<div>Dear Ensembl Devs</div><div><br></div><div>I am struggling with an issue of the memory footprint of a script.</div><div>I have installed a local version of Ensembl V64 and have created a script that basically retrieves all known variation on any given chromosome and outputs to files some relevant information regarding allele frequencies and any annotations connected with each variation.</div>

<div><br></div><div>Because I am looping though all variants and printing the data gathered to files I did not expect the memory footprint of the script to be as large as it is and it seems that for big chromosomes the script keeps increasing the memory allocated to him until it will eventually crash.</div>

<div><br></div><div>Can any of you find where I have gone wrong on my code?</div><div>I appreciate any insights into what I might be doing wrong.</div><div><br></div><div>Best regards</div><div><br></div><div>      Duarte Molha</div>

<div><br></div><div>Script:</div><div><br></div><div><div>#!/usr/bin/perl -w</div><div><br></div><div><br></div><div>use lib '/array/dept/compbio/ensembl/current_api/ensembl-variation/modules';<span class="Apple-tab-span" style="white-space:pre">            </span># points to latest ensemblt core API</div>

<div><br></div><div>use Bio::EnsEMBL::Registry;</div><div>use Statistics::Descriptive;</div><div>use Term::ProgressBar;</div><div>use IO::File;</div><div>use strict;</div><div><br></div><div>my $host     = 'myHost';</div>

<div>my $user     =  'user';</div><div>my $password = 'pass';</div><div>my $port     = 3306;</div><div><br></div><div># get registry</div><div>my $reg = 'Bio::EnsEMBL::Registry';</div><div>    </div>

<div><br></div><div>$reg->load_registry_from_db(</div><div><span class="Apple-tab-span" style="white-space:pre">       </span>-host       => $host,</div><div><span class="Apple-tab-span" style="white-space:pre">     </span>-user       => $user,</div>

<div><span class="Apple-tab-span" style="white-space:pre">      </span>-pass       => $password,</div><div><span class="Apple-tab-span" style="white-space:pre"> </span>-port       => $port,</div><div><span class="Apple-tab-span" style="white-space:pre">     </span>-species    => 'Homo sapiens',</div>

<div><span class="Apple-tab-span" style="white-space:pre">      </span>-verbose    => 1,</div><div>);</div><div><br></div><div>my $vfa   = $reg->get_adaptor('Homo sapiens', 'variation', 'variationfeature');</div>

<div>my $va    = $reg->get_adaptor('Homo sapiens', 'variation', 'variation');</div><div>my $sa    = $reg->get_adaptor('Homo sapiens', 'core', 'slice');</div><div>my $vaa   = $reg->get_adaptor('Homo sapiens', 'variation', 'variationannotation');</div>

<div><br></div><div>my $freqs_file;</div><div>my $annot_file;</div><div><br></div><div>my $chr = shift;</div><div><br></div><div><br></div><div>my $core_mca = $reg->get_adaptor('Homo sapiens', 'core', 'metacontainer');</div>

<div>unless ($core_mca){</div><div><span class="Apple-tab-span" style="white-space:pre">    </span>print STDERR "Failed to connect to local database\n";</div><div><span class="Apple-tab-span" style="white-space:pre">      </span>die;</div>

<div>}</div><div><br></div><div><br></div><div>$freqs_file = new IO::File(">/ReferenceData/NGS_preprocessed_data/v64/".$chr."_freqs.txt");</div><div>$annot_file = new IO::File(">/ReferenceData/NGS_preprocessed_data/v64/".$chr."_annots.txt");</div>

<div><br></div><div>run_data($chr);</div><div><br></div><div>$freqs_file->close();</div><div>$annot_file->close();</div><div><br></div><div>#main sub to run all the vars found on each inputr chr</div><div>sub run_data{</div>

<div><span class="Apple-tab-span" style="white-space:pre">      </span>my $chr = shift;</div><div><span class="Apple-tab-span" style="white-space:pre">     </span></div><div><span class="Apple-tab-span" style="white-space:pre">     </span>#get slice for chr</div>

<div><span class="Apple-tab-span" style="white-space:pre">      </span>my $slice = "";</div><div><span class="Apple-tab-span" style="white-space:pre">    </span>eval { $slice = $sa->fetch_by_region('chromosome', $chr); };</div>

<div><span class="Apple-tab-span" style="white-space:pre">      </span># if failed, try to get any seq region</div><div><span class="Apple-tab-span" style="white-space:pre">       </span>if(!defined($slice)) {</div><div><span class="Apple-tab-span" style="white-space:pre">               </span>$slice = $sa->fetch_by_region(undef, $chr);</div>

<div><span class="Apple-tab-span" style="white-space:pre">      </span>}</div><div><span class="Apple-tab-span" style="white-space:pre">    </span></div><div><span class="Apple-tab-span" style="white-space:pre">     </span># get all vars</div>

<div><span class="Apple-tab-span" style="white-space:pre">      </span>my @vfas = @{$vfa->fetch_all_by_Slice($slice)};</div><div><span class="Apple-tab-span" style="white-space:pre">   </span></div><div><span class="Apple-tab-span" style="white-space:pre">     </span># progress bar initialization</div>

<div><span class="Apple-tab-span" style="white-space:pre">      </span>my $number_of_entries = scalar (@vfas);</div><div><span class="Apple-tab-span" style="white-space:pre">      </span>my $progress = Term::ProgressBar->new ({count => $number_of_entries ,</div>

<div><span class="Apple-tab-span" style="white-space:pre">                                              </span>name => "Gathering Data for chr $chr :", </div><div><span class="Apple-tab-span" style="white-space:pre">                                               </span>ETA => 'linear', </div>

<div><span class="Apple-tab-span" style="white-space:pre">                                      </span>});</div><div><span class="Apple-tab-span" style="white-space:pre">  </span>$progress->max_update_rate(1);</div><div><span class="Apple-tab-span" style="white-space:pre">    </span>my $next_update = 0;</div>

<div><span class="Apple-tab-span" style="white-space:pre">      </span>my $i = 0;</div><div><span class="Apple-tab-span" style="white-space:pre">   </span></div><div><span class="Apple-tab-span" style="white-space:pre">     </span># loop though all vars found</div>

<div><span class="Apple-tab-span" style="white-space:pre">      </span>foreach my $vf (@vfas) {</div><div><span class="Apple-tab-span" style="white-space:pre">             </span>#get freq data for var</div><div><span class="Apple-tab-span" style="white-space:pre">               </span>my @data1 = get_freqs_for_existing($vf);</div>

<div><span class="Apple-tab-span" style="white-space:pre">              </span>#print to file</div><div><span class="Apple-tab-span" style="white-space:pre">               </span>if ($data1[0]){</div><div><span class="Apple-tab-span" style="white-space:pre">                      </span>print $freqs_file @data1;</div>

<div><span class="Apple-tab-span" style="white-space:pre">              </span>}</div><div><span class="Apple-tab-span" style="white-space:pre">            </span></div><div><span class="Apple-tab-span" style="white-space:pre">             </span>#get annotation data for far</div>

<div><span class="Apple-tab-span" style="white-space:pre">              </span>my @data2 = get_phenotype_info($vf);</div><div><span class="Apple-tab-span" style="white-space:pre">         </span>#print to file</div><div><span class="Apple-tab-span" style="white-space:pre">               </span>if ($data2[0]){</div>

<div><span class="Apple-tab-span" style="white-space:pre">                      </span>print $annot_file @data2;</div><div><span class="Apple-tab-span" style="white-space:pre">            </span>}</div><div><span class="Apple-tab-span" style="white-space:pre">            </span></div>

<div><span class="Apple-tab-span" style="white-space:pre">              </span>#progress bar iteration</div><div><span class="Apple-tab-span" style="white-space:pre">              </span>$i++;</div><div><span class="Apple-tab-span" style="white-space:pre">                </span>$next_update = $progress->update() if ($i > $next_update);</div>

<div><span class="Apple-tab-span" style="white-space:pre">      </span>}</div><div><span class="Apple-tab-span" style="white-space:pre">    </span></div><div><span class="Apple-tab-span" style="white-space:pre">     </span>#update progress bar</div>

<div><span class="Apple-tab-span" style="white-space:pre">      </span>$progress->update($number_of_entries) if $number_of_entries >= $next_update;</div><div>}</div><div><br></div><div><br></div><div># sub to retrieve allele frequency info for input variation</div>

<div>sub get_freqs_for_existing{</div><div><span class="Apple-tab-span" style="white-space:pre">    </span>my $vf = shift;</div><div><br></div><div><span class="Apple-tab-span" style="white-space:pre">     </span>my %data;</div>

<div><span class="Apple-tab-span" style="white-space:pre">      </span>my @return_data;</div><div><span class="Apple-tab-span" style="white-space:pre">     </span>foreach my $allele (@{$vf->get_all_Alleles()}) {</div><div><span class="Apple-tab-span" style="white-space:pre">          </span></div>

<div><span class="Apple-tab-span" style="white-space:pre">              </span>next unless defined $allele->{population} || defined $allele->{'_population_id'};</div><div><span class="Apple-tab-span" style="white-space:pre">              </span>my $pop_name = $allele->population->name;</div>

<div><span class="Apple-tab-span" style="white-space:pre">              </span></div><div><span class="Apple-tab-span" style="white-space:pre">             </span>#populate hash if allele frequency info is available</div><div><span class="Apple-tab-span" style="white-space:pre">         </span>if ($allele->allele && $allele->frequency){</div>

<div><span class="Apple-tab-span" style="white-space:pre">                      </span>#merge hapmap data and 1kg frequeny data</div><div><span class="Apple-tab-span" style="white-space:pre">                     </span>if ($pop_name =~ /^(CSHL-HAPMAP)|(1000)/i){</div>

<div><span class="Apple-tab-span" style="white-space:pre">                              </span>if ($pop_name =~ /^CSHL-HAPMAP/i){</div><div><span class="Apple-tab-span" style="white-space:pre">                                   </span>push(@{$data{hap}{$allele->allele}}, $allele->frequency);</div>

<div><span class="Apple-tab-span" style="white-space:pre">                              </span>}</div><div><span class="Apple-tab-span" style="white-space:pre">                            </span>if($pop_name =~ /^1000/i){</div><div><span class="Apple-tab-span" style="white-space:pre">                                   </span>push(@{$data{"1kg"}{$allele->allele}}, $allele->frequency);</div>

<div><span class="Apple-tab-span" style="white-space:pre">                              </span>}</div><div><span class="Apple-tab-span" style="white-space:pre">                            </span>push(@{$data{any}{$allele->allele}}, $allele->frequency);</div><div><span class="Apple-tab-span" style="white-space:pre">                      </span>}</div>

<div><span class="Apple-tab-span" style="white-space:pre">                      </span>#gather frequency data for any population</div><div><span class="Apple-tab-span" style="white-space:pre">                    </span>push(@{$data{$pop_name}{$allele->allele}}, $allele->frequency);</div>

<div><span class="Apple-tab-span" style="white-space:pre">              </span>}</div><div><span class="Apple-tab-span" style="white-space:pre">    </span>}</div><div><span class="Apple-tab-span" style="white-space:pre">    </span></div><div>

<span class="Apple-tab-span" style="white-space:pre"> </span># get statistics</div><div><span class="Apple-tab-span" style="white-space:pre">     </span>foreach my $pop (keys %data){</div><div><span class="Apple-tab-span" style="white-space:pre">                </span>foreach my $allele (keys %{$data{$pop}}){</div>

<div><span class="Apple-tab-span" style="white-space:pre">                      </span>my $stat = Statistics::Descriptive::Sparse->new();</div><div><span class="Apple-tab-span" style="white-space:pre">                        </span>$stat->add_data(@{$data{$pop}{$allele}}); </div>

<div><span class="Apple-tab-span" style="white-space:pre">                      </span>my $mean =  sprintf "%.3f", $stat->mean();</div><div><span class="Apple-tab-span" style="white-space:pre">                      </span>my $std  =  sprintf "%.3f", $stat->standard_deviation();</div>

<div><span class="Apple-tab-span" style="white-space:pre">                      </span>my $min  =  sprintf "%.3f", $stat->min();</div><div><span class="Apple-tab-span" style="white-space:pre">                       </span>push (@return_data, $vf->variation_name."\t"."chr".$vf->slice->seq_region_name().":".$vf->seq_region_start().'-'.$vf->seq_region_end()."\t".$pop."\t".$allele."\t".$mean."\t".$min."\t".$std."\n");</div>

<div><span class="Apple-tab-span" style="white-space: pre; ">           </span>}</div><div><span class="Apple-tab-span" style="white-space:pre">    </span>}</div><div><span class="Apple-tab-span" style="white-space:pre">    </span></div>

<div><span class="Apple-tab-span" style="white-space:pre">      </span>return ((@return_data) ? @return_data : undef);</div><div><br></div><div>}</div><div><br></div><div># sub to get all available annotation for input variation</div>

<div>sub get_phenotype_info{</div><div><span class="Apple-tab-span" style="white-space:pre">        </span>my $vf = shift;</div><div><span class="Apple-tab-span" style="white-space:pre">      </span>my @alleles = split /\//, $vf->allele_string;</div>

<div><span class="Apple-tab-span" style="white-space:pre">      </span>my $var = $vf->variation();</div><div><span class="Apple-tab-span" style="white-space:pre">       </span>my %vas;</div><div><span class="Apple-tab-span" style="white-space:pre">     </span>my @return_data;</div>

<div><br></div><div><span class="Apple-tab-span" style="white-space:pre">     </span>#get all variation annotations available</div><div><span class="Apple-tab-span" style="white-space:pre">     </span>for my $va (@{ $var->get_all_VariationAnnotations() }) {</div>

<div><span class="Apple-tab-span" style="white-space:pre">              </span>my $risk_allele = "-";</div><div><span class="Apple-tab-span" style="white-space:pre">             </span>foreach my $allele (@alleles){</div><div><span class="Apple-tab-span" style="white-space:pre">                       </span>if ($va->associated_variant_risk_allele){</div>

<div><span class="Apple-tab-span" style="white-space:pre">                              </span>my $risk_allele_annot = (split /\-/, $va->associated_variant_risk_allele)[1];</div><div><span class="Apple-tab-span" style="white-space:pre">                             </span>if ($risk_allele_annot && $risk_allele_annot eq $allele){</div>

<div><span class="Apple-tab-span" style="white-space:pre">                                      </span>$risk_allele = $risk_allele_annot;</div><div><span class="Apple-tab-span" style="white-space:pre">                                   </span>last;</div><div><span class="Apple-tab-span" style="white-space:pre">                                </span>}</div>

<div><span class="Apple-tab-span" style="white-space:pre">                      </span>}</div><div><span class="Apple-tab-span" style="white-space:pre">            </span>}</div><div><span class="Apple-tab-span" style="white-space:pre">            </span>my $associated_genes = $va->associated_gene || "-";</div>

<div><span class="Apple-tab-span" style="white-space:pre">              </span>my $pheno_desc       = $va->phenotype_description || "-";</div><div><span class="Apple-tab-span" style="white-space:pre">               </span>my $pheno_source     = $va->source_name || "-";</div>

<div><span class="Apple-tab-span" style="white-space:pre">              </span>my $pheno_study      = $va->study_description || "-";</div><div><span class="Apple-tab-span" style="white-space:pre">           </span>my $pheno_study_reference = $va->external_reference || "-";</div>

<div><span class="Apple-tab-span" style="white-space:pre">              </span>my $string = join "\t", $associated_genes, $pheno_desc, $pheno_source, $pheno_study, $pheno_study_reference;</div><div><span class="Apple-tab-span" style="white-space:pre">               </span>push (@{$vas{$risk_allele}}, $string);</div>

<div><span class="Apple-tab-span" style="white-space:pre">      </span>}</div><div><br></div><div><span class="Apple-tab-span" style="white-space:pre">   </span>#retrieve all annotations info gathered </div><div><span class="Apple-tab-span" style="white-space:pre">     </span>if (%vas){</div>

<div><span class="Apple-tab-span" style="white-space:pre">              </span>foreach my $allele (keys %vas){</div><div><span class="Apple-tab-span" style="white-space:pre">                      </span>foreach my $annotation (@{$vas{$allele}}){</div>
<div>
<span class="Apple-tab-span" style="white-space:pre">                         </span>push (@return_data, $var->name."\t"."chr".$vf->slice->seq_region_name().":".$vf->seq_region_start().'-'.$vf->seq_region_end()."\tannot\t".$allele."\t".$annotation."\n");</div>

<div><span class="Apple-tab-span" style="white-space:pre">                      </span>}</div><div><span class="Apple-tab-span" style="white-space:pre">            </span>}</div><div><span class="Apple-tab-span" style="white-space:pre">    </span>}</div>

<div><span class="Apple-tab-span" style="white-space:pre">      </span></div><div><span class="Apple-tab-span" style="white-space:pre">     </span>return ((@return_data) ? @return_data : undef);</div><div>}</div></div>