c4onastick has asked for the wisdom of the Perl Monks concerning the following question:
gene 337..2799 /gene="thrA" /locus_tag="t0002" /db_xref="GeneID:1066974" CDS 337..2799 /gene="thrA" /locus_tag="t0002" /note="multifunctional homotetrameric enzyme that catalyzes the phosphorylation of aspartate to for +m aspartyl-4-phosphate as well as conversion of asp +artate semialdehyde to homoserine; functions in a number + of amino acid biosynthetic pathways" /codon_start=1 /transl_table=11 /product="bifunctional aspartokinase I/homeserine dehydrogenase I" /protein_id="NP_803887.1" /db_xref="GI:29140545" /db_xref="GeneID:1066974" /translation="MRVLKFGGTSVANAERFLRVADILESNSRQGQVAT +VLSAPAKIT NHLVAMIEKTIGGQDALPNISDAERIFSDLLAGLASAQPGFPLARLKMV +VEQEFAQIK HVLHGISLLGQCPDSINAALICRGEKMSIAIMAGLLEARGHRVTVIDPV +EKLLAVGHY LESTVDIAESTRRIAASQIPADHMILMAGFTAGNEKGELVVLGRNGSDY +SAAVLAACL RADCCEIWTDVDGVYTCDPRQVPDARLLKSMSYQEAMELSYFGAKVLHP +RTITPIAQF QIPCLIKNTGNPQAPGTLIGASSDDDNLPVKGISNLNNMAMFSVSGPGM +KGMIGMAAR VFAAMSRAGISVVLITQSSSEYSISFCVPQSDCARARRAMQDEFYLELK +EGLLEPLAV TERLAIISVVGDGMRTLRGISAKFFAALARANINIVAIAQGSSERSISV +VVNNDDATT GVRVTHQMLFNTDQVIEVFVIGVGGVGGALLEQLKRQQTWLKNKHIDLR +VCGVANSKA LLTNVHGLNLDNWQAELAQANAPFNLGRLIRLVKEYHLLNPVIVDCTSS +QAVADQYAD FLREGFHVVTPNKKANTSSMDYYHQLRFAAAQSRRKFLYDTNVGAGLPV +IENLQNLLN AGDELQKFSGILSGSLSFIFGKLEEGMSLSQATALAREMGYTEPDPRDD +LSGMDVARK LLILARETGRELELSDIVIEPVLPDEFDASGDVTAFMAHLPQLDDAFAA +RVAKARDEG KVLRYVGNIEEDGVCRVKIAEVDGNDPLFKVKNGENALAFYSHYYQPLP +LVLRGYGAG NDVTAAGVFADLLRTLSWKLGV" gene 2801..3730 /gene="thrB" /locus_tag="t0003" /db_xref="GeneID:1066981" CDS 2801..3730 /gene="thrB" /locus_tag="t0003" /note="catalyzes the formation of O-phospho-L-hom +oserine from L-homoserine in threonine biosynthesis from +asparate" /codon_start=1 /transl_table=11 /product="homoserine kinase" /protein_id="NP_803888.1" /db_xref="GI:29140546" /db_xref="GeneID:1066981" /translation="MVKVYAPASSANMSVGFDVLGAAVTPVDGTLLGDV +VSVEAADHF RLHNLGRFADKLPPEPRENIVYQCWERFCQALGKTIPVAMTLEKNMPIG +SGLGSSACS VVAALVAMNEHCGKPLNDTRLLALMGELEGRISGSIHYDNVAPCFLGGM +QLMIEENGI ISQQVPGFDEWLWVLAYPGIKVSTAEARAILPAQYRRQDCIAHGRHLAG +FIHACYSRQ PQLAAALMKDVIAEPYRARLLPGFSQARQAVSEIGALASGISGSGPTLF +ALCDKPETA QRVADWLSKHYLQNQEGFVHICRLDTAGARVVG"
ORIGIN 1 agagattacg tctggttgca agagatcata acaggggaaa ttgattgaaa ataaa +tatat 61 cgccagcagc acatgaacaa gtttcggaat gtgatcaatt taaaaattta ttgac +ttagg 121 cgggcagata ctttaaccaa tataggaata caagacagac aaataaaaat gacag +agtac 181 acaacatcca tgaaccgcat cagcaccacc accattacca ccatcaccat tacca +caggt ... 4791781 acgcgcgcgc cttttacgcc tgctaaccac tctggaggcg gccgatgacc acaaa +ttaac 4791841 cgactggcta caacagcgaa tcggcctgct gggacagcga gatacggcaa tgttg +caccg 4791901 tttggtccat gatattgaaa aaaaactaac aaaataacgt gttgtaattt ttaaa +ataat 4791961 a //
Other than a prototype error when I prototyped the html_out sub it works great. Thanks again!#!/usr/bin/perl use warnings; use strict; use Getopt::Std; local $/; our %opts; getopts('hf:', \%opts); die("Usage: uv_mutant.pl -f <file>.gbk\nAdd -h for html output\n") unl +ess $opts{f}; my $file = $opts{f}; my $genome; my $total_mutations; open(FH, $file) or die "File couldn't be opened"; my $contents = <FH>; close(FH); #Extract the entire genome $contents =~ m#ORIGIN(.+?)//#s or die "No genome data found."; $genome = $1; # Remove extraneous characters, make it one big long string to use sub +str position on it $genome =~ s/[\d\s]+//g; # Calculate total possible mutations while( $genome =~ /[ct](?=[ct])/g ) { $total_mutations++; } #print "\nTotal possible mutations (pyramidine dimerizations): $total_ +mutations\n\n"; # Extract all the gene definitions, end at protein translation. my @genes; @genes = $contents =~ m#(?<!/)gene.+?/translation#sg; my %mutant_genes; # Set up output #printf "%-20s%-10s%-25s%-15s%s\n", "Gene", "GeneID", "Possible Mutati +ons", "Probability", "Protein Product"; #print "=" x 85, "\n"; foreach my $gene (@genes) { # filter out complement genes if( $gene =~ m/gene\s+(\d+)\.\.(\d+)/ ) { my $gene_mutations = 0; my $gene_name; my $geneid = "Unknown"; #GeneID for online db my $gene_product = "Unknown"; #Encoded protein my $start = $1; $start--; #increment down for substr my $end = $2; $end--; #increment down for substr my $length = $end - $start; ($gene_name) = $gene =~ m#/gene="([^"]+)"#; unless( defined($gene_name) ) { $gene_name = "Unknown$length"; } my $sub_genome = substr($genome, $start, $length); $gene_mutations = () = $sub_genome =~/[ct](?=[ct])/g; my $probability = $gene_mutations/$total_mutations*100; #Pull out GeneID (if exists) if( $gene =~ m#/db_xref="GeneID:(\d+)"# ) { $geneid = $1; } #Pull out Protein Product, if exists if( $gene =~ m#/product="([^"]+)"# ) { $gene_product = $1; $gene_product =~ s/\n\s*/ /g; #Clear out newlines and inde +ntation } $mutant_genes{$gene_name} = { gene_id => $geneid, prod_pro => $gene_product, gene_mutants => $gene_mutations, mutant_prob => $probability }; #printf "%-20s%-10d%-25d%.5f%% %s\n", $gene_name, $geneid, $ge +ne_mutations, $probability, $gene_product; } } if($opts{h}) { html_out($total_mutations, %mutant_genes) }else{ print "UV Mutation (pyramidine dimerization) Analysis\n"; print "Total possible mutations in genome: $total_mutations\n\n"; print "\nGenes sorted by UV mutation probability:\n", "=" x 65, "\ +n"; foreach (sort by_descending_probability keys %mutant_genes) { printf "%-20s%.5f%% %s\n", $_, $mutant_genes{$_}{mutant_prob +}, $mutant_genes{$_}{prod_pro}; } } sub by_descending_probability { $mutant_genes{$b}{mutant_prob} <=> $mutant_genes{$a}{mutant_prob}; } sub html_out { my $total_muts = shift; print "<html>\n<head>\n<body>\n"; print "<h1>UV Mutant Analysis</h1>\n"; print "<b>Total Possible mutations in Genome: $total_muts </b><br +/>\n"; print "<b>Gene mutations sorted by decending probability of mutati +on</b><br />\n"; print "<table>\n<tr><td>Gene</td><td>Possible Gene Mutations</td>< +td>Mutation Probability (%)</td><td>Gene Product</td></tr>\n"; #my %mutant_genes = shift; #Gives an odd numbered hash assignment +error when prototyped foreach (sort by_descending_probability keys %mutant_genes) { print "<tr><td><a href=\"http://www.ncbi.nlm.nih.gov/entrez/qu +ery.fcgi?db=gene&cmd=Retrieve&dopt=Graphics&list_uids=$mutant_genes{$ +_}{gene_id}\">$_</a></td><td>$mutant_genes{$_}{gene_mutants}</td><td> +$mutant_genes{$_}{mutant_prob}</td><td>$mutant_genes{$_}{prod_pro}</t +d></tr>\n"; } print "</table>\n"; print "</body>\n</html>"; }
#!/usr/bin/perl use warnings; use strict; use Getopt::Std; use Number::Format qw(:subs); undef $/; my $num_precision = 5; our %opts; getopts('hf:', \%opts); die("Usage: uv_mutant.pl -f <file>.gbk\nAdd -h for html output\n") unl +ess $opts{f}; my $file = $opts{f}; my $genome; open(FH, '<', $file) or die "File couldn't be opened: $!"; my $contents = <FH>; close(FH); #Extract the entire genome $contents =~ m#ORIGIN(.+?)//#s or die "No genome data found."; $genome = $1; # Remove extraneous characters, make it one big long string to use sub +str position on it $genome =~ s/[\d\r\n\s]+//g; # Calculate total possible mutations my %mutations = find_possible_mutations($genome); # Extract all the gene definitions, end at protein translation. my @genes; @genes = $contents =~ m#(?<!/)gene.+?/translation#sg; my %mutant_genes; # Set up output #printf "%-20s%-10s%-25s%-15s%s\n", "Gene", "GeneID", "Possible Mutati +ons", "Probability", "Protein Product"; #print "=" x 85, "\n"; foreach my $gene (@genes) { # filter out complement genes if( $gene =~ m/gene\s+(\d+)\.\.(\d+)/ ) { my $gene_name; my $geneid = "Unknown"; #GeneID for online db my $gene_product = "Unknown"; #Encoded protein my $start = $1 - 1; #$start--; #increment down for substr my $end = $2 - 1; #$end--; #increment down for substr my $length = $end - $start; ($gene_name) = $gene =~ m#/gene="([^"]+)"#; unless( defined($gene_name) ) { $gene_name = "Unknown$length"; } my $sub_genome = substr($genome, $start, $length); my %gene_mutations = find_possible_mutations($sub_genome); my %probability = ( ptt => format_number($gene_mutations{t +t}/$mutations{tt}*100, $num_precision), pct => format_number($gene_mutations{ct}/$mutation +s{ct}*100, $num_precision), pcc => format_number($gene_mutations{cc}/$mutation +s{cc}*100, $num_precision), ptotal => format_number($gene_mutations{total}/$mu +tations{total}*100, $num_precision) ); #Pull out GeneID (if exists) if( $gene =~ m#/db_xref="GeneID:(\d+)"# ) { $geneid = $1; } #Pull out Protein Product, if exists if( $gene =~ m#/product="([^"]+)"# ) { $gene_product = $1; $gene_product =~ s/\n\s*/ /g; #Clear out newlines and inde +ntation } $mutant_genes{$gene_name} = { gene_id => $geneid, prod_pro => $gene_product, %gene_mutations, %probability }; } } if($opts{h}) { html_out($mutations{total}, \%mutant_genes) }else{ print "UV Mutation (pyramidine dimerization) Analysis\n"; print "Total possible mutations in genome: $mutations{total}\n\n"; print "\nGenes sorted by UV mutation probability:\n", "=" x 65, "\ +n"; foreach (sort by_descending_probability keys %mutant_genes) { printf "%-20s%.5f%% %s\n", $_, $mutant_genes{$_}{ptt}, $muta +nt_genes{$_}{prod_pro}; } } sub by_descending_probability { $mutant_genes{$b}{ptt} <=> $mutant_genes{$a}{ptt}; } sub html_out { my ($total_muts, $mutant_ref) = @_; my %mutant_genes = %{$mutant_ref}; print "<html>\n<head>\n<body>\n"; print "<h1>UV Mutant Analysis</h1>\n"; print "<b>Total Possible mutations in Genome: $total_muts</b><br / +>\n"; print "<b>Gene mutations sorted by decending probability of mutati +on</b><br />\n"; print "<table>\n<tr><td>Gene</td><td>Possible Gene Mutations TT</t +d><td>Possible Gene Mutations CT</td><td>Possible Gene Mutations CC</ +td><td>Mutation Probability TT(%)</td><td>Mutation Probability CT(%)< +/td><td>Mutation Probability CC(%)</td><td>Gene Product</td></tr>\n"; #my %mutant_genes = shift; #Gives an odd numbered hash assignment +error when prototyped foreach (sort by_descending_probability keys %mutant_genes) { print "<tr><td><a href=\"http://www.ncbi.nlm.nih.gov/entrez/qu +ery.fcgi?db=gene&cmd=Retrieve&dopt=Graphics&list_uids=$mutant_genes{$ +_}{gene_id}\">$_</a></td><td>$mutant_genes{$_}{tt}</td><td>$mutant_ge +nes{$_}{ct}</td><td>$mutant_genes{$_}{cc}</td><td>$mutant_genes{$_}{p +tt}</td><td>$mutant_genes{$_}{pct}</td><td>$mutant_genes{$_}{pcc}</td +><td>$mutant_genes{$_}{prod_pro}</td></tr>\n"; } print "</table>\n"; print "</body>\n</html>"; } sub find_possible_mutations { my $genome = shift; my %mutations = ( tt => 0, ct => 0, cc => 0, total => 0 ); # Set all values to zero to start incase + no possible sites found. # Find all possible Thymidine dimerizations (most common dimerizat +ion) while( $genome =~ /t(?=t)/g ) { $mutations{tt}++; } # Find all possible heterogeneous dimerization sites (less common) while( $genome =~ /c(?=t)/g ) { $mutations{ct}++; } while( $genome =~ /t(?=c)/g ) { $mutations{ct}++; } # Find all possible Cystine dimerization sites (least common) while( $genome =~ /c(?=c)/g ) { $mutations{cc}++; } # Store the total mutations for later calculations $mutations{total} = $mutations{tt} + $mutations{ct} + $mutations{c +c}; return %mutations; }
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re: Genome UV Mutation Script -- Critique
by kyle (Abbot) on Mar 14, 2007 at 21:39 UTC | |
|
Re: Genome UV Mutation Script -- Critique
by lima1 (Curate) on Mar 14, 2007 at 22:22 UTC | |
by BrowserUk (Patriarch) on Mar 15, 2007 at 04:54 UTC | |
by lima1 (Curate) on Mar 15, 2007 at 09:20 UTC | |
|
Re: Genome UV Mutation Script -- Critique
by jdporter (Paladin) on Mar 14, 2007 at 21:45 UTC | |
by kyle (Abbot) on Mar 14, 2007 at 21:54 UTC | |
by jdporter (Paladin) on Mar 15, 2007 at 01:42 UTC | |
|
Re: Genome UV Mutation Script -- Critique
by jwkrahn (Abbot) on Mar 14, 2007 at 22:06 UTC | |
|
Re: Genome UV Mutation Script -- Critique
by Anno (Deacon) on Mar 14, 2007 at 22:28 UTC | |
|
Re: Genome UV Mutation Script -- Critique
by c4onastick (Friar) on Mar 15, 2007 at 03:24 UTC |