gene 337..2799 /gene="thrA" /locus_tag="t0002" /db_xref="GeneID:1066974" CDS 337..2799 /gene="thrA" /locus_tag="t0002" /note="multifunctional homotetrameric enzyme that catalyzes the phosphorylation of aspartate to form aspartyl-4-phosphate as well as conversion of aspartate semialdehyde to homoserine; functions in a number of amino acid biosynthetic pathways" /codon_start=1 /transl_table=11 /product="bifunctional aspartokinase I/homeserine dehydrogenase I" /protein_id="NP_803887.1" /db_xref="GI:29140545" /db_xref="GeneID:1066974" /translation="MRVLKFGGTSVANAERFLRVADILESNSRQGQVATVLSAPAKIT NHLVAMIEKTIGGQDALPNISDAERIFSDLLAGLASAQPGFPLARLKMVVEQEFAQIK HVLHGISLLGQCPDSINAALICRGEKMSIAIMAGLLEARGHRVTVIDPVEKLLAVGHY LESTVDIAESTRRIAASQIPADHMILMAGFTAGNEKGELVVLGRNGSDYSAAVLAACL RADCCEIWTDVDGVYTCDPRQVPDARLLKSMSYQEAMELSYFGAKVLHPRTITPIAQF QIPCLIKNTGNPQAPGTLIGASSDDDNLPVKGISNLNNMAMFSVSGPGMKGMIGMAAR VFAAMSRAGISVVLITQSSSEYSISFCVPQSDCARARRAMQDEFYLELKEGLLEPLAV TERLAIISVVGDGMRTLRGISAKFFAALARANINIVAIAQGSSERSISVVVNNDDATT GVRVTHQMLFNTDQVIEVFVIGVGGVGGALLEQLKRQQTWLKNKHIDLRVCGVANSKA LLTNVHGLNLDNWQAELAQANAPFNLGRLIRLVKEYHLLNPVIVDCTSSQAVADQYAD FLREGFHVVTPNKKANTSSMDYYHQLRFAAAQSRRKFLYDTNVGAGLPVIENLQNLLN AGDELQKFSGILSGSLSFIFGKLEEGMSLSQATALAREMGYTEPDPRDDLSGMDVARK LLILARETGRELELSDIVIEPVLPDEFDASGDVTAFMAHLPQLDDAFAARVAKARDEG KVLRYVGNIEEDGVCRVKIAEVDGNDPLFKVKNGENALAFYSHYYQPLPLVLRGYGAG NDVTAAGVFADLLRTLSWKLGV" gene 2801..3730 /gene="thrB" /locus_tag="t0003" /db_xref="GeneID:1066981" CDS 2801..3730 /gene="thrB" /locus_tag="t0003" /note="catalyzes the formation of O-phospho-L-homoserine from L-homoserine in threonine biosynthesis from asparate" /codon_start=1 /transl_table=11 /product="homoserine kinase" /protein_id="NP_803888.1" /db_xref="GI:29140546" /db_xref="GeneID:1066981" /translation="MVKVYAPASSANMSVGFDVLGAAVTPVDGTLLGDVVSVEAADHF RLHNLGRFADKLPPEPRENIVYQCWERFCQALGKTIPVAMTLEKNMPIGSGLGSSACS VVAALVAMNEHCGKPLNDTRLLALMGELEGRISGSIHYDNVAPCFLGGMQLMIEENGI ISQQVPGFDEWLWVLAYPGIKVSTAEARAILPAQYRRQDCIAHGRHLAGFIHACYSRQ PQLAAALMKDVIAEPYRARLLPGFSQARQAVSEIGALASGISGSGPTLFALCDKPETA QRVADWLSKHYLQNQEGFVHICRLDTAGARVVG" ##

##

ORIGIN      
        1 agagattacg tctggttgca agagatcata acaggggaaa ttgattgaaa ataaatatat
       61 cgccagcagc acatgaacaa gtttcggaat gtgatcaatt taaaaattta ttgacttagg
      121 cgggcagata ctttaaccaa tataggaata caagacagac aaataaaaat gacagagtac
      181 acaacatcca tgaaccgcat cagcaccacc accattacca ccatcaccat taccacaggt
      ...
  4791781 acgcgcgcgc cttttacgcc tgctaaccac tctggaggcg gccgatgacc acaaattaac
  4791841 cgactggcta caacagcgaa tcggcctgct gggacagcga gatacggcaa tgttgcaccg
  4791901 tttggtccat gatattgaaa aaaaactaac aaaataacgt gttgtaattt ttaaaataat
  4791961 a
//

##

##

#!/usr/bin/perl

use warnings;
use strict;
use Getopt::Std;
local $/;

our %opts;
getopts('hf:', \%opts);

die("Usage: uv_mutant.pl -f .gbk\nAdd -h for html output\n") unless $opts{f};
my $file = $opts{f};
my $genome;
my $total_mutations;

open(FH, $file) or die "File couldn't be opened";
my $contents = ;
close(FH);

#Extract the entire genome
$contents =~ m#ORIGIN(.+?)//#s or die "No genome data found.";
$genome = $1;

# Remove extraneous characters, make it one big long string to use substr position on it
$genome =~ s/[\d\s]+//g;

# Calculate total possible mutations
while( $genome =~ /[ct](?=[ct])/g ) {
	$total_mutations++;
}

#print "\nTotal possible mutations (pyramidine dimerizations): $total_mutations\n\n";

# Extract all the gene definitions, end at protein translation.
my @genes;
@genes = $contents =~ m#(? $geneid,
						prod_pro => $gene_product,
						gene_mutants => $gene_mutations,
						mutant_prob => $probability
						};

		#printf "%-20s%-10d%-25d%.5f%% %s\n", $gene_name, $geneid, $gene_mutations, $probability, $gene_product;
	}
}


if($opts{h}) {
	html_out($total_mutations, %mutant_genes)
}else{
	print "UV Mutation (pyramidine dimerization) Analysis\n";
	print "Total possible mutations in genome: $total_mutations\n\n";
	print "\nGenes sorted by UV mutation probability:\n", "=" x 65, "\n";
	
	foreach (sort by_descending_probability keys %mutant_genes) {
		printf "%-20s%.5f%%   %s\n", $_, $mutant_genes{$_}{mutant_prob}, $mutant_genes{$_}{prod_pro};
	}
}

sub by_descending_probability {
	$mutant_genes{$b}{mutant_prob} <=> $mutant_genes{$a}{mutant_prob};
}

sub html_out {
	my $total_muts = shift;
	print "\n\n\n";
	print "UV Mutant Analysis\n";
	print "Total Possible mutations in Genome: $total_muts 
\n";
	print "Gene mutations sorted by decending probability of mutation
\n";
	print "\n\n";
	#my %mutant_genes = shift; #Gives an odd numbered hash assignment error when prototyped
	foreach (sort by_descending_probability keys %mutant_genes) {
		print "\n";
	}
	print "Gene Possible Gene Mutations Mutation Probability (%) Gene Product
$_ $mutant_genes{$_}{gene_mutants} $mutant_genes{$_}{mutant_prob} $mutant_genes{$_}{prod_pro}\n";
	print "\n";
}


##

##

#!/usr/bin/perl

use warnings;
use strict;
use Getopt::Std;
use Number::Format qw(:subs);
undef $/;

my $num_precision = 5;

our %opts;
getopts('hf:', \%opts);

die("Usage: uv_mutant.pl -f .gbk\nAdd -h for html output\n") unless $opts{f};
my $file = $opts{f};
my $genome;

open(FH, '<', $file) or die "File couldn't be opened: $!";
my $contents = ;
close(FH);

#Extract the entire genome
$contents =~ m#ORIGIN(.+?)//#s or die "No genome data found.";
$genome = $1;

# Remove extraneous characters, make it one big long string to use substr position on it
$genome =~ s/[\d\r\n\s]+//g;

# Calculate total possible mutations
my %mutations = find_possible_mutations($genome);

# Extract all the gene definitions, end at protein translation.
my @genes;
@genes = $contents =~ m#(? format_number($gene_mutations{tt}/$mutations{tt}*100, $num_precision),
					pct => format_number($gene_mutations{ct}/$mutations{ct}*100, $num_precision),
					pcc => format_number($gene_mutations{cc}/$mutations{cc}*100, $num_precision),
					ptotal => format_number($gene_mutations{total}/$mutations{total}*100, $num_precision)
				);
		
		#Pull out GeneID (if exists)
		if( $gene =~ m#/db_xref="GeneID:(\d+)"# ) {
			$geneid = $1;
		}
		#Pull out Protein Product, if exists
		if( $gene =~ m#/product="([^"]+)"# ) {
			$gene_product = $1;
			$gene_product =~ s/\n\s*/ /g; #Clear out newlines and indentation
		}

		$mutant_genes{$gene_name} = {	gene_id => $geneid,
						prod_pro => $gene_product,
						%gene_mutations,
						%probability
						};
	}
}


if($opts{h}) {
	html_out($mutations{total}, \%mutant_genes)
}else{
	print "UV Mutation (pyramidine dimerization) Analysis\n";
	print "Total possible mutations in genome: $mutations{total}\n\n";
	print "\nGenes sorted by UV mutation probability:\n", "=" x 65, "\n";
	
	foreach (sort by_descending_probability keys %mutant_genes) {
		printf "%-20s%.5f%%   %s\n", $_, $mutant_genes{$_}{ptt}, $mutant_genes{$_}{prod_pro};
	}
}

sub by_descending_probability {
	$mutant_genes{$b}{ptt} <=> $mutant_genes{$a}{ptt};
}

sub html_out {
	my ($total_muts, $mutant_ref) = @_;
	my %mutant_genes = %{$mutant_ref};
	print "\n\n\n";
	print "UV Mutant Analysis\n";
	print "Total Possible mutations in Genome: $total_muts
\n";
	print "Gene mutations sorted by decending probability of mutation
\n";
	print "\n\n";
	#my %mutant_genes = shift; #Gives an odd numbered hash assignment error when prototyped
	foreach (sort by_descending_probability keys %mutant_genes) {
		print "\n";
	}
	print "Gene Possible Gene Mutations TT Possible Gene Mutations CT Possible Gene Mutations CC Mutation Probability TT(%) Mutation Probability CT(%) Mutation Probability CC(%) Gene Product
$_ $mutant_genes{$_}{tt} $mutant_genes{$_}{ct} $mutant_genes{$_}{cc} $mutant_genes{$_}{ptt} $mutant_genes{$_}{pct} $mutant_genes{$_}{pcc} $mutant_genes{$_}{prod_pro}\n";
	print "\n";
}

sub find_possible_mutations {
	my $genome = shift;
	my %mutations = ( 	tt => 0,
				ct => 0,
				cc => 0,
				total => 0 ); # Set all values to zero to start incase no possible sites found.
	# Find all possible Thymidine dimerizations (most common dimerization)
	while( $genome =~ /t(?=t)/g ) {
		$mutations{tt}++;
	}
	# Find all possible heterogeneous dimerization sites (less common)
	while( $genome =~ /c(?=t)/g ) {
		$mutations{ct}++;
	}
	while( $genome =~ /t(?=c)/g ) {
		$mutations{ct}++;
	}
	# Find all possible Cystine dimerization sites (least common)
	while( $genome =~ /c(?=c)/g ) {
		$mutations{cc}++;
	}
	# Store the total mutations for later calculations
	$mutations{total} = $mutations{tt} + $mutations{ct} + $mutations{cc};
	return %mutations;
}

Gene	Possible Gene Mutations	Mutation Probability (%)	Gene Product
$_	$mutant_genes{$_}{gene_mutants}	$mutant_genes{$_}{mutant_prob}	$mutant_genes{$_}{prod_pro}

Gene	Possible Gene Mutations TT	Possible Gene Mutations CT	Possible Gene Mutations CC	Mutation Probability TT(%)	Mutation Probability CT(%)	Mutation Probability CC(%)	Gene Product
$_	$mutant_genes{$_}{tt}	$mutant_genes{$_}{ct}	$mutant_genes{$_}{cc}	$mutant_genes{$_}{ptt}	$mutant_genes{$_}{pct}	$mutant_genes{$_}{pcc}	$mutant_genes{$_}{prod_pro}