#!/usr/bin/perl use warnings; use strict; use Getopt::Std; use Number::Format qw(:subs); undef $/; my $num_precision = 5; our %opts; getopts('hf:', \%opts); die("Usage: uv_mutant.pl -f .gbk\nAdd -h for html output\n") unless $opts{f}; my $file = $opts{f}; my $genome; open(FH, '<', $file) or die "File couldn't be opened: $!"; my $contents = ; close(FH); #Extract the entire genome $contents =~ m#ORIGIN(.+?)//#s or die "No genome data found."; $genome = $1; # Remove extraneous characters, make it one big long string to use substr position on it $genome =~ s/[\d\r\n\s]+//g; # Calculate total possible mutations my %mutations = find_possible_mutations($genome); # Extract all the gene definitions, end at protein translation. my @genes; @genes = $contents =~ m#(? format_number($gene_mutations{tt}/$mutations{tt}*100, $num_precision), pct => format_number($gene_mutations{ct}/$mutations{ct}*100, $num_precision), pcc => format_number($gene_mutations{cc}/$mutations{cc}*100, $num_precision), ptotal => format_number($gene_mutations{total}/$mutations{total}*100, $num_precision) ); #Pull out GeneID (if exists) if( $gene =~ m#/db_xref="GeneID:(\d+)"# ) { $geneid = $1; } #Pull out Protein Product, if exists if( $gene =~ m#/product="([^"]+)"# ) { $gene_product = $1; $gene_product =~ s/\n\s*/ /g; #Clear out newlines and indentation } $mutant_genes{$gene_name} = { gene_id => $geneid, prod_pro => $gene_product, %gene_mutations, %probability }; } } if($opts{h}) { html_out($mutations{total}, \%mutant_genes) }else{ print "UV Mutation (pyramidine dimerization) Analysis\n"; print "Total possible mutations in genome: $mutations{total}\n\n"; print "\nGenes sorted by UV mutation probability:\n", "=" x 65, "\n"; foreach (sort by_descending_probability keys %mutant_genes) { printf "%-20s%.5f%% %s\n", $_, $mutant_genes{$_}{ptt}, $mutant_genes{$_}{prod_pro}; } } sub by_descending_probability { $mutant_genes{$b}{ptt} <=> $mutant_genes{$a}{ptt}; } sub html_out { my ($total_muts, $mutant_ref) = @_; my %mutant_genes = %{$mutant_ref}; print "\n\n\n"; print "

UV Mutant Analysis

\n"; print "Total Possible mutations in Genome: $total_muts
\n"; print "Gene mutations sorted by decending probability of mutation
\n"; print "\n\n"; #my %mutant_genes = shift; #Gives an odd numbered hash assignment error when prototyped foreach (sort by_descending_probability keys %mutant_genes) { print "\n"; } print "

Gene	Possible Gene Mutations TT	Possible Gene Mutations CT	Possible Gene Mutations CC	Mutation Probability TT(%)	Mutation Probability CT(%)	Mutation Probability CC(%)	Gene Product
$_	$mutant_genes{$_}{tt}	$mutant_genes{$_}{ct}	$mutant_genes{$_}{cc}	$mutant_genes{$_}{ptt}	$mutant_genes{$_}{pct}	$mutant_genes{$_}{pcc}	$mutant_genes{$_}{prod_pro}

\n"; print "\n"; } sub find_possible_mutations { my $genome = shift; my %mutations = ( tt => 0, ct => 0, cc => 0, total => 0 ); # Set all values to zero to start incase no possible sites found. # Find all possible Thymidine dimerizations (most common dimerization) while( $genome =~ /t(?=t)/g ) { $mutations{tt}++; } # Find all possible heterogeneous dimerization sites (less common) while( $genome =~ /c(?=t)/g ) { $mutations{ct}++; } while( $genome =~ /t(?=c)/g ) { $mutations{ct}++; } # Find all possible Cystine dimerization sites (least common) while( $genome =~ /c(?=c)/g ) { $mutations{cc}++; } # Store the total mutations for later calculations $mutations{total} = $mutations{tt} + $mutations{ct} + $mutations{cc}; return %mutations; }