#A perl script that takes in as arguments, the names of two alignments files in msf format, #compares the sites in all of them, and calculates mutual information between sites. #The files have to be in msf format. the calculations are done under the assumption that each site #evolved independently to the other sites in its alignmnet. # #Author : Kostantinos Gkogkoglou #e-mail: kg66@le.ac.uk #!/usr/bin/perl -w use warnings; use strict; use Mutual; #load the necessary packages #use Switch; #use Math::ElFun; #use POSIX qw(log10); use Bio::AlignIO; use Bio::Align::AlignI; use Bio::SeqIO; use Bio::Seq; use Mutual; use List::Util; #initializing the necessary variables. my $inputfilename=''; #inputfile for 1st sequence my $inputfilename2=''; my $outputfilename=''; #outputfile for both sequences my $TIM ; #the TIM alignment my $PER ; my $aln=''; #the first alignment my $aln2=''; my $out=''; #the output my $out2=''; my @seq = (); #the array that stores all the sequence objects from the first alignment my @seq2 = (); my $site; #positions in the alignments #creating inputfiles for the user to type the alignment filenames and the name of the outputfile. $inputfilename = $ARGV[0]; $inputfilename2 = $ARGV[1]; # Using the new file method from the Bio::AlignIO to load the sequences from the allignment file. $TIM = Bio::AlignIO->new(-file => $inputfilename ); $PER = Bio::AlignIO->new(-file => $inputfilename2); #Creating an AlignI object from the ALignIO input. Now we can manipulate the sequence in many possible ways $aln= $TIM->next_aln; $aln2 =$PER->next_aln; #Sorting the sequences alphabetically #$aln->sort_alphabetically; $aln2->sort_alphabetically; #each_seq method. Gets an array of sequences for each sequence of the array. @seq = $aln->each_seq(); @seq2 = $aln2->each_seq(); #calculating the length of the alignment(length of sequences). It will be used in order to calculate the frequencies later on. my $no_of_sequences= $aln->no_sequences; #will be used to calculate the frequencies #my $no_of_sequences2 = $aln2->no_sequences; my $length1 = $aln->length; my $length2 = $aln2->length; my $no_of_residues = $aln->no_residues; my $no_of_residues2 = $aln2->no_residues; #the following method displays the sequence as a string . Just doing that in order to see what's inside my array. my $a; for ($a=1; $a<=1000; $a++){ print "CURRENT ROUND IS :$a\n"; my @shuffled = fisher_yates_shuffle(\@seq); #my @shuffled = fisher_yates_shuffle(\@seq); foreach my $seq(@seq){ my $id = $seq->display_id; print "$id\n"; } #accecsing all the individual aminoacids of the 1st alignment sequence. Print all aminoacids for each site of the alignment #my $seq; my $j; my @site1; for ($j=1;$j<= $length1;$j++){ my $site; foreach my $seq(@seq){ my $res = $seq->subseq($j,$j); $site .=$res; } push (@site1, $site); } #doing the same for the second alignment file. my $i; my @site2; for ($i=1;$i<= $length2;$i++){ my $site; foreach my $seq(@seq2){ my $res = $seq->subseq($i,$i); $site .=$res; } push (@site2, $site); } my $site1; my $site2; my $l=0; foreach $site1 (@site1) { my @site1_parts = split(//, $site1); $l++; my $j=0; my $mutual; my $Hx= get_Hx($site1); #print "$Hx\n"; foreach $site2 (@site2) {#begining of loop2 for each site 2 $j++; my $Hy = get_Hx($site2); my $Hxy; my %counts; my @count_collection; my @site2_parts = split(//, $site2); my $i = @site1_parts; $counts{get_index($site1_parts[$i], $site2_parts[$i])}++ while ($i--); push(@count_collection, \%counts); foreach my $counts (@count_collection) { my $total; foreach (sort keys %$counts) { #print "$counts{$_}\n"; my $freq = $counts->{$_}/length($site1); my $MI = -$freq * log_2$freq; #$total +=$freq; $Hxy +=$MI; } #print "$Hxy \ $Hx\ $Hy \n"; $mutual = $Hx + $Hy - $Hxy; } print " $l,$j,$mutual\n"; #print " $l,$j\n"; #print($_, ': ', $counts{$_}, $/) foreach (sort keys %counts); #print($/); }#end of loop2 for each site2 } }#end of the procedure of shuffling exit; # # #### package Mutual; use strict; require Exporter; our @ISA = qw(Exporter); our @EXPORT = qw(log_2 get_index get_Hx fisher_yates_shuffle); # fisher_yates_shuffle( \@array ) : # generate a random permutation of @array in place sub fisher_yates_shuffle { my $array = shift; my $i; for ($i = @$array; --$i; ) { my $j = int rand ($i+1); next if $i == $j; @$array[$i,$j] = @$array[$j,$i]; } } # fisher_yates_shuffle( \@array ); # permutes @array in place sub log_2 { my $base = 2; my $value = $_[0]; return log($value)/log($base); } sub get_index{ $_[0] lt $_[1] ? ($_[0].$_[1]) : ($_[1].$_[0]) } sub get_Hx{ my $site = $_[0]; my $no_of_sequences =length($site); my $count_A=0 ; my $count_C=0 ; my $count_D=0 ; my $count_E=0 ; my $count_F=0 ; my $count_G=0 ; my $count_H=0 ; my $count_I=0 ; my $count_J=0 ; my $count_K=0 ; my $count_L=0 ; my $count_M=0 ; my $count_N=0 ; my $count_P=0 ; my $count_Q=0 ; my $count_R=0 ; my $count_S=0 ; my $count_T=0 ; my $count_V=0 ; my $count_W=0 ; my $count_Y=0 ; my $count_ =0 ; my $sum1=0; my $sum2=0; my $sum3=0; my $sum4=0; my $sum5=0; my $sum6=0; my $sum7=0; my $sum8=0; my $sum9=0; my $sum10=0; my $sum11=0; my $sum12=0; my $sum13=0; my $sum14=0; my $sum15=0; my $sum16=0; my $sum17=0; my $sum18=0; my $sum19=0; my $sum20=0; my $sum21=0; while($site =~m/A/g){ $count_A++; } while($site =~m/C/g){ $count_C++; } while($site =~m/D/g){ $count_D++; } while($site =~m/E/g){ $count_E++; } while($site =~m/F/g){ $count_F++; } while($site =~m/G/g){ $count_G++; } while($site =~m/H/g){ $count_H++; } while($site =~m/I/g){ $count_I++; } while($site =~m/K/g){ $count_K++; } while($site =~m/L/g){ $count_L++; } while($site =~m/M/g){ $count_M++; } while($site =~m/N/g){ $count_N++; } while($site =~m/P/g){ $count_P++; } while($site =~m/Q/g){ $count_Q++; } while($site =~m/R/g){ $count_R++; } while($site =~m/S/g){ $count_S++; } while($site =~m/T/g){ $count_T++; } while($site =~m/V/g){ $count_V++; } while($site =~m/W/g){ $count_W++; } while($site =~m/Y/g){ $count_Y++; } while($site =~m/\./g){ $count_++; } while($site =~m/\_/g){ $count_++; } #if there is a counter with +ve value in any position of the array, then calculate the mutual information #value and store it there. if ($count_A > 0){ $sum1= -($count_A/$no_of_sequences)*log_2($count_A/$no_of_sequences); } if ($count_C > 0){ $sum2= -($count_C/$no_of_sequences)*log_2($count_C/$no_of_sequences); } if ($count_D > 0){ $sum3= -($count_D/$no_of_sequences)*log_2($count_D/$no_of_sequences); } if ($count_E > 0){ $sum4= -($count_E/$no_of_sequences)*log_2($count_E/$no_of_sequences); } if ($count_F > 0){ $sum5= -($count_F/$no_of_sequences)*log_2($count_F/$no_of_sequences); } if ($count_G > 0){ $sum6= -($count_G/$no_of_sequences)*log_2($count_G/$no_of_sequences); } if ($count_H > 0){ $sum7= -($count_H/$no_of_sequences)*log_2($count_H/$no_of_sequences) ; } if ($count_I > 0){ $sum8= -($count_I/$no_of_sequences)*log_2($count_I/$no_of_sequences) ; } if ($count_J > 0){ $sum9= -($count_J/$no_of_sequences)*log_2($count_J/$no_of_sequences); } if ($count_K > 0){ $sum10= -($count_K/$no_of_sequences)*log_2($count_K/$no_of_sequences); } if ($count_L > 0){ $sum11=-($count_L/$no_of_sequences)*log_2($count_L/$no_of_sequences); } if ($count_M > 0){ $sum12=-($count_M/$no_of_sequences)*log_2($count_M/$no_of_sequences); } if ($count_N > 0){ $sum13= -($count_N/$no_of_sequences)*log_2($count_N/$no_of_sequences); } if ($count_P > 0){ $sum14= -($count_P/$no_of_sequences)*log_2($count_P/$no_of_sequences); } if ($count_R > 0){ $sum15= -($count_R/$no_of_sequences)*log_2($count_R/$no_of_sequences); } if ($count_S > 0){ $sum16= -($count_S/$no_of_sequences)*log_2($count_S/$no_of_sequences); } if ($count_T > 0){ $sum17= -($count_T/$no_of_sequences)*log_2($count_T/$no_of_sequences); } if ($count_V > 0){ $sum18= -($count_V/$no_of_sequences)*log_2($count_V/$no_of_sequences); } if ($count_W > 0){ $sum19= -($count_W/$no_of_sequences)*log_2($count_W/$no_of_sequences); } if ($count_Y > 0){ $sum20= -($count_Y/$no_of_sequences)*log_2($count_Y/$no_of_sequences); } if ($count_ > 0){ $sum21= -($count_/$no_of_sequences)*log_2($count_/$no_of_sequences); } my $Hx = $sum1+$sum2+$sum3+$sum4+$sum5+$sum6+$sum7+$sum8+$sum9+$sum10+$sum11+$sum12+$sum13+$sum14 +$sum15+$sum16+$sum17+$sum18+$sum19+$sum20+$sum21; return $Hx; }#end of foreach 1;