#A perl script that takes in as arguments, the names of two alignments files in msf format, #compares the sites in all of them, and calculates mutual information between sites. #The files have to be in msf format. the calculations are done under the assumption that each site #evolved independently to the other sites in its alignmnet. # #Author : Kostantinos Gkogkoglou #e-mail: kg66@le.ac.uk #!/usr/bin/perl -w use warnings; use strict; use Mutual; #load the necessary packages #use Switch; #use Math::ElFun; #use POSIX qw(log10); use Bio::AlignIO; use Bio::Align::AlignI; use Bio::SeqIO; use Bio::Seq; use Mutual; use List::Util; #initializing the necessary variables. my $inputfilename=''; #inputfile for 1st sequence my $inputfilename2=''; my $outputfilename=''; #outputfile for both sequences my $TIM ; #the TIM alignment my $PER ; my $aln=''; #the first alignment my $aln2=''; my $out=''; #the output my $out2=''; my @seq = (); #the array that stores all the sequence objects from the first alignment my @seq2 = (); my $site; #positions in the alignments #creating inputfiles for the user to type the alignment filenames and the name of the outputfile. $inputfilename = $ARGV[0]; $inputfilename2 = $ARGV[1]; # Using the new file method from the Bio::AlignIO to load the sequences from the allignment file. $TIM = Bio::AlignIO->new(-file => $inputfilename ); $PER = Bio::AlignIO->new(-file => $inputfilename2); #Creating an AlignI object from the ALignIO input. Now we can manipulate the sequence in many possible ways $aln= $TIM->next_aln; $aln2 =$PER->next_aln; #Sorting the sequences alphabetically #$aln->sort_alphabetically; $aln2->sort_alphabetically; #each_seq method. Gets an array of sequences for each sequence of the array. @seq = $aln->each_seq(); @seq2 = $aln2->each_seq(); #calculating the length of the alignment(length of sequences). It will be used in order to calculate the frequencies later on. my $no_of_sequences= $aln->no_sequences; #will be used to calculate the frequencies #my $no_of_sequences2 = $aln2->no_sequences; my $length1 = $aln->length; my $length2 = $aln2->length; my $no_of_residues = $aln->no_residues; my $no_of_residues2 = $aln2->no_residues; #the following method displays the sequence as a string . Just doing that in order to see what's inside my array. my $a; for ($a=1; $a<=1000; $a++){ print "CURRENT ROUND IS :$a\n"; my @shuffled = fisher_yates_shuffle(\@seq); #my @shuffled = fisher_yates_shuffle(\@seq); foreach my $seq(@seq){ my $id = $seq->display_id; print "$id\n"; } #accecsing all the individual aminoacids of the 1st alignment sequence. Print all aminoacids for each site of the alignment #my $seq; my $j; my @site1; for ($j=1;$j<= $length1;$j++){ my $site; foreach my $seq(@seq){ my $res = $seq->subseq($j,$j); $site .=$res; } push (@site1, $site); } #doing the same for the second alignment file. my $i; my @site2; for ($i=1;$i<= $length2;$i++){ my $site; foreach my $seq(@seq2){ my $res = $seq->subseq($i,$i); $site .=$res; } push (@site2, $site); } my $site1; my $site2; my $l=0; foreach $site1 (@site1) { my @site1_parts = split(//, $site1); $l++; my $j=0; my $mutual; my $Hx= get_Hx($site1); #print "$Hx\n"; foreach $site2 (@site2) {#begining of loop2 for each site 2 $j++; my $Hy = get_Hx($site2); my $Hxy; my %counts; my @count_collection; my @site2_parts = split(//, $site2); my $i = @site1_parts; $counts{get_index($site1_parts[$i], $site2_parts[$i])}++ while ($i--); push(@count_collection, \%counts); foreach my $counts (@count_collection) { my $total; foreach (sort keys %$counts) { #print "$counts{$_}\n"; my $freq = $counts->{$_}/length($site1); my $MI = -$freq * log_2$freq; #$total +=$freq; $Hxy +=$MI; } #print "$Hxy \ $Hx\ $Hy \n"; $mutual = $Hx + $Hy - $Hxy; } print " $l,$j,$mutual\n"; #print " $l,$j\n"; #print($_, ': ', $counts{$_}, $/) foreach (sort keys %counts); #print($/); }#end of loop2 for each site2 } }#end of the procedure of shuffling exit; # #