#A perl script that takes in as arguments, the names of two alignments files in msf format,
#compares the sites in all of them, and calculates mutual information between sites.
#The files have to be in msf format. the calculations are done under the assumption that each site
#evolved independently to the other sites in its alignmnet.
#
#Author : Kostantinos Gkogkoglou
#e-mail: kg66@le.ac.uk


#!/usr/bin/perl -w
use warnings;
use strict;
use Mutual;
#load the necessary packages
#use Switch;
#use Math::ElFun;
#use POSIX qw(log10);
use Bio::AlignIO;
use Bio::Align::AlignI;
use Bio::SeqIO;
use Bio::Seq;
use Mutual;
use List::Util;
#initializing the necessary variables.

my $inputfilename='';              #inputfile for 1st sequence
my $inputfilename2='';
my $outputfilename='';             #outputfile for both sequences
my $TIM ;                          #the TIM alignment
my $PER ;
my $aln='';                        #the first alignment
my $aln2='';
my $out='';                        #the output
my $out2='';
my @seq = ();                      #the array that stores all the sequence objects from the first alignment
my @seq2 = ();


my $site;                          #positions in the alignments

#creating inputfiles for the user to type the alignment filenames and the name of the outputfile.

$inputfilename = $ARGV[0];
$inputfilename2 = $ARGV[1];

# Using the new file method from the Bio::AlignIO to load the sequences from the allignment file. 

 
$TIM  = Bio::AlignIO->new(-file => $inputfilename );
$PER  = Bio::AlignIO->new(-file => $inputfilename2);

#Creating an AlignI object from the ALignIO input. Now we can manipulate the sequence in many possible ways 
$aln= $TIM->next_aln;
$aln2 =$PER->next_aln;
#Sorting the sequences alphabetically

#$aln->sort_alphabetically;
$aln2->sort_alphabetically;

#each_seq method. Gets an array of sequences for each sequence of the array.

@seq = $aln->each_seq();
@seq2 = $aln2->each_seq();



#calculating the length of the alignment(length of sequences). It will be used in order to calculate the frequencies later on.
my $no_of_sequences= $aln->no_sequences; #will be used to calculate the frequencies 
#my $no_of_sequences2 = $aln2->no_sequences;
my $length1 = $aln->length;
my $length2 = $aln2->length;
my $no_of_residues = $aln->no_residues;
my $no_of_residues2 = $aln2->no_residues;

#the following method displays the sequence as a string . Just doing that in order to see what's inside my array.
my $a;
for ($a=1; $a<=1000; $a++){
   print "CURRENT ROUND IS :$a\n";
my @shuffled = fisher_yates_shuffle(\@seq);


   
#my @shuffled = fisher_yates_shuffle(\@seq);
foreach my $seq(@seq){
   my $id = $seq->display_id;
   print "$id\n";
   
   
}
 














#accecsing all the individual aminoacids of the 1st alignment sequence. Print all aminoacids for each site of the alignment
#my $seq;
my $j;
my @site1;
    for ($j=1;$j<= $length1;$j++){
       my $site;
       
       foreach my $seq(@seq){
       my  $res = $seq->subseq($j,$j);
         $site .=$res;
         }
       push (@site1, $site);
      
      }
    #doing the same for the second alignment file.
my $i;
my @site2;
    for ($i=1;$i<= $length2;$i++){
       my $site;
       
       foreach my $seq(@seq2){
       my  $res = $seq->subseq($i,$i);
         $site .=$res;
         }
       push (@site2, $site);
 
      }
    
    

my $site1;
my $site2;


my $l=0;
foreach $site1 (@site1) {
   my @site1_parts = split(//, $site1);
    $l++;
    my $j=0;
    my $mutual;
    my $Hx= get_Hx($site1);
    #print "$Hx\n";
    
   foreach $site2 (@site2) {#begining of loop2 for each site 2
           $j++;
      my $Hy = get_Hx($site2);
        
        my $Hxy;
        my %counts;
        my @count_collection;
       
        my @site2_parts = split(//, $site2);

        my $i = @site1_parts;
        $counts{get_index($site1_parts[$i], $site2_parts[$i])}++ while ($i--);
          push(@count_collection, \%counts);
          foreach my $counts (@count_collection) {
            my $total;
            foreach (sort keys %$counts) {
  #print "$counts{$_}\n";
        my $freq = $counts->{$_}/length($site1);
        my $MI = -$freq * log_2$freq; #$total +=$freq;
        $Hxy +=$MI;
        
  }
            #print "$Hxy \ $Hx\ $Hy \n"; 
            $mutual = $Hx + $Hy - $Hxy;
}
        
        
        
        print " $l,$j,$mutual\n";
       #print " $l,$j\n";
        #print($_, ': ', $counts{$_}, $/) foreach (sort keys %counts);
        #print($/);
   }#end of loop2 for each site2
 
   
}

}#end of the procedure of shuffling
exit;
#
#