#!/usr/bin/perl -w
use Data::Dumper;
use String::Substrings;
use Algorithm::Loops 'NestedLoops';
use List::Compare;
use Bio::SeqIO;
use List::MoreUtils qw(uniq);
use sp_neg_eff;
use Devel::Size qw (size total_size);

my $f        = $ARGV[0];
my $file     = "files/$f.fasta"; 
my $nofseq 	 = scalar(getSeqfromfasta2lmers($file));
my $name     =  (split(/\./,$file))[0]; 
my $file_neg = "$name.rsat";

my @param_groups = gen_param($file,$file_neg,$nofseq);
#print Dumper \@param_groups;
# Some comment

my ( $count, %result) = 1;

foreach  ( @param_groups )
{
    print "TAG: ParamGroup" . $count ,"\n";
    print STDERR "Processing ParamGroup $count\n";                          
    my $output = main_process( @{$_}{
                                        qw/
                                           file
                                           file_neg
                                           submt_len          
                                           submt_d            
                                           e
                                           W_size             
                                           lp                   
                                           support_threshold  
                                           min_inst_lower     
                                           min_inst_upper      
                                           polyTA_lim         
                                           poly_lim           
                                        /
                                    });
      $result{'ParamGroup'.$count++} = $output;

}               

#----------------

sub main_process {

    my (
        $file,
        $file_neg,
        $submt_len, 
        $submt_d,  
        $e,
        $W_size, 	
        $lp,     
        $support_threshold, 
        $min_inst_lower,   
        $min_inst_upper,  
        $polyTA_lim,     
        $poly_lim
       ) = @_;      


        my @input_seqs 	       = getSeqfromfasta2lmers($file);
        my $ip 		           = @input_seqs;
        my @mcands    = getlmersfromseq(\@input_seqs,$W_size);

        my $mc 		           = @mcands;
        my $skip               = 0; 


        my %bighash;    # HoAoH with motif-candidate as key, then Windows as key,
                        # motif-cand => [ { motif inst (windows) 
                        #                   => seqn1-positions--submot_score-submotifs}
                        #                 {  motif inst (windows)
                        #                   => seqn2-positions--submot_score-submotifs} ...]

        my @all_array;
        my $count_mcand; # mcand to be processed in apriori

        my $seqlen;
        #my $count_win;

        foreach my $mcands (@mcands) {
            #Submotifs of motif (window)
            my @Wsub = substrings $mcands, $submt_len;    
            my %fhash;
            my $seqno;
            my @winfromallseq;

            foreach my $seqs (@input_seqs) {
                $seqlen = length $seqs;

                #windows taken from each motif Candidate
                my @line_W = substrings $seqs, $W_size;    
                my $count_lineW = @line_W;
                my %candwin_with_submot;

                $seqno += 1;

                for ( my $i = 0; $i <= $#line_W; $i++ ) {
                    my @submt_Wfrom_line = substrings $line_W[$i], $submt_len;

                    # array of submotifs from each windows taken from line

                    my @matches;
                    my @matches_and_pos;

                    for ( my $k = 0; $k <= $#Wsub; $k++ ) {
                        my $dif = hd( $submt_Wfrom_line[$k], $Wsub[$k] );

                        if ( $dif <= $submt_d ) {
                            push @matches, $submt_Wfrom_line[$k];
                        }
                        my $mcount = @matches;
                    }

                    if (@matches) {
                        $candwin_with_submot{ $line_W[$i] . " "
                                . ( $seqno - 1 ) . " "
                                . ( $i - $seqlen ) } = [@matches];
                    }
                    else {
                        $i += $skip;
                    }

                }

                # a HoA: motif inst (windows) => seqno-position--submot_score-submotifs
                my %hash; 

                foreach my $key ( keys %candwin_with_submot ) {

                    # @key elem contain motif-pos-seqno
                    my @key_elems = split( /\s/, $key );    
                    my $score = score_dup( $key_elems[0],
                        $candwin_with_submot{$key} );

                    my @array;

                    if ( $score >= $lp ) {
                        push @array,
                            (
                              $key_elems[1], 
                              $key_elems[2], 
                              $score,
                              @{ $candwin_with_submot{$key} },
                            );
                        $hash{ $key_elems[0] } = [@array];
                    }
                }

                #filtered hash for windows from every seqs
                %fhash = filter(%hash);    

                push @winfromallseq, {%fhash};

                #$bighash{$mcands} = [ @winfromallseq ];
                #print Dumper \@winfromallseq ;

            }  # -----  end foreach $seqs  -----

            my %output_hash;
            my $align_score;
            my @to_align;
            my %sb_hash;

            for my $winfromallseq (@winfromallseq) {

                if ($winfromallseq) {

                    for my $mcand_nstr ( keys %{$winfromallseq} ) {
                        my @tmp_ar;
                        push @tmp_ar, @{ $winfromallseq->{$mcand_nstr} };
                        my $sid  = $tmp_ar[0];
                        my $ipos = $tmp_ar[1];

                        my @sbmt_slice = @tmp_ar[ 3 .. $#tmp_ar ];
                        $sb_hash{ $sid . ',' . $ipos . ',' . $mcand_nstr }
                            = [@sbmt_slice];

                    }

                }

            }

            my $no_ins = scalar( keys %sb_hash );

            if (   $no_ins >= $min_inst_lower
                && $no_ins <= $min_inst_upper )
            {
                $count_mcand++;
                print STDERR "Motif Candidates Number $count_mcand\n"
                    if ( $count_mcand % 10 == 0 );

                # Apriori_space subroutine 
                # is other complex process
                # taken from "sp" package


                my @ap_array = apriori_space( \%sb_hash, $W_size, $submt_len, $submt_d,
                    $e, $support_threshold );

                if (@ap_array) {
                    prn_final_sorted(\@ap_array); 
                }

            }

        }    # -----  end foreach $mcands  -----


 
        return;
    
   # The code fails after the first main 
   # Process is completed.

} #------sub main_process-----	



#------------------------------------------------------------------------------
#---------                        Subroutines                  ----------------
#------------------------------------------------------------------------------
sub gen_param {
    my ( $file, $file_neg, $nofseq ) = @_;
    my @nq;
    my @wlen_group = ( 8, 15, 20 );
    my @frac = ( 0.8, 0.5 );
    my @param_groups;

    push @nq, ( $nofseq, ( $nofseq * 1.5 ) );

    foreach my $wlen (@wlen_group) {
        foreach my $fract (@frac) {
            foreach my $q (@nq) {
                my $rec = {};

                $rec->{'file'}              = $file;
                $rec->{'file_neg'}          = $file_neg;
                $rec->{'submt_len'}         = 5;
                $rec->{'submt_d'}           = 1;
                $rec->{'e'}                 = 0;
                $rec->{'W_size'}            = $wlen;
                $rec->{'lp'}                = $fract * $wlen;
                $rec->{'support_threshold'} = $q;
                $rec->{'min_inst_lower'}    = $q;
                $rec->{'min_inst_upper'}    = ( 3 * $q );
                $rec->{'polyTA_lim'}        = 0.8;
                $rec->{'poly_lim'}          = 0.8;

                push @param_groups, $rec;
            }    # -----  end foreach  -----

        }    # -----  end foreach  -----

    }

    return @param_groups;
}

sub prn_final_sorted {
    my $AoA = shift;

    print ">instances\n";
    my $count = 0;

    foreach my $aref ( sort { $b->[1] <=> $a->[1] } @$AoA ) {
        $count++;
        print "PATTERN: $aref->[0], $aref->[1]\n";
        print "CONSENSUS: $aref->[-1]\n\n";

        foreach my $ins ( @{ $aref->[2] } ) {
            print "$ins\n";
        }    # -----  end foreach  -----

        print "Alignment\n";

        foreach my $k ( keys %{ $aref->[3] } ) {
            print "$k  $aref->[3]{$k}\n";
        }

        print "=============\n\n";

        last if ( $count == 100 );

    }

    return;
}    # ----------  end of subroutine prn_final_sort  ----------


sub append_n 
{     #fastest
      my ( $str, $array ) = @_;
      my $nstring = "N" x length($str);

      foreach my $sbstr ( @$array )
      {
        my $ofs = 0;
        while ( ( my $pos = index $str, $sbstr, $ofs ) > -1 ) {
          substr ($nstring, $pos, length ($sbstr)) = $sbstr;
          $ofs = $pos + 1;
        }
      }
      return $nstring;
}



sub getlmersfromseq
{
	my  ($seqsarr,$l)= @_;
	my  @lmers;
	 	
	@lmers = map {substrings $_, $l} @{$seqsarr};
	my @uniq_lmers = uniq @lmers;
	return @uniq_lmers;
}

sub getSeqfromfasta2lmers
{
    #designed for getting sequences in to array from a file  (fasta format), 
    #input: file name 
    
    my $file = shift;
    my @seqs= ();

    open INFILE, "<$file" or die "$0:  Can't open file $file: $!";
    my $in = Bio::SeqIO->new(-format => 'fasta',
                         -noclose => 1 ,
                         -fh => \*INFILE);

    while ( my $seq = $in->next_seq() ) {
	push @seqs, $seq->seq();
    } #end while
       
    return @seqs;

}

sub filter
{       # Filter duplicated Windows 
    my %hoa = @_;
     my %filtered_hoa;
 	my @key = (sort {$hoa{$a}[1] <=> $hoa{$b}[1] } keys %hoa);	
	my @flag; 
	
	foreach my $key (@key)
	{  
	    my @submt_only = @{$hoa{$key}}[3..$#{$hoa{$key}}]; 
	    
	    # check if the current array is equiv. with *one* before
            my $lc = List::Compare->new(\@flag,\@submt_only);
	    my $eq = $lc->is_LeqvlntR;
			
	    if ($eq == 0)
	    {   #if they are not equivalent, assign them to a hash
		$filtered_hoa{$key} = $hoa{$key};
	    }
	    else
	    {	#otherwise skip them
		next;
	    }
	    @flag = @submt_only;
	}

	return %filtered_hoa;	
}	


sub print_bypos {
    my %hoa = @_;

    # sort by position of motifs in sequence
    foreach my $key ( sort { $hoa{$a}[1] <=> $hoa{$b}[1] } keys %hoa ) {
        print "$key $hoa{$key}[1] ";

        foreach my $i ( 3 .. $#{ $hoa{$key} } ) {
            print " $hoa{$key}[$i]";
        }
        print "\n";
    }

}    # ----------  end of subroutine print_bypos  ----------

sub score_dup 
{
      my ( $str, $array ) = @_;
      my $offset = 0;
      my $mask = "\0" x length $str;

      for my $frag ( @$array ) {
	( my $idx = index $str, $frag, $offset ) >= 0 or next;
	    substr $mask, $idx, length $frag, $frag;
	    $offset = $idx + 1;
	}

      return $mask =~ tr/\0//c;
}


sub score_nodup 
{
    my ($str, $array) = @_;
    my $vec = "\0" x length($str);
    for (@$array) {
        my $idx = index $str, $_;
        # Matching substrings are padded into position with nulls
        $vec |= ("\0" x $idx) . $_;
    }
    # Matching characters become nulls; others non-nulls
    $vec ^= $str;
    # Count nulls
    $vec =~ tr/\0//;
}