#!/usr/bin/perl -w use Data::Dumper; use String::Substrings; use Algorithm::Loops 'NestedLoops'; use List::Compare; use Bio::SeqIO; use List::MoreUtils qw(uniq); use sp_neg_eff; use Devel::Size qw (size total_size); my $f = $ARGV[0]; my $file = "files/$f.fasta"; my $nofseq = scalar(getSeqfromfasta2lmers($file)); my $name = (split(/\./,$file))[0]; my $file_neg = "$name.rsat"; my @param_groups = gen_param($file,$file_neg,$nofseq); #print Dumper \@param_groups; # Some comment my ( $count, %result) = 1; foreach ( @param_groups ) { print "TAG: ParamGroup" . $count ,"\n"; print STDERR "Processing ParamGroup $count\n"; my $output = main_process( @{$_}{ qw/ file file_neg submt_len submt_d e W_size lp support_threshold min_inst_lower min_inst_upper polyTA_lim poly_lim / }); $result{'ParamGroup'.$count++} = $output; } #---------------- sub main_process { my ( $file, $file_neg, $submt_len, $submt_d, $e, $W_size, $lp, $support_threshold, $min_inst_lower, $min_inst_upper, $polyTA_lim, $poly_lim ) = @_; my @input_seqs = getSeqfromfasta2lmers($file); my $ip = @input_seqs; my @mcands = getlmersfromseq(\@input_seqs,$W_size); my $mc = @mcands; my $skip = 0; my %bighash; # HoAoH with motif-candidate as key, then Windows as key, # motif-cand => [ { motif inst (windows) # => seqn1-positions--submot_score-submotifs} # { motif inst (windows) # => seqn2-positions--submot_score-submotifs} ...] my @all_array; my $count_mcand; # mcand to be processed in apriori my $seqlen; #my $count_win; foreach my $mcands (@mcands) { #Submotifs of motif (window) my @Wsub = substrings $mcands, $submt_len; my %fhash; my $seqno; my @winfromallseq; foreach my $seqs (@input_seqs) { $seqlen = length $seqs; #windows taken from each motif Candidate my @line_W = substrings $seqs, $W_size; my $count_lineW = @line_W; my %candwin_with_submot; $seqno += 1; for ( my $i = 0; $i <= $#line_W; $i++ ) { my @submt_Wfrom_line = substrings $line_W[$i], $submt_len; # array of submotifs from each windows taken from line my @matches; my @matches_and_pos; for ( my $k = 0; $k <= $#Wsub; $k++ ) { my $dif = hd( $submt_Wfrom_line[$k], $Wsub[$k] ); if ( $dif <= $submt_d ) { push @matches, $submt_Wfrom_line[$k]; } my $mcount = @matches; } if (@matches) { $candwin_with_submot{ $line_W[$i] . " " . ( $seqno - 1 ) . " " . ( $i - $seqlen ) } = [@matches]; } else { $i += $skip; } } # a HoA: motif inst (windows) => seqno-position--submot_score-submotifs my %hash; foreach my $key ( keys %candwin_with_submot ) { # @key elem contain motif-pos-seqno my @key_elems = split( /\s/, $key ); my $score = score_dup( $key_elems[0], $candwin_with_submot{$key} ); my @array; if ( $score >= $lp ) { push @array, ( $key_elems[1], $key_elems[2], $score, @{ $candwin_with_submot{$key} }, ); $hash{ $key_elems[0] } = [@array]; } } #filtered hash for windows from every seqs %fhash = filter(%hash); push @winfromallseq, {%fhash}; #$bighash{$mcands} = [ @winfromallseq ]; #print Dumper \@winfromallseq ; } # ----- end foreach $seqs ----- my %output_hash; my $align_score; my @to_align; my %sb_hash; for my $winfromallseq (@winfromallseq) { if ($winfromallseq) { for my $mcand_nstr ( keys %{$winfromallseq} ) { my @tmp_ar; push @tmp_ar, @{ $winfromallseq->{$mcand_nstr} }; my $sid = $tmp_ar[0]; my $ipos = $tmp_ar[1]; my @sbmt_slice = @tmp_ar[ 3 .. $#tmp_ar ]; $sb_hash{ $sid . ',' . $ipos . ',' . $mcand_nstr } = [@sbmt_slice]; } } } my $no_ins = scalar( keys %sb_hash ); if ( $no_ins >= $min_inst_lower && $no_ins <= $min_inst_upper ) { $count_mcand++; print STDERR "Motif Candidates Number $count_mcand\n" if ( $count_mcand % 10 == 0 ); # Apriori_space subroutine # is other complex process # taken from "sp" package my @ap_array = apriori_space( \%sb_hash, $W_size, $submt_len, $submt_d, $e, $support_threshold ); if (@ap_array) { prn_final_sorted(\@ap_array); } } } # ----- end foreach $mcands ----- return; # The code fails after the first main # Process is completed. } #------sub main_process----- #------------------------------------------------------------------------------ #--------- Subroutines ---------------- #------------------------------------------------------------------------------ sub gen_param { my ( $file, $file_neg, $nofseq ) = @_; my @nq; my @wlen_group = ( 8, 15, 20 ); my @frac = ( 0.8, 0.5 ); my @param_groups; push @nq, ( $nofseq, ( $nofseq * 1.5 ) ); foreach my $wlen (@wlen_group) { foreach my $fract (@frac) { foreach my $q (@nq) { my $rec = {}; $rec->{'file'} = $file; $rec->{'file_neg'} = $file_neg; $rec->{'submt_len'} = 5; $rec->{'submt_d'} = 1; $rec->{'e'} = 0; $rec->{'W_size'} = $wlen; $rec->{'lp'} = $fract * $wlen; $rec->{'support_threshold'} = $q; $rec->{'min_inst_lower'} = $q; $rec->{'min_inst_upper'} = ( 3 * $q ); $rec->{'polyTA_lim'} = 0.8; $rec->{'poly_lim'} = 0.8; push @param_groups, $rec; } # ----- end foreach ----- } # ----- end foreach ----- } return @param_groups; } sub prn_final_sorted { my $AoA = shift; print ">instances\n"; my $count = 0; foreach my $aref ( sort { $b->[1] <=> $a->[1] } @$AoA ) { $count++; print "PATTERN: $aref->[0], $aref->[1]\n"; print "CONSENSUS: $aref->[-1]\n\n"; foreach my $ins ( @{ $aref->[2] } ) { print "$ins\n"; } # ----- end foreach ----- print "Alignment\n"; foreach my $k ( keys %{ $aref->[3] } ) { print "$k $aref->[3]{$k}\n"; } print "=============\n\n"; last if ( $count == 100 ); } return; } # ---------- end of subroutine prn_final_sort ---------- sub append_n { #fastest my ( $str, $array ) = @_; my $nstring = "N" x length($str); foreach my $sbstr ( @$array ) { my $ofs = 0; while ( ( my $pos = index $str, $sbstr, $ofs ) > -1 ) { substr ($nstring, $pos, length ($sbstr)) = $sbstr; $ofs = $pos + 1; } } return $nstring; } sub getlmersfromseq { my ($seqsarr,$l)= @_; my @lmers; @lmers = map {substrings $_, $l} @{$seqsarr}; my @uniq_lmers = uniq @lmers; return @uniq_lmers; } sub getSeqfromfasta2lmers { #designed for getting sequences in to array from a file (fasta format), #input: file name my $file = shift; my @seqs= (); open INFILE, "<$file" or die "$0: Can't open file $file: $!"; my $in = Bio::SeqIO->new(-format => 'fasta', -noclose => 1 , -fh => \*INFILE); while ( my $seq = $in->next_seq() ) { push @seqs, $seq->seq(); } #end while return @seqs; } sub filter { # Filter duplicated Windows my %hoa = @_; my %filtered_hoa; my @key = (sort {$hoa{$a}[1] <=> $hoa{$b}[1] } keys %hoa); my @flag; foreach my $key (@key) { my @submt_only = @{$hoa{$key}}[3..$#{$hoa{$key}}]; # check if the current array is equiv. with *one* before my $lc = List::Compare->new(\@flag,\@submt_only); my $eq = $lc->is_LeqvlntR; if ($eq == 0) { #if they are not equivalent, assign them to a hash $filtered_hoa{$key} = $hoa{$key}; } else { #otherwise skip them next; } @flag = @submt_only; } return %filtered_hoa; } sub print_bypos { my %hoa = @_; # sort by position of motifs in sequence foreach my $key ( sort { $hoa{$a}[1] <=> $hoa{$b}[1] } keys %hoa ) { print "$key $hoa{$key}[1] "; foreach my $i ( 3 .. $#{ $hoa{$key} } ) { print " $hoa{$key}[$i]"; } print "\n"; } } # ---------- end of subroutine print_bypos ---------- sub score_dup { my ( $str, $array ) = @_; my $offset = 0; my $mask = "\0" x length $str; for my $frag ( @$array ) { ( my $idx = index $str, $frag, $offset ) >= 0 or next; substr $mask, $idx, length $frag, $frag; $offset = $idx + 1; } return $mask =~ tr/\0//c; } sub score_nodup { my ($str, $array) = @_; my $vec = "\0" x length($str); for (@$array) { my $idx = index $str, $_; # Matching substrings are padded into position with nulls $vec |= ("\0" x $idx) . $_; } # Matching characters become nulls; others non-nulls $vec ^= $str; # Count nulls $vec =~ tr/\0//; }