comment on

Dear BrowserUk,
Basically what my code is doing is to take sets of DNA input sequences, and find a conserved substrings within them.

The size of the variable expand, because for each length "W" strings from the input sequence I collect the again the substrings of it.

So here I ran the "main_process" subroutine multiple time given parameter sets (generated with gen_param subroutine).

Dont' be overwhelmed with my code below. You can ignore much of it. The out of memory message only occur after it completes the first set of parameters, and then it breaks. See the last portion of "main_process" subroutine.

Really hope to hear from you again.

#!/usr/bin/perl -w
use Data::Dumper;
use String::Substrings;
use Algorithm::Loops 'NestedLoops';
use List::Compare;
use Bio::SeqIO;
use List::MoreUtils qw(uniq);
use sp_neg_eff;
use Devel::Size qw (size total_size);

my $f        = $ARGV[0];
my $file     = "files/$f.fasta"; 
my $nofseq      = scalar(getSeqfromfasta2lmers($file));
my $name     =  (split(/\./,$file))[0]; 
my $file_neg = "$name.rsat";

my @param_groups = gen_param($file,$file_neg,$nofseq);
#print Dumper \@param_groups;
# Some comment

my ( $count, %result) = 1;

foreach  ( @param_groups )
{
    print "TAG: ParamGroup" . $count ,"\n";
    print STDERR "Processing ParamGroup $count\n";                    
+      
    my $output = main_process( @{$_}{
                                        qw/
                                           file
                                           file_neg
                                           submt_len          
                                           submt_d            
                                           e
                                           W_size             
                                           lp                   
                                           support_threshold  
                                           min_inst_lower     
                                           min_inst_upper      
                                           polyTA_lim         
                                           poly_lim           
                                        /
                                    });
      $result{'ParamGroup'.$count++} = $output;

}               

#----------------

sub main_process {

    my (
        $file,
        $file_neg,
        $submt_len, 
        $submt_d,  
        $e,
        $W_size,     
        $lp,     
        $support_threshold, 
        $min_inst_lower,   
        $min_inst_upper,  
        $polyTA_lim,     
        $poly_lim
       ) = @_;      


        my @input_seqs            = getSeqfromfasta2lmers($file);
        my $ip                    = @input_seqs;
        my @mcands    = getlmersfromseq(\@input_seqs,$W_size);

        my $mc                    = @mcands;
        my $skip               = 0; 


        my %bighash;    # HoAoH with motif-candidate as key, then Wind
+ows as key,
                        # motif-cand => [ { motif inst (windows) 
                        #                   => seqn1-positions--submot
+_score-submotifs}
                        #                 {  motif inst (windows)
                        #                   => seqn2-positions--submot
+_score-submotifs} ...]

        my @all_array;
        my $count_mcand; # mcand to be processed in apriori

        my $seqlen;
        #my $count_win;

        foreach my $mcands (@mcands) {
            #Submotifs of motif (window)
            my @Wsub = substrings $mcands, $submt_len;    
            my %fhash;
            my $seqno;
            my @winfromallseq;

            foreach my $seqs (@input_seqs) {
                $seqlen = length $seqs;

                #windows taken from each motif Candidate
                my @line_W = substrings $seqs, $W_size;    
                my $count_lineW = @line_W;
                my %candwin_with_submot;

                $seqno += 1;

                for ( my $i = 0; $i <= $#line_W; $i++ ) {
                    my @submt_Wfrom_line = substrings $line_W[$i], $su
+bmt_len;

                    # array of submotifs from each windows taken from 
+line

                    my @matches;
                    my @matches_and_pos;

                    for ( my $k = 0; $k <= $#Wsub; $k++ ) {
                        my $dif = hd( $submt_Wfrom_line[$k], $Wsub[$k]
+ );

                        if ( $dif <= $submt_d ) {
                            push @matches, $submt_Wfrom_line[$k];
                        }
                        my $mcount = @matches;
                    }

                    if (@matches) {
                        $candwin_with_submot{ $line_W[$i] . " "
                                . ( $seqno - 1 ) . " "
                                . ( $i - $seqlen ) } = [@matches];
                    }
                    else {
                        $i += $skip;
                    }

                }

                # a HoA: motif inst (windows) => seqno-position--submo
+t_score-submotifs
                my %hash; 

                foreach my $key ( keys %candwin_with_submot ) {

                    # @key elem contain motif-pos-seqno
                    my @key_elems = split( /\s/, $key );    
                    my $score = score_dup( $key_elems[0],
                        $candwin_with_submot{$key} );

                    my @array;

                    if ( $score >= $lp ) {
                        push @array,
                            (
                              $key_elems[1], 
                              $key_elems[2], 
                              $score,
                              @{ $candwin_with_submot{$key} },
                            );
                        $hash{ $key_elems[0] } = [@array];
                    }
                }

                #filtered hash for windows from every seqs
                %fhash = filter(%hash);    

                push @winfromallseq, {%fhash};

                #$bighash{$mcands} = [ @winfromallseq ];
                #print Dumper \@winfromallseq ;

            }  # -----  end foreach $seqs  -----

            my %output_hash;
            my $align_score;
            my @to_align;
            my %sb_hash;

            for my $winfromallseq (@winfromallseq) {

                if ($winfromallseq) {

                    for my $mcand_nstr ( keys %{$winfromallseq} ) {
                        my @tmp_ar;
                        push @tmp_ar, @{ $winfromallseq->{$mcand_nstr}
+ };
                        my $sid  = $tmp_ar[0];
                        my $ipos = $tmp_ar[1];

                        my @sbmt_slice = @tmp_ar[ 3 .. $#tmp_ar ];
                        $sb_hash{ $sid . ',' . $ipos . ',' . $mcand_ns
+tr }
                            = [@sbmt_slice];

                    }

                }

            }

            my $no_ins = scalar( keys %sb_hash );

            if (   $no_ins >= $min_inst_lower
                && $no_ins <= $min_inst_upper )
            {
                $count_mcand++;
                print STDERR "Motif Candidates Number $count_mcand\n"
                    if ( $count_mcand % 10 == 0 );

                # Apriori_space subroutine 
                # is other complex process
                # taken from "sp" package


                my @ap_array = apriori_space( \%sb_hash, $W_size, $sub
+mt_len, $submt_d,
                    $e, $support_threshold );

                if (@ap_array) {
                    prn_final_sorted(\@ap_array); 
                }

            }

        }    # -----  end foreach $mcands  -----


 
        return;
    
   # The code fails after the first main 
   # Process is completed.

} #------sub main_process-----    



#---------------------------------------------------------------------
+---------
#---------                        Subroutines                  -------
+---------
#---------------------------------------------------------------------
+---------
sub gen_param {
    my ( $file, $file_neg, $nofseq ) = @_;
    my @nq;
    my @wlen_group = ( 8, 15, 20 );
    my @frac = ( 0.8, 0.5 );
    my @param_groups;

    push @nq, ( $nofseq, ( $nofseq * 1.5 ) );

    foreach my $wlen (@wlen_group) {
        foreach my $fract (@frac) {
            foreach my $q (@nq) {
                my $rec = {};

                $rec->{'file'}              = $file;
                $rec->{'file_neg'}          = $file_neg;
                $rec->{'submt_len'}         = 5;
                $rec->{'submt_d'}           = 1;
                $rec->{'e'}                 = 0;
                $rec->{'W_size'}            = $wlen;
                $rec->{'lp'}                = $fract * $wlen;
                $rec->{'support_threshold'} = $q;
                $rec->{'min_inst_lower'}    = $q;
                $rec->{'min_inst_upper'}    = ( 3 * $q );
                $rec->{'polyTA_lim'}        = 0.8;
                $rec->{'poly_lim'}          = 0.8;

                push @param_groups, $rec;
            }    # -----  end foreach  -----

        }    # -----  end foreach  -----

    }

    return @param_groups;
}

sub prn_final_sorted {
    my $AoA = shift;

    print ">instances\n";
    my $count = 0;

    foreach my $aref ( sort { $b->[1] <=> $a->[1] } @$AoA ) {
        $count++;
        print "PATTERN: $aref->[0], $aref->[1]\n";
        print "CONSENSUS: $aref->[-1]\n\n";

        foreach my $ins ( @{ $aref->[2] } ) {
            print "$ins\n";
        }    # -----  end foreach  -----

        print "Alignment\n";

        foreach my $k ( keys %{ $aref->[3] } ) {
            print "$k  $aref->[3]{$k}\n";
        }

        print "=============\n\n";

        last if ( $count == 100 );

    }

    return;
}    # ----------  end of subroutine prn_final_sort  ----------


sub append_n 
{     #fastest
      my ( $str, $array ) = @_;
      my $nstring = "N" x length($str);

      foreach my $sbstr ( @$array )
      {
        my $ofs = 0;
        while ( ( my $pos = index $str, $sbstr, $ofs ) > -1 ) {
          substr ($nstring, $pos, length ($sbstr)) = $sbstr;
          $ofs = $pos + 1;
        }
      }
      return $nstring;
}



sub getlmersfromseq
{
    my  ($seqsarr,$l)= @_;
    my  @lmers;
         
    @lmers = map {substrings $_, $l} @{$seqsarr};
    my @uniq_lmers = uniq @lmers;
    return @uniq_lmers;
}

sub getSeqfromfasta2lmers
{
    #designed for getting sequences in to array from a file  (fasta fo
+rmat), 
    #input: file name 
    
    my $file = shift;
    my @seqs= ();

    open INFILE, "<$file" or die "$0:  Can't open file $file: $!";
    my $in = Bio::SeqIO->new(-format => 'fasta',
                         -noclose => 1 ,
                         -fh => \*INFILE);

    while ( my $seq = $in->next_seq() ) {
    push @seqs, $seq->seq();
    } #end while
       
    return @seqs;

}

sub filter
{       # Filter duplicated Windows 
    my %hoa = @_;
     my %filtered_hoa;
     my @key = (sort {$hoa{$a}[1] <=> $hoa{$b}[1] } keys %hoa);    
    my @flag; 
    
    foreach my $key (@key)
    {  
        my @submt_only = @{$hoa{$key}}[3..$#{$hoa{$key}}]; 
        
        # check if the current array is equiv. with *one* before
            my $lc = List::Compare->new(\@flag,\@submt_only);
        my $eq = $lc->is_LeqvlntR;
            
        if ($eq == 0)
        {   #if they are not equivalent, assign them to a hash
        $filtered_hoa{$key} = $hoa{$key};
        }
        else
        {    #otherwise skip them
        next;
        }
        @flag = @submt_only;
    }

    return %filtered_hoa;    
}    


sub print_bypos {
    my %hoa = @_;

    # sort by position of motifs in sequence
    foreach my $key ( sort { $hoa{$a}[1] <=> $hoa{$b}[1] } keys %hoa )
+ {
        print "$key $hoa{$key}[1] ";

        foreach my $i ( 3 .. $#{ $hoa{$key} } ) {
            print " $hoa{$key}[$i]";
        }
        print "\n";
    }

}    # ----------  end of subroutine print_bypos  ----------

sub score_dup 
{
      my ( $str, $array ) = @_;
      my $offset = 0;
      my $mask = "\0" x length $str;

      for my $frag ( @$array ) {
    ( my $idx = index $str, $frag, $offset ) >= 0 or next;
        substr $mask, $idx, length $frag, $frag;
        $offset = $idx + 1;
    }

      return $mask =~ tr/\0//c;
}


sub score_nodup 
{
    my ($str, $array) = @_;
    my $vec = "\0" x length($str);
    for (@$array) {
        my $idx = index $str, $_;
        # Matching substrings are padded into position with nulls
        $vec |= ("\0" x $idx) . $_;
    }
    # Matching characters become nulls; others non-nulls
    $vec ^= $str;
    # Count nulls
    $vec =~ tr/\0//;
}
[download]

---
neversaint and everlastingly indebted.......

In reply to Re^6: Attempt to free temp prematurely and unreferenced scalar by neversaint
in thread Attempt to free temp prematurely and unreferenced scalar by neversaint

Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!

Titles consisting of a single word are discouraged, and in most cases are disallowed outright.

Read Where should I post X? if you're not absolutely sure you're posting in the right place.

Please read these before you post! —

Posts may use any of the Perl Monks Approved HTML tags:

a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr

You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)

	For:		Use:
	&		`&`
	<		`<`
	>		`>`
	[		`[`
	]		`]`

Link using PerlMonks shortcuts! What shortcuts can I use for linking?

See Writeup Formatting Tips and other pages linked from there for more info.