############# Begin WEED subroutine ############## #### # This subroutine sifts through the arrays of all the matches to the # sites found in SEARCHFASTA, and organizes them into 'sets' corresponding # to each hit and all of the other hits occurring within $span of it. # The format of the incoming hash, %matches, is a HoHoA. The breakdown # is as follows: Initial hash keys are FASTA_ID lines. The Arrays stored # as the values for these keys are hashes. The number of key/value pairs # corresponds to the number of user input search patterns. The values # for these keys are arrays, which contain the position of m//g matches # within that FASTA_ID for that search pattern. # The output of the subroutine will be a more complex structure. We will # be splitting the subhashes into arrays of (identically organized) subhashes. # We want to gather together all groups of m//g hits (of all the patterns) # that occur within $span distance of each other, keeping their hash # organization. Each group will correspond to an array element. #### sub WEED { my ($span, %matches) = @_; my ($site, $fasta, $sitekey) = ''; my ($setscounter,$lowerlimit,$upperlimit,$set,$yes) = 0; my %sets = (); my @newfast; my $i = 0; while ($i < $num) { $sortedKeys[$i] = $i; $i++; } # Rather than the unsorted method of (keys %matches), I iterate through # the fastheaders array, which stores the fasta_ids of interest, in their # order within the file. This array will need updating at the end of this # subroutine for my $fasta_id (@fastheaders) { $setscounter = 0; # We only want to keep groups that contain matches for all patterns next unless defined %{$matches{$fasta_id}}; # keep the user updated on program progression print "Currently analysing results from sequence:\n $fasta_id\n"; # The key $site below is an index value (0, 1, 2, ..), so incrementing # in numerical order makes the most sense. It corresponds to the user- # input patterns that were searched for for my $site ( @sortedKeys ) { # We only want to keep groups that contain matches for all patterns last unless @{$matches{$fasta_id}{$site}}; $i = 0; # We are looking for all matches within $span of EVERY match, so # for each match, find it's position, and the position $span away WEEDLOW: for my $low (@{$matches{$fasta_id}{$site}}) { $lowerlimit = $low + 0; $upperlimit = $span + $lowerlimit; # Now, any and all matches between $lowerlimit and $upperlimit should be # saved into an %HoA for my $sitekey ( @sortedKeys ) { next unless defined @{$matches{$fasta_id}{$sitekey}}; my @arrayA = (); # this part should be self-explanatory, though it is the core of the # sub. For each element, where is it located? If it is below the lower # limit (only occurring when $lowerlimit defined by a different $sitekey) # then proceed to the next. If it is above the upperlimit, then there # won't be any more within the array, so skip to next sitekey for my $hit (@{$matches{$fasta_id}{$sitekey}}) { next unless ($hit >= $lowerlimit); last unless ($hit <= $upperlimit); # the only reason for the $ggg is that I was having referencing problems, and # this seemed to alleviate those. I do not understand why. my $ggg = $hit + 0; push (@arrayA, $ggg); $ggg = 0; next; } # Now, if we saved matches to an array above, then we want to keep them. # setcounter keeps track of how many sets there are, with the sets being # subhashes containing arrays of hits from ALL sitekeys. if (@arrayA) { @{$sets{$fasta_id}[$setscounter]{$sitekey}} = @arrayA; @arrayA = (); } # However, if we did not save any matches, then this whole set ($setscounter) # is pointless and should be discarded. else { %{$sets{$fasta_id}[$setscounter]} = (); next WEEDLOW; } } # a possibly redundant check to make sure we have matches from ALL patterns for my $checkhash (keys %{$sets{$fasta_id}[$setscounter]}) { unless (defined $sets{$fasta_id}[$setscounter]{$checkhash}[0]) { %{$sets{$fasta_id}[$setscounter]} = (); last; } } # and another... if (scalar keys %{$sets{$fasta_id}[$setscounter]} < $num) { %{$sets{$fasta_id}[$setscounter]} = (); } # Finally, if we are sure this is a hit, then increment setscounter # to get the next set $setscounter++ if scalar %{$sets{$fasta_id}[$setscounter]}; # closes for my low } #closes for my site } # Originally, I was getting arrays of empty subhashes, since $setscounter was # being initialized, though never having a subhash assignedto it. Therefore, # I threw this in. If there is a subhash present as the last element of the array, # the scalar call will come back with the bits being used (1/8, etc). Otherwise, # it will return 0. If it's 0, we pop that out, and there's no array remaining. if (@{$sets{$fasta_id}}) { pop @{$sets{$fasta_id}} unless scalar %{$sets{$fasta_id}[$#{$sets{$fasta_id}}]}; } # this will reset the @fastarray array, since we have possibly removed fasta_id keys from # our hash of matches. if (@{$sets{$fasta_id}}) { push(@newfast,$h); } # similarly, we want to get rid of empty hash elements. else { delete $sets{$fasta_id}; } #closes for fastarray } @fastarray=(); # as @fastarray is global, this will have global effect, and we do not need to return it. @fastarray=@newfast; return %sets; #closes subroutine } ############# End WEED subroutine ################ #### for my $hit (@{$matches{$fasta_id}{$sitekey}}) { next unless ($hit >= $lowerlimit); last unless ($hit <= $upperlimit); #### sub WEED { my ($span, %matches) = @_; my %sets; my @newfast; my @sortedKeys; my $i = 0; while ($i < $num) { $sortedKeys[$i] = $i; $i++; } # Rather than the unsorted method of (keys %matches), I iterate through # the fastheaders array, which stores the fasta_ids of interest, in their # order within the file. This array will need updating at the end of this # subroutine WEEDFAST: for my $fasta_id (@fastheaders) { my $setscounter = 0; # We only want to keep groups that contain matches for all patterns next unless defined %{$matches{$fasta_id}}; # keep the user updated on program progression print "Currently analysing results from sequence:\n $fasta_id\n"; # The key $site below is an index value (0, 1, 2, ..), so incrementing # in numerical order makes the most sense. It corresponds to the user- # input patterns that were searched for for my $site ( @sortedKeys ) { # We only want to keep groups that contain matches for all patterns last unless defined @{$matches{$fasta_id}{$site}}; # We are looking for all matches within $span of EVERY match, so # for each match, find it's position, and the position $span away WEEDLOW: for my $low (@{$matches{$fasta_id}{$site}}) { my $lowerlimit = $low + 0; my $upperlimit = $span + $lowerlimit; # Now, any and all matches between $lowerlimit and $upperlimit should be # saved into an %HoA for my $sitekey ( @sortedKeys ) { next unless defined @{$matches{$fasta_id}{$sitekey}}; # this part should be self-explanatory, though it is the core of the # sub. For each element, where is it located? If it is below the lower # limit (only occurring when $lowerlimit defined by a different $sitkey) # then proceed to the next. If it is above the upperlimit, then there # won't be any more within the array, so skip to next sitekey @{$sets{$fasta_id}[$setscounter]{$sitekey}} = grep { $_ >= $lowerlimit and $_ <= $upperlimit } @{$matches{$fasta_id}{$sitekey}}; unless (@{$sets{$fasta_id}[$setscounter]{$sitekey}}) { %{$sets{$fasta_id}[$setscounter]} = (); next WEEDLOW; } } # a possibly redundant check to make sure we have matches from ALL patterns if (scalar keys %{$sets{$fasta_id}[$setscounter]} < $num) { %{$sets{$fasta_id}[$setscounter]} = (); } # Finally, if we are sure this is a hit, then increment setscounter # to get the next set $setscounter++ if scalar %{$sets{$fasta_id}[$setscounter]}; # closes for my low } #closes for my site } # Originally, I was getting arrays of empty subhashes, since $setscounter was # being initialized, though never having a subhash assigned to it. Therefore, # I threw this in. If there is a subhash present as the last element of the array, # the scalar call will come back with the bits being used (1/8, etc). Otherwise, # it will return 0. If it's 0, we pop that out, and there's no array remaining. unless (defined @{$sets{$fasta_id}}) { delete $sets{$fasta_id}; next WEEDFAST; } if (@{$sets{$fasta_id}}) { pop @{$sets{$fasta_id}} unless scalar %{$sets{$fasta_id}[$#{$sets{$fasta_id}}]}; } # this will reset the @fastarray array, since we have possibly removed fasta_id keys from # our hash of matches. if (@{$sets{$fasta_id}}) { push(@newfast,$fasta_id); } # similarly, we want to get rid of empty hash elements. else { delete $sets{$fasta_id}; } #closes for fastarray } # as @fastarray is global, this will have global effect, and we do not need to return it. @fastarray=@newfast; return %sets; #closes subroutine }