############# Begin WEED subroutine ##############
####
# This subroutine sifts through the arrays of all the matches to the
# sites found in SEARCHFASTA, and organizes them into 'sets' corresponding
# to each hit and all of the other hits occurring within $span of it.
# The format of the incoming hash, %matches, is a HoHoA. The breakdown
# is as follows: Initial hash keys are FASTA_ID lines. The Arrays stored
# as the values for these keys are hashes. The number of key/value pairs
# corresponds to the number of user input search patterns. The values
# for these keys are arrays, which contain the position of m//g matches
# within that FASTA_ID for that search pattern.
# The output of the subroutine will be a more complex structure. We will
# be splitting the subhashes into arrays of (identically organized) subhashes.
# We want to gather together all groups of m//g hits (of all the patterns)
# that occur within $span distance of each other, keeping their hash
# organization. Each group will correspond to an array element.
####
sub WEED {
my ($span, %matches) = @_;
my ($site, $fasta, $sitekey) = '';
my ($setscounter,$lowerlimit,$upperlimit,$set,$yes) = 0;
my %sets = ();
my @newfast;
my $i = 0;
while ($i < $num) {
$sortedKeys[$i] = $i;
$i++;
}
# Rather than the unsorted method of (keys %matches), I iterate through
# the fastheaders array, which stores the fasta_ids of interest, in their
# order within the file. This array will need updating at the end of this
# subroutine
for my $fasta_id (@fastheaders) {
$setscounter = 0;
# We only want to keep groups that contain matches for all patterns
next unless defined %{$matches{$fasta_id}};
# keep the user updated on program progression
print "Currently analysing results from sequence:\n $fasta_id\n";
# The key $site below is an index value (0, 1, 2, ..), so incrementing
# in numerical order makes the most sense. It corresponds to the user-
# input patterns that were searched for
for my $site ( @sortedKeys ) {
# We only want to keep groups that contain matches for all patterns
last unless @{$matches{$fasta_id}{$site}};
$i = 0;
# We are looking for all matches within $span of EVERY match, so
# for each match, find it's position, and the position $span away
WEEDLOW: for my $low (@{$matches{$fasta_id}{$site}}) {
$lowerlimit = $low + 0;
$upperlimit = $span + $lowerlimit;
# Now, any and all matches between $lowerlimit and $upperlimit should be
# saved into an %HoA
for my $sitekey ( @sortedKeys ) {
next unless defined @{$matches{$fasta_id}{$sitekey}};
my @arrayA = ();
# this part should be self-explanatory, though it is the core of the
# sub. For each element, where is it located? If it is below the lower
# limit (only occurring when $lowerlimit defined by a different $sitekey)
# then proceed to the next. If it is above the upperlimit, then there
# won't be any more within the array, so skip to next sitekey
for my $hit (@{$matches{$fasta_id}{$sitekey}}) {
next unless ($hit >= $lowerlimit);
last unless ($hit <= $upperlimit);
# the only reason for the $ggg is that I was having referencing problems, and
# this seemed to alleviate those. I do not understand why.
my $ggg = $hit + 0;
push (@arrayA, $ggg);
$ggg = 0;
next;
}
# Now, if we saved matches to an array above, then we want to keep them.
# setcounter keeps track of how many sets there are, with the sets being
# subhashes containing arrays of hits from ALL sitekeys.
if (@arrayA) {
@{$sets{$fasta_id}[$setscounter]{$sitekey}} = @arrayA;
@arrayA = ();
}
# However, if we did not save any matches, then this whole set ($setscounter)
# is pointless and should be discarded.
else {
%{$sets{$fasta_id}[$setscounter]} = ();
next WEEDLOW;
}
}
# a possibly redundant check to make sure we have matches from ALL patterns
for my $checkhash (keys %{$sets{$fasta_id}[$setscounter]}) {
unless (defined $sets{$fasta_id}[$setscounter]{$checkhash}[0]) {
%{$sets{$fasta_id}[$setscounter]} = ();
last;
}
}
# and another...
if (scalar keys %{$sets{$fasta_id}[$setscounter]} < $num) {
%{$sets{$fasta_id}[$setscounter]} = ();
}
# Finally, if we are sure this is a hit, then increment setscounter
# to get the next set
$setscounter++ if scalar %{$sets{$fasta_id}[$setscounter]};
# closes for my low
}
#closes for my site
}
# Originally, I was getting arrays of empty subhashes, since $setscounter was
# being initialized, though never having a subhash assignedto it. Therefore,
# I threw this in. If there is a subhash present as the last element of the array,
# the scalar call will come back with the bits being used (1/8, etc). Otherwise,
# it will return 0. If it's 0, we pop that out, and there's no array remaining.
if (@{$sets{$fasta_id}}) {
pop @{$sets{$fasta_id}} unless scalar %{$sets{$fasta_id}[$#{$sets{$fasta_id}}]};
}
# this will reset the @fastarray array, since we have possibly removed fasta_id keys from
# our hash of matches.
if (@{$sets{$fasta_id}}) {
push(@newfast,$h);
}
# similarly, we want to get rid of empty hash elements.
else {
delete $sets{$fasta_id};
}
#closes for fastarray
}
@fastarray=();
# as @fastarray is global, this will have global effect, and we do not need to return it.
@fastarray=@newfast;
return %sets;
#closes subroutine
}
############# End WEED subroutine ################
####
for my $hit (@{$matches{$fasta_id}{$sitekey}}) {
next unless ($hit >= $lowerlimit);
last unless ($hit <= $upperlimit);
####
sub WEED {
my ($span, %matches) = @_;
my %sets;
my @newfast;
my @sortedKeys;
my $i = 0;
while ($i < $num) {
$sortedKeys[$i] = $i;
$i++;
}
# Rather than the unsorted method of (keys %matches), I iterate through
# the fastheaders array, which stores the fasta_ids of interest, in their
# order within the file. This array will need updating at the end of this
# subroutine
WEEDFAST: for my $fasta_id (@fastheaders) {
my $setscounter = 0;
# We only want to keep groups that contain matches for all patterns
next unless defined %{$matches{$fasta_id}};
# keep the user updated on program progression
print "Currently analysing results from sequence:\n $fasta_id\n";
# The key $site below is an index value (0, 1, 2, ..), so incrementing
# in numerical order makes the most sense. It corresponds to the user-
# input patterns that were searched for
for my $site ( @sortedKeys ) {
# We only want to keep groups that contain matches for all patterns
last unless defined @{$matches{$fasta_id}{$site}};
# We are looking for all matches within $span of EVERY match, so
# for each match, find it's position, and the position $span away
WEEDLOW: for my $low (@{$matches{$fasta_id}{$site}}) {
my $lowerlimit = $low + 0;
my $upperlimit = $span + $lowerlimit;
# Now, any and all matches between $lowerlimit and $upperlimit should be
# saved into an %HoA
for my $sitekey ( @sortedKeys ) {
next unless defined @{$matches{$fasta_id}{$sitekey}};
# this part should be self-explanatory, though it is the core of the
# sub. For each element, where is it located? If it is below the lower
# limit (only occurring when $lowerlimit defined by a different $sitkey)
# then proceed to the next. If it is above the upperlimit, then there
# won't be any more within the array, so skip to next sitekey
@{$sets{$fasta_id}[$setscounter]{$sitekey}} = grep {
$_ >= $lowerlimit and $_ <= $upperlimit
} @{$matches{$fasta_id}{$sitekey}};
unless (@{$sets{$fasta_id}[$setscounter]{$sitekey}}) {
%{$sets{$fasta_id}[$setscounter]} = ();
next WEEDLOW;
}
}
# a possibly redundant check to make sure we have matches from ALL patterns
if (scalar keys %{$sets{$fasta_id}[$setscounter]} < $num) {
%{$sets{$fasta_id}[$setscounter]} = ();
}
# Finally, if we are sure this is a hit, then increment setscounter
# to get the next set
$setscounter++ if scalar %{$sets{$fasta_id}[$setscounter]};
# closes for my low
}
#closes for my site
}
# Originally, I was getting arrays of empty subhashes, since $setscounter was
# being initialized, though never having a subhash assigned to it. Therefore,
# I threw this in. If there is a subhash present as the last element of the array,
# the scalar call will come back with the bits being used (1/8, etc). Otherwise,
# it will return 0. If it's 0, we pop that out, and there's no array remaining.
unless (defined @{$sets{$fasta_id}}) {
delete $sets{$fasta_id};
next WEEDFAST;
}
if (@{$sets{$fasta_id}}) {
pop @{$sets{$fasta_id}} unless scalar %{$sets{$fasta_id}[$#{$sets{$fasta_id}}]};
}
# this will reset the @fastarray array, since we have possibly removed fasta_id keys from
# our hash of matches.
if (@{$sets{$fasta_id}}) {
push(@newfast,$fasta_id);
}
# similarly, we want to get rid of empty hash elements.
else {
delete $sets{$fasta_id};
}
#closes for fastarray
}
# as @fastarray is global, this will have global effect, and we do not need to return it.
@fastarray=@newfast;
return %sets;
#closes subroutine
}