############# Begin WEED subroutine ##############      
      
####
#  This subroutine sifts through the arrays of all the matches to the 
#  sites found in SEARCHFASTA, and organizes them into 'sets' corresponding
#  to each hit and all of the other hits occurring within $span of it.
#  The format of the incoming hash, %matches, is a HoHoA.  The breakdown
#  is as follows:  Initial hash keys are FASTA_ID lines.  The Arrays stored
#  as the values for these keys are hashes.  The number of key/value pairs
#  corresponds to the number of user input search patterns.  The values 
#  for these keys are arrays, which contain the position of m//g matches
#  within that FASTA_ID for that search pattern.
#  The output of the subroutine will be a more complex structure.  We will
#  be splitting the subhashes into arrays of (identically organized) subhashes.
#  We want to gather together all groups of m//g hits (of all the patterns)
#  that occur within $span distance of each other, keeping their hash
#  organization.  Each group will correspond to an array element.
####

sub WEED {

 my ($span, %matches) = @_;
 my ($site, $fasta, $sitekey) = '';
 my ($setscounter,$lowerlimit,$upperlimit,$set,$yes) = 0;
 my %sets = ();
 my @newfast;
 
 my $i = 0;
 
 while ($i < $num) {
   $sortedKeys[$i] = $i;
   $i++;
   }

 # Rather than the unsorted method of (keys %matches), I iterate through
 # the fastheaders array, which stores the fasta_ids of interest, in their
 # order within the file. This array will need updating at the end of this 
 # subroutine
 for my $fasta_id (@fastheaders) {
  $setscounter = 0;
  
  # We only want to keep groups that contain matches for all patterns
  next unless defined %{$matches{$fasta_id}};
  
  # keep the user updated on program progression
  print "Currently analysing results from sequence:\n     $fasta_id\n";
   
   #  The key $site below is an index value (0, 1, 2, ..), so incrementing
   #  in numerical order makes the most sense.  It corresponds to the user-
   #  input patterns that were searched for
   for my $site ( @sortedKeys ) {

     #  We only want to keep groups that contain matches for all patterns
     last unless @{$matches{$fasta_id}{$site}};
     $i = 0;
    
     #  We are looking for all matches within $span of EVERY match, so 
     #  for each match, find it's position, and the position $span away
          
     WEEDLOW: for my $low (@{$matches{$fasta_id}{$site}}) {
       $lowerlimit = $low + 0;
       $upperlimit = $span + $lowerlimit;
    
       #  Now, any and all matches between $lowerlimit and $upperlimit should be 
       #  saved into an %HoA
       
       for my $sitekey (  @sortedKeys ) {
          next unless defined @{$matches{$fasta_id}{$sitekey}};
          my @arrayA = ();
    
       # this part should be self-explanatory, though it is the core of the
       # sub.  For each element, where is it located?  If it is below the lower
       # limit (only occurring when $lowerlimit defined by a different $sitekey)
       # then proceed to the next.  If it is above the upperlimit, then there
       # won't be any more within the array, so skip to next sitekey
      
          for my $hit (@{$matches{$fasta_id}{$sitekey}}) {
            next unless ($hit >= $lowerlimit);
            last unless ($hit <= $upperlimit);
     
       # the only reason for the $ggg is that I was having referencing problems, and
       # this seemed to alleviate those.  I do not understand why.
       
            my $ggg = $hit + 0;
            push (@arrayA, $ggg);
            $ggg = 0;
            next;
            }
       # Now, if we saved matches to an array above, then we want to keep them.    
       # setcounter keeps track of how many sets there are, with the sets being
       # subhashes containing arrays of hits from ALL sitekeys.
          if (@arrayA) {
            @{$sets{$fasta_id}[$setscounter]{$sitekey}} = @arrayA;
            @arrayA = ();
            }
       # However, if we did not save any matches, then this whole set ($setscounter)
       # is pointless and should be discarded.
            else {
              %{$sets{$fasta_id}[$setscounter]} = ();
              next WEEDLOW;
              } 
          } 
 
       # a possibly redundant check to make sure we have matches from ALL patterns
    
       for my $checkhash (keys %{$sets{$fasta_id}[$setscounter]}) {
         unless (defined $sets{$fasta_id}[$setscounter]{$checkhash}[0]) {
           %{$sets{$fasta_id}[$setscounter]} = ();
           last;
           }
         }
       # and another...
       
       if (scalar keys %{$sets{$fasta_id}[$setscounter]} < $num) {
         %{$sets{$fasta_id}[$setscounter]} = ();
         } 
       # Finally, if we are sure this is a hit, then increment setscounter
       # to get the next set 
       $setscounter++ if scalar %{$sets{$fasta_id}[$setscounter]};
         
       # closes for my low
       }
    #closes for my site
    }
  
  # Originally, I was getting arrays of empty subhashes, since $setscounter was
  # being initialized, though never having a subhash assignedto it.  Therefore, 
  # I threw this in.  If there is a subhash present as the last element of the array,
  # the scalar call will come back with the bits being used (1/8, etc).  Otherwise,
  # it will return 0.  If it's 0, we pop that out, and there's no array remaining.

  if (@{$sets{$fasta_id}}) {
    pop @{$sets{$fasta_id}} unless scalar %{$sets{$fasta_id}[$#{$sets{$fasta_id}}]};
    }

# this will reset the @fastarray array, since we have possibly removed fasta_id keys from
# our hash of matches.

  if (@{$sets{$fasta_id}}) {
    push(@newfast,$h);
    }
  # similarly, we want to get rid of empty hash elements.  
  else {
    delete $sets{$fasta_id};
    }
  
  #closes for fastarray
  }
 @fastarray=();

# as @fastarray is global, this will have global effect, and we do not need to return it.

 @fastarray=@newfast;
 return %sets;
 #closes subroutine
 }

############# End WEED subroutine ################

##</code><code>##

          for my $hit (@{$matches{$fasta_id}{$sitekey}}) {
            next unless ($hit >= $lowerlimit);
            last unless ($hit <= $upperlimit);

##</code><code>##

sub WEED {

 my ($span, %matches) = @_;
 my %sets;
 my @newfast;
 my @sortedKeys;
 my $i = 0;
 
 while ($i < $num) {
   $sortedKeys[$i] = $i;
   $i++;
   }
 
 # Rather than the unsorted method of (keys %matches), I iterate through
 # the fastheaders array, which stores the fasta_ids of interest, in their
 # order within the file. This array will need updating at the end of this 
 # subroutine

 WEEDFAST: for my $fasta_id (@fastheaders) {
  my $setscounter = 0;
  
  # We only want to keep groups that contain matches for all patterns
 
  next unless defined %{$matches{$fasta_id}};
  
  # keep the user updated on program progression
 
  print "Currently analysing results from sequence:\n     $fasta_id\n";
   
   #  The key $site below is an index value (0, 1, 2, ..), so incrementing
   #  in numerical order makes the most sense.  It corresponds to the user-
   #  input patterns that were searched for
 
   for my $site ( @sortedKeys ) {

     #  We only want to keep groups that contain matches for all patterns
  
     last unless defined @{$matches{$fasta_id}{$site}};
     
     #  We are looking for all matches within $span of EVERY match, so 
     #  for each match, find it's position, and the position $span away
          
     WEEDLOW: for my $low (@{$matches{$fasta_id}{$site}}) {
       my $lowerlimit = $low + 0;
       my $upperlimit = $span + $lowerlimit;
    
       #  Now, any and all matches between $lowerlimit and $upperlimit should be 
       #  saved into an %HoA
       
       for my $sitekey (  @sortedKeys ) {
          next unless defined @{$matches{$fasta_id}{$sitekey}};
    
       # this part should be self-explanatory, though it is the core of the
       # sub.  For each element, where is it located?  If it is below the lower
       # limit (only occurring when $lowerlimit defined by a different $sitkey)
       # then proceed to the next.  If it is above the upperlimit, then there
       # won't be any more within the array, so skip to next sitekey
      
          @{$sets{$fasta_id}[$setscounter]{$sitekey}} = grep {
            $_ >= $lowerlimit and $_ <= $upperlimit
            }  @{$matches{$fasta_id}{$sitekey}};
          
          unless (@{$sets{$fasta_id}[$setscounter]{$sitekey}}) {
            %{$sets{$fasta_id}[$setscounter]} = ();
            next WEEDLOW;
            }
          } 
 
       # a possibly redundant check to make sure we have matches from ALL patterns
       
       if (scalar keys %{$sets{$fasta_id}[$setscounter]} < $num) {
         %{$sets{$fasta_id}[$setscounter]} = ();
         } 
 
       # Finally, if we are sure this is a hit, then increment setscounter
       # to get the next set 
       
       $setscounter++ if scalar %{$sets{$fasta_id}[$setscounter]};
         
       # closes for my low
       }
    #closes for my site
    }
  
  # Originally, I was getting arrays of empty subhashes, since $setscounter was
  # being initialized, though never having a subhash assigned to it.  Therefore, 
  # I threw this in.  If there is a subhash present as the last element of the array,
  # the scalar call will come back with the bits being used (1/8, etc).  Otherwise,
  # it will return 0.  If it's 0, we pop that out, and there's no array remaining.
  
  unless (defined @{$sets{$fasta_id}}) {
    delete $sets{$fasta_id};
    next WEEDFAST;
    }

  if (@{$sets{$fasta_id}}) {
    pop @{$sets{$fasta_id}} unless scalar %{$sets{$fasta_id}[$#{$sets{$fasta_id}}]};
    }

# this will reset the @fastarray array, since we have possibly removed fasta_id keys from
# our hash of matches.

  if (@{$sets{$fasta_id}}) {
    push(@newfast,$fasta_id);
    }
  # similarly, we want to get rid of empty hash elements.  
  else {
    delete $sets{$fasta_id};
    }
  
  #closes for fastarray
  }
 
# as @fastarray is global, this will have global effect, and we do not need to return it.

 @fastarray=@newfast;
 return %sets;
 #closes subroutine
 }