# Step through each term found in the parsed content and proceed with # term indexing foreach my $term ( split /\s+/, $self->{'_content'} ) { # Normalise the search term, allowing only characters in the range of a-z, # A-Z, digits and the underscore character. All terms are then dropped to # lowercase to improve the likelihood of matching search results. $term = $self->_normalise( $term ); next unless length $term; my ( $stem ) = @{ Lingua::Stem::stem( $term ) }; # Increment the frequency counters for the stemmed term - The _index_count is # the count of the number of documents which the stemmed term appears in (not # the total count of all appearances of the stemmed term in all documents) # while _index_frequency is the number of occurences of the stemmed term in # the current document indexed by $url # # The hash _index_stem is important to prevent duplicate document counting # for documents which may have a stemmed term appear more than once. ++${$self->{'_index_count'}}{$stem} unless ${$self->{'_index_stem'}}{$stem}++; ++${${$self->{'_index_frequency'}}{$stem}}{$url}; } #### sub weights { my ( $self ) = @_; # Step through each stemmed term indexed foreach my $stem ( keys %{$self->{'_index_count'}} ) { # Step through each document in which the stemmed term $stem appears, # calculate its weight and store this ranking in the %weights hash. my %weights; foreach my $url ( keys %{${$self->{'_index_frequency'}}{$stem}} ) { $weights{$url} = sprintf "%.2f", ${${$self->{'_index_frequency'}}{$stem}}{$url} * log( ( scalar keys %{$self->{'_crawl_visited'}} ) / $self->{'_index_count'}->{$stem} ); } # Store ranking score in tied hash - Note the fashion by which the hash # reference is built first and then assigned to the MLDBM-tied hash. This is # required due to the limitations of the Perl TIEHASH interface which has no # support for multi-dimensional ties. ${$self->{'_tied_weight'}}{$stem} = \%weights; } }