# Step through each term found in the parsed content and proceed wit +h # term indexing foreach my $term ( split /\s+/, $self->{'_content'} ) { # Normalise the search term, allowing only characters in the ran +ge of a-z, # A-Z, digits and the underscore character. All terms are then +dropped to # lowercase to improve the likelihood of matching search results +. $term = $self->_normalise( $term ); next unless length $term; my ( $stem ) = @{ Lingua::Stem::stem( $term ) }; # Increment the frequency counters for the stemmed term - The _i +ndex_count is # the count of the number of documents which the stemmed term ap +pears in (not # the total count of all appearances of the stemmed term in all +documents) # while _index_frequency is the number of occurences of the stem +med term in # the current document indexed by $url # # The hash _index_stem is important to prevent duplicate documen +t counting # for documents which may have a stemmed term appear more than o +nce. ++${$self->{'_index_count'}}{$stem} unless ${$self->{'_index_stem' +}}{$stem}++; ++${${$self->{'_index_frequency'}}{$stem}}{$url}; }
The term index weights are subsequently calculated based upon these stem and per-page counts. The only additional variable in this subroutine which needs explanation is the _crawl_visited hash reference - This hash, indexed by content source URI, stores meta information about the content.
sub weights { my ( $self ) = @_; # Step through each stemmed term indexed foreach my $stem ( keys %{$self->{'_index_count'}} ) { # Step through each document in which the stemmed term $stem + appears, # calculate its weight and store this ranking in the %weight +s hash. my %weights; foreach my $url ( keys %{${$self->{'_index_frequency'}}{$stem} +} ) { $weights{$url} = sprintf "%.2f", ${${$self->{'_index_frequ +ency'}}{$stem}}{$url} * log( ( scalar keys %{$self->{'_crawl_visited' +}} ) / $self->{'_index_count'}->{$stem} ); } # Store ranking score in tied hash - Note the fashion by whi +ch the hash # reference is built first and then assigned to the MLDBM-ti +ed hash. This is # required due to the limitations of the Perl TIEHASH interf +ace which has no # support for multi-dimensional ties. ${$self->{'_tied_weight'}}{$stem} = \%weights; } }
As outlined in this node, the Perlfect search engine which is written in Perl may also prove to be a useful resource for reference.
perl -le 'print+unpack("N",pack("B32","00000000000000000000001000111010"))'
In reply to Re: term weight
by rob_au
in thread term weight
by Anonymous Monk
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |