in reply to Re: Re: Re: Re: Simple Text Indexing
in thread Simple Text Indexing
Comments are inline.
use strict; use warnings; use Storable; use vars qw($CONTEXT_LINES $TEXTS %STOPWORDS $INDEX); $CONTEXT_LINES = 3; @STOPWORDS{ # Prefer reading these from a file. qw( the a an of and on in by with at he after into their is that they for to it them which) } = (); # prefer texts/*.txt over "chdir 'text'; glob '*.txt'" my @files = glob "texts/*.txt"; my %file_idx = map {; $_ => index_file( $_, $CONTEXT_LINES ) } @files; store \%file_idx, "../index/text.idx"; =pod { 'text.idx' => { word => [ 1, 3, 5, 6 ], another => [ 5, 7, 2, ] }, 'barfoo.txt' => { ....... } =cut sub index_file { my $filename = shift; my $lines_of_context = $_[0] > 0 ? shift() : 1; open my $fh, "<", $filename or die "Couldn't open $filename: $!"; my @offsets; my %index; while ( my $line = <$fh> ) { push @offsets, tell $fh; my $offset = scalar( @offsets ) < $lines_of_context ? $offsets[0] : shift @offsets; # Prefer ' ' over /\s+/ here. See perlfunc about this. for my $word ( split ' ', $line ) { $word = lc $word; # Prefer character classes to alternation when possible $word =~ s/[,.]$|[\][();:!]//g; next if exists $STOPWORDS{$word} # Prefer (\d+) to (\d)+ (unless that is *really* what + you mean) or $word =~ /p\.(\d+)/ # Remove the '?' as that makes the operation always s +ucceed. or $word =~ /-{5,}/; push @{ $index{$word} }, $offset; } } close $fh or warn "Couldn't close $filename: $!"; return \ %index; }
|
|---|