s[\b($word)\b][$1]g; #### Highlighting works by this call: $text = highlight( \$text, $stemmer_function, set_match_regexp( \%heads ) ); First, this builds the three regular expressions: ------------------------------------------------- sub set_match_regexp { my $header = shift; my ($query, $wc, $ignoref, $ignorel ) = @{$header}{'parsed words', qw/wordcharacters ignorefirstchar ignorelastchar/}; $wc = quotemeta $wc; # build regexp to match against the query words # If a trailing "*" is found then do a wild card search. my $match_string = join '|', map { substr( $_, -1, 1 ) eq '*' ? quotemeta( substr( $_, 0, -1) ) . "[$wc]*?" : quotemeta } # get rid of operator words and characters grep { ! /^(and|or|not|["()=])$/oi } split /\s+/, $query; # These are to remove and capture the ignorefirst # and ingnorelast chars for ( $ignoref, $ignorel ) { if ( $_ ) { $_ = quotemeta; $_ = "([$_]*)"; } else { $_ = '()'; } } # Avoid /i matching... $wc .= 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'; # Warning: dependent on tolower used while indexing # return the three compiled expressions # 1) used to split source into swish/non-swish words # 2) used to strip ignorefirst/ignorelast # 3) used to match final word against query word(s) return ( qr/([^$wc]+)/o, qr/^$ignoref([$wc]+?)$ignorel$/io, qr/^$match_string$/, ); And here's the time killer: --------------------------- sub highlight { my ( $text_ref, $stemmer_function, $wc_regexp, $extract_regexp, $match_regexp ) = @_; my $last = 0; # Split source into swish-words and non-swish words. # Every other element is a swish-word due to the capture my @words = split /$wc_regexp/, $$text_ref; # This keeps track of what words to print (context printing) my @flags; $flags[$#words] = 0; # Extend array. my $occurrences = $Occurrences; # limit number of matches my $pos = 0; while ( $pos <= $#words ) { # pull out the (begin_chars, match word, end_chars) if ( $words[$pos] =~ /$extract_regexp/ ) { my ( $begin, $word, $end ) = ( $1, $2, $3 ); # Stem word if enabled, else just use lower case my $test = $stemmer_function ? $stemmer_function->($word) : lc $word; $test ||= lc $word; # Not check if word matches if ( $test =~ /$match_regexp/ ) { $words[$pos] = "$begin$word$end"; # turn on display on both sides of word my $start = $pos - $Show_Words + 1; my $end = $pos + $Show_Words - 1; if ( $start < 0 ) { $end = $end - $start; $start = 0; } $end = $#words if $end > $#words; $flags[$_]++ for $start .. $end; # All done, and mark where to stop looking if ( $occurrences-- <= 0 ) { $last = $end; last; } } } $pos += 2; # Skip to next wordchar word } # Now build array of just words to display my @output; my $printing; my $first = 1; my $some_printed; for my $i ( 0 ..$#words ) { if ( $last && $i >= $last && $i < $#words ) { push @output, '...'; last; } # Is word marked to display? if ( $flags[$i] ) { push @output, '...' if !$printing++ && !$first; push @output, $words[$i]; $some_printed++; } else { $printing = 0; } $first = 0; } # if none highlighted show $Min_Words if ( !$some_printed ) { for my $i ( 0 .. $Min_Words ) { last if $i >= $#words; push @output, $words[$i]; } } push @output,'...' if !$printing; return join '', @output; }