s[\b($word)\b][$1]g;
####
Highlighting works by this call:
$text =
highlight( \$text, $stemmer_function, set_match_regexp( \%heads ) );
First, this builds the three regular expressions:
-------------------------------------------------
sub set_match_regexp {
my $header = shift;
my ($query, $wc, $ignoref, $ignorel ) =
@{$header}{'parsed words',
qw/wordcharacters ignorefirstchar ignorelastchar/};
$wc = quotemeta $wc;
# build regexp to match against the query words
# If a trailing "*" is found then do a wild card search.
my $match_string =
join '|',
map { substr( $_, -1, 1 ) eq '*'
? quotemeta( substr( $_, 0, -1) ) . "[$wc]*?"
: quotemeta
}
# get rid of operator words and characters
grep { ! /^(and|or|not|["()=])$/oi }
split /\s+/, $query;
# These are to remove and capture the ignorefirst
# and ingnorelast chars
for ( $ignoref, $ignorel ) {
if ( $_ ) {
$_ = quotemeta;
$_ = "([$_]*)";
} else {
$_ = '()';
}
}
# Avoid /i matching...
$wc .= 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'; # Warning: dependent on tolower used while indexing
# return the three compiled expressions
# 1) used to split source into swish/non-swish words
# 2) used to strip ignorefirst/ignorelast
# 3) used to match final word against query word(s)
return (
qr/([^$wc]+)/o,
qr/^$ignoref([$wc]+?)$ignorel$/io,
qr/^$match_string$/,
);
And here's the time killer:
---------------------------
sub highlight {
my ( $text_ref, $stemmer_function,
$wc_regexp, $extract_regexp, $match_regexp ) = @_;
my $last = 0;
# Split source into swish-words and non-swish words.
# Every other element is a swish-word due to the capture
my @words = split /$wc_regexp/, $$text_ref;
# This keeps track of what words to print (context printing)
my @flags;
$flags[$#words] = 0; # Extend array.
my $occurrences = $Occurrences; # limit number of matches
my $pos = 0;
while ( $pos <= $#words ) {
# pull out the (begin_chars, match word, end_chars)
if ( $words[$pos] =~ /$extract_regexp/ ) {
my ( $begin, $word, $end ) = ( $1, $2, $3 );
# Stem word if enabled, else just use lower case
my $test = $stemmer_function
? $stemmer_function->($word)
: lc $word;
$test ||= lc $word;
# Not check if word matches
if ( $test =~ /$match_regexp/ ) {
$words[$pos] = "$begin$word$end";
# turn on display on both sides of word
my $start = $pos - $Show_Words + 1;
my $end = $pos + $Show_Words - 1;
if ( $start < 0 ) {
$end = $end - $start;
$start = 0;
}
$end = $#words if $end > $#words;
$flags[$_]++ for $start .. $end;
# All done, and mark where to stop looking
if ( $occurrences-- <= 0 ) {
$last = $end;
last;
}
}
}
$pos += 2; # Skip to next wordchar word
}
# Now build array of just words to display
my @output;
my $printing;
my $first = 1;
my $some_printed;
for my $i ( 0 ..$#words ) {
if ( $last && $i >= $last && $i < $#words ) {
push @output, '...';
last;
}
# Is word marked to display?
if ( $flags[$i] ) {
push @output, '...' if !$printing++ && !$first;
push @output, $words[$i];
$some_printed++;
} else {
$printing = 0;
}
$first = 0;
}
# if none highlighted show $Min_Words
if ( !$some_printed ) {
for my $i ( 0 .. $Min_Words ) {
last if $i >= $#words;
push @output, $words[$i];
}
}
push @output,'...' if !$printing;
return join '', @output;
}