# find_hits $file_name, @keywords; # # Returns a list of "score: line" strinsg. Example: # # If file "foo.txt" contains: # # Larry Wall, Programming Perl, Reference, x, y, perl # Peter Scott, Perl Medic, Legacy herding, z, a, perl # # A call to this function might look like this: # # my @hits = find_hits "foo.txt", "medic"; # # And it would return a one-element list: # # '32: Peter Scot...a, perl' sub find_hits( $ @ ) { my ( $file, @keywords ) = @_; # compose a regex for quick rejection of non-matching lines: my $any_keyword_re = join '|', map quotemeta $_, @keywords; $any_keyword_re = qr/$any_keyword_re/i; # and a keyword for the whole phrase my $phrase_re = join '\W+', @keywords; $phrase_re = qr/$phrase_re/i; # open input file for read open my $in, $file or croak "opening $file for read: $!"; my @rv; # for accumulating return values while ( <$in> ) { # reject lines with no matches out of hand next unless m/$any_keyword_re/; # any match at all is one point. my $score = 1; # split into fields for further scoring. my ( undef, $title, $desc, undef, undef, $keys ) = split /\t/; # title matches are worth 5 points each while ( $title =~ m/$any_keyword_re/g ) { $score += 5 } # description matches are only 1 point while ( $desc =~ m/$any_keyword_re/g ) { $score += 1 } # keyword matches are 4 points while ( $keys =~ m/$any_keyword_re/g ) { $score += 4 } # phrase matches (against entire line) are 10 points while ( m/$phrase_re/g ) { $score += 10 } # multiple matches are worth 10x the number # of keywords that matched. my $n_matches = () = m/$any_keyword_re/g; # see perlfaq4 if ( $n_matches > 1 ) { $score += 10*$n_matches } # finally, format $score and save for returning # to the caller push @rv, sprintf "%03d: %s", $score, $_; } return @rv; }