#!/usr/bin/perl -w # Small CGI program that does a normal perlmonks.org search # and weighs the result depending on number of matched words # and if it is a root node or a reply. # TODO: remove common words like 'in' and 'from'? # BUG: If search results in just one node, this will report 'none found'. use strict; use CGI; use LWP::Simple; use HTML::TokeParser; # Buffering off $| = 1; my $q = CGI->new; # Print out the html print $q->header; print $q->start_html(-title => 'Perlmonks weighted search'); print $q->h1('Perlmonks weighted search'); print '

Search perlmonks.org:

'; print $q->start_form(-method => 'get', -action => $q->self_url); print $q->textfield(-name => 'query'); print $q->br, $q->br; print $q->submit(-name => 'search', -value => 'Search'); print $q->end_form; # Do we have a search? if($q->param('search') eq 'Search') { print $q->hr; my $search = $q->param('query'); # Do a search on perlmonks and weigh it: my @list = &get_weighted( $search ); # Did we get any results? if(@list) { print "

Weighted results for $search:

"; # Print the list print ''; } # No results :( else { print "

No results were found for $search.

"; } } print $q->end_html; # The do-it-all sub sub get_weighted { # The search query: my $search = shift; # Split it up my @query = split ' ', $search; # ...and regroup (prob not necessary) my $search_words = join '%20', @query; # Fetch from the real thing: my $html = get "http://www.perlmonks.org/index.pl?node=$search_words" or die "$!"; my $parser = HTML::TokeParser->new(\$html); # Place to store our node titles my @results = (); # We only want links within a list: while($parser->get_tag('li')) { # The first tag after the
  • my $token = $parser->get_token(); # Avoid keyword nodelet next if($token->[1] !~ /^a$/i); # Get url for listing my $href = $token->[2]{href}; # Get the node title $token = $parser->get_token; # Sanity check next if($token->[0] ne 'T'); # Put together for display my $link = "$token->[1]"; # Add to list push @results, $link; } ################ # Here is the 'important' code: ################ # Temporary placeholder my @unsorted = (); for(0..$#results) { # Copy to two-dimensional array for later sorting: $unsorted[$_][1] = $results[$_]; # Weight list depending on how many matching words it contains: foreach my $word (@query) { # Just ++ made it more unpredictable... $unsorted[$_][0] += ($unsorted[$_][0] || 1) if($results[$_] =~ /\b$word\b/i) } # Weight down the answers, not perfect # since it will not correctly take care of # Re(5) and friends. If that matters. my @re_count = $results[$_] =~ /(Re:)/gi; $unsorted[$_][0] -= @re_count; $unsorted[$_][1] .= "($unsorted[$_][0])"; # Maybe we should weight up 'Answer:' too? } # Now weigh the unsorted list: my @weighted = sort{ $$b[0] <=> $$a[0] } @unsorted; return @weighted; }