#!/usr/bin/perl -w

# Small CGI program that does a normal perlmonks.org search
# and weighs the result depending on number of matched words
# and if it is a root node or a reply.


# TODO: remove common words like 'in' and 'from'?

# BUG: If search results in just one node, this will report 'none found'.

use strict;
use CGI;
use LWP::Simple;
use HTML::TokeParser;

# Buffering off
$| = 1;

my $q = CGI->new;

# Print out the html

print $q->header;

print $q->start_html(-title => 'Perlmonks weighted search');

print $q->h1('Perlmonks weighted search');

print '<p>Search <a href="http://www.perlmonks.org">perlmonks.org</a>:</p>';

print $q->start_form(-method => 'get', -action => $q->self_url);

print $q->textfield(-name => 'query');

print $q->br, $q->br;

print $q->submit(-name => 'search', -value => 'Search');

print $q->end_form;


# Do we have a search?
if($q->param('search') eq 'Search')
{
	print $q->hr;

	my $search = $q->param('query');

	# Do a search on perlmonks and weigh it:
	my @list = &get_weighted( $search );
	
	# Did we get any results?
	if(@list)
	{
		print "<p>Weighted results for <i>$search</i>:</p>";

		# Print the list
		print '<ul>';
		foreach(@list)
		{
			print "<li>$$_[1]</li>";
		}
		print '</ul>';
	}

	# No results :(
	else
	{
		print "<p>No results were found for <i>$search</i>.</p>";	
	}
}

print $q->end_html;


# The do-it-all sub

sub get_weighted
{
	# The search query:
	my $search = shift;
	
	# Split it up
	my @query = split ' ', $search;

	# ...and regroup (prob not necessary)
	my $search_words = join '%20', @query;


	# Fetch from the real thing:
	my $html = get "http://www.perlmonks.org/index.pl?node=$search_words" or die "$!";


	my $parser = HTML::TokeParser->new(\$html);

	# Place to store our node titles
	my @results = ();

	# We only want links within a list:
	while($parser->get_tag('li'))
	{
		# The first tag after the <li>
		my $token = $parser->get_token();

		# Avoid keyword nodelet
		next if($token->[1] !~ /^a$/i);

		# Get url for listing
		my $href = $token->[2]{href};

		# Get the node title
		$token = $parser->get_token;

		# Sanity check
		next if($token->[0] ne 'T');

		# Put together for display
		my $link = "<a href=\"http://www.perlmonks.org$href\">$token->[1]</a>";

		# Add to list
		push @results, $link;	

	}


	################

	# Here is the 'important' code:

	################

	# Temporary placeholder
	my @unsorted = ();

	for(0..$#results)
	{
		# Copy to two-dimensional array for later sorting:
		$unsorted[$_][1] = $results[$_];

		# Weight list depending on how many matching words it contains:
		foreach my $word (@query)
		{
			# Just ++ made it more unpredictable...
			$unsorted[$_][0] += ($unsorted[$_][0] || 1) if($results[$_] =~ /\b$word\b/i)
		}

		# Weight down the answers, not perfect
		# since it will not correctly take care of
		# Re(5) and friends. If that matters.
		my @re_count = $results[$_] =~ /(Re:)/gi;
		$unsorted[$_][0] -= @re_count;

		$unsorted[$_][1] .= "($unsorted[$_][0])";

		# Maybe we should weight up 'Answer:' too?
	}
	
	# Now weigh the unsorted list:
	my @weighted = sort{ $$b[0] <=> $$a[0] } @unsorted;
	
	return @weighted;
}