package VROOM::Search::Google; use strict; use VROOM::Search qw(escape_query unescape_sequence); use Time::HiRes qw(gettimeofday); @VROOM::Search::Google::ISA = qw(VROOM::Search); sub prepare_request { my $self = shift; my $query = escape_query(shift); my $params = shift; my $uri = 'http://www.google.com'; $params->{baseurl} = $uri unless defined $params->{baseurl}; $params->{hl} = 'en' unless defined $params->{hl}; $self->{baseurl} = $uri = $params->{baseurl}; $uri .= '/search?q='.$query; while (my ($name, $value) = each %$params) { next if $name =~ /baseurl/; $uri .= '&'.$name.'='.$value; } $self->{initime} = $self->{endtime} = [gettimeofday]; $self->{request} = new HTTP::Request(GET => $uri); } sub store_results { my $self = shift; my $res = shift; $self->{endtime} = [gettimeofday]; if ($res->code != 200) { $self->{request} = undef; return undef; } # # Google doesn't return Content-Length, # so ($res->headers)->content_length will be zero. We're forced to # use Perl function - length. # $self->{fetch}++; $self->{pgsize} += length($res->content); # # If we reach here, HTTP response is OK. Proceed to parse the html # document for search results # my ($HIT, $ENTRY, $NEXT) = (0, 1, 2); my $rank = $self->count; my $hits = 0; my $wish = $HIT; my $result = undef; foreach (split(/(
|\n|<\/div>)/i, $res->content)) {
next if /^$/; # short circuit for blank lines
last if $wish == $NEXT;
if ($self->count == $self->maximum) {
$self->{request} = undef;
return $hits;
}
#print "#################################################\n";
#print $_, "\n";
#
# Ah,found some results. Get approximate results and wish to
# see the title/url of the first result.
#
if ($wish == $HIT && /Results.*?of.*?([0-9,]+).*?\./i) {
my $count = $1;
$self->approximate($count);
$wish = $ENTRY;
}
#
# Extract the url/title and wish to have abstract text
#
elsif ($wish == $ENTRY &&
/^(.*?)<\/a>.*?