Re^3: searching via www::search on alltheweb

Replies are listed 'Best First'.
Re^4: searching via www::search on alltheweb by coder45 (Initiate) on Feb 08, 2006 at 15:46 UTC
I haven't been able to get vroom working, although I have manually copied and pasted the pm files into the appropriate c:\perl\site\lib\vroom\search\google.pm and c:\perl\site\lib\vroom\vroom.pm directories, it still says vroom\search\google not found and lists the above directories as to where it should be found, the package I am using is this: package VROOM::Search::Google; use strict; use VROOM::Search qw(escape_query unescape_sequence); use Time::HiRes qw(gettimeofday); @VROOM::Search::Google::ISA = qw(VROOM::Search); sub prepare_request { my $self = shift; my $query = escape_query(shift); my $params = shift; my $uri = 'http://www.google.com'; $params->{baseurl} = $uri unless defined $params->{baseurl}; $params->{hl} = 'en' unless defined $params->{hl}; $self->{baseurl} = $uri = $params->{baseurl}; $uri .= '/search?q='.$query; while (my ($name, $value) = each %$params) { next if $name =~ /baseurl/; $uri .= '&'.$name.'='.$value; } $self->{initime} = $self->{endtime} = [gettimeofday]; $self->{request} = new HTTP::Request(GET => $uri); } sub store_results { my $self = shift; my $res = shift; $self->{endtime} = [gettimeofday]; if ($res->code != 200) { $self->{request} = undef; return undef; } # # Google doesn't return Content-Length, # so ($res->headers)->content_length will be zero. We're forced to # use Perl function - length. # $self->{fetch}++; $self->{pgsize} += length($res->content); # # If we reach here, HTTP response is OK. Proceed to parse the html # document for search results # my ($HIT, $ENTRY, $NEXT) = (0, 1, 2); my $rank = $self->count; my $hits = 0; my $wish = $HIT; my $result = undef; foreach (split(/(<p>\|\n\|<\/div>)/i, $res->content)) { next if /^$/; # short circuit for blank lines last if $wish == $NEXT; if ($self->count == $self->maximum) { $self->{request} = undef; return $hits; } #print "#################################################\n"; #print $_, "\n"; # # Ah,found some results. Get approximate results and wish to # see the title/url of the first result. # if ($wish == $HIT && /Results.?of.?([0-9,]+).?\./i) { my $count = $1; $self->approximate($count); $wish = $ENTRY; } # # Extract the url/title and wish to have abstract text # elsif ($wish == $ENTRY && /^<a href=(.?)>(.?)<\/a><br><font.?>(.?)$/i) { my $url = $1; my $title = $2; my $abstract = $3; $url =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; $url =~ s/(^http:\/\/\|\/(index.htm\|index.html)$)//g; $title =~ s/<.?>//g; $result = new VROOM::Search::Result; $result->url($url); $result->title(unescape_sequence($title)); $result->text(unescape_sequence($abstract)); $result->rank(++$rank); $result->engine('Google'); $self->add_result($result); $self->{pool}->insert($result) if $self->{pool}; $hits++; } # # Extract the url for the next page # elsif ($wish == $ENTRY && /<td nowrap><a href=(.?)>.?<span.?>Next<\/span><\/a>/i) +{ $self->{request}->uri($self->{baseurl}.$1); $wish = $NEXT; } } # # This is important. It signals the search agent not to fetch more + pages. # $self->{request} = undef if $wish != $NEXT; return $hits; } 1; __END__ [download] my perl code is this: `#! Perl\bin\perl -w use VROOM::Search::Google; open FILE1, "> sample1.txt" or die "$!"; my $oSearch = new VROOM::Search::Google( ); my $sQuery = VROOM::Search::Google::escape_query('"telefonos" "mundial +"'); $oSearch->native_query($sQuery); while ( my $oResult = $oSearch->next_result() ) { print "Adding: ", $oResult->url, "\n"; print FILE1 $oResult->url, "\n"; } print ref($oSearch);` [download] I have still not been able to figure out how to amend the languages hl = es in tis case instead of en. Any help will be appreciated.	[reply] [d/l] [select]

Replies are listed 'Best First'.

Re^4: searching via www::search on alltheweb
by coder45 (Initiate) on Feb 08, 2006 at 15:46 UTC


package VROOM::Search::Google;

use strict;

use VROOM::Search qw(escape_query unescape_sequence);
use Time::HiRes   qw(gettimeofday);

@VROOM::Search::Google::ISA = qw(VROOM::Search);

sub prepare_request
{
    my $self   = shift;
    my $query  = escape_query(shift);
    my $params = shift;
    my $uri    = 'http://www.google.com';

    $params->{baseurl} = $uri unless defined $params->{baseurl};
    $params->{hl}      = 'en' unless defined $params->{hl};

    $self->{baseurl} = $uri = $params->{baseurl};

    $uri .= '/search?q='.$query;

    while (my ($name, $value) = each %$params) {
    next if $name =~ /baseurl/;
    $uri .= '&'.$name.'='.$value;
    }

    $self->{initime} = $self->{endtime} = [gettimeofday];

    $self->{request} = new HTTP::Request(GET => $uri);
}

sub store_results
{
    my $self = shift;
    my $res  = shift;

    $self->{endtime} = [gettimeofday];

    if ($res->code != 200) {
    $self->{request} = undef;
    return undef;
    }

    #
    # Google doesn't return Content-Length, 
    # so ($res->headers)->content_length will be zero. We're forced to
    # use Perl function - length.
    #
    $self->{fetch}++;
    $self->{pgsize} += length($res->content);

    #
    # If we reach here, HTTP response is OK. Proceed to parse the html
    # document for search results
    #
    my ($HIT, $ENTRY, $NEXT) = (0, 1, 2);
    my $rank   = $self->count;
    my $hits   = 0;
    my $wish   = $HIT;
    my $result = undef;
    
    foreach (split(/(<p>|\n|<\/div>)/i, $res->content)) {
        next if /^$/;            # short circuit for blank lines
    last if $wish == $NEXT;
    
    if ($self->count == $self->maximum) {
        $self->{request} = undef;
        return $hits;
    }

    #print "#################################################\n";
    #print $_, "\n";

    #
    # Ah,found some results. Get approximate results and wish to 
        # see the title/url of the first result.
    # 
    if ($wish == $HIT && /Results.*?of.*?([0-9,]+).*?\./i) {
        my $count = $1;
        $self->approximate($count);
        $wish = $ENTRY;
    }
    #
    # Extract the url/title and wish to have abstract text
    #
    elsif ($wish == $ENTRY && 
           /^<a href=(.*?)>(.*?)<\/a><br><font.*?>(.*?)$/i) {
        
        my $url      = $1;
        my $title    = $2;
        my $abstract = $3;

        $url    =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
        $url    =~ s/(^http:\/\/|\/(index.htm|index.html)*$)//g;
        $title  =~ s/<.*?>//g;
 
        $result = new VROOM::Search::Result;
        $result->url($url);
        $result->title(unescape_sequence($title));
        $result->text(unescape_sequence($abstract));
        $result->rank(++$rank);
        $result->engine('Google');

        $self->add_result($result);
        $self->{pool}->insert($result) if $self->{pool};
        $hits++;
        }
    #
    # Extract the url for the next page
    #
    elsif ($wish == $ENTRY && 
           /<td nowrap><a href=(.*?)>.*?<span.*?>Next<\/span><\/a>/i) 
+{

        $self->{request}->uri($self->{baseurl}.$1);

        $wish = $NEXT;
    }
    }

    #
    # This is important. It signals the search agent not to fetch more
+ pages.
    #
    $self->{request} = undef if $wish != $NEXT;


    return $hits;
    
}

1;

__END__
[download]

#! Perl\bin\perl -w

use VROOM::Search::Google;

open FILE1, "> sample1.txt" or die "$!";

my $oSearch = new VROOM::Search::Google( );

my $sQuery = VROOM::Search::Google::escape_query('"telefonos" "mundial
+"');

$oSearch->native_query($sQuery);
  
while ( my $oResult = $oSearch->next_result() ) { 
    
    print "Adding: ", $oResult->url, "\n";
    print FILE1 $oResult->url, "\n";
   
    }
print ref($oSearch);
[download]

[reply]
[d/l]
[select]