Re: WWW::Spyder check url before loading it?

Analyze for what?
package Dutchman70;
use parent 'WWW::Spyder';
sub crawl {
    my $self      = shift;
    my $opts     = shift || undef;
    my $excludes = [];

    # Exclude list option.
    if ( ref($opts->{exclude}) eq 'ARRAY' ) {
        $excludes = $opts->{exclude};
    }

    while ('I have pages to get...') {

        $self->_exit_check and return;

        my $skip_url = 0;
        my $enQ      = undef;

        # Report a page with a 404 error in the title if report_broken
+_links is
        # enabled.  Also keep processing if we're looking for img src 
+tags.
        if ($self->{report_broken_links} || $self->{image_checking}) {
            $enQ = $self->_choose_courteously ||
                $self->_just_choose;
        } else {
            $enQ = $self->_choose_courteously ||
                $self->_just_choose ||
                    return;
        }

        my $url = $enQ->url;

        # Skip this URL if it's in our excluded list.
        for (@$excludes) { $skip_url = 1 if $url =~ m/$_/; }
        next if $skip_url;

        $self->url($url);
        $self->_current_enQ($enQ);

        print "GET'ing: $url\n" if $VERBOSITY;

        my $response = $self->UA->request   # no redirects &c is simpl
+e_
            ( HTTP::Request->new( GET => "$url" ) );

        print STDERR "\a" if $self->bell;

        $response or
            carp "$url failed GET!" and next;

        push @{$self->{_courtesy_Queue}}, $enQ->domain;
        shift @{$self->{_courtesy_Queue}}
        if $self->{_courtesy_Queue}
        and @{$self->{_courtesy_Queue}} > 100;

        my $head = $response->headers_as_string;
        $head or
            carp "$url has no HEAD!" and
                next; # no headless webpages

        length($head) > 1_024 and $head = substr($head,0,1_024);

        print $head, "\n" if $VERBOSITY > 2;

        my $base;
        eval { $base = $response->base };
        $base or
            carp "$url has no discernible BASE!" and
                next; # no baseless webpages

# WE SHOULD also look for <HTML> because some servers that we might wa
+nt
# to look at don't properly report the content-type

# start over unless this is something we can read
        my $title       = '';
        my $description = '';
        my $is_image    = 0;

        # Make an exception for images.
        if ($self->{image_checking}) {
            if ($head =~ /Content\-Type:\s*image/i) {
            my ($img_size) = $head =~ /Content\-Length:\s*(\d+)/i;

                if ($img_size <= 0) {
                    $title = $description = '404 Not Found';
                    next;
                } else {
                    $is_image = 1;
                }
            }
        } else {
        lc($head) =~ /content-type:\s?(?:text|html)/ or
            carp "$url doesn't look like TEXT or HTML!" and
            next; # no weird media, movies, flash, etc
        }

    ( $title ) = $head =~ m,[Tt]itle:\s*(.+)\n,
            unless $title;

        ( $description ) = $head =~
            /[^:]*?DESCRIPTION:\s*((?:[^\n]+(?:\n )?)+)/i
                unless $description;

        # Add this link to our dead links list if the title matches
        # a standard "404 Not Found" error.
        if ($title && $self->{report_broken_links}) {
            push(@{ $self->{broken_links} }, $url)
                if $title =~ /^\s*404\s+Not\s+Found\s*$/;
        }

        $description = $self->_snip($description) if $description;

        my $page = $response->content or
            carp "Failed to fetch $url." and
                next; # no empty pages, start over with next url

        $self->{_current_Bytes} = length($page);
        $self->spyder_data($self->{_current_Bytes});

# we are going to use a digest to prevent parsing the identical
# content received via a different url
        my $digest = md5_base64($page); # unique microtag of the page
# so if we've seen it before, start over with the next URL
        $self->{_page_Memory}{$digest}++ and
            carp "Seen this page's content before: $url"
                and next;

        $self->{_page_content} = $page;
        print "PARSING: $url\n" if $VERBOSITY > 1;
        $self->{_spydered}{$url}++;
        $self->html_parser->parse($page);
        $self->html_parser->eof;

        $self->{_adjustment} = $self->_parse_for_terms if $self->terms
+;

# make links absolute and fix bad spacing in link names, then turn
# them into an Enqueue object
        for my $pair ( @{$self->{_enqueue_Objects}} ) {
            my $url;
            eval {
                $url = URI::URL::url($pair->[0], $base)->abs;
            };
            my $name =  _snip($pair->[1]);
            my $item = WWW::Spyder::Enqueue->new("$url",$name);
            $pair = $item;
        }
# put links into the queue(s)
       $self->_stack_urls() if $self->_links;

# clean up text a bit. should this be here...?
        if ( $self->{_text} and ${$self->{_text}} ) {
            ${$self->{_text}} =~ s/(?:\s*[\r\n]){3,}/\n\n/g;
        }

# in the future Page object should be installed like parsers as a
# reusable container
#    return
  my $Page =
      WWW::Spyder::Page->new(
                             title  => $title,
                             text   => $self->{_text},
                             raw    => \$page,
                             url    => $enQ->url,
                             domain => $enQ->domain,
                             link_name   => undef,
                             link        => undef,
                             description => $description || '',
                             pages_enQs  => $self->_enqueue,
                             );
        $self->_reset;       #<<--clear out things that might remain
        return $Page;
    }
}
[download]
Comment on Re: WWW::Spyder check url before loading it? Download Code
Replies are listed 'Best First'.
Re^2: WWW::Spyder check url before loading it? by dutchman70 (Initiate) on Feb 16, 2010 at 00:45 UTC
Anonymous, thanks very much for the code - exactly what I was looking for...! Thanks!	[reply]
Re^2: WWW::Spyder check url before loading it? by dutchman70 (Initiate) on Feb 15, 2010 at 01:54 UTC
for example if the url is part of the docroot.	[reply]