in reply to WWW::Spyder check url before loading it?
package Dutchman70; use parent 'WWW::Spyder'; sub crawl { my $self = shift; my $opts = shift || undef; my $excludes = []; # Exclude list option. if ( ref($opts->{exclude}) eq 'ARRAY' ) { $excludes = $opts->{exclude}; } while ('I have pages to get...') { $self->_exit_check and return; my $skip_url = 0; my $enQ = undef; # Report a page with a 404 error in the title if report_broken +_links is # enabled. Also keep processing if we're looking for img src +tags. if ($self->{report_broken_links} || $self->{image_checking}) { $enQ = $self->_choose_courteously || $self->_just_choose; } else { $enQ = $self->_choose_courteously || $self->_just_choose || return; } my $url = $enQ->url; # Skip this URL if it's in our excluded list. for (@$excludes) { $skip_url = 1 if $url =~ m/$_/; } next if $skip_url; $self->url($url); $self->_current_enQ($enQ); print "GET'ing: $url\n" if $VERBOSITY; my $response = $self->UA->request # no redirects &c is simpl +e_ ( HTTP::Request->new( GET => "$url" ) ); print STDERR "\a" if $self->bell; $response or carp "$url failed GET!" and next; push @{$self->{_courtesy_Queue}}, $enQ->domain; shift @{$self->{_courtesy_Queue}} if $self->{_courtesy_Queue} and @{$self->{_courtesy_Queue}} > 100; my $head = $response->headers_as_string; $head or carp "$url has no HEAD!" and next; # no headless webpages length($head) > 1_024 and $head = substr($head,0,1_024); print $head, "\n" if $VERBOSITY > 2; my $base; eval { $base = $response->base }; $base or carp "$url has no discernible BASE!" and next; # no baseless webpages # WE SHOULD also look for <HTML> because some servers that we might wa +nt # to look at don't properly report the content-type # start over unless this is something we can read my $title = ''; my $description = ''; my $is_image = 0; # Make an exception for images. if ($self->{image_checking}) { if ($head =~ /Content\-Type:\s*image/i) { my ($img_size) = $head =~ /Content\-Length:\s*(\d+)/i; if ($img_size <= 0) { $title = $description = '404 Not Found'; next; } else { $is_image = 1; } } } else { lc($head) =~ /content-type:\s?(?:text|html)/ or carp "$url doesn't look like TEXT or HTML!" and next; # no weird media, movies, flash, etc } ( $title ) = $head =~ m,[Tt]itle:\s*(.+)\n, unless $title; ( $description ) = $head =~ /[^:]*?DESCRIPTION:\s*((?:[^\n]+(?:\n )?)+)/i unless $description; # Add this link to our dead links list if the title matches # a standard "404 Not Found" error. if ($title && $self->{report_broken_links}) { push(@{ $self->{broken_links} }, $url) if $title =~ /^\s*404\s+Not\s+Found\s*$/; } $description = $self->_snip($description) if $description; my $page = $response->content or carp "Failed to fetch $url." and next; # no empty pages, start over with next url $self->{_current_Bytes} = length($page); $self->spyder_data($self->{_current_Bytes}); # we are going to use a digest to prevent parsing the identical # content received via a different url my $digest = md5_base64($page); # unique microtag of the page # so if we've seen it before, start over with the next URL $self->{_page_Memory}{$digest}++ and carp "Seen this page's content before: $url" and next; $self->{_page_content} = $page; print "PARSING: $url\n" if $VERBOSITY > 1; $self->{_spydered}{$url}++; $self->html_parser->parse($page); $self->html_parser->eof; $self->{_adjustment} = $self->_parse_for_terms if $self->terms +; # make links absolute and fix bad spacing in link names, then turn # them into an Enqueue object for my $pair ( @{$self->{_enqueue_Objects}} ) { my $url; eval { $url = URI::URL::url($pair->[0], $base)->abs; }; my $name = _snip($pair->[1]); my $item = WWW::Spyder::Enqueue->new("$url",$name); $pair = $item; } # put links into the queue(s) $self->_stack_urls() if $self->_links; # clean up text a bit. should this be here...? if ( $self->{_text} and ${$self->{_text}} ) { ${$self->{_text}} =~ s/(?:\s*[\r\n]){3,}/\n\n/g; } # in the future Page object should be installed like parsers as a # reusable container # return my $Page = WWW::Spyder::Page->new( title => $title, text => $self->{_text}, raw => \$page, url => $enQ->url, domain => $enQ->domain, link_name => undef, link => undef, description => $description || '', pages_enQs => $self->_enqueue, ); $self->_reset; #<<--clear out things that might remain return $Page; } }
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re^2: WWW::Spyder check url before loading it?
by dutchman70 (Initiate) on Feb 16, 2010 at 00:45 UTC | |
|
Re^2: WWW::Spyder check url before loading it?
by dutchman70 (Initiate) on Feb 15, 2010 at 01:54 UTC |