Analyze for what?
package Dutchman70; use parent 'WWW::Spyder'; sub crawl { my $self = shift; my $opts = shift || undef; my $excludes = []; # Exclude list option. if ( ref($opts->{exclude}) eq 'ARRAY' ) { $excludes = $opts->{exclude}; } while ('I have pages to get...') { $self->_exit_check and return; my $skip_url = 0; my $enQ = undef; # Report a page with a 404 error in the title if report_broken +_links is # enabled. Also keep processing if we're looking for img src +tags. if ($self->{report_broken_links} || $self->{image_checking}) { $enQ = $self->_choose_courteously || $self->_just_choose; } else { $enQ = $self->_choose_courteously || $self->_just_choose || return; } my $url = $enQ->url; # Skip this URL if it's in our excluded list. for (@$excludes) { $skip_url = 1 if $url =~ m/$_/; } next if $skip_url; $self->url($url); $self->_current_enQ($enQ); print "GET'ing: $url\n" if $VERBOSITY; my $response = $self->UA->request # no redirects &c is simpl +e_ ( HTTP::Request->new( GET => "$url" ) ); print STDERR "\a" if $self->bell; $response or carp "$url failed GET!" and next; push @{$self->{_courtesy_Queue}}, $enQ->domain; shift @{$self->{_courtesy_Queue}} if $self->{_courtesy_Queue} and @{$self->{_courtesy_Queue}} > 100; my $head = $response->headers_as_string; $head or carp "$url has no HEAD!" and next; # no headless webpages length($head) > 1_024 and $head = substr($head,0,1_024); print $head, "\n" if $VERBOSITY > 2; my $base; eval { $base = $response->base }; $base or carp "$url has no discernible BASE!" and next; # no baseless webpages # WE SHOULD also look for <HTML> because some servers that we might wa +nt # to look at don't properly report the content-type # start over unless this is something we can read my $title = ''; my $description = ''; my $is_image = 0; # Make an exception for images. if ($self->{image_checking}) { if ($head =~ /Content\-Type:\s*image/i) { my ($img_size) = $head =~ /Content\-Length:\s*(\d+)/i; if ($img_size <= 0) { $title = $description = '404 Not Found'; next; } else { $is_image = 1; } } } else { lc($head) =~ /content-type:\s?(?:text|html)/ or carp "$url doesn't look like TEXT or HTML!" and next; # no weird media, movies, flash, etc } ( $title ) = $head =~ m,[Tt]itle:\s*(.+)\n, unless $title; ( $description ) = $head =~ /[^:]*?DESCRIPTION:\s*((?:[^\n]+(?:\n )?)+)/i unless $description; # Add this link to our dead links list if the title matches # a standard "404 Not Found" error. if ($title && $self->{report_broken_links}) { push(@{ $self->{broken_links} }, $url) if $title =~ /^\s*404\s+Not\s+Found\s*$/; } $description = $self->_snip($description) if $description; my $page = $response->content or carp "Failed to fetch $url." and next; # no empty pages, start over with next url $self->{_current_Bytes} = length($page); $self->spyder_data($self->{_current_Bytes}); # we are going to use a digest to prevent parsing the identical # content received via a different url my $digest = md5_base64($page); # unique microtag of the page # so if we've seen it before, start over with the next URL $self->{_page_Memory}{$digest}++ and carp "Seen this page's content before: $url" and next; $self->{_page_content} = $page; print "PARSING: $url\n" if $VERBOSITY > 1; $self->{_spydered}{$url}++; $self->html_parser->parse($page); $self->html_parser->eof; $self->{_adjustment} = $self->_parse_for_terms if $self->terms +; # make links absolute and fix bad spacing in link names, then turn # them into an Enqueue object for my $pair ( @{$self->{_enqueue_Objects}} ) { my $url; eval { $url = URI::URL::url($pair->[0], $base)->abs; }; my $name = _snip($pair->[1]); my $item = WWW::Spyder::Enqueue->new("$url",$name); $pair = $item; } # put links into the queue(s) $self->_stack_urls() if $self->_links; # clean up text a bit. should this be here...? if ( $self->{_text} and ${$self->{_text}} ) { ${$self->{_text}} =~ s/(?:\s*[\r\n]){3,}/\n\n/g; } # in the future Page object should be installed like parsers as a # reusable container # return my $Page = WWW::Spyder::Page->new( title => $title, text => $self->{_text}, raw => \$page, url => $enQ->url, domain => $enQ->domain, link_name => undef, link => undef, description => $description || '', pages_enQs => $self->_enqueue, ); $self->_reset; #<<--clear out things that might remain return $Page; } }

In reply to Re: WWW::Spyder check url before loading it? by Anonymous Monk
in thread WWW::Spyder check url before loading it? by dutchman70

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post, it's "PerlMonks-approved HTML":



  • Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!
  • Titles consisting of a single word are discouraged, and in most cases are disallowed outright.
  • Read Where should I post X? if you're not absolutely sure you're posting in the right place.
  • Please read these before you post! —
  • Posts may use any of the Perl Monks Approved HTML tags:
    a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr
  • You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)
            For:     Use:
    & &amp;
    < &lt;
    > &gt;
    [ &#91;
    ] &#93;
  • Link using PerlMonks shortcuts! What shortcuts can I use for linking?
  • See Writeup Formatting Tips and other pages linked from there for more info.