Okay, here is the pertinent parts of my code:

First is a small container object

package Spider::URL; use strict; sub new{ my ($class,$url,$target,$current_depth,$referer) = @_; my $object = bless {}, $class; $object->{url} = $url; $object->{current_depth} = $current_depth; $object->{target} = $target; $object->{referer} = $referer; return $object; } 1;

Next is the object that inherits from LWP::Parallel::UserAgent:

package Spider::LWP; use strict; use Exporter(); use HTTP::Request; use LWP::Parallel::UserAgent qw(:CALLBACK); use Spider::URL; use File::Path; use Data::Dump qw(dump); use vars qw (@ISA @EXPORT); @ISA = qw(LWP::Parallel::UserAgent Exporter); @EXPORT = @LWP::Parallel::UserAgent::EXPORT_OK; sub new { my ($class,$depth,$path,$max_sockets) = @_; my $object = bless(LWP::Parallel::UserAgent->new(),$class); $object->{current} = {}; $object->{queue} = {}; $object->{finished} = {}; $object->{failed} = {}; $object->{depth} = $depth; $object->{path} = $path; $object->{completed} = 0; $object->{max_sockets} = $max_sockets; return $object; } sub on_return { my ($self, $request, $response, $entry) = @_; if ($response->is_success) { $response->content($response->decoded_content); $self->{finished}->{$request->url->as_string} = $self- +>{current}->{$request->url->as_string}; delete $self->{current}->{$request->url->as_string}; $self->_process_return($response); # print "\n\nWoa! Request to ",$request->url," returned +code ", $response->code, ": ", $response->message, "\n"; } else { on_failure($self, $request, $response, $entry); } $self->discard_entry($entry); return; } sub _process_return{ my ($self,$response) = @_; my $html = $response->header('content-type') =~ /text\/html/i; my $key = $response->request->url->as_string; my $path = _build_path($self->{path},$self->{finished}->{$key} +->{target}); _write_page($path,$response->request->uri,$response->content,$ +html); my $maxed = $self->{depth} - $self->{finished}->{$key}->{curre +nt_depth}; if($html){ my $urls = _get_urls($response->content,$response->bas +e,$maxed,);#\$ignore,\$exclude);#biggest waiting point foreach my $url(@$urls){ if($url !~ /\w/){next;} if(! defined $self->{current}->{$$url[0]} and ! defined $self->{queue}->{$$url[0]} and ! defined $self->{finished}->{$$url[0]}){ $self->{queue}->{$$url[0]} = Spider::U +RL->new($$url[0],$self->{finished}->{$key}->{target},$self->{finished +}->{$key}->{current_depth}+1); #print "Putting $$url[0] into the queue\n"; } } } my @keys = keys %{$self->{queue}}; while((keys %{$self->{current}}) < $self->{max_sockets} and $# +keys > -1){ $self->{current}->{$keys[0]} = $self->{queue}->{$keys[ +0]}; #print "Putting $keys[0] into the current\n"; delete $self->{queue}->{$keys[0]}; my $req = HTTP::Request->new('GET', "$keys[0]"); $req->header('Accept-Encoding'=>'gzip,x-gzip,x-bzip2,d +eflate,compress,base64,quoted-printable'); $self->register($req); @keys = keys %{$self->{queue}}; } if($#keys < 0){$self->{completed} = 1;} }

Last is the main.pl which initializes and calls everything:

#!/usr/bin/perl use lib "."; use Spider::LWP; #Overloaded version of LWP use HTTP::Request; use Googly::Search; use Getopt::Long; use File::Slurp; use Data::Dump qw(dump); sub main{ my @list = $ARGV[0]; #starting dir of lists of participating URLS. foreach(@list){ if($_ !~ /\w/){next;} chomp; $urls = GetListOfSta rtingURLS($_); my $path_addition = $_; $path_addition =~ s/[\.\s+\\\/\:\*\"\?\<\>\|]+/_/g; $path_addition =~ s/[\_\s*]$//; foreach my $url(@$urls){ my $req = HTTP::Request->new('GET', $url); $ua->{current}->{$req->uri->as_string} = Spide +r::URL->new($req->uri->as_string,$path_addition,0); $req->header('Accept-Encoding'=>'gzip,x-gzip,x +-bzip2,deflate,compress,base64,quoted-printable'); $ua->register ($req); } } $SIG{INT} = sub { #print dump $ua; print_queue($ua->{queue},"QUEUE"); print_queue($ua->{current},"CURRENT"); print_queue($ua->{failed},"FAILED"); print_queue($ua->{finished},"FINISHED"); exit; }; $ua->wait(300); # block until we are all finished or until ever +ything has stopped for 5 minutes }

In reply to Re^2: Problems with 'discard_entry' in LWP::Parallel::Useragent by perlmonkey2
in thread LWP::Parallel::UserAgent discard_entry does not delete entry objects by perlmonkey2

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post, it's "PerlMonks-approved HTML":



  • Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!
  • Titles consisting of a single word are discouraged, and in most cases are disallowed outright.
  • Read Where should I post X? if you're not absolutely sure you're posting in the right place.
  • Please read these before you post! —
  • Posts may use any of the Perl Monks Approved HTML tags:
    a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr
  • You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)
            For:     Use:
    & &amp;
    < &lt;
    > &gt;
    [ &#91;
    ] &#93;
  • Link using PerlMonks shortcuts! What shortcuts can I use for linking?
  • See Writeup Formatting Tips and other pages linked from there for more info.