Since it looks like you are trying to create a multi-threaded spider, here is a crude one to get you started. It is simple, it doesn't actually store what it fetches anywhere and could be much cleverer about making links absolute etc., but it demonstrates a few ideas.
Usage:
perl -s script.pl [-THREADS=2] url [url...]
Use ^C to have it abort cleanly and produce some simple stats about what it did.
#! perl -slw
use strict;
use threads;
use threads::shared;
use Thread::Queue;
use LWP::UserAgent;
use HTML::LinkExtor;
use URI;
$|=1;
our $THREADS ||= 2;
sub fetcher {
my $tid = threads->self->tid;
my( $Qwork, $hashref ) = @_;
my $ua = new LWP::UserAgent;
my $p = new HTML::LinkExtor;
while( my $next = $Qwork->dequeue ) {
next if exists $hashref->{ $next };
my $res = $ua->get( $next );
$hashref->{ $next } = $res->code;
print "$tid: Failed to fetch $next", next unless $res->is_succ
+ess;
$p->parse( $res->content );
print "Fetched $next";
my @links = map{
my( $tag, %attrs ) = @$_;
$tag eq 'a'
? $attrs{ href }
: ()
} $p->links;
printf "$tid: Stacked %d links\n", scalar @links;
$Qwork->enqueue( @links );
}
}
## A place to store the urls visited.
## Checked by all threads
my %urls : shared;
## A queue of links to fetch and follow
## Added to and read by all threads
my $Qwork = new Thread::Queue;
## Prime the pump
$Qwork->enqueue( @ARGV );
## Start the fetchers
my @fetchers = map{
threads->create( \&fetcher, $Qwork, \%urls )
} 1 .. $THREADS;
## A flag to abort the main thread and clean up
my $stop = 0;
## ^C aborts
$SIG{INT} = sub{
## Empty the work queue
$Qwork->dequeue while $Qwork->pending;
## And stack an undef for each fetcher to abort their main loops
$Qwork->enqueue( (undef) x @fetchers );
$stop = 1;
};
## Log some progress information
printf "Fetched: %d (in Q: %d)\n", scalar keys %urls, $Qwork->pending
+
while not $stop and sleep 5;
## Wait for the fetchers to stop
$_->join for @fetchers;
## Gen some stats
my( %stats, $total );
++$total, ++$stats{ $_ } for values %urls;
## And print them
print "Atempted: $total";
print "Status: $_ ( $stats{ $_ } )" for sort{$a<=>$b} keys %stats;
Be careful where you let it loose.
Examine what is said, not who speaks -- Silence betokens consent -- Love the truth but pardon error.
Lingua non convalesco, consenesco et abolesco. -- Rule 1 has a caveat! -- Who broke the cabal?
"Science is about questioning the status quo. Questioning authority".
In the absence of evidence, opinion is indistinguishable from prejudice.
|