in reply to Get 10,000 web pages fast
Using windows fork emulation for this is not a good idea.
A pool of threads will be far more efficient (on Windows), and is hardly more complex:
#! perl -slw use strict; use threads; use threads::shared; use Thread::Queue; use LWP::Simple; sub worker { my( $Q, $curCountRef ) = @_; while( my $work = $Q->dequeue ) { my( $url, $name ) = split $;, $work; my $rc =getstore( $url, $name ); warn( "Failed to fetch $url: $rc\n" ), next if $rc != RC_OK; lock $$curCountRef; printf STDERR "%06d:($name) fetched\n", ++$$curCountRef; } } our $W //= 20; our $urlFile //= 'urls.txt'; my $Q = new Thread::Queue; my $curCount :shared = 0; my @threads = map{ threads->create( \&worker, $Q, \$curCount ); } 1 .. $W; open URLS, '<', $urlFile or die "$urlFile : $!"; my $fileNo = 0; while( my $work = <URLS> ) { chomp $work; $Q->enqueue( sprintf "%s$;./tmp/saved.%06d", $work, ++$fileNo ); sleep 1 while $Q->pending > $W; } close URLS; $Q->enqueue( ( undef ) x $W ); $_->join for @threads;
By leaving the urls in a file, this will read them on demand and save filling memory with many copies of them. As posted, this expects the urls to be in a file called ./urls.txt. And will run a pool of 20 workers (thisScript -W=20 -urlFile=./urls.txt). The retrieved files are written to tmp/saved.nnnnnn Adjust to suit.
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re^2: Get 10,000 web pages fast
by Mad_Mac (Beadle) on Sep 28, 2010 at 11:49 UTC | |
by BrowserUk (Patriarch) on Sep 28, 2010 at 15:13 UTC |