#! perl -slw use strict; use threads; use threads::shared; use Thread::Queue; use LWP::Simple; ## A semaphore to serialise access to stdout my $sem : shared; ## (2 to 4) * $noOfCores ## depending upon your bandwidth, server response times ## and how hard you feel you should hit them! my $noOfThreads = 10; my $firstURL = 'http://www.example.com/thePage.htm'; sub retrieveInfo { my( $serialNo, $content ) = @_; my $info = parseContent( $content ); ## do something with the info ## Obtain exclusive access to STDOUT lock $sem; ## Print the info prefixed with the serial no. printf "%05d:%s", $info; return; } sub listParse { my( $url, $Qout ) = @_; ## Serial no incremented each time a link is found. my $serialNo = 0; ## Get the first page my $content = get $url; ## find the links and push them onto the queue while( $content =~ m[...]g ) { ## Queue the data pre-fixed by its serial no $Qout->enqueue( ++$serialNo . ':' . $_ ); } ## Push 1 undef per thread to terminate their loops $Qout->enqueue( (undef) x $noOfThreads ); } sub getHTML { my( $Qin ) = @_; ## Read a link while( $Qin->dequeue ) { ## Split off and remember the serial no my( $serialNo, $link ) = split ':', $_, 2; ## Fetch the content my $content = get $link; ## And process it, passing along the serial no retrieveInfo( $serialNo, $content ); } } ## Redirect STDOUT via teh system sort utility ## and via another pipe to the OLE/Excel script open STDOUT, '|sort | perl excelOLE.pl' or die $!; ## Create the queue my $Qlinks = new Thread::Queue; ## Start the threads. my @threads = map { threads->create( \&getHTML, $Qlinks ); } 1 .. $noOfThreads; ## Fetch and parse the first page; queue the links listParse( $firstURL, $Qlinks ); ## Join the threads $_->join for @threads; ## Ensure the pipe gets flushed ## so that sort can do its thing close STDOUT;