Event driven systems are also handy for parallel processing,
provided you have cooperative modules to do the work. This program
reads a list of URLs on STDIN and dumps their responses to STDOUT.
The program uses a cooperative HTTP user agent to run parallel web
requests. It can take advantage of POE::Component::Client::DNS (a
cooperative host resolver) if it is also installed.
Sample use:
perl -pe 's!^(\S+).*!http://$1.com/!' /usr/share/dict/words \
| perl perlmonks-url-fetcher.perl
-- Rocco Caputo - troc@pobox.com - poe.perl.org
#!/usr/bin/perl
# Fetch all manner of URLs from STDIN; dumping the text of their
# responses on STDOUT.
use warnings;
use strict;
sub MAX_PARALLEL () { 8 } # Number of requests to run at once.
use POE; # Cooperative multitasking framewo
+rk.
use POE::Component::Client::HTTP; # Non-blocking HTTP requests modul
+e.
use HTTP::Request::Common qw(GET);
### Spawn the HTTP client component. It will be named "ua", which is
### short for "useragent".
POE::Component::Client::HTTP->spawn(Alias => 'ua');
### Start the session that will use the HTTP client. The _start event
### is fired by POE to kick-start a session.
POE::Session->create(
inline_states => {
_start => \&initialize_session,
got_response => \&handle_response,
}
);
### Run the session that will visit pages. The run() function will
### not return until the session is through processing its last URL.
$poe_kernel->run();
exit 0;
### Handle the _start event by setting up the session and starting an
### initial number of requests. As each request finishes, another
### will be started in its place.
###
### The $_[KERNEL] parameter convention is strange but useful. See:
### http://poe.perl.org/?POE_FAQ/Why_does_POE_pass_parameters_as_array
+_slices
sub initialize_session {
my $kernel = $_[KERNEL];
for (1..MAX_PARALLEL) {
my $next_url = <STDIN>;
last unless defined $next_url;
chomp $next_url;
$kernel->post( "ua", # Post the request to the user age
+nt.
"request", # It is a request we're posting.
"got_response", # The ua response should be "got_r
+esponse".
GET $next_url # The HTTP::Request to process.
);
}
}
### Receive a response and just dump it as_string() for demonstration
### purposes. Once dumped, it attempts to read and request yet
### another URL. The parameter convention is strange but useful
### again; this time pulling off only the values we need using a slice
### of @_.
sub handle_response {
my ($kernel, $heap, $req_packet, $resp_packet) =
@_[KERNEL, HEAP, ARG0, ARG1];
my $http_request = $req_packet->[0]; # Original HTTP::Request
my $http_response = $resp_packet->[0]; # Resulting HTTP::Response
my $response_string = $http_response->as_string();
$response_string =~ s/^/| /mg;
print ",---------- ", $http_request->uri," ----------\n";
print $response_string;
print "`", '-' x 78, "\n";
# Start another request if it's available, or let the list of
# pending URLs run out. The session will stop when it does run out.
my $next_url = <STDIN>;
if (defined $next_url) {
chomp $next_url;
$kernel->post(ua => request => got_response => GET $next_url);
}
}
|