use strict; use Data::Dumper; use LWP::Parallel::UserAgent; use HTTP::Request; use HTML::SimpleLinkExtor; use LWP::Parallel::Protocol::http; *LWP::Parallel::UserAgent::_new_response = \&LWP::UserAgent::_new_response; my $pagecount = 1; my $url = $ARGV[0]; my $request = HTTP::Request->new(GET => $url); my $browser = 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4b) Gecko/20030514'; my $ua = new LWP::UserAgent; $ua->agent($browser); # debugging messages. See 'perldoc LWP::Debug' # use LWP::Debug qw(+); my $response = $ua->request($request); my $status_line = $response->status_line; my $html = $response->content; my $extor = HTML::SimpleLinkExtor->new(); $extor->parse($html); my @img_srcs = $extor->img; my @a_hrefs = $extor->a; my @base_hrefs = $extor->base; undef my %saw; my @out = grep(!$saw{$_}++, @a_hrefs); my @urls; my $uri = URI->new($url)->canonical; my $host = $uri->host($url); my $g_scheme = $uri->scheme; foreach my $site (@out) { my $p_uri = URI->new($site)->canonical; my $p_scheme = $p_uri->scheme; if ($p_scheme !~ /http/) { $site =~ s,^//,http://,; $site = "$g_scheme://$host/$site\n\n"; } push @urls, $site; } my $reqs = [ map { HTTP::Request->new('GET', $_ ) } @urls ]; my $pua = LWP::Parallel::UserAgent->new(); $pua->in_order (0); $pua->duplicates(1); $pua->timeout (2); $pua->max_req (100); $pua->max_hosts (100); $pua->redirect (1); my $urlcount = 0; foreach my $req (@$reqs) { if ( my $res = $pua->register ($req) ) { print STDERR $res->error_as_HTML; } $urlcount++; } print "Total valid (unique) urls found: $urlcount\n\n"; my $entries = $pua->wait(); foreach (keys %$entries) { my $res = $entries->{$_}->response; my $html = $res->content; print "Fetching link $pagecount\n\n"; open FILE, ">$pagecount.html" or die $!; print FILE $html; close FILE; $pagecount++; }