use strict; use warnings; use diagnostics; use HTML::TableExtract; use WWW::Mechanize; use Time::HiRes; $|++; my $huffdata = "huff_data.txt"; open (my $fh, "+>>", "$huffdata") or die "unable to open $huffdata $!"; my $mech = WWW::Mechanize->new; # Skeleton URL my $url = "http://fundrace.huffingtonpost.com/neighbors.php?type=name&lname=SMITH"; my $pagecount = 0; my $off=0; print "Page "; SLURP: while (1) { sleep(rand(10)); my $text; # Build URL for the currently-required page. off=0 works fine. my $current_url = "$url&off=" . ($off*100); # Stay in the loop until we've successfully gotten the page or died. my $need_to_get = 1; while ($need_to_get) { $mech->get($current_url); die "Failed to get $current_url on page $pagecount: @{[$mech->status]}" unless $mech->success; # We got another page. $pagecount++; print "$pagecount ..."; $text = $mech->content; # Successfully ran out of entries. Blow out of BOTH loops. if ($text =~ /No Contributions Found/) { last SLURP; } # Hiccup at site. Try this one again. if ($text =~ /An error occurred in processing your request/sm) { print "(oops)"; next; } # Try to parse the table. Reload if this fails. (Takes care of "featured"). my $te; eval { $te = HTML::TableExtract->new( headers => [qw(Donor Contribution Address)] ); $te->parse($text); }; if ($@) { print "(parse failed: $@) "; next; } my @rows = eval { $te->rows }; if ($@) { print "(extract failed: $@) "; next; } # Add a newline to make sure the entries are actually separated. foreach my $row ($te->rows) { print $fh join(",", @$row),"\n"; } # Done with this one; drop out of the retry loop. $need_to_get = 0; } # Move up to the next page. $off++; }