use strict;
use warnings;
use diagnostics;
use HTML::TableExtract;
use WWW::Mechanize;
use Time::HiRes;
$|++;

my $huffdata = "huff_data.txt";
open (my $fh, "+>>", "$huffdata") or die "unable to open $huffdata $!";
my $mech = WWW::Mechanize->new;

# Skeleton URL
my $url = "http://fundrace.huffingtonpost.com/neighbors.php?type=name&lname=SMITH";

my $pagecount = 0;
my $off=0;

print "Page ";

SLURP:
while (1) {
    sleep(rand(10));
    my $text;

    # Build URL for the currently-required page. off=0 works fine.
    my $current_url = "$url&off=" . ($off*100);

    # Stay in the loop until we've successfully gotten the page or died.
    my $need_to_get = 1;
    while ($need_to_get) {
        $mech->get($current_url);
        die "Failed to get $current_url on page $pagecount: @{[$mech->status]}" unless $mech->success;

        # We got another page.
        $pagecount++;
        print "$pagecount ...";

        $text = $mech->content;

        # Successfully ran out of entries. Blow out of BOTH loops.
        if ($text =~ /No Contributions Found/) {
            last SLURP;
        }

        # Hiccup at site. Try this one again.
        if ($text =~ /An error occurred in processing your request/sm) {
           print "(oops)";
           next;
        }

        # Try to parse the table. Reload if this fails. (Takes care of "featured").
        my $te;
        eval {
            $te = HTML::TableExtract->new( headers => [qw(Donor Contribution Address)] );
            $te->parse($text);
        };
        if ($@) {
            print "(parse failed: $@) ";
            next;
        }
        my @rows = eval { $te->rows };
        if ($@) {
            print "(extract failed: $@) ";
            next;
        }

        # Add a newline to make sure the entries are actually separated.
        foreach my $row ($te->rows) {
            print $fh join(",", @$row),"\n";
        }

        # Done with this one; drop out of the retry loop.
        $need_to_get = 0;
    }
    # Move up to the next page.
    $off++;
}