#!/usr/bin/perl
package PDAScraper;
use strict;
use Exporter;
use vars qw($VERSION @ISA @EXPORT);
$VERSION = 0.00;
@ISA = qw( Exporter );
@EXPORT = qw( &scrape );
use URI::URL;
use HTML::TreeBuilder;
use HTML::Template;
use LWP::UserAgent;
my $ua = LWP::UserAgent->new();
# $ua->proxy( ['http'], 'http://foo.bar:8080/' ); # If needed.
### Grab the template for the 'index.html' file. Stored
### in-module here, not necessarily the best way but it
### cuts down on external files.
my @html_array = ;
sub new {
### PerlMonk jdporter clued me in on how
### to Do This Properly. There are sub-modules
### which contain the rules for each website.
### Thanks J D.
my ( $pkg, $rules ) = @_;
bless { rules => $rules, }, $pkg;
}
sub scrape {
my $self = shift;
my $obj = $self->{'rules'}->config();
my $template = undef;
my @all_links = ();
my @good_links = ();
my $tree = undef;
my $chunk = undef;
my $file_number = 0;
print "getting " . $obj->{'name'} . $/;
my $response = $ua->get( $obj->{'start_from'} );
### get the front page which has the links
unless ( $response->is_success() ) {
warn
"Failed to get starter page: $obj->{'start_from'}\n";
return;
} else {
print "Got starter page: $obj->{'start_from'}\n";
}
my $page = $response->content();
if ( $obj->{'chunk_spec'} ) {
### if we're parsing the HTML the Good Way using TreeBuilder
$chunk = HTML::TreeBuilder->new_from_content( $page );
$chunk->elementify() || die "$!";
$chunk = $chunk->look_down( @{ $obj->{'chunk_spec'} } );
} elsif ( $obj->{'chunk_regex'} ) {
### if we're parsing the HTML the Bad Way using a regex
$page =~ $obj->{'chunk_regex'};
unless (defined $1){
print "Regex failed to match\n";
return;
}
$chunk = HTML::TreeBuilder->new_from_content( $1 );
$chunk->elementify();
} else {
### if we're just grabbing the whole page,
### probably not a good idea, but see
### link_spec below for a way of filtering links
$chunk = HTML::TreeBuilder->new_from_content( $page );
$chunk->elementify();
}
if ( defined( $obj->{'link_spec'} ) ) {
### If we've got a TreeBuilder link filter to grab only
### the links which match a certain format
@all_links =
$chunk->look_down( '_tag', 'a',
@{ $obj->{'link_spec'} } );
} else {
@all_links = $chunk->look_down( '_tag', 'a' );
}
print "found " . scalar( @all_links ) . " links.\n";
for ( @all_links ) {
### Avoid three problem conditions -- no text means
### we've probably got image links (often duplicates)
### -- "#" as the href means a JavaScript link --
### and tags with no HREF are also no use to us:
next
unless ( defined( $_->attr( 'href' ) )
&& $_->as_text() ne ''
&& $_->attr( 'href' ) ne '#' );
my $href = $_->attr( 'href' );
### It's expected that we'll need to transform
### the URL from regular to print-friendly:
if ( defined( $obj->{'url_regex'} )
&& ref( $obj->{'url_regex'}->[1] ) eq 'CODE' )
{
### PerlMonk Roy Johnson is my saviour here.
### Solution to the problem of some url regexes
### needing backreferences and some not.
$href =~ s{$obj->{'url_regex'}->[0]}
{${$obj->{'url_regex'}->[1]->()}}e;
} elsif ( defined( $obj->{'url_regex'} ) ) {
### If there is a regex object at all:
$href =~ s{$obj->{'url_regex'}->[0]}
{$obj->{'url_regex'}->[1]};
}
### Transform the URL from relative to absolute:
my $url = URI::URL->new( $href, $obj->{'start_from'} );
my $abs_url = $url->abs();
### Make a data structure with all the stuff we're
### going to get on the next pass:
push(
@good_links,
{
text => $_->as_text(),
url => "$abs_url"
}
);
}
print "found " . scalar( @good_links ) . " 'good' links.\n";
if ( scalar( @good_links ) == 0 ) {
print "No 'good' links found.\n";
return;
}
( my $foldername = $obj->{'name'} ) =~ s/\W//g;
### Make a foldername with no non-word chars
unless ( -e "$ENV{'HOME'}/scrape" ) {
### Make a scrape folder if there isn't one
mkdir "$ENV{'HOME'}/scrape" || die "$!";
}
unless ( -e "$ENV{'HOME'}/scrape/$foldername" ) {
### Make a folder for this content if there isn't one
mkdir "$ENV{'HOME'}/scrape/$foldername" || die "$!";
}
foreach ( @good_links ) {
my $response = $ua->get( $_->{'url'} );
unless ( $response->is_success() ) {
warn "didn't get " . $_->{'url'} . "$!" . $/;
return;
} else {
print "got " . $_->{'url'} . $/;
}
my $page = $response->content();
### TO DO: arbitrary number of further regexes
### in case users want to clean content up more?
### Filenames sprintf'd for neatness only:
my $local_file = sprintf( "%03d.html", $file_number );
### add a localfile value to the AoH for use in the index:
$_->{localfile} = $local_file;
### Print out the actual content page locally:
open( PAGE,
">$ENV{'HOME'}/scrape/$foldername/$local_file" )
|| die "$!";
print PAGE $page;
close( PAGE );
$file_number++;
}
### [die_on_bad_params is off because the AoH contains
### one item we don't need, the original URL]
$template = HTML::Template->new(
arrayref => \@html_array,
debug => 0,
die_on_bad_params => 0
);
### Use the name and the links array to fill out the template:
$template->param(
links => \@good_links,
sitename => $obj->{'name'}
);
### Output the index page locally:
open( INDEX, ">$ENV{'HOME'}/scrape/$foldername/index.html" )
|| die "$!";
unless ( print INDEX $template->output() ) {
print "Error in HTML::Template output\n";
return;
}
close( INDEX );
print "Finished scraping $obj->{'name'}\n\n";
### Clean up after HTML::Tree as recommended
$chunk->delete();
}
1;
__DATA__
####
package PDAScraper::YahooTV;
# PDAScraper.pm rules for scraping the
# Yahoo TV website
sub config {
return {
name => 'Yahoo TV',
start_from => 'http://news.yahoo.com/i/763',
chunk_spec => [ "_tag", "div", "id", "indexstories" ],
url_regex => [ '/[^/]*$', '&printer=1' ]
};
}
1;
####
package PDAScraper::Slate;
# PDAScraper.pm rules for scraping the
# Slate website
sub config {
return {
name => 'Slate',
start_from =>
'http://www.slate.com/id/2065896/view/2057069/',
url_regex => [
'/id/(\d+)/',
sub { \ "/toolbar.aspx?action=print&id=$1" }
],
chunk_regex =>
qr{