#!/usr/bin/perl package PDAScraper; use strict; use Exporter; use vars qw($VERSION @ISA @EXPORT); $VERSION = 0.00; @ISA = qw( Exporter ); @EXPORT = qw( &scrape ); use URI::URL; use HTML::TreeBuilder; use HTML::Template; use LWP::UserAgent; my $ua = LWP::UserAgent->new(); # $ua->proxy( ['http'], 'http://foo.bar:8080/' ); # If needed. ### Grab the template for the 'index.html' file. Stored ### in-module here, not necessarily the best way but it ### cuts down on external files. my @html_array = ; sub new { ### PerlMonk jdporter clued me in on how ### to Do This Properly. There are sub-modules ### which contain the rules for each website. ### Thanks J D. my ( $pkg, $rules ) = @_; bless { rules => $rules, }, $pkg; } sub scrape { my $self = shift; my $obj = $self->{'rules'}->config(); my $template = undef; my @all_links = (); my @good_links = (); my $tree = undef; my $chunk = undef; my $file_number = 0; print "getting " . $obj->{'name'} . $/; my $response = $ua->get( $obj->{'start_from'} ); ### get the front page which has the links unless ( $response->is_success() ) { warn "Failed to get starter page: $obj->{'start_from'}\n"; return; } else { print "Got starter page: $obj->{'start_from'}\n"; } my $page = $response->content(); if ( $obj->{'chunk_spec'} ) { ### if we're parsing the HTML the Good Way using TreeBuilder $chunk = HTML::TreeBuilder->new_from_content( $page ); $chunk->elementify() || die "$!"; $chunk = $chunk->look_down( @{ $obj->{'chunk_spec'} } ); } elsif ( $obj->{'chunk_regex'} ) { ### if we're parsing the HTML the Bad Way using a regex $page =~ $obj->{'chunk_regex'}; unless (defined $1){ print "Regex failed to match\n"; return; } $chunk = HTML::TreeBuilder->new_from_content( $1 ); $chunk->elementify(); } else { ### if we're just grabbing the whole page, ### probably not a good idea, but see ### link_spec below for a way of filtering links $chunk = HTML::TreeBuilder->new_from_content( $page ); $chunk->elementify(); } if ( defined( $obj->{'link_spec'} ) ) { ### If we've got a TreeBuilder link filter to grab only ### the links which match a certain format @all_links = $chunk->look_down( '_tag', 'a', @{ $obj->{'link_spec'} } ); } else { @all_links = $chunk->look_down( '_tag', 'a' ); } print "found " . scalar( @all_links ) . " links.\n"; for ( @all_links ) { ### Avoid three problem conditions -- no text means ### we've probably got image links (often duplicates) ### -- "#" as the href means a JavaScript link -- ### and tags with no HREF are also no use to us: next unless ( defined( $_->attr( 'href' ) ) && $_->as_text() ne '' && $_->attr( 'href' ) ne '#' ); my $href = $_->attr( 'href' ); ### It's expected that we'll need to transform ### the URL from regular to print-friendly: if ( defined( $obj->{'url_regex'} ) && ref( $obj->{'url_regex'}->[1] ) eq 'CODE' ) { ### PerlMonk Roy Johnson is my saviour here. ### Solution to the problem of some url regexes ### needing backreferences and some not. $href =~ s{$obj->{'url_regex'}->[0]} {${$obj->{'url_regex'}->[1]->()}}e; } elsif ( defined( $obj->{'url_regex'} ) ) { ### If there is a regex object at all: $href =~ s{$obj->{'url_regex'}->[0]} {$obj->{'url_regex'}->[1]}; } ### Transform the URL from relative to absolute: my $url = URI::URL->new( $href, $obj->{'start_from'} ); my $abs_url = $url->abs(); ### Make a data structure with all the stuff we're ### going to get on the next pass: push( @good_links, { text => $_->as_text(), url => "$abs_url" } ); } print "found " . scalar( @good_links ) . " 'good' links.\n"; if ( scalar( @good_links ) == 0 ) { print "No 'good' links found.\n"; return; } ( my $foldername = $obj->{'name'} ) =~ s/\W//g; ### Make a foldername with no non-word chars unless ( -e "$ENV{'HOME'}/scrape" ) { ### Make a scrape folder if there isn't one mkdir "$ENV{'HOME'}/scrape" || die "$!"; } unless ( -e "$ENV{'HOME'}/scrape/$foldername" ) { ### Make a folder for this content if there isn't one mkdir "$ENV{'HOME'}/scrape/$foldername" || die "$!"; } foreach ( @good_links ) { my $response = $ua->get( $_->{'url'} ); unless ( $response->is_success() ) { warn "didn't get " . $_->{'url'} . "$!" . $/; return; } else { print "got " . $_->{'url'} . $/; } my $page = $response->content(); ### TO DO: arbitrary number of further regexes ### in case users want to clean content up more? ### Filenames sprintf'd for neatness only: my $local_file = sprintf( "%03d.html", $file_number ); ### add a localfile value to the AoH for use in the index: $_->{localfile} = $local_file; ### Print out the actual content page locally: open( PAGE, ">$ENV{'HOME'}/scrape/$foldername/$local_file" ) || die "$!"; print PAGE $page; close( PAGE ); $file_number++; } ### [die_on_bad_params is off because the AoH contains ### one item we don't need, the original URL] $template = HTML::Template->new( arrayref => \@html_array, debug => 0, die_on_bad_params => 0 ); ### Use the name and the links array to fill out the template: $template->param( links => \@good_links, sitename => $obj->{'name'} ); ### Output the index page locally: open( INDEX, ">$ENV{'HOME'}/scrape/$foldername/index.html" ) || die "$!"; unless ( print INDEX $template->output() ) { print "Error in HTML::Template output\n"; return; } close( INDEX ); print "Finished scraping $obj->{'name'}\n\n"; ### Clean up after HTML::Tree as recommended $chunk->delete(); } 1; __DATA__ <tmpl_var name="sitename">

#### package PDAScraper::YahooTV; # PDAScraper.pm rules for scraping the # Yahoo TV website sub config { return { name => 'Yahoo TV', start_from => 'http://news.yahoo.com/i/763', chunk_spec => [ "_tag", "div", "id", "indexstories" ], url_regex => [ '/[^/]*$', '&printer=1' ] }; } 1; #### package PDAScraper::Slate; # PDAScraper.pm rules for scraping the # Slate website sub config { return { name => 'Slate', start_from => 'http://www.slate.com/id/2065896/view/2057069/', url_regex => [ '/id/(\d+)/', sub { \ "/toolbar.aspx?action=print&id=$1" } ], chunk_regex => qr{

(.*?)} }; } 1; #### package PDAScraper::NewScientist; # PDAScraper.pm rules for scraping the # New Scientist website sub config { return { name => 'New Scientist Headlines', start_from => 'http://www.newscientist.com/news.ns', chunk_spec => [ "_tag", "div", "id", "newslisting" ], url_regex => [ '$', '&print=true' ] }; } 1; #### #!/usr/bin/perl use strict; use warnings; use PDAScraper; use PDAScraper::YahooTV; my $YM_Scraper = PDAScraper->new('PDAScraper::YahooTV') || die "$!"; $YM_Scraper->scrape(); use PDAScraper::Slate; my $Slate_Scraper = PDAScraper->new('PDAScraper::Slate') || die "$!"; $Slate_Scraper->scrape(); use PDAScraper::NewScientist; my $NS_Scraper = PDAScraper->new('PDAScraper::NewScientist') || die "$!"; $NS_Scraper->scrape(); #### package PDAScraper::Foo; # PDAScraper.pm rules for scraping the # Foo website sub config { return { name => 'Foo', # Name of the website. Arbitrary text. start_from => 'http://www.foo.com/news/', # URL where the scraper should find the links. url_regex => [ '$', '&print=1' ], # This is the simple form of the url_regex, which # is used to change a regular link to a "print-friendly" # link. Simple because there are no backreferences # neede on the RHS. # url_regex => [ # '/id/(\d+)/', # sub { \ "/toolbar.aspx?action=print&id=$1" } # ], # This is the complex form of the url_regex, using # a sub to return because it needs to evaluate a # backreference i.e. $1, $2 etc. chunk_spec => [ "_tag", "div", "id", "headlines" ], # A list of arguments to HTML::Element's look_down() # method. This one will return an HTML::Element object # matching the first ID tag having the attribute # "id" with value "headlines". # If you can't use a chunk_spec, you'll have to use a # chunk_regex: chunk_regex => qr{
(.*?)
}s # A regular expression which returns your desired # chunk of the page as $1. Using chunk_spec is better. link_spec => [sub { $_[0]->as_text ne 'FULL STORY' }] # All links are grabbed from the page chunk by default, # but chunk_spec allows you to add HTML::Element # filtering, here, for example, rejecting links in the # form FULL STORY, but you could also # reject them on any attribute, see HTML::Element. }; } 1;