<tmpl_var name="sitename">

#!/usr/bin/perl package PDAScraper; use strict; use Exporter; use vars qw($VERSION @ISA @EXPORT); $VERSION = 0.00; @ISA = qw( Exporter ); @EXPORT = qw( &scrape ); use URI::URL; use HTML::TreeBuilder; use HTML::Template; use LWP::UserAgent; my $ua = LWP::UserAgent->new(); # $ua->proxy( ['http'], 'http://foo.bar:8080/' ); # If needed. ### Grab the template for the 'index.html' file. Stored ### in-module here, not necessarily the best way but it ### cuts down on external files. my @html_array = ; sub new { ### PerlMonk jdporter clued me in on how ### to Do This Properly. There are sub-modules ### which contain the rules for each website. ### Thanks J D. my ( $pkg, $rules ) = @_; bless { rules => $rules, }, $pkg; } sub scrape { my $self = shift; my $obj = $self->{'rules'}->config(); my $template = undef; my @all_links = (); my @good_links = (); my $tree = undef; my $chunk = undef; my $file_number = 0; print "getting " . $obj->{'name'} . $/; my $response = $ua->get( $obj->{'start_from'} ); ### get the front page which has the links unless ( $response->is_success() ) { warn "Failed to get starter page: $obj->{'start_from'}\n"; return; } else { print "Got starter page: $obj->{'start_from'}\n"; } my $page = $response->content(); if ( $obj->{'chunk_spec'} ) { ### if we're parsing the HTML the Good Way using TreeBuilder $chunk = HTML::TreeBuilder->new_from_content( $page ); $chunk->elementify() || die "$!"; $chunk = $chunk->look_down( @{ $obj->{'chunk_spec'} } ); } elsif ( $obj->{'chunk_regex'} ) { ### if we're parsing the HTML the Bad Way using a regex $page =~ $obj->{'chunk_regex'}; unless (defined $1){ print "Regex failed to match\n"; return; } $chunk = HTML::TreeBuilder->new_from_content( $1 ); $chunk->elementify(); } else { ### if we're just grabbing the whole page, ### probably not a good idea, but see ### link_spec below for a way of filtering links $chunk = HTML::TreeBuilder->new_from_content( $page ); $chunk->elementify(); } if ( defined( $obj->{'link_spec'} ) ) { ### If we've got a TreeBuilder link filter to grab only ### the links which match a certain format @all_links = $chunk->look_down( '_tag', 'a', @{ $obj->{'link_spec'} } ); } else { @all_links = $chunk->look_down( '_tag', 'a' ); } print "found " . scalar( @all_links ) . " links.\n"; for ( @all_links ) { ### Avoid three problem conditions -- no text means ### we've probably got image links (often duplicates) ### -- "#" as the href means a JavaScript link -- ### and tags with no HREF are also no use to us: next unless ( defined( $_->attr( 'href' ) ) && $_->as_text() ne '' && $_->attr( 'href' ) ne '#' ); my $href = $_->attr( 'href' ); ### It's expected that we'll need to transform ### the URL from regular to print-friendly: if ( defined( $obj->{'url_regex'} ) && ref( $obj->{'url_regex'}->[1] ) eq 'CODE' ) { ### PerlMonk Roy Johnson is my saviour here. ### Solution to the problem of some url regexes ### needing backreferences and some not. $href =~ s{$obj->{'url_regex'}->[0]} {${$obj->{'url_regex'}->[1]->()}}e; } elsif ( defined( $obj->{'url_regex'} ) ) { ### If there is a regex object at all: $href =~ s{$obj->{'url_regex'}->[0]} {$obj->{'url_regex'}->[1]}; } ### Transform the URL from relative to absolute: my $url = URI::URL->new( $href, $obj->{'start_from'} ); my $abs_url = $url->abs(); ### Make a data structure with all the stuff we're ### going to get on the next pass: push( @good_links, { text => $_->as_text(), url => "$abs_url" } ); } print "found " . scalar( @good_links ) . " 'good' links.\n"; if ( scalar( @good_links ) == 0 ) { print "No 'good' links found.\n"; return; } ( my $foldername = $obj->{'name'} ) =~ s/\W//g; ### Make a foldername with no non-word chars unless ( -e "$ENV{'HOME'}/scrape" ) { ### Make a scrape folder if there isn't one mkdir "$ENV{'HOME'}/scrape" || die "$!"; } unless ( -e "$ENV{'HOME'}/scrape/$foldername" ) { ### Make a folder for this content if there isn't one mkdir "$ENV{'HOME'}/scrape/$foldername" || die "$!"; } foreach ( @good_links ) { my $response = $ua->get( $_->{'url'} ); unless ( $response->is_success() ) { warn "didn't get " . $_->{'url'} . "$!" . $/; return; } else { print "got " . $_->{'url'} . $/; } my $page = $response->content(); ### TO DO: arbitrary number of further regexes ### in case users want to clean content up more? ### Filenames sprintf'd for neatness only: my $local_file = sprintf( "%03d.html", $file_number ); ### add a localfile value to the AoH for use in the index: $_->{localfile} = $local_file; ### Print out the actual content page locally: open( PAGE, ">$ENV{'HOME'}/scrape/$foldername/$local_file" ) || die "$!"; print PAGE $page; close( PAGE ); $file_number++; } ### [die_on_bad_params is off because the AoH contains ### one item we don't need, the original URL] $template = HTML::Template->new( arrayref => \@html_array, debug => 0, die_on_bad_params => 0 ); ### Use the name and the links array to fill out the template: $template->param( links => \@good_links, sitename => $obj->{'name'} ); ### Output the index page locally: open( INDEX, ">$ENV{'HOME'}/scrape/$foldername/index.html" ) || die "$!"; unless ( print INDEX $template->output() ) { print "Error in HTML::Template output\n"; return; } close( INDEX ); print "Finished scraping $obj->{'name'}\n\n"; ### Clean up after HTML::Tree as recommended $chunk->delete(); } 1; __DATA__ <tmpl_var name="sitename">

##

package PDAScraper::YahooTV;


# PDAScraper.pm rules for scraping the
# Yahoo TV website

sub config {
    return {
        name       => 'Yahoo TV',
        start_from => 'http://news.yahoo.com/i/763',
        chunk_spec => [ "_tag", "div", "id", "indexstories" ],
        url_regex => [ '/[^/]*$', '&printer=1' ]
    };
}

1;



##

##

package PDAScraper::Slate;

# PDAScraper.pm rules for scraping the
# Slate website

sub config {
    return {
        name       => 'Slate',
        start_from =>
          'http://www.slate.com/id/2065896/view/2057069/',
        url_regex => [
            '/id/(\d+)/',
            sub { \ "/toolbar.aspx?action=print&id=$1" }
        ],
        chunk_regex =>
          qr{
(.*?)}
    };
}

1;



####

package PDAScraper::NewScientist;

# PDAScraper.pm rules for scraping the 
# New Scientist website

sub config {
    return {
        name       => 'New Scientist Headlines',
        start_from => 'http://www.newscientist.com/news.ns',
        chunk_spec => [ "_tag", "div", "id", "newslisting" ],
        url_regex => [ '$', '&print=true' ]
    };
}

1;



####

#!/usr/bin/perl
use strict;
use warnings;
use PDAScraper;

use PDAScraper::YahooTV;
my $YM_Scraper = PDAScraper->new('PDAScraper::YahooTV') 
|| die "$!";
$YM_Scraper->scrape();

use PDAScraper::Slate;
my $Slate_Scraper = PDAScraper->new('PDAScraper::Slate') 
|| die "$!";
$Slate_Scraper->scrape();

use PDAScraper::NewScientist;
my $NS_Scraper = PDAScraper->new('PDAScraper::NewScientist') 
|| die "$!";
$NS_Scraper->scrape();


####

package PDAScraper::Foo;

# PDAScraper.pm rules for scraping the
# Foo website

sub config {
    return {
        name => 'Foo',
        # Name of the website. Arbitrary text.

        start_from => 'http://www.foo.com/news/',
        # URL where the scraper should find the links.

        url_regex => [ '$', '&print=1' ],
        # This is the simple form of the url_regex, which
        # is used to change a regular link to a "print-friendly"
        # link. Simple because there are no backreferences 
        # neede on the RHS.

        #   url_regex => [
        #      '/id/(\d+)/',
        #      sub { \ "/toolbar.aspx?action=print&id=$1" }
        #  ],
        #  This is the complex form of the url_regex, using
        #  a sub to return because it needs to evaluate a 
        #  backreference i.e. $1, $2 etc.

        chunk_spec => [ "_tag", "div", "id", "headlines" ],
        # A list of arguments to HTML::Element's look_down()
        # method. This one will return an HTML::Element object
        # matching the first ID tag having the attribute
        # "id" with value "headlines".

        # If you can't use a chunk_spec, you'll have to use a
        # chunk_regex:

        chunk_regex => 
          qr{
(.*?)
}s
        # A regular expression which returns your desired
        # chunk of the page as $1. Using chunk_spec is better.
        
        link_spec => [sub { $_[0]->as_text ne 'FULL STORY' }]
        # All links are grabbed from the page chunk by default,
        # but chunk_spec allows you to add HTML::Element
        # filtering, here, for example, rejecting links in the
        # form FULL STORY, but you could also
        # reject them on any attribute, see HTML::Element.
    };
}

1;