#!/usr/bin/perl package PDAScraper; use strict; use Exporter; use vars qw($VERSION @ISA @EXPORT); $VERSION = 0.00; @ISA = qw( Exporter ); @EXPORT = qw( &scrape ); use URI::URL; use HTML::TreeBuilder; use HTML::Template; use LWP::UserAgent; my $ua = LWP::UserAgent->new(); # $ua->proxy( ['http'], 'http://foo.bar:8080/' ); # If needed. ### Grab the template for the 'index.html' file. Stored ### in-module here, not necessarily the best way but it ### cuts down on external files. my @html_array = ; sub new { ### PerlMonk jdporter clued me in on how ### to Do This Properly. There are sub-modules ### which contain the rules for each website. ### Thanks J D. my ( $pkg, $rules ) = @_; bless { rules => $rules, }, $pkg; } sub scrape { my $self = shift; my $obj = $self->{'rules'}->config(); my $template = undef; my @all_links = (); my @good_links = (); my $tree = undef; my $chunk = undef; my $file_number = 0; print "getting " . $obj->{'name'} . $/; my $response = $ua->get( $obj->{'start_from'} ); ### get the front page which has the links unless ( $response->is_success() ) { warn "Failed to get starter page: $obj->{'start_from'}\n"; return; } else { print "Got starter page: $obj->{'start_from'}\n"; } my $page = $response->content(); if ( $obj->{'chunk_spec'} ) { ### if we're parsing the HTML the Good Way using TreeBuilder $chunk = HTML::TreeBuilder->new_from_content( $page ); $chunk->elementify() || die "$!"; $chunk = $chunk->look_down( @{ $obj->{'chunk_spec'} } ); } elsif ( $obj->{'chunk_regex'} ) { ### if we're parsing the HTML the Bad Way using a regex $page =~ $obj->{'chunk_regex'}; unless (defined $1){ print "Regex failed to match\n"; return; } $chunk = HTML::TreeBuilder->new_from_content( $1 ); $chunk->elementify(); } else { ### if we're just grabbing the whole page, ### probably not a good idea, but see ### link_spec below for a way of filtering links $chunk = HTML::TreeBuilder->new_from_content( $page ); $chunk->elementify(); } if ( defined( $obj->{'link_spec'} ) ) { ### If we've got a TreeBuilder link filter to grab only ### the links which match a certain format @all_links = $chunk->look_down( '_tag', 'a', @{ $obj->{'link_spec'} } ); } else { @all_links = $chunk->look_down( '_tag', 'a' ); } print "found " . scalar( @all_links ) . " links.\n"; for ( @all_links ) { ### Avoid three problem conditions -- no text means ### we've probably got image links (often duplicates) ### -- "#" as the href means a JavaScript link -- ### and tags with no HREF are also no use to us: next unless ( defined( $_->attr( 'href' ) ) && $_->as_text() ne '' && $_->attr( 'href' ) ne '#' ); my $href = $_->attr( 'href' ); ### It's expected that we'll need to transform ### the URL from regular to print-friendly: if ( defined( $obj->{'url_regex'} ) && ref( $obj->{'url_regex'}->[1] ) eq 'CODE' ) { ### PerlMonk Roy Johnson is my saviour here. ### Solution to the problem of some url regexes ### needing backreferences and some not. $href =~ s{$obj->{'url_regex'}->[0]} {${$obj->{'url_regex'}->[1]->()}}e; } elsif ( defined( $obj->{'url_regex'} ) ) { ### If there is a regex object at all: $href =~ s{$obj->{'url_regex'}->[0]} {$obj->{'url_regex'}->[1]}; } ### Transform the URL from relative to absolute: my $url = URI::URL->new( $href, $obj->{'start_from'} ); my $abs_url = $url->abs(); ### Make a data structure with all the stuff we're ### going to get on the next pass: push( @good_links, { text => $_->as_text(), url => "$abs_url" } ); } print "found " . scalar( @good_links ) . " 'good' links.\n"; if ( scalar( @good_links ) == 0 ) { print "No 'good' links found.\n"; return; } ( my $foldername = $obj->{'name'} ) =~ s/\W//g; ### Make a foldername with no non-word chars unless ( -e "$ENV{'HOME'}/scrape" ) { ### Make a scrape folder if there isn't one mkdir "$ENV{'HOME'}/scrape" || die "$!"; } unless ( -e "$ENV{'HOME'}/scrape/$foldername" ) { ### Make a folder for this content if there isn't one mkdir "$ENV{'HOME'}/scrape/$foldername" || die "$!"; } foreach ( @good_links ) { my $response = $ua->get( $_->{'url'} ); unless ( $response->is_success() ) { warn "didn't get " . $_->{'url'} . "$!" . $/; return; } else { print "got " . $_->{'url'} . $/; } my $page = $response->content(); ### TO DO: arbitrary number of further regexes ### in case users want to clean content up more? ### Filenames sprintf'd for neatness only: my $local_file = sprintf( "%03d.html", $file_number ); ### add a localfile value to the AoH for use in the index: $_->{localfile} = $local_file; ### Print out the actual content page locally: open( PAGE, ">$ENV{'HOME'}/scrape/$foldername/$local_file" ) || die "$!"; print PAGE $page; close( PAGE ); $file_number++; } ### [die_on_bad_params is off because the AoH contains ### one item we don't need, the original URL] $template = HTML::Template->new( arrayref => \@html_array, debug => 0, die_on_bad_params => 0 ); ### Use the name and the links array to fill out the template: $template->param( links => \@good_links, sitename => $obj->{'name'} ); ### Output the index page locally: open( INDEX, ">$ENV{'HOME'}/scrape/$foldername/index.html" ) || die "$!"; unless ( print INDEX $template->output() ) { print "Error in HTML::Template output\n"; return; } close( INDEX ); print "Finished scraping $obj->{'name'}\n\n"; ### Clean up after HTML::Tree as recommended $chunk->delete(); } 1; __DATA__ <tmpl_var name="sitename">