package PDAScraper::Foo; # PDAScraper.pm rules for scraping the # Foo website sub config { return { name => 'Foo', # Name of the website. Arbitrary text. start_from => 'http://www.foo.com/news/', # URL where the scraper should find the links. url_regex => [ '$', '&print=1' ], # This is the simple form of the url_regex, which # is used to change a regular link to a "print-friendly" # link. Simple because there are no backreferences # neede on the RHS. # url_regex => [ # '/id/(\d+)/', # sub { \ "/toolbar.aspx?action=print&id=$1" } # ], # This is the complex form of the url_regex, using # a sub to return because it needs to evaluate a # backreference i.e. $1, $2 etc. chunk_spec => [ "_tag", "div", "id", "headlines" ], # A list of arguments to HTML::Element's look_down() # method. This one will return an HTML::Element object # matching the first ID tag having the attribute # "id" with value "headlines". # If you can't use a chunk_spec, you'll have to use a # chunk_regex: chunk_regex => qr{(.*?)
}s # A regular expression which returns your desired # chunk of the page as $1. Using chunk_spec is better. link_spec => [sub { $_[0]->as_text ne 'FULL STORY' }] # All links are grabbed from the page chunk by default, # but chunk_spec allows you to add HTML::Element # filtering, here, for example, rejecting links in the # form FULL STORY, but you could also # reject them on any attribute, see HTML::Element. }; } 1;