package PDAScraper::Foo; # PDAScraper.pm rules for scraping the # Foo website sub config { return { name => 'Foo', # Name of the website. Arbitrary text. start_from => 'http://www.foo.com/news/', # URL where the scraper should find the links. url_regex => [ '$', '&print=1' ], # This is the simple form of the url_regex, which # is used to change a regular link to a "print-friendly" # link. Simple because there are no backreferences # neede on the RHS. # url_regex => [ # '/id/(\d+)/', # sub { \ "/toolbar.aspx?action=print&id=$1" } # ], # This is the complex form of the url_regex, using # a sub to return because it needs to evaluate a # backreference i.e. $1, $2 etc. chunk_spec => [ "_tag", "div", "id", "headlines" ], # A list of arguments to HTML::Element's look_down() # method. This one will return an HTML::Element object # matching the first ID tag having the attribute # "id" with value "headlines". # If you can't use a chunk_spec, you'll have to use a # chunk_regex: chunk_regex => qr{