in reply to Re: Re: Using URI::URL
in thread Using URI::URL
use strict; use warnings; use WWW::Mechanize; use WWW::RobotRules; use LWP::Simple; my $SITE = 'http://www.unlocalhost.com'; my $rules = WWW::RobotRules->new('bot/1.0'); my $robot_url = "$SITE/robots.txt"; my $robot_data = LWP::Simple::get($robot_url); $rules->parse($robot_url, $robot_data) if $robot_data; for ('disallow.txt', 'allow.txt') { my $url = "$SITE/$_"; if($rules->allowed($url)) { my $mech = WWW::Mechanize->new; $mech->get($url); print "$url:\n", $mech->content; } else { print "$url:\ndenied\n"; } }
And a client script:package WWW::Mechanize::Polite; use base 'WWW::Mechanize'; use WWW::RobotRules; sub new { my $self = shift->SUPER::new(@_); $self->{robo_rules} = WWW::RobotRules->new($self->agent()); return $self; } sub parse_robots { my ($self,$url) = @_; $self->get($url); $self->{robo_rules}->parse($url, $self->content); } sub polite_get { my ($self,$url) = @_; if ($self->{robo_rules}->allowed($url)) { $self->get($url); } else { undef $self->{content}; } } 1;
use WWW::Mechanize::Polite; my $SITE = 'http://www.unlocalhost.com'; my $mech = WWW::Mechanize::Polite->new; $mech->parse_robots("$SITE/robots.txt"); for ('allow.txt', 'disallow.txt') { my $url = "$SITE/$_"; $mech->polite_get($url); print "$url:\n", $mech->content ? $mech->content : "denied\n"; }
jeffa
L-LL-L--L-LL-L--L-LL-L-- -R--R-RR-R--R-RR-R--R-RR B--B--B--B--B--B--B--B-- H---H---H---H---H---H--- (the triplet paradiddle with high-hat)
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re: WWW::Mechanize::Polite ?
by mkurtis (Scribe) on Feb 22, 2004 at 03:14 UTC |