use strict; use warnings; use WWW::Mechanize; use WWW::RobotRules; use LWP::Simple; my $SITE = 'http://www.unlocalhost.com'; my $rules = WWW::RobotRules->new('bot/1.0'); my $robot_url = "$SITE/robots.txt"; my $robot_data = LWP::Simple::get($robot_url); $rules->parse($robot_url, $robot_data) if $robot_data; for ('disallow.txt', 'allow.txt') { my $url = "$SITE/$_"; if($rules->allowed($url)) { my $mech = WWW::Mechanize->new; $mech->get($url); print "$url:\n", $mech->content; } else { print "$url:\ndenied\n"; } }
And a client script:package WWW::Mechanize::Polite; use base 'WWW::Mechanize'; use WWW::RobotRules; sub new { my $self = shift->SUPER::new(@_); $self->{robo_rules} = WWW::RobotRules->new($self->agent()); return $self; } sub parse_robots { my ($self,$url) = @_; $self->get($url); $self->{robo_rules}->parse($url, $self->content); } sub polite_get { my ($self,$url) = @_; if ($self->{robo_rules}->allowed($url)) { $self->get($url); } else { undef $self->{content}; } } 1;
use WWW::Mechanize::Polite; my $SITE = 'http://www.unlocalhost.com'; my $mech = WWW::Mechanize::Polite->new; $mech->parse_robots("$SITE/robots.txt"); for ('allow.txt', 'disallow.txt') { my $url = "$SITE/$_"; $mech->polite_get($url); print "$url:\n", $mech->content ? $mech->content : "denied\n"; }
jeffa
L-LL-L--L-LL-L--L-LL-L-- -R--R-RR-R--R-RR-R--R-RR B--B--B--B--B--B--B--B-- H---H---H---H---H---H--- (the triplet paradiddle with high-hat)
In reply to WWW::Mechanize::Polite ?
by jeffa
in thread Using URI::URL
by mkurtis
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |