use strict; use warnings; use WWW::Mechanize; use WWW::RobotRules; use LWP::Simple; my $SITE = 'http://www.unlocalhost.com'; my $rules = WWW::RobotRules->new('bot/1.0'); my $robot_url = "$SITE/robots.txt"; my $robot_data = LWP::Simple::get($robot_url); $rules->parse($robot_url, $robot_data) if $robot_data; for ('disallow.txt', 'allow.txt') { my $url = "$SITE/$_"; if($rules->allowed($url)) { my $mech = WWW::Mechanize->new; $mech->get($url); print "$url:\n", $mech->content; } else { print "$url:\ndenied\n"; } } #### package WWW::Mechanize::Polite; use base 'WWW::Mechanize'; use WWW::RobotRules; sub new { my $self = shift->SUPER::new(@_); $self->{robo_rules} = WWW::RobotRules->new($self->agent()); return $self; } sub parse_robots { my ($self,$url) = @_; $self->get($url); $self->{robo_rules}->parse($url, $self->content); } sub polite_get { my ($self,$url) = @_; if ($self->{robo_rules}->allowed($url)) { $self->get($url); } else { undef $self->{content}; } } 1; #### use WWW::Mechanize::Polite; my $SITE = 'http://www.unlocalhost.com'; my $mech = WWW::Mechanize::Polite->new; $mech->parse_robots("$SITE/robots.txt"); for ('allow.txt', 'disallow.txt') { my $url = "$SITE/$_"; $mech->polite_get($url); print "$url:\n", $mech->content ? $mech->content : "denied\n"; }