use strict;
use warnings;
use WWW::Mechanize;
use WWW::RobotRules;
use LWP::Simple;
my $SITE = 'http://www.unlocalhost.com';
my $rules = WWW::RobotRules->new('bot/1.0');
my $robot_url = "$SITE/robots.txt";
my $robot_data = LWP::Simple::get($robot_url);
$rules->parse($robot_url, $robot_data) if $robot_data;
for ('disallow.txt', 'allow.txt') {
my $url = "$SITE/$_";
if($rules->allowed($url)) {
my $mech = WWW::Mechanize->new;
$mech->get($url);
print "$url:\n", $mech->content;
}
else {
print "$url:\ndenied\n";
}
}
####
package WWW::Mechanize::Polite;
use base 'WWW::Mechanize';
use WWW::RobotRules;
sub new {
my $self = shift->SUPER::new(@_);
$self->{robo_rules} = WWW::RobotRules->new($self->agent());
return $self;
}
sub parse_robots {
my ($self,$url) = @_;
$self->get($url);
$self->{robo_rules}->parse($url, $self->content);
}
sub polite_get {
my ($self,$url) = @_;
if ($self->{robo_rules}->allowed($url)) {
$self->get($url);
}
else {
undef $self->{content};
}
}
1;
####
use WWW::Mechanize::Polite;
my $SITE = 'http://www.unlocalhost.com';
my $mech = WWW::Mechanize::Polite->new;
$mech->parse_robots("$SITE/robots.txt");
for ('allow.txt', 'disallow.txt') {
my $url = "$SITE/$_";
$mech->polite_get($url);
print "$url:\n", $mech->content ? $mech->content : "denied\n";
}