#!/usr/bin/perl
use strict;
use warnings;
use LWP::UserAgent;
use HTML::LinkExtor;
use URI::URL;
my $site = shift @ARGV; # Why not just take an uri as argument?
my $domain = shift @ARGV;
# Should really be an uri.
# Assume it's a HTTP server if not.
if ($site !~ /^\w+:/) {
$site = "http://site";
}
my $first_uri = URI->new($site)->canonical;
my @to_visit = ( $first_uri );
my %seen = map { $_ => 1 } @to_visit;
my $ua = LWP::UserAgent->new();
while (@to_visit) {
my $uri = shift(@to_visit);
# Make the parser
my $p = HTML::LinkExtor->new(sub {
my ($tag, %attr) = @_;
# Only interested in A elements.
return if $tag ne 'a';
# Only interested in the HREF attribute.
return if not exists $attr{href};
my $link_uri = URI->new_abs($attr{href}, $uri)->canonical;
# Ignore links outside of the domain.
return if $link_uri->rel($first_uri) eq $link_uri;
# Ignore links already in the queue and links already visited.
return if $seen{$link_uri}++;
push @to_visit, $link_uri;
});
my $response = $ua->request(
HTTP::Request->new(GET => $uri),
sub { $p->parse($_[0]) }
);
$p->eof;
}
my @links = keys %seen;
You should use LWP::RobotUA instead of LWP::UserAgent for this kind of application.
Untested.
Updated: The call to request was accidently removed! oops. Readded. Added $p->eof.
|