[skipping ~http://www.sun.com/]
[skipping ]
####
!/usr/bin/perl
use lib "/perl/bin";
use LWP::UserAgent;
use HTML::Parser;
use URI::URL;
## begin configure
@CHECK = # list of initial starting points
qw(~http://www.sun.com/ );
sub PARSE { # verify existance, parse for further URLs
## $_[0] is the absolute URL
$_[0] =~ m!^\.(teleport|stonehenge)\.com/~merlyn! and not
$_[0] =~ /refindex/;
}
sub PING { # verify existence, but don't parse
## $_[0] is the absolute URL
$_[0] =~ m!^(http):!;
}
## end configure
{
package ParseLink;
@ISA = qw(HTML::Parser);
sub start { # called by parse
my $this = shift;
my ($tag, $attr) = @_;
if ($tag eq "a") {
$this->{links}{$attr->{href}}++;
} elsif ($tag eq "img") {
$this->{links}{$attr->{src}}++;
}
}
sub get_links {
my $this = shift;
sort keys %{$this->{links}};
}
} # end of ParseLink
$ua = new LWP::UserAgent;
$ua->agent("hverify/1.0");
$ua->env_proxy;
$| = 1;
MAINLOOP:
while ($thisurl = shift @CHECK) {
$thisurl =~ s/%7e/~/ig; # ugh :-)
next if $did{$thisurl}++;
if (PARSE $thisurl) {
warn "fetching $thisurl\n";
$request = HTTP::Request('GET',$thisurl);
$response = $ua->request($request); # fetch!
unless ($response->is_success) {
warn
"Cannot fetch $thisurl (status ",
$response->code, " ", $response->message,")\n";
next MAINLOOP;
}
next MAINLOOP unless $response->content_type =~ /text\/html/i;
$base = $response->base;
my $p = ParseLink->new;
$p->parse($response->content);
$p->parse(undef);
for $link ($p->get_links) {
$abs = url($link, $base)->abs;
warn "... $link => $abs\n";
push(@CHECK, $abs);
}
next MAINLOOP;
}
if (PING $thisurl) {
warn "verifying $thisurl\n";
for $method (qw(HEAD GET)) {
$request = HTTP::Request ($method,$thisurl);
$response = $ua->request($request); # fetch!
next MAINLOOP if $response->is_success; # ok
}
warn
"Cannot fetch $thisurl (status ",
$response->code, " ", $response->message,")\n";
next MAINLOOP;
}
warn "[skipping $thisurl]\n";
}