Appreciate the suggestion.
No, the code would not compile and would fail under use strict because this is not all the code just the heart of it. Sorry, I should have explained that. I was trying to keep the amount of code to review short.
Here is the full code, if you want to see it all (you can see why I truncated it):
#!/usr/bin/perl -Tw
package LinkExtractor;
use strict;
use HTML::Parser;
use LWP::UserAgent;
use URI::URL;
my %opts;
my $startPoint;
my @CHECK;
my $abs;
my $uri;
my $link;
my $base;
my $domain;
# verify existance, parse for further URLs
# The absolution URL is passed into PARSE
sub PARSE
{
my ($url, $domain) = @_;
if ($url =~ /^http\:\/\/.*($domain)/ and not $url =~ m!gif|jpg$!)
{
return 1;
}
return 0;
}
# Check for existence of the URL but do not parse
# The absolution URL is passed into PING
sub PING {
my ($url) = shift;
$url =~ m!^http:!;
}
# start of ParseLink
{
package ParseLink;
our @ISA = qw(HTML::Parser);
# called by parse
sub start
{
# turn off warn because the variable: $this->{links}{$attr->{hre
+f}} cannot really
# be initialized as far as I can see
$^W = 0;
#my $this = shift;
my ($this, $tag, $attr) = @_;
if ($tag eq "a")
{
$this->{links}{$attr->{href}} = 1;
}
elsif ($tag eq "img")
{
$this->{links}{$attr->{src}} = 1;
}
elsif ($tag eq 'frame')
{
$this->{links}{$attr->{src}} = 1;
}
elsif ($tag eq 'meta')
{
if ($attr->{'http-equiv'} && $attr->{'http-equiv'} eq 'refresh')
{
my $refresh_time;
my $url;
($refresh_time, $url) = split(/\;/, $attr->{'content'});
$url =~ s/^.*\=//;
$this->{links}{$url} = 1;
}
}
$^W = 1;
}
sub get_links
{
my $this = shift;
sort keys %{$this->{links}};
}
}
# end of ParseLink
sub extractLinks
{
my $ua = new LWP::UserAgent;
my @return;
my %skippedurl;
my %badurl;
my %goodurl;
my %links;
my $badurls = 0;
my $goodurls = 0;
my $thisurl;
my %did;
my $response;
my $request;
my $method;
my $startPoint = shift || die "__FILE__: No starting point set.\n"
+;
my $debug = shift;
push (@CHECK, "$startPoint");
$domain = new URI($startPoint);
if ($startPoint =~ /^.*tp/)
{
$domain = $domain->host;
}
else
{
return 0;
}
$ua->agent("LinkExtractor/1.0");
$ua->env_proxy;
$| = 1;
{
MAINLOOP:
while ($thisurl = shift @CHECK)
{
my $testnum = scalar(@CHECK);
print "CHECK: $testnum\n";
if ($debug)
{
print "processing $thisurl, $domain\n";
}
$thisurl =~ s/%7e/~/ig; # convert encoded tilde to usable tilde
next if $did{$thisurl}++;
if (PARSE($thisurl,$domain))
{
$goodurl{$thisurl}++;
if ($debug)
{
print "GOODURL: $thisurl\n";
}
if ($debug)
{
warn "fetching $thisurl\n";
}
$request = new HTTP::Request('GET',$thisurl);
$response = $ua->request($request); # fetch!
unless ($response->is_success)
{
$badurl{$thisurl} = " status ,$response->code, $response->me
+ssage,";
$badurls = 1;
if ($debug)
{
warn
"Cannot fetch $thisurl (status ",
$response->code, " ", $response->message,")\n";
}
next MAINLOOP;
}
unless ($response->content_type =~ /text\/html/i)
{
next MAINLOOP;
}
$base = $response->base;
my $p = ParseLink->new;
$p->parse($response->content);
# parse calls for a passing of undef but triggered a warn
# so passed an empty string instead to get rid of the warnin
+g msg
$p->parse(""); # parse calls for a passing of undef but tri
+ggered a warn
for $link ($p->get_links)
{
$abs = url($link, $base)->abs;
if ($debug)
{
warn "... $link => $abs\n";
}
if (PARSE($abs,$domain))
{
print "pushing: $abs\n";
push(@CHECK, $abs);
push(@{ $links{$thisurl} }, $abs);
}
}
}
next MAINLOOP;
if (PING $thisurl)
{
if ($debug)
{
warn "verifying $thisurl\n";
}
for $method (qw(HEAD GET))
{
$request = new HTTP::Request($method,$thisurl);
$response = $ua->request($request); # fetch!
if ($response->is_success)
{
next MAINLOOP;
}
}
if ($debug)
{
warn
"Cannot fetch $thisurl (status ",
$response->code, " ", $response->message,")\n";
}
$badurl{$thisurl} = "status ,$response->code, $response->mes
+sage";
$badurls = 1;
next MAINLOOP;
}
if ($debug)
{
warn "[skipping $thisurl]\n";
}
$skippedurl{$thisurl} = 1;
}
push(@return, \%links);
push(@return, \%goodurl);
push(@return, \%badurl);
return \@return;
}
}
I admit it, I am Paco. |