in reply to Re: Link Extrator Eating Up All My Memory
in thread Link Extrator Eating Up All My Memory

Appreciate the suggestion.

No, the code would not compile and would fail under use strict because this is not all the code just the heart of it. Sorry, I should have explained that. I was trying to keep the amount of code to review short.

Here is the full code, if you want to see it all (you can see why I truncated it):

#!/usr/bin/perl -Tw package LinkExtractor; use strict; use HTML::Parser; use LWP::UserAgent; use URI::URL; my %opts; my $startPoint; my @CHECK; my $abs; my $uri; my $link; my $base; my $domain; # verify existance, parse for further URLs # The absolution URL is passed into PARSE sub PARSE { my ($url, $domain) = @_; if ($url =~ /^http\:\/\/.*($domain)/ and not $url =~ m!gif|jpg$!) { return 1; } return 0; } # Check for existence of the URL but do not parse # The absolution URL is passed into PING sub PING { my ($url) = shift; $url =~ m!^http:!; } # start of ParseLink { package ParseLink; our @ISA = qw(HTML::Parser); # called by parse sub start { # turn off warn because the variable: $this->{links}{$attr->{hre +f}} cannot really # be initialized as far as I can see $^W = 0; #my $this = shift; my ($this, $tag, $attr) = @_; if ($tag eq "a") { $this->{links}{$attr->{href}} = 1; } elsif ($tag eq "img") { $this->{links}{$attr->{src}} = 1; } elsif ($tag eq 'frame') { $this->{links}{$attr->{src}} = 1; } elsif ($tag eq 'meta') { if ($attr->{'http-equiv'} && $attr->{'http-equiv'} eq 'refresh') { my $refresh_time; my $url; ($refresh_time, $url) = split(/\;/, $attr->{'content'}); $url =~ s/^.*\=//; $this->{links}{$url} = 1; } } $^W = 1; } sub get_links { my $this = shift; sort keys %{$this->{links}}; } } # end of ParseLink sub extractLinks { my $ua = new LWP::UserAgent; my @return; my %skippedurl; my %badurl; my %goodurl; my %links; my $badurls = 0; my $goodurls = 0; my $thisurl; my %did; my $response; my $request; my $method; my $startPoint = shift || die "__FILE__: No starting point set.\n" +; my $debug = shift; push (@CHECK, "$startPoint"); $domain = new URI($startPoint); if ($startPoint =~ /^.*tp/) { $domain = $domain->host; } else { return 0; } $ua->agent("LinkExtractor/1.0"); $ua->env_proxy; $| = 1; { MAINLOOP: while ($thisurl = shift @CHECK) { my $testnum = scalar(@CHECK); print "CHECK: $testnum\n"; if ($debug) { print "processing $thisurl, $domain\n"; } $thisurl =~ s/%7e/~/ig; # convert encoded tilde to usable tilde next if $did{$thisurl}++; if (PARSE($thisurl,$domain)) { $goodurl{$thisurl}++; if ($debug) { print "GOODURL: $thisurl\n"; } if ($debug) { warn "fetching $thisurl\n"; } $request = new HTTP::Request('GET',$thisurl); $response = $ua->request($request); # fetch! unless ($response->is_success) { $badurl{$thisurl} = " status ,$response->code, $response->me +ssage,"; $badurls = 1; if ($debug) { warn "Cannot fetch $thisurl (status ", $response->code, " ", $response->message,")\n"; } next MAINLOOP; } unless ($response->content_type =~ /text\/html/i) { next MAINLOOP; } $base = $response->base; my $p = ParseLink->new; $p->parse($response->content); # parse calls for a passing of undef but triggered a warn # so passed an empty string instead to get rid of the warnin +g msg $p->parse(""); # parse calls for a passing of undef but tri +ggered a warn for $link ($p->get_links) { $abs = url($link, $base)->abs; if ($debug) { warn "... $link => $abs\n"; } if (PARSE($abs,$domain)) { print "pushing: $abs\n"; push(@CHECK, $abs); push(@{ $links{$thisurl} }, $abs); } } } next MAINLOOP; if (PING $thisurl) { if ($debug) { warn "verifying $thisurl\n"; } for $method (qw(HEAD GET)) { $request = new HTTP::Request($method,$thisurl); $response = $ua->request($request); # fetch! if ($response->is_success) { next MAINLOOP; } } if ($debug) { warn "Cannot fetch $thisurl (status ", $response->code, " ", $response->message,")\n"; } $badurl{$thisurl} = "status ,$response->code, $response->mes +sage"; $badurls = 1; next MAINLOOP; } if ($debug) { warn "[skipping $thisurl]\n"; } $skippedurl{$thisurl} = 1; } push(@return, \%links); push(@return, \%goodurl); push(@return, \%badurl); return \@return; } }


I admit it, I am Paco.