in reply to Link Extrator Eating Up All My Memory

Where's PARSE() subroutine and what does it do? Also, your code fails when I try to 'perl -d' it with 'use strict'. However, that's a minor issue since most of these warnings are for non declared variables. ;).

In any event, I suggest you use the HTML::SimpleLinkExtor module instead. From what I've seen in your code, this module should do the job, hopefully. Remember, there's always a module to every task you undertake! ;-).

"There is no system but GNU, and Linux is one of its kernels." -- Confession of Faith

Replies are listed 'Best First'.
Re: Re: Link Extrator Eating Up All My Memory
by jonjacobmoon (Pilgrim) on May 09, 2002 at 23:28 UTC
    Appreciate the suggestion.

    No, the code would not compile and would fail under use strict because this is not all the code just the heart of it. Sorry, I should have explained that. I was trying to keep the amount of code to review short.

    Here is the full code, if you want to see it all (you can see why I truncated it):

    #!/usr/bin/perl -Tw package LinkExtractor; use strict; use HTML::Parser; use LWP::UserAgent; use URI::URL; my %opts; my $startPoint; my @CHECK; my $abs; my $uri; my $link; my $base; my $domain; # verify existance, parse for further URLs # The absolution URL is passed into PARSE sub PARSE { my ($url, $domain) = @_; if ($url =~ /^http\:\/\/.*($domain)/ and not $url =~ m!gif|jpg$!) { return 1; } return 0; } # Check for existence of the URL but do not parse # The absolution URL is passed into PING sub PING { my ($url) = shift; $url =~ m!^http:!; } # start of ParseLink { package ParseLink; our @ISA = qw(HTML::Parser); # called by parse sub start { # turn off warn because the variable: $this->{links}{$attr->{hre +f}} cannot really # be initialized as far as I can see $^W = 0; #my $this = shift; my ($this, $tag, $attr) = @_; if ($tag eq "a") { $this->{links}{$attr->{href}} = 1; } elsif ($tag eq "img") { $this->{links}{$attr->{src}} = 1; } elsif ($tag eq 'frame') { $this->{links}{$attr->{src}} = 1; } elsif ($tag eq 'meta') { if ($attr->{'http-equiv'} && $attr->{'http-equiv'} eq 'refresh') { my $refresh_time; my $url; ($refresh_time, $url) = split(/\;/, $attr->{'content'}); $url =~ s/^.*\=//; $this->{links}{$url} = 1; } } $^W = 1; } sub get_links { my $this = shift; sort keys %{$this->{links}}; } } # end of ParseLink sub extractLinks { my $ua = new LWP::UserAgent; my @return; my %skippedurl; my %badurl; my %goodurl; my %links; my $badurls = 0; my $goodurls = 0; my $thisurl; my %did; my $response; my $request; my $method; my $startPoint = shift || die "__FILE__: No starting point set.\n" +; my $debug = shift; push (@CHECK, "$startPoint"); $domain = new URI($startPoint); if ($startPoint =~ /^.*tp/) { $domain = $domain->host; } else { return 0; } $ua->agent("LinkExtractor/1.0"); $ua->env_proxy; $| = 1; { MAINLOOP: while ($thisurl = shift @CHECK) { my $testnum = scalar(@CHECK); print "CHECK: $testnum\n"; if ($debug) { print "processing $thisurl, $domain\n"; } $thisurl =~ s/%7e/~/ig; # convert encoded tilde to usable tilde next if $did{$thisurl}++; if (PARSE($thisurl,$domain)) { $goodurl{$thisurl}++; if ($debug) { print "GOODURL: $thisurl\n"; } if ($debug) { warn "fetching $thisurl\n"; } $request = new HTTP::Request('GET',$thisurl); $response = $ua->request($request); # fetch! unless ($response->is_success) { $badurl{$thisurl} = " status ,$response->code, $response->me +ssage,"; $badurls = 1; if ($debug) { warn "Cannot fetch $thisurl (status ", $response->code, " ", $response->message,")\n"; } next MAINLOOP; } unless ($response->content_type =~ /text\/html/i) { next MAINLOOP; } $base = $response->base; my $p = ParseLink->new; $p->parse($response->content); # parse calls for a passing of undef but triggered a warn # so passed an empty string instead to get rid of the warnin +g msg $p->parse(""); # parse calls for a passing of undef but tri +ggered a warn for $link ($p->get_links) { $abs = url($link, $base)->abs; if ($debug) { warn "... $link => $abs\n"; } if (PARSE($abs,$domain)) { print "pushing: $abs\n"; push(@CHECK, $abs); push(@{ $links{$thisurl} }, $abs); } } } next MAINLOOP; if (PING $thisurl) { if ($debug) { warn "verifying $thisurl\n"; } for $method (qw(HEAD GET)) { $request = new HTTP::Request($method,$thisurl); $response = $ua->request($request); # fetch! if ($response->is_success) { next MAINLOOP; } } if ($debug) { warn "Cannot fetch $thisurl (status ", $response->code, " ", $response->message,")\n"; } $badurl{$thisurl} = "status ,$response->code, $response->mes +sage"; $badurls = 1; next MAINLOOP; } if ($debug) { warn "[skipping $thisurl]\n"; } $skippedurl{$thisurl} = 1; } push(@return, \%links); push(@return, \%goodurl); push(@return, \%badurl); return \@return; } }


    I admit it, I am Paco.