Link Extrator Eating Up All My Memory

jonjacobmoon has asked for the wisdom of the Perl Monks concerning the following question:

Been awhile. I have been away working on non-perl projects but I am back with a problem that has plagued me for a week, so I am at the point where I need a new set of eyes. Perhaps I am missing something obivous.

Sorry if the code is a little long. Some of you may recognize it as a modified routine that Merlyn wrote some time ago. What is happening is that memory usage is getting eaten up and the program is killing itself before terminating when I have a particularly large web site I want to extract all the links for. I have tried a few things, but frankly I am not even sure where to start. Could this be a memory leak in HTML::Parser or Perl 5.6.1? I am not sure it is because at several points the memory usage goes down slightly which suggested that it is not really a leak.

Here is the code:

sub extractLinks
  {
    my $ua = new LWP::UserAgent;
    my @return;
    my %skippedurl;
    my %badurl;
    my %goodurl;
    my %links;
    my $badurls = 0;
    my $goodurls = 0;
    my $thisurl;
    my %did;
    my $response;
    my $request;
    my $method;

    my $startPoint = shift || die "__FILE__: No starting point set.\n"
+;
    my $debug = shift;

    push (@CHECK, "$startPoint");

    $domain = new URI($startPoint);

    if ($startPoint =~ /^.*tp/)
      {
    $domain = $domain->host;
      }
    else
      {
    return 0;
      }

    $ua->agent("LinkExtractor/1.0");
    $ua->env_proxy;

    $| = 1;

    {
    MAINLOOP:
      while ($thisurl = shift @CHECK)
    {
      my $testnum = scalar(@CHECK);
      print "CHECK: $testnum\n";

      if ($debug)
        {
          print "processing $thisurl, $domain\n";
        }
      $thisurl =~ s/%7e/~/ig; # convert encoded tilde to usable tilde
      next if $did{$thisurl}++;
      if (PARSE($thisurl,$domain))
        {
          $goodurl{$thisurl}++;
          if ($debug)
        {
          print "GOODURL: $thisurl\n";
        }
          if ($debug)
        {
          warn "fetching $thisurl\n";
        }
          $request = new HTTP::Request('GET',$thisurl);
          $response = $ua->request($request); # fetch!

          unless ($response->is_success)
        {
          $badurl{$thisurl} = " status ,$response->code, $response->me
+ssage,";
          $badurls = 1;
          if ($debug)
            {
              warn
            "Cannot fetch $thisurl (status ",
              $response->code, " ", $response->message,")\n";
            }
          next MAINLOOP;
        }
    
          unless ($response->content_type =~ /text\/html/i)
        {
          next MAINLOOP;
        }
    
          $base = $response->base;
          my $p = ParseLink->new;
          $p->parse($response->content);
    
          # parse calls for a passing of undef but triggered a warn
          # so passed an empty string instead to get rid of the warnin
+g msg
          $p->parse("");  # parse calls for a passing of undef but tri
+ggered a warn
    
          for $link ($p->get_links)
        {
          $abs = url($link, $base)->abs;
          if ($debug)
            {
              warn "... $link => $abs\n";
            }
          if (PARSE($abs,$domain))
            {
              print "pushing: $abs\n";
              push(@CHECK, $abs);
              push(@{ $links{$thisurl} }, $abs);
            }
        }
        }
      next MAINLOOP;
    

      if (PING $thisurl)
        {    
          if ($debug)
        {
          warn "verifying $thisurl\n";
        }
          for $method (qw(HEAD GET))
        {
          $request = new HTTP::Request($method,$thisurl);
          $response = $ua->request($request); # fetch!
          if ($response->is_success)
            {
              next MAINLOOP;
            }
        }
    
          if ($debug)
        {
          warn
            "Cannot fetch $thisurl (status ",
              $response->code, " ", $response->message,")\n";
        }
          $badurl{$thisurl} = "status ,$response->code, $response->mes
+sage";
          $badurls = 1;
          next MAINLOOP;
        }

      if ($debug)
        {
          warn "[skipping $thisurl]\n";
        }
      $skippedurl{$thisurl} = 1;
    }

      push(@return, \%links);
      push(@return, \%goodurl);
      push(@return, \%badurl);
      return \@return;
    }
  }
[download]

I admit it, I am Paco.

Edit kudra, 2002-05-10 Added READMORE

Comment on Link Extrator Eating Up All My Memory Download Code

Replies are listed 'Best First'.

Re: Link Extrator Eating Up All My Memory
by vladb (Vicar) on May 09, 2002 at 21:58 UTC

PARSE()

HTML::SimpleLinkExtor

"There is no system but GNU, and Linux is one of its kernels." -- Confession of Faith

[reply]
[d/l]

Re: Re: Link Extrator Eating Up All My Memory

by jonjacobmoon (Pilgrim) on May 09, 2002 at 23:28 UTC

No, the code would not compile and would fail under use strict because this is not all the code just the heart of it. Sorry, I should have explained that. I was trying to keep the amount of code to review short.

Here is the full code, if you want to see it all (you can see why I truncated it):

#!/usr/bin/perl -Tw

package LinkExtractor;

use strict;

use HTML::Parser;
use LWP::UserAgent;
use URI::URL;

my %opts;
my $startPoint;
my @CHECK;
my $abs;
my $uri;
my $link;
my $base;
my $domain;

# verify existance, parse for further URLs
# The absolution URL is passed into PARSE
sub PARSE
  {
    my ($url, $domain) = @_;

    if ($url =~ /^http\:\/\/.*($domain)/ and not $url =~ m!gif|jpg$!)
      {
    return 1;
      }
    return 0;
}

# Check for existence of the URL but do not parse
# The absolution URL is passed into PING
sub PING {

  my ($url) = shift;
  $url =~ m!^http:!;
}


# start of ParseLink
{
  package ParseLink;
  our @ISA = qw(HTML::Parser);

  # called by parse
  sub start
    {
      # turn off warn because the variable: $this->{links}{$attr->{hre
+f}} cannot really
      # be initialized as far as I can see
      $^W = 0;

      #my $this = shift;
      my ($this, $tag, $attr) = @_;

      if ($tag eq "a")
    {
      $this->{links}{$attr->{href}} = 1;
    }
      elsif ($tag eq "img")
    {
      $this->{links}{$attr->{src}} = 1;
    }
      elsif ($tag eq 'frame')
    {
      $this->{links}{$attr->{src}} = 1;
    }
      elsif ($tag eq 'meta')
    {
      if ($attr->{'http-equiv'} && $attr->{'http-equiv'} eq 'refresh')
        {
          my $refresh_time;
          my $url;

          ($refresh_time, $url) = split(/\;/, $attr->{'content'});
          $url =~ s/^.*\=//;
          $this->{links}{$url} = 1;
      }
      }

      $^W = 1;

    }

  sub get_links
    {
      my $this = shift;
      sort keys %{$this->{links}};
    }
}
# end of ParseLink

sub extractLinks
  {
    my $ua = new LWP::UserAgent;
    my @return;
    my %skippedurl;
    my %badurl;
    my %goodurl;
    my %links;
    my $badurls = 0;
    my $goodurls = 0;
    my $thisurl;
    my %did;
    my $response;
    my $request;
    my $method;

    my $startPoint = shift || die "__FILE__: No starting point set.\n"
+;
    my $debug = shift;

    push (@CHECK, "$startPoint");

    $domain = new URI($startPoint);

    if ($startPoint =~ /^.*tp/)
      {
    $domain = $domain->host;
      }
    else
      {
    return 0;
      }

    $ua->agent("LinkExtractor/1.0");
    $ua->env_proxy;

    $| = 1;

    {
    MAINLOOP:
      while ($thisurl = shift @CHECK)
    {
      my $testnum = scalar(@CHECK);
      print "CHECK: $testnum\n";

      if ($debug)
        {
          print "processing $thisurl, $domain\n";
        }
      $thisurl =~ s/%7e/~/ig; # convert encoded tilde to usable tilde
      next if $did{$thisurl}++;
      if (PARSE($thisurl,$domain))
        {
          $goodurl{$thisurl}++;
          if ($debug)
        {
          print "GOODURL: $thisurl\n";
        }
          if ($debug)
        {
          warn "fetching $thisurl\n";
        }
          $request = new HTTP::Request('GET',$thisurl);
          $response = $ua->request($request); # fetch!

          unless ($response->is_success)
        {
          $badurl{$thisurl} = " status ,$response->code, $response->me
+ssage,";
          $badurls = 1;
          if ($debug)
            {
              warn
            "Cannot fetch $thisurl (status ",
              $response->code, " ", $response->message,")\n";
            }
          next MAINLOOP;
        }
    
          unless ($response->content_type =~ /text\/html/i)
        {
          next MAINLOOP;
        }
    
          $base = $response->base;
          my $p = ParseLink->new;
          $p->parse($response->content);
    
          # parse calls for a passing of undef but triggered a warn
          # so passed an empty string instead to get rid of the warnin
+g msg
          $p->parse("");  # parse calls for a passing of undef but tri
+ggered a warn
    
          for $link ($p->get_links)
        {
          $abs = url($link, $base)->abs;
          if ($debug)
            {
              warn "... $link => $abs\n";
            }
          if (PARSE($abs,$domain))
            {
              print "pushing: $abs\n";
              push(@CHECK, $abs);
              push(@{ $links{$thisurl} }, $abs);
            }
        }
        }
      next MAINLOOP;
    

      if (PING $thisurl)
        {    
          if ($debug)
        {
          warn "verifying $thisurl\n";
        }
          for $method (qw(HEAD GET))
        {
          $request = new HTTP::Request($method,$thisurl);
          $response = $ua->request($request); # fetch!
          if ($response->is_success)
            {
              next MAINLOOP;
            }
        }
    
          if ($debug)
        {
          warn
            "Cannot fetch $thisurl (status ",
              $response->code, " ", $response->message,")\n";
        }
          $badurl{$thisurl} = "status ,$response->code, $response->mes
+sage";
          $badurls = 1;
          next MAINLOOP;
        }

      if ($debug)
        {
          warn "[skipping $thisurl]\n";
        }
      $skippedurl{$thisurl} = 1;
    }

      push(@return, \%links);
      push(@return, \%goodurl);
      push(@return, \%badurl);
      return \@return;
    }
  }
[download]

[reply]
[d/l]