comment on

Appreciate the suggestion.

No, the code would not compile and would fail under use strict because this is not all the code just the heart of it. Sorry, I should have explained that. I was trying to keep the amount of code to review short.

Here is the full code, if you want to see it all (you can see why I truncated it):

#!/usr/bin/perl -Tw

package LinkExtractor;

use strict;

use HTML::Parser;
use LWP::UserAgent;
use URI::URL;

my %opts;
my $startPoint;
my @CHECK;
my $abs;
my $uri;
my $link;
my $base;
my $domain;

# verify existance, parse for further URLs
# The absolution URL is passed into PARSE
sub PARSE
  {
    my ($url, $domain) = @_;

    if ($url =~ /^http\:\/\/.*($domain)/ and not $url =~ m!gif|jpg$!)
      {
    return 1;
      }
    return 0;
}

# Check for existence of the URL but do not parse
# The absolution URL is passed into PING
sub PING {

  my ($url) = shift;
  $url =~ m!^http:!;
}


# start of ParseLink
{
  package ParseLink;
  our @ISA = qw(HTML::Parser);

  # called by parse
  sub start
    {
      # turn off warn because the variable: $this->{links}{$attr->{hre
+f}} cannot really
      # be initialized as far as I can see
      $^W = 0;

      #my $this = shift;
      my ($this, $tag, $attr) = @_;

      if ($tag eq "a")
    {
      $this->{links}{$attr->{href}} = 1;
    }
      elsif ($tag eq "img")
    {
      $this->{links}{$attr->{src}} = 1;
    }
      elsif ($tag eq 'frame')
    {
      $this->{links}{$attr->{src}} = 1;
    }
      elsif ($tag eq 'meta')
    {
      if ($attr->{'http-equiv'} && $attr->{'http-equiv'} eq 'refresh')
        {
          my $refresh_time;
          my $url;

          ($refresh_time, $url) = split(/\;/, $attr->{'content'});
          $url =~ s/^.*\=//;
          $this->{links}{$url} = 1;
      }
      }

      $^W = 1;

    }

  sub get_links
    {
      my $this = shift;
      sort keys %{$this->{links}};
    }
}
# end of ParseLink

sub extractLinks
  {
    my $ua = new LWP::UserAgent;
    my @return;
    my %skippedurl;
    my %badurl;
    my %goodurl;
    my %links;
    my $badurls = 0;
    my $goodurls = 0;
    my $thisurl;
    my %did;
    my $response;
    my $request;
    my $method;

    my $startPoint = shift || die "__FILE__: No starting point set.\n"
+;
    my $debug = shift;

    push (@CHECK, "$startPoint");

    $domain = new URI($startPoint);

    if ($startPoint =~ /^.*tp/)
      {
    $domain = $domain->host;
      }
    else
      {
    return 0;
      }

    $ua->agent("LinkExtractor/1.0");
    $ua->env_proxy;

    $| = 1;

    {
    MAINLOOP:
      while ($thisurl = shift @CHECK)
    {
      my $testnum = scalar(@CHECK);
      print "CHECK: $testnum\n";

      if ($debug)
        {
          print "processing $thisurl, $domain\n";
        }
      $thisurl =~ s/%7e/~/ig; # convert encoded tilde to usable tilde
      next if $did{$thisurl}++;
      if (PARSE($thisurl,$domain))
        {
          $goodurl{$thisurl}++;
          if ($debug)
        {
          print "GOODURL: $thisurl\n";
        }
          if ($debug)
        {
          warn "fetching $thisurl\n";
        }
          $request = new HTTP::Request('GET',$thisurl);
          $response = $ua->request($request); # fetch!

          unless ($response->is_success)
        {
          $badurl{$thisurl} = " status ,$response->code, $response->me
+ssage,";
          $badurls = 1;
          if ($debug)
            {
              warn
            "Cannot fetch $thisurl (status ",
              $response->code, " ", $response->message,")\n";
            }
          next MAINLOOP;
        }
    
          unless ($response->content_type =~ /text\/html/i)
        {
          next MAINLOOP;
        }
    
          $base = $response->base;
          my $p = ParseLink->new;
          $p->parse($response->content);
    
          # parse calls for a passing of undef but triggered a warn
          # so passed an empty string instead to get rid of the warnin
+g msg
          $p->parse("");  # parse calls for a passing of undef but tri
+ggered a warn
    
          for $link ($p->get_links)
        {
          $abs = url($link, $base)->abs;
          if ($debug)
            {
              warn "... $link => $abs\n";
            }
          if (PARSE($abs,$domain))
            {
              print "pushing: $abs\n";
              push(@CHECK, $abs);
              push(@{ $links{$thisurl} }, $abs);
            }
        }
        }
      next MAINLOOP;
    

      if (PING $thisurl)
        {    
          if ($debug)
        {
          warn "verifying $thisurl\n";
        }
          for $method (qw(HEAD GET))
        {
          $request = new HTTP::Request($method,$thisurl);
          $response = $ua->request($request); # fetch!
          if ($response->is_success)
            {
              next MAINLOOP;
            }
        }
    
          if ($debug)
        {
          warn
            "Cannot fetch $thisurl (status ",
              $response->code, " ", $response->message,")\n";
        }
          $badurl{$thisurl} = "status ,$response->code, $response->mes
+sage";
          $badurls = 1;
          next MAINLOOP;
        }

      if ($debug)
        {
          warn "[skipping $thisurl]\n";
        }
      $skippedurl{$thisurl} = 1;
    }

      push(@return, \%links);
      push(@return, \%goodurl);
      push(@return, \%badurl);
      return \@return;
    }
  }
[download]

I admit it, I am Paco.

In reply to Re: Re: Link Extrator Eating Up All My Memory by jonjacobmoon
in thread Link Extrator Eating Up All My Memory by jonjacobmoon

Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!

Titles consisting of a single word are discouraged, and in most cases are disallowed outright.

Read Where should I post X? if you're not absolutely sure you're posting in the right place.

Please read these before you post! —

Posts may use any of the Perl Monks Approved HTML tags:

a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr

You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)

	For:		Use:
	&		`&`
	<		`<`
	>		`>`
	[		`[`
	]		`]`

Link using PerlMonks shortcuts! What shortcuts can I use for linking?

See Writeup Formatting Tips and other pages linked from there for more info.