next unless defined($token->[1]{href});
next unless defined($token->[1]{class});
next unless $token->[1]{href} =~ /\?page=/ || $token->[1]{class} =~ /top10_link/;
my $urls = $token->[1]{href};
####
if(!$parent{$urls}){
my $count = keys %parent;
$parent{$urls} = $i;
my $count2 = keys %parent;
print "$i: count1 is: $count and count2 is $count2\n$urls\n";
if ($urls =~ /page=/){
print "!!!!!!!!!!!!recursing!!!!!!!!!!!!!\n";
#print "\n\"$i $title\"\n $parent{$urls}\n";
&passing($urls);
}
}
####
#!/usr/bin/perl -w
use strict;
use HTML::TokeParser;
use LWP::Simple;
use URI::URL;
my %parent;
sub passing{
my $url = shift;
my $data = get($url) or die $!;
#the magical parser.
my $p = HTML::TokeParser->new(\$data);
my $i=0;
while (my $token = $p->get_tag("a")) {
next unless defined($token->[1]{href});
next unless defined($token->[1]{class});
next unless $token->[1]{href} =~ /\?page=/ || $token->[1]{class} =~ /top10_link/;
my $urls = $token->[1]{href};
$urls =~ s/&PHPSESSID=.*//g;
$urls = &canonical($urls, "http://www.ash-distribution.co.uk/index.php");
my $title = $p->get_trimmed_text;
if (!$parent{$urls}){
$parent{$urls} = $i;
if ($urls =~ /page=/){
print "!!!!!!!!!!!!recursing!!!!!!!!!!!!!\n";
#print "\n\"$i $title\"\n $parent{$urls}\n";
&passing($urls);
}
}
$i++;
}
}
sub canonical{
if (not $_[0] =~ m%^http://%){
$_[0] = url($_[0])->abs($_[1]);
}
return $_[0];
}
&passing("http://www.ash-distribution.co.uk/index.php?c=%20Ink&sc=Epson%20Replacement%20Cartridges");