You give the script a web page to start on and it wanders randomly for as many pages as you tell it to. It's fun to watch!
update: I have refactored the original code.
update: I've added keyword functionality if you care about the source.
#!/usr/bin/perl -wT
use strict;
use CGI;
use LWP::Simple;
$|++;
my $q = new CGI;
my $v = $q->Vars;
my $action = $q->url;
print $q->header(-type => 'text/plain'), viewSource() and exit if $v->
+{view};
print $q->header, buildTop($v, $action);
exit unless $v->{start} && $v->{depth};
$v->{depth} = 25 if $v->{depth} > 25;
my $links = [$v->{start}];
my @cycleBreaker = ();
print 'Watch! The AutoSurfer is surfing!<ol>';
my $i = 0;
do {
my $rand = randomizeArray($links);
for my $url (@$rand) {
next if grep { $_ eq $url } @cycleBreaker;
print "<br>Getting $url..." if $v->{verbose};
my $html = get $url;
if ($html) {
print "done." if $v->{verbose};
} else {
print "failed." if $v->{verbose};
next;
}
my $title = getTitle($html);
my $allLinks = getLinks($html);
$links = validateLinks($allLinks);
cleanLinks($links, $url);
if (@$links) {
print qq|<li><a target="_blank" href="$url">$title</a> - <
+a href="$action?start=$url&depth=$v->{depth}">Surf it</a> - <font si
+ze="-1">$url</font>|;
$i++;
push @cycleBreaker, $url;
last;
}
}
} while ($i < $v->{depth} && @$links);
print '</ol></body></html>';
sub buildMenu {
my ($depth) = @_;
my $options = '';
for (5,10,15,20,25) {
$options .= qq|<option value="$_"|;
$options .= ' selected ' if $depth && $_ == $depth;
$options .= ">$_";
}
return $options;
}
sub viewSource {
open THIS, $0 or die $!;
my @this = <THIS>;
close THIS;
return @this;
}
sub buildTop {
my ($v, $action) = @_;
my $options = buildMenu($v->{depth});
my $start = $v->{start} || 'http://';
my $top = qq|
<html><head>
<title>AutoSurfer</title>
</head><body>
<h2>AutoSurfer</h2>
<form action="$action">
<table><tr><td>
Starting Point:
</td><td>
<input name="start" value="$start">
</td></tr><tr><td>
Surf Depth:
</td><td>
<select name="depth">
$options
</select>
</td></tr><tr><td>
Verbose:
</td><td>
<input type="checkbox" name="verbose" value="1"|;
$top .= ' checked' if $v->{verbose};
$top .= qq|>
</td></tr></table>
<p>
<input type="submit" value=" Go! ">
<a href="$action?view=1">view source</a>
</form>|;
return $top;
}
sub randomizeArray {
my ($array) = @_;
my @rand = ();
push @rand, splice @$array, rand @$array, 1 while @$array;
return \@rand;
}
sub getTitle {
my ($html) = @_;
my ($title) = $html =~ m|<\s*title\s*>([^<]+)</\s*title\s*>|is;
$title ||= 'Untitled';
return $title;
}
sub getLinks {
my ($html) = @_;
my @links = $html =~ m|<[^>]+href=['"]?([^'" >]+)|ig;
my @frames = $html =~ m|<frame[^>]+src=['"]?([^'" >]+)|ig;
return [@links, @frames];
}
sub validateLinks {
my ($links) = @_;
my %uniq;
@uniq{(@$links)} = undef;
my @urls = grep {!/(dtd|gif|jpg|pdf|css|gz|mov)$/i && !/mailto:/i
+&& $_} keys %uniq;
return \@urls;
}
sub getDomain {
my ($url) = @_;
my ($domain) = $url =~ m|(http://[^/]+)|i;
return $domain;
}
sub getPath {
my ($path) = @_;
$path =~ s|(?<!/)/[^/]+$||;
return $path;
}
sub cleanLinks {
my ($links, $url) = @_;
my $domain = getDomain($url);
my $path = getPath($url);
for (@$links) {
next unless $_;
unless (/^http/i) {
if (/^\//) {
$_ = $domain . $_;
} elsif (s/^\.\.\///) {
my ($new_path) = $path =~ m|^(.+)/|;
$_ = $new_path . '/' . $_;
} else {
my $slash = ($path =~ /\/$/) ? '':'/';
$_ = $path . $slash . $_;
}
}
}
}