I have a very old script that I do that with, it uses
LWP to fetch the pages and parses the html with regex (yes,
I know better now, but it works).
At the time I failed to identify any module to do what I,
needed and later I added various bits and pieces to count
words, list external links and so on. So rolling my own
turned out to be the best way to go. Essentially it was:
use strict;
use LWP::UserAgent;
use IO::File;
my $ua = LWP::UserAgent->new;
$ua->agent("Angler/0.1 ");
my $server_root = "/";
my %pages =
(
$server_root => "needs_scan",
# Note some pages that we don't want to visit
"${server_root}login.htm" => "done",
"${server_root}logout" => "done",
"${server_root}table.xls" => "done",
"${server_root}select_view" => "done",
"${server_root}sws/error" => "done",
);
my $default_host = "127.0.0.1";
my $default_port;
my $done_some = 1;
while($done_some)
{
$done_some = 0;
foreach my $path (keys %pages)
{
next if($pages{$path} eq "done");
scan_page($path);
$pages{$path} = "done";
$done_some = 1;
$total_done++;
if(($total_done % 1000) == 0)
{
print "\rDone $total_done";
}
}
}
print "\nDone $total_done pages\n";
# A whole load of reporting in here
exit 0;
sub sub scan_page
{
my($path) = @_;
print STDERR "Scanning: $path\n" if($verbose);
if(!defined $base_name)
{
$base_name = "http://$default_host";
$base_name .= ":" . $default_port if($default_port != 80);
}
# Dir contains the relative path to the page so that
# we can correctly adjust relative links
$dir = $path;
if($dir =~ s#/+[^/]*$#/#)
{
}
else
{
$dir = $server_root;
}
%known_content_types =
(
# Here are the link types that we know about at the start
"text/html" => "HTML",
"image/gif" => "ignore",
"application/octet-stream" => "ignore",
"application/pdf" => "ignore",
) if(!%known_content_types);
my $req = HTTP::Request->new(GET => "$base_name$path");
my $res = $ua->request($req);
if(!defined $res)
{
print STDERR "Failed to load $base_name$path\n";
return;
}
elsif (!$res->is_success)
{
print STDERR "Status ".$res->status_line." when connecting to
+$base_name$path\n";
return;
}
my $page_contents = $res->content();
if(!defined $page_contents || $page_contents eq "")
{
print STDERR "Failed to find content $path\n";
return;
}
if(defined $res->headers()->content_type())
{
my $content = $res->headers()->content_type();
if($content =~ s/;.*$//)
{
# The content type could be something like
# "text/html; charset=iso-8859-1"
}
if(!defined $known_content_types{$content})
{
print STDERR "Unknown content type \"$content\" for $path\
+n";
print STDERR " Ref from ".keys(%{$ref_from{$path}})."\n"
if(defined $ref_from{$path
+});
$known_content_types{$content} = "ignore" if($content ne "
+");
return;
}
return if($known_content_types{$content} ne "HTML");
}
else
{
print STDERR "Cannot find content type for $path\n";
return;
}
# Parse HTML with regex, this bit needs rebuilding
while($page_contents =~ s/href\s*\=\s*\"([^\"]+)\"/**ref_done**/i)
{
my $link_to = $1;
# Remove the within page address
$link_to =~ s/#.*$//;
next if(!$link_to);
if($link_to =~ m#https?:#i)
{
# Link to explicitly external site
if(!defined $external_pages{$link_to})
{
if(!defined $external_report)
{
print STDERR "External page (from $path) $link_to\
+n";
}
else
{
print $external_report "External page (from $path)
+ $link_to\n";
}
$external_pages{$link_to} = "noted";
}
next;
}
elsif($link_to =~ s#^\./+##)
{
# Change from link to . to local
$link_to = $dir.$link_to;
}
elsif($link_to =~ m#javascript:#)
{
# I ain't going in there...
}
elsif($link_to =~ m#^/#)
{
# Absolute path within the web site
}
elsif($link_to =~ m#^(\w[\w\d_\-\/\.\s]+)$#)
{
# Change from rel to absolute
$link_to = $dir.$link_to;
}
else
{
print STDERR "Unparsed href (from $path) => \"$link_to\"\n
+";
next;
}
if(!defined $pages{$link_to})
{
$pages{$link_to} = "needs_scan";
}
}
while($page_contents =~ s/href(.{0,30})/**ref_done**/)
{
my $href_val = $1;
# These are the special cases that we have found previously an
+d
# are not interesting to us
next if($href_val =~ m#<#);
next if($href_val =~ m#^\"$#);
next if($href_val =~ m#^=\"#);
next if($href_val =~ m#^_values#);
next if($href_val =~ m#^\"\s+type=#);
next if($href_val =~ m#[34]_p#);
print STDERR "Cannot parse out (from $path) \"href$1\"\n";
}
}
$num_pages++;
}
Of course that code is chopped out of a much larger script and not tested, but I think it should give you all the bits you need (well except for extracting href values with a proper parser rather than using regex). Hope it helps.
-
Are you posting in the right place? Check out Where do I post X? to know for sure.
-
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big>
<blockquote> <br /> <dd>
<dl> <dt> <em> <font>
<h1> <h2> <h3> <h4>
<h5> <h6> <hr /> <i>
<li> <nbsp> <ol> <p>
<small> <strike> <strong>
<sub> <sup> <table>
<td> <th> <tr> <tt>
<u> <ul>
-
Snippets of code should be wrapped in
<code> tags not
<pre> tags. In fact, <pre>
tags should generally be avoided. If they must
be used, extreme care should be
taken to ensure that their contents do not
have long lines (<70 chars), in order to prevent
horizontal scrolling (and possible janitor
intervention).
-
Want more info? How to link
or How to display code and escape characters
are good places to start.