#!perl
=head1 NAME
reshtml - extract image (and other resource) urls from a HTML
=head1 SYNOPSIS
B [B<-b> I] [I...]
=head1 DESCRIPTION
Parses a HTML document, extract the URL of all images and other resources in
it, and print them one per line.
Currently url for images, default style sheets, and favicons are collected.
Scripts, optional style sheets, random hyperlinks and random header links,
applets, netscape low-res image previews, refresh targets, frames and iframes
are ignored, though some of this could change in the future.
The HTML documents are read from the files whose names are given on command
line, or from STDIN if no name is given.
Repeated URLs are printed only once, though no effort is done to recognize
equivalent URLs.
=head1 OPTIONS
=over
=item B<-b> I
Qualify relative urls using I as the base.
Note that a base url given in the HTML document (with the B tag)
is always used this way, no matter whether you give this switch or not.
If no base URL is known, but relative URLs are found, they are output
as is but with a warning. Use B<-b .> to silence this warning.
=item B<-i> I
Read I for a list of URLs and download filenames. The filenames
from second column give the name of HTML files to read and parse, the
URLs in the first column are used as the base URL only. This option
excludes giving filenames or base urls from the command line.
The listfile has the same format as the listfile for L, making
it easier to process HTML files you have downloaded with that utility.
=item B<-P> I
Interpret filenames as relative to directory I. This is most
useful with B<-i>, but can be used otherwise too.
=item B<-v>
Print names of files as they're parsed.
=back
=cut
####
use warnings; use 5.012; use strict;
use IO::Handle;
use Getopt::Long;
use URI;
use File::Spec::Functions "catfile";
use XML::Twig;
our $VERSION = 0.001;
our $gbase = undef;
our $listfname = undef;
our $dirprefix = ".";
our $verbose;
our($versionopt, $helpopt);
Getopt::Long::Configure qw"bundling gnu_compat prefix_pattern=(--|-)";
GetOptions(
"b|base=s" => \$gbase,
"i|listfile=s" => \$listfname,
"P|directory-prefix|prefix=s" => \$dirprefix,
"v|verbose" => \$verbose,
"V|version" => \$versionopt,
"h|help" => \$helpopt,
);
if ($versionopt) {
die "reshtml $VERSION\n";
} elsif ($helpopt) {
my $helpstr = q(Usage: reshtml [-b BASEURL] [FILENAME...]
Parses a HTML document, extract the list of images and other resources
necessary to render it, qualifies URLs to absolute URL using BASEURL as the
base, prints URLs one per line.
);
$helpstr =~ s/\n\s*/\n/g;
die $helpstr;
}
my $base;
my $relwarn;
my $ifname;
my %foundurl;
my $foundurl = sub {
my($u) = @_;
$u = URI->new($u);
if (length($base)) {
$u = $u->abs($base);
} elsif (!$u->scheme) { # relative URL
$relwarn++ or warn "warning: relative URL found and base address unknown, in file $ifname";
}
# we could canonicalize the URL for hashing purposes, but let's ignore that
if (!$foundurl{$u}++) {
print $u, "\n";
}
};
my %twhnd;
$twhnd{"base"} = sub {
my($tw, $e) = @_;
$base = $e->att("href");
1;
};
$twhnd{"link"} = $twhnd{"a"} = $twhnd{"area"} = sub {
my($tw, $e) = @_;
my $rs = $e->att("rel") // "";
my %rw;
for my $rw (split /[ \t\n\f\r]+/, $rs) { # seriously, HTML5 defines everything precisely
$rw =~ y/A-Z/a-z/;
$rw{$rw}++;
}
if (length(my $u = $e->att("href")) && (
$rw{"icon"} || $rw{"stylesheet"} && !$rw{"alternate"}
)) {
&$foundurl($u);
}
1;
};
$twhnd{"img"} = $twhnd{"object"} = sub {
my($tw, $e) = @_;
for my $an ("src", "data") {
if (length(my $u = $e->att($an))) {
&$foundurl($u);
}
}
1;
};
my $do_file = sub {
my($fname, $stdin, $lbase) = @_;
$ifname = $fname;
$verbose and print STDERR "#$fname\n";
$fname = catfile($dirprefix, $fname);
$relwarn = 0;
$base = $lbase // $gbase;
my $twig = XML::Twig->new(twig_handlers => \%twhnd);
if (!$stdin) {
$twig->parsefile_html($fname);
} else {
$twig->parse_html(\*STDIN);
}
STDOUT->flush;
};
if (defined($listfname)) {
@ARGV and die "error: both -i option and input filenames are given";
open my $LIST, "<", $listfname or
die "error opening list file: ($listfname) $!";
my $_;
while (<$LIST>) {
/\S/ or next;
my($lbase, $fname, $rest) = split " ";
length($rest) and die "error: invalid spec in listfile (too many words): $_ -";
if (!defined($fname)) {
$lbase =~ m"/([^/]+)/*$" or
die "error: cannot find suffix or base uri: $lbase";
$fname = $1;
}
&$do_file($fname, 0, $lbase);
}
} elsif (@ARGV) {
for my $fname (@ARGV) {
&$do_file($fname);
}
} else {
&$do_file("-", 1);
}
__END__