use warnings; use 5.012; use strict; use IO::Handle; use Getopt::Long; use URI; use File::Spec::Functions "catfile"; use XML::Twig; our $VERSION = 0.001; our $gbase = undef; our $listfname = undef; our $dirprefix = "."; our $verbose; our($versionopt, $helpopt); Getopt::Long::Configure qw"bundling gnu_compat prefix_pattern=(--|-)"; GetOptions( "b|base=s" => \$gbase, "i|listfile=s" => \$listfname, "P|directory-prefix|prefix=s" => \$dirprefix, "v|verbose" => \$verbose, "V|version" => \$versionopt, "h|help" => \$helpopt, ); if ($versionopt) { die "reshtml $VERSION\n"; } elsif ($helpopt) { my $helpstr = q(Usage: reshtml [-b BASEURL] [FILENAME...] Parses a HTML document, extract the list of images and other resources necessary to render it, qualifies URLs to absolute URL using BASEURL as the base, prints URLs one per line. ); $helpstr =~ s/\n\s*/\n/g; die $helpstr; } my $base; my $relwarn; my $ifname; my %foundurl; my $foundurl = sub { my($u) = @_; $u = URI->new($u); if (length($base)) { $u = $u->abs($base); } elsif (!$u->scheme) { # relative URL $relwarn++ or warn "warning: relative URL found and base address unknown, in file $ifname"; } # we could canonicalize the URL for hashing purposes, but let's ignore that if (!$foundurl{$u}++) { print $u, "\n"; } }; my %twhnd; $twhnd{"base"} = sub { my($tw, $e) = @_; $base = $e->att("href"); 1; }; $twhnd{"link"} = $twhnd{"a"} = $twhnd{"area"} = sub { my($tw, $e) = @_; my $rs = $e->att("rel") // ""; my %rw; for my $rw (split /[ \t\n\f\r]+/, $rs) { # seriously, HTML5 defines everything precisely $rw =~ y/A-Z/a-z/; $rw{$rw}++; } if (length(my $u = $e->att("href")) && ( $rw{"icon"} || $rw{"stylesheet"} && !$rw{"alternate"} )) { &$foundurl($u); } 1; }; $twhnd{"img"} = $twhnd{"object"} = sub { my($tw, $e) = @_; for my $an ("src", "data") { if (length(my $u = $e->att($an))) { &$foundurl($u); } } 1; }; my $do_file = sub { my($fname, $stdin, $lbase) = @_; $ifname = $fname; $verbose and print STDERR "#$fname\n"; $fname = catfile($dirprefix, $fname); $relwarn = 0; $base = $lbase // $gbase; my $twig = XML::Twig->new(twig_handlers => \%twhnd); if (!$stdin) { $twig->parsefile_html($fname); } else { $twig->parse_html(\*STDIN); } STDOUT->flush; }; if (defined($listfname)) { @ARGV and die "error: both -i option and input filenames are given"; open my $LIST, "<", $listfname or die "error opening list file: ($listfname) $!"; my $_; while (<$LIST>) { /\S/ or next; my($lbase, $fname, $rest) = split " "; length($rest) and die "error: invalid spec in listfile (too many words): $_ -"; if (!defined($fname)) { $lbase =~ m"/([^/]+)/*$" or die "error: cannot find suffix or base uri: $lbase"; $fname = $1; } &$do_file($fname, 0, $lbase); } } elsif (@ARGV) { for my $fname (@ARGV) { &$do_file($fname); } } else { &$do_file("-", 1); } __END__