#c:/Perl/bin -w
use File::Find;
use strict;
# PURPOSE: find_http.pl finds ALL links in which "http" appears inside ""
# in all w3c 4.01 compliant *.ht* files in the ARGV[0] directory and its subdirs; in other
# words, finds absolute links, including both offsite links and those local links written
# in absolute notation, prepended in output by # the line number in which the link is found.
#
# ACKNOWLEDGEMENTS: thanks to ikegami, Joost, holli, Scooterm, zaxo, corion and other PM,
# for their tutelage (but who bear NO responsibility for non-idiomatic constructs or errors).
# Readers should also note that holli warns emphatically NOT to use regex
# to parse html ("because it is fatally error-prone, unneccesary (there is HTML::Parser)..." and that he says "fatally error-prone" especially when applied "to ill-formed html." The warning's been taken to heart, but was laid aside for
# the purposes this drill (schodckwm 1/22/05)
use vars qw( $dirname $file @filenames $input @input $linecounter $part @parts $href $offsite $link @found $subdir );
$linecounter = "0";
$href = qr/ outfile.txt\n\twhere dirname can be a relative or absolute path\n\tEven under MS Windows, use of *nix-style '/' forward slashes in path is recommended.\n\ti.e., from d:, 'd:\/foo' or '.\/foo' \(or '.\/long\/path\/to\/target'\)"); exit(1);
}
# call &process_file for each file in the directory (& subdirs) in $ARGV[0]
find \&process_file, "$dirname";
&gethtml;
exit();
####
sub process_file {
if ($_ =~ /(.+)(ht[ml]{1,2})/ ) {
$subdir = $File::Find::dir . "/"; # see below at # 1:
push @filenames, $subdir . $_;
} else {
return (@filenames);
}
}
# 1: (from c:/Perl/lib/file/find.pm)
# $File::Find::dir is the current directory name,
# also from find.pm:
# $_ is the current filename within that directory
####
sub gethtml {
print "\n\t Files found:\n";
foreach $file(@filenames) {
print "\n\t found: $file";
open INFILE, "$file" or warn "\tCan't open $file: $! ";
push @found, "\n\n\t" . $file ."\n";
@input = ; # slurp the whole file to @input
$linecounter = 0; # reset linecounter for next file
foreach $input(@input) {
$linecounter++;
@parts = split m!()!i, $input;
foreach $part(@parts) {
if ( $part =~ m%
( # CAPTURE to $1
$href # START ON '
# because the link may be formatted, for
# example, "; # add the close tag
push @found, $linecounter;
push @found , $link;
} # end if $link_a...
} # end if $part...
} # end foreach #part(@parts)
} #end foreach $input(@input)
close INFILE;
} # end foreach $file(@filenames)
&print (@found);
}
# end sub gethtml
#####
sub print {
print "\n\n\t Found these \"http\" Links \n";
foreach $link(@found) {
my $out = $link . " ";
if ( $link !~ /^\d*$/) {
$out = $out . "\n";
}
print $out;
}
}
#ENDNOTES: With ActiveState perl 5.8.4, this extracts all "http" links from a
# local mirror (Xitami on E:) of a ~1600 page website and writes them to a
# local ATA drive (F: on a P4, 2.4GHz, w2k box) in ~16 seconds.
# The pages searched range from trivial to ~2400 lines of 4.01 html
#
# It would be non-trivial to output to html. However, reformatting
# "Line_number rendered link" to be displayed properly by
# a browser beyond the scope of this exercise