#c:/Perl/bin -w use File::Find; use strict; # PURPOSE: find_http.pl finds ALL links in which "http" appears inside "" # in all w3c 4.01 compliant *.ht* files in the ARGV[0] directory and its subdirs; in other # words, finds absolute links, including both offsite links and those local links written # in absolute notation, prepended in output by # the line number in which the link is found. # # ACKNOWLEDGEMENTS: thanks to ikegami, Joost, holli, Scooterm, zaxo, corion and other PM, # for their tutelage (but who bear NO responsibility for non-idiomatic constructs or errors). # Readers should also note that holli warns emphatically NOT to use regex # to parse html ("because it is fatally error-prone, unneccesary (there is HTML::Parser)..." and that he says "fatally error-prone" especially when applied "to ill-formed html." The warning's been taken to heart, but was laid aside for # the purposes this drill (schodckwm 1/22/05) use vars qw( $dirname $file @filenames $input @input $linecounter $part @parts $href $offsite $link @found $subdir ); $linecounter = "0"; $href = qr/ outfile.txt\n\twhere dirname can be a relative or absolute path\n\tEven under MS Windows, use of *nix-style '/' forward slashes in path is recommended.\n\ti.e., from d:, 'd:\/foo' or '.\/foo' \(or '.\/long\/path\/to\/target'\)"); exit(1); } # call &process_file for each file in the directory (& subdirs) in $ARGV[0] find \&process_file, "$dirname"; &gethtml; exit(); #### sub process_file { if ($_ =~ /(.+)(ht[ml]{1,2})/ ) { $subdir = $File::Find::dir . "/"; # see below at # 1: push @filenames, $subdir . $_; } else { return (@filenames); } } # 1: (from c:/Perl/lib/file/find.pm) # $File::Find::dir is the current directory name, # also from find.pm: # $_ is the current filename within that directory #### sub gethtml { print "\n\t Files found:\n"; foreach $file(@filenames) { print "\n\t found: $file"; open INFILE, "$file" or warn "\tCan't open $file: $! "; push @found, "\n\n\t" . $file ."\n"; @input = ; # slurp the whole file to @input $linecounter = 0; # reset linecounter for next file foreach $input(@input) { $linecounter++; @parts = split m!()!i, $input; foreach $part(@parts) { if ( $part =~ m% ( # CAPTURE to $1 $href # START ON ' # because the link may be formatted, for # example, "; # add the close tag push @found, $linecounter; push @found , $link; } # end if $link_a... } # end if $part... } # end foreach #part(@parts) } #end foreach $input(@input) close INFILE; } # end foreach $file(@filenames) &print (@found); } # end sub gethtml ##### sub print { print "\n\n\t Found these \"http\" Links \n"; foreach $link(@found) { my $out = $link . " "; if ( $link !~ /^\d*$/) { $out = $out . "\n"; } print $out; } } #ENDNOTES: With ActiveState perl 5.8.4, this extracts all "http" links from a # local mirror (Xitami on E:) of a ~1600 page website and writes them to a # local ATA drive (F: on a P4, 2.4GHz, w2k box) in ~16 seconds. # The pages searched range from trivial to ~2400 lines of 4.01 html # # It would be non-trivial to output to html. However, reformatting # "Line_number rendered link" to be displayed properly by # a browser beyond the scope of this exercise