# # Usage: site-check.pl # # Checks for: # 1. Broken links # 2. Orphaned files # use strict; use warnings; use HTML::SimpleLinkExtor; use LWP::Simple; use File::Basename; use File::Spec::Functions; use File::Find (); # Generated by find2pl # for the convenience of &wanted calls, # including -eval statements: use vars qw/*name *dir *prune/; *name = *File::Find::name; *dir = *File::Find::dir; *prune = *File::Find::prune; my %file_seen = (); # True if we've seen a file my @external_links = ();# List of external links my @bad_files = (); # Files we did not see my @full_file_list = ();# List of all the files ######################################################## # wanted -- Called by the find routine, this returns # true if the file is wanted. As a side effect # it records any normal file seen in "full_file_list". ######################################################## sub wanted { if (-f "$name") { push(@full_file_list, $name); } return (1); } ######################################################## # process_file($file) # # Read an html file and extract the tags. # # If the file does not exist, put it in the list of # bad files. ######################################################## no warnings 'recursion'; # Turn off recursion warning sub process_file($); # Needed because this is recursive sub process_file($) { my $file_name = shift; # The file to process my $dir_name = dirname($file_name); # Did we do it already if ($file_seen{$file_name}) { return; } $file_seen{$file_name} = 1; if (! -f $file_name) { push(@bad_files, $file_name); return; } # Skip non-html files if (($file_name !~ /\.html$/) and ($file_name !~ /\.htm$/)) { return; } # The parser object to extract the list my $extractor = HTML::SimpleLinkExtor->new(); # Parse the file $extractor->parse_file($file_name); # The list of all the links in the file my @all_links = $extractor->links(); # Check each link foreach my $cur_link (@all_links) { # Is the link external if ($cur_link =~ /^http:\/\//) { # Put it on the list of external links push(@external_links, { file => $file_name, link => $cur_link}); next; } # Remove the "#name" part of the link # We don't check that if ($cur_link =~ /([^#]*)#/) { $cur_link = $1; } if ($cur_link eq "") { next; } # Get the name of the file my $next_file = "$dir_name/$cur_link"; # Remove any funny characters in the name $next_file = File::Spec->canonpath($next_file); # Follow the links in this file process_file($next_file); } } # Turn on deep recursion warning use warnings 'recursion'; if ($#ARGV != 0) { print STDERR "Usage: $0 \n"; exit (8); } # Top level file my $top_file = $ARGV[0]; if (-d $top_file) { $top_file .= "/index.html"; } if (! -f $top_file) { print STDERR "ERROR: No such file $top_file\n"; exit (8); } # Scan all the links process_file($top_file); print "Broken Internal Links\n"; foreach my $cur_file (sort @bad_files) { print "\t$cur_file\n"; } # Traverse desired filesystems File::Find::find({wanted => \&wanted}, dirname($ARGV[0])); print "Orphan Files\n"; foreach my $cur_file (sort @full_file_list) { if (not defined($file_seen{$cur_file})) { print "\t$cur_file\n"; } } print "Broken External Links\n"; foreach my $cur_file (sort @external_links) { if (not (head($cur_file->{link}))) { print "\t$cur_file->{file} => $cur_file->{link}\n"; } }