#
# Usage: site-check.pl <top-file> 
#
# Checks for:
#	1. Broken links
#	2. Orphaned files
#
use strict;
use warnings;

use HTML::SimpleLinkExtor;
use LWP::Simple;
use File::Basename;
use File::Spec::Functions;
use File::Find ();

# Generated by find2pl
# for the convenience of &wanted calls, 
# including -eval statements:
use vars qw/*name *dir *prune/;
*name   = *File::Find::name;
*dir    = *File::Find::dir;
*prune  = *File::Find::prune;

my %file_seen = ();	# True if we've seen a file
my @external_links = ();# List of external links

my @bad_files = ();	# Files we did not see
my @full_file_list = ();# List of all the files


########################################################
# wanted -- Called by the find routine, this returns
#	true if the file is wanted.  As a side effect
#	it records any normal file seen in "full_file_list".
########################################################
sub wanted {
    if (-f "$name") {
        push(@full_file_list, $name);
    }
    return (1);
}

########################################################
# process_file($file)
#
# Read an html file and extract the tags.  
#
# If the file does not exist, put it in the list of 
# bad files.
########################################################
no warnings 'recursion';	# Turn off recursion warning

sub process_file($);	# Needed because this is recursive
sub process_file($)
{
    my $file_name = shift;	# The file to process
    my $dir_name = dirname($file_name);

    # Did we do it already
    if ($file_seen{$file_name}) {
        return;
    }
    $file_seen{$file_name} = 1;
    if (! -f $file_name) {
        push(@bad_files, $file_name);
	return;
    }

    # Skip non-html files
    if (($file_name !~ /\.html$/) and 
        ($file_name !~ /\.htm$/)) {
        return;
    }
    # The parser object to extract the list
    my $extractor = HTML::SimpleLinkExtor->new();

    # Parse the file
    $extractor->parse_file($file_name);

    # The list of all the links in the file
    my @all_links = $extractor->links();

    # Check each link
    foreach my $cur_link (@all_links) {

	# Is the link external
        if ($cur_link =~ /^http:\/\//) {
	    # Put it on the list of external links
	    push(@external_links, {
		file => $file_name,
	    	link => $cur_link});
	    next;
	}
	# Remove the "#name" part of the link
	# We don't check that
	if ($cur_link =~ /([^#]*)#/) {
	    $cur_link = $1;
	}
	if ($cur_link eq "") {
	    next;
	}
	# Get the name of the file
	my $next_file = "$dir_name/$cur_link";

	# Remove any funny characters in the name
	$next_file = File::Spec->canonpath($next_file);

	# Follow the links in this file
	process_file($next_file);
    }
}
# Turn on deep recursion warning
use warnings 'recursion';	

if ($#ARGV != 0) {
    print STDERR "Usage: $0 <top-file>\n";
    exit (8);
}
        
# Top level file
my $top_file = $ARGV[0];
if (-d $top_file) {
    $top_file .= "/index.html";
}
if (! -f $top_file) {
    print STDERR "ERROR: No such file $top_file\n";
    exit (8);
}

# Scan all the links
process_file($top_file);

print "Broken Internal Links\n";
foreach my $cur_file (sort @bad_files)
{
    print "\t$cur_file\n";
}

# Traverse desired filesystems
File::Find::find({wanted => \&wanted}, dirname($ARGV[0]));

print "Orphan Files\n";
foreach my $cur_file (sort @full_file_list)
{
    if (not defined($file_seen{$cur_file})) {
	print "\t$cur_file\n";
    }
}

print "Broken External Links\n";
foreach my $cur_file (sort @external_links) {
    if (not (head($cur_file->{link}))) {
	print "\t$cur_file->{file} => $cur_file->{link}\n";
    }
}