comment on

I am trying to run the code originally written by Steve Oualline

Description : One of the most vexing problems facing a webmaster is making sure that all the links on their website are correct. Internal links are difficult to deal with. Every time a file is added, removed, or changed on your website, there is the possibility of generating dead links. External links are even worse. Not only are they not under your control, but they disappear without a moment's notice. What's needed is a way of automatically checking a site for links that just don't work.

When I am trying to run the script I am getting the following error. I am using Windows 7.

C:\demo>site-check.pl http://www.codeinside.com
ERROR: No such file http://www.codeinside.com

Please advice on how to run the program successfully. Where am I going wrong?

#
# Usage: site-check.pl <top-file> 
#
# Checks for:
#    1. Broken links
#    2. Orphaned files
#
use strict;
use warnings;

use HTML::SimpleLinkExtor;
use LWP::Simple;
use File::Basename;
use File::Spec::Functions;
use File::Find ();

# Generated by find2pl
# for the convenience of &wanted calls, 
# including -eval statements:
use vars qw/*name *dir *prune/;
*name   = *File::Find::name;
*dir    = *File::Find::dir;
*prune  = *File::Find::prune;

my %file_seen = ();    # True if we've seen a file
my @external_links = ();# List of external links

my @bad_files = ();    # Files we did not see
my @full_file_list = ();# List of all the files


########################################################
# wanted -- Called by the find routine, this returns
#    true if the file is wanted.  As a side effect
#    it records any normal file seen in "full_file_list".
########################################################
sub wanted {
    if (-f "$name") {
        push(@full_file_list, $name);
    }
    return (1);
}

########################################################
# process_file($file)
#
# Read an html file and extract the tags.  
#
# If the file does not exist, put it in the list of 
# bad files.
########################################################
no warnings 'recursion';    # Turn off recursion warning

sub process_file($);    # Needed because this is recursive
sub process_file($)
{
    my $file_name = shift;    # The file to process
    my $dir_name = dirname($file_name);

    # Did we do it already
    if ($file_seen{$file_name}) {
        return;
    }
    $file_seen{$file_name} = 1;
    if (! -f $file_name) {
        push(@bad_files, $file_name);
    return;
    }

    # Skip non-html files
    if (($file_name !~ /\.html$/) and 
        ($file_name !~ /\.htm$/)) {
        return;
    }
    # The parser object to extract the list
    my $extractor = HTML::SimpleLinkExtor->new();

    # Parse the file
    $extractor->parse_file($file_name);

    # The list of all the links in the file
    my @all_links = $extractor->links();

    # Check each link
    foreach my $cur_link (@all_links) {

    # Is the link external
        if ($cur_link =~ /^http:\/\//) {
        # Put it on the list of external links
        push(@external_links, {
        file => $file_name,
            link => $cur_link});
        next;
    }
    # Remove the "#name" part of the link
    # We don't check that
    if ($cur_link =~ /([^#]*)#/) {
        $cur_link = $1;
    }
    if ($cur_link eq "") {
        next;
    }
    # Get the name of the file
    my $next_file = "$dir_name/$cur_link";

    # Remove any funny characters in the name
    $next_file = File::Spec->canonpath($next_file);

    # Follow the links in this file
    process_file($next_file);
    }
}
# Turn on deep recursion warning
use warnings 'recursion';    

if ($#ARGV != 0) {
    print STDERR "Usage: $0 <top-file>\n";
    exit (8);
}
        
# Top level file
my $top_file = $ARGV[0];
if (-d $top_file) {
    $top_file .= "/index.html";
}
if (! -f $top_file) {
    print STDERR "ERROR: No such file $top_file\n";
    exit (8);
}

# Scan all the links
process_file($top_file);

print "Broken Internal Links\n";
foreach my $cur_file (sort @bad_files)
{
    print "\t$cur_file\n";
}

# Traverse desired filesystems
File::Find::find({wanted => \&wanted}, dirname($ARGV[0]));

print "Orphan Files\n";
foreach my $cur_file (sort @full_file_list)
{
    if (not defined($file_seen{$cur_file})) {
    print "\t$cur_file\n";
    }
}

print "Broken External Links\n";
foreach my $cur_file (sort @external_links) {
    if (not (head($cur_file->{link}))) {
    print "\t$cur_file->{file} => $cur_file->{link}\n";
    }
}
[download]

In reply to Website link checker by shajiindia

Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!

Titles consisting of a single word are discouraged, and in most cases are disallowed outright.

Read Where should I post X? if you're not absolutely sure you're posting in the right place.

Please read these before you post! —

Posts may use any of the Perl Monks Approved HTML tags:

a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr

You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)

	For:		Use:
	&		`&`
	<		`<`
	>		`>`
	[		`[`
	]		`]`

Link using PerlMonks shortcuts! What shortcuts can I use for linking?

See Writeup Formatting Tips and other pages linked from there for more info.