#!/usr/bin/perl -w # Adapted, in part, from Recipe 20.7 of The Perl Cookbook. use strict; use HTML::LinkExtor; use LWP::Simple; my @links; # holds all links found on target page my %dead; # list of unresolveable (dead) links. my $base_url = shift or die "Usage: $0 \n\n", "(including scheme prefix, e.g. http://)\n"; unless ( url_exists( $base_url ) ) { die "Unable to test links;\n", "$base_url could not be reached\n\n"; } print "Parsing $base_url...\n"; my $parser = HTML::LinkExtor->new( undef, $base_url ); $parser->parse( get( $base_url ) ); @links = $parser->links; print "Checking links"; foreach my $linkarray ( @links ) { print '.'; # show user that something is happening. my @element = @$linkarray; my $elt_type = shift @element; while ( @element ) { ( my $attr_name, my $attr_value) = splice( @element, 0, 2 ); # skip MAILTO's, which you shouldn't have anyway. next if ( $attr_value =~ /\b(mailto)\b/ ); unless ( url_exists( $attr_value ) ) { $dead{ $attr_value }++; } } } print "\nAll links checked.\n\n", "$base_url contains ", scalar @links, " link(s)"; unless ( scalar keys %dead ) { print "; all live.\n"; } else { print "; sadly, the following links aren't working:\n\n"; for (sort keys %dead) { print $_, "\n"} print "\n"; } sub url_exists { # Two tests are done because some sites do not appear to # properly support HEAD requests. Ex: www.borland.com # As a result, we were getting some false positives; the # extra test prevents those. my $url = shift; return head( $url ) || get( $url ); }