#!/usr/bin/perl use strict; use warnings; use HTML::LinkExtractor; use LWP::Simple qw(get); # Base for searching my $base = "http://business.intra.company.com"; # Create the LinkExtractor object for use in the subroutine my $LX = new HTML::LinkExtractor(undef, "$base/it/"); # List of all the links found my @allLinks; # Start here on the recursive traversal recursiveFollow("/it/index.html"); foreach (@allLinks) { # Print a list of resources to be used on apollo print "/wwwprod/docs/business/docs" . $_->path, "\n"; } sub recursiveFollow { my $file = shift; my $html = get("$base$file"); my @thisDocLinks; if (!defined $html) { warn "file not found: $base$file\n"; return; } # DEBUG print "got $base$file\n"; # /DEBUG $LX->parse(\$html); for my $link (@{ $LX->links }) { next if !defined $$link{href}; # Stash the link if it's a relative link or it begins with $base # but NOT if it's a "file:///" URI if (($$link{href} !~ /^http:/ || $$link{href} =~ /business\.intra/) && ($$link{href} !~ /^file:/)) { push @allLinks, $$link{href}; if ($$link{href} =~ /\.html?$/) { push @thisDocLinks, $$link{href}; } } } # Follow each link to an htm/html file found in this file recursivly foreach (@thisDocLinks) { recursiveFollow($_->path); } } #### $ ./findorphans.pl got http://business.intra.company.com/it/index.html got http://business.intra.company.com/it/meetings/meeting_schedule.htm file not found: http://business.intra.company.com/meetings/meeting_schedule.htm file not found: http://business.intra.company.com/associates/associate_info.htm file not found: http://business.intra.company.com/associates/index.html got http://business.intra.company.com/it/don/spotlight_winners.html file not found: http://business.intra.company.com/ask/index.html file not found: http://business.intra.company.com/meetings/meeting_schedule.htm file not found: http://business.intra.company.com/associates/associate_info.htm file not found: http://business.intra.company.com/associates/index.html got http://business.intra.company.com/it/associates/associate_info.htm file not found: http://business.intra.company.com/../index.html got http://business.intra.company.com/index.html got http://business.intra.company.com/aboutbusiness/bus_history.htm got http://business.intra.company.com/aboutbusiness/bus_history.htm got http://business.intra.company.com/aboutbusiness/bus_history.htm got http://business.intra.company.com/aboutbusiness/bus_history.htm got http://business.intra.company.com/aboutbusiness/bus_history.htm got http://business.intra.company.com/aboutbusiness/bus_history.htm got http://business.intra.company.com/aboutbusiness/bus_history.htm got http://business.intra.company.com/aboutbusiness/bus_history.htm got http://business.intra.company.com/aboutbusiness/bus_history.htm got http://business.intra.company.com/aboutbusiness/bus_history.htm got http://business.intra.company.com/aboutbusiness/bus_history.htm ...