#!/usr/bin/perl # Web Site Mapper # Copyright (C) 2004 Timm Murray # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # use strict; use warnings; use WWW::Mechanize; use URI; use YAML; #use Data::Dumper 'Dumper'; sub SITES () { qw( www.perlmonks.org ) } sub DISALLOWED_SCHEMES () { qw( mailto ) } sub DEBUG () { 1 } sub get_page { my ($data, $uri) = @_; my $page_to_load = $uri->canonical; warn "Indexing $page_to_load\n" if DEBUG; # Don't process pages that have already been loaded if(exists $data->{$page_to_load}) { warn "\t$page_to_load already indexed\n" if DEBUG; return; } # Don't process pages that aren't listed in the sites above unless( grep { lc($uri->host) eq lc($_) } SITES) { warn "\t$page_to_load not in allowed sites\n" if DEBUG; return; } my $mech = WWW::Mechanize->new(); my $response = $mech->get( $page_to_load ); $data->{$page_to_load}{status} = $mech->status; if($mech->success) { warn "\tResponse successful\n" if DEBUG; $data->{$page_to_load}{content_type} = $mech->ct; warn "\tContent-type: " . $data->{$page_to_load}{content_type} . "\n" if DEBUG; $data->{$page_to_load}{title} = $mech->title; warn "\tTitle: " . $data->{$page_to_load}{title} . "\n" if DEBUG; my @links = map { $_->url_abs } $mech->links; $data->{$page_to_load}{links} = []; warn "\tLinks: " . join("\n", map "\t\t$_", @links) . "\n" if DEBUG; foreach my $link (@links) { my $uri = URI->new($link); next if grep { $uri->scheme eq $_ } DISALLOWED_SCHEMES; my $url = $uri->canonical->as_string; warn "\tFollowing $url\n" if DEBUG; push @{ $data->{$page_to_load}{links} }, $url; get_page( $data, $uri ); } } else { warn "\tResponse unsuccessful\n" if DEBUG; } } { my $data = { }; foreach my $site (SITES) { my $start_page = 'http://' . $site; my $uri = URI->new($start_page); get_page( $data, $uri ); } print Dump($data); #print Dumper($data); }