#!/usr/bin/perl # Web Site Mapper # Copyright (C) 2004 Timm Murray # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + USA # use strict; use warnings; use WWW::Mechanize; use URI; use YAML; #use Data::Dumper 'Dumper'; sub SITES () { qw( www.perlmonks.org ) } sub DISALLOWED_SCHEMES () { qw( mailto ) } sub DEBUG () { 1 } sub get_page { my ($data, $uri) = @_; my $page_to_load = $uri->canonical; warn "Indexing $page_to_load\n" if DEBUG; # Don't process pages that have already been loaded if(exists $data->{$page_to_load}) { warn "\t$page_to_load already indexed\n" if DEBUG; return; } # Don't process pages that aren't listed in the sites above unless( grep { lc($uri->host) eq lc($_) } SITES) { warn "\t$page_to_load not in allowed sites\n" if DEBUG +; return; } my $mech = WWW::Mechanize->new(); my $response = $mech->get( $page_to_load ); $data->{$page_to_load}{status} = $mech->status; if($mech->success) { warn "\tResponse successful\n" if DEBUG; $data->{$page_to_load}{content_type} = $mech->ct; warn "\tContent-type: " . $data->{$page_to_load}{content_type} . "\n" if DEBUG; $data->{$page_to_load}{title} = $mech->title; warn "\tTitle: " . $data->{$page_to_load}{title} . "\n" if DEBUG; my @links = map { $_->url_abs } $mech->links; $data->{$page_to_load}{links} = []; warn "\tLinks: " . join("\n", map "\t\t$_", @links) . +"\n" if DEBUG; foreach my $link (@links) { my $uri = URI->new($link); next if grep { $uri->scheme eq $_ } DISALLOWED +_SCHEMES; my $url = $uri->canonical->as_string; warn "\tFollowing $url\n" if DEBUG; push @{ $data->{$page_to_load}{links} }, $url; get_page( $data, $uri ); } } else { warn "\tResponse unsuccessful\n" if DEBUG; } } { my $data = { }; foreach my $site (SITES) { my $start_page = 'http://' . $site; my $uri = URI->new($start_page); get_page( $data, $uri ); } print Dump($data); #print Dumper($data); }

In reply to Web Site Mapper by hardburn

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post, it's "PerlMonks-approved HTML":



  • Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!
  • Titles consisting of a single word are discouraged, and in most cases are disallowed outright.
  • Read Where should I post X? if you're not absolutely sure you're posting in the right place.
  • Please read these before you post! —
  • Posts may use any of the Perl Monks Approved HTML tags:
    a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr
  • You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)
            For:     Use:
    & &amp;
    < &lt;
    > &gt;
    [ &#91;
    ] &#93;
  • Link using PerlMonks shortcuts! What shortcuts can I use for linking?
  • See Writeup Formatting Tips and other pages linked from there for more info.