#!/usr/bin/perl package Metabot; use strict; use WWW::SimpleRobot; use HTML::Entities; require HTML::Parser; use HTML::Summary; use HTML::TreeBuilder; use Lingua::EN::Keywords; @Metabot::ISA = qw(HTML::Parser); my $url = $ARGV[0]; my $parser = Metabot->new; my $robot = WWW::SimpleRobot->new( URLS => [ $url ], FOLLOW_REGEX => "^$url", DEPTH => 2, TRAVERSAL => 'depth', VISIT_CALLBACK => sub { my ( $url, $depth, $html, $links ) = @_; print "$url - depth $depth\n"; $html = decode_entities($html); $html =~ s/document\.write\(.+?\)\;//g; $html =~ s/\&\#.+?\;//g; my $tree = new HTML::TreeBuilder; $tree->parse($html); my $summarizer = new HTML::Summary( LENGTH => 250, USE_META => 1, ); my $summary = $summarizer->generate( $tree ); $summary =~ s/\s+/ /gs; print "Summary: $summary\n"; $parser->parse($html); my $text = $parser->{TEXT}; my @keywords = keywords($summary . $text); print "Keywords: " . join(", ", @keywords) . "\n\n"; } , BROKEN_LINK_CALLBACK => sub { my ( $url, $linked_from, $depth ) = @_; print STDERR "$url looks like a broken link on $linked_from\n"; print STDERR "Depth = $depth\n"; } ); $robot->traverse; my @urls = @{$robot->urls}; my @pages = @{$robot->pages}; for my $page ( @pages ) { my $url = $page->{url}; my $depth = $page->{depth}; my $modification_time = $page->{modification_time}; } sub text { my ($self,$text) = @_; $self->{TEXT} .= $text; }