#!/usr/bin/perl package Metabot; use warnings; use strict; use WWW::SimpleRobot; use HTML::Entities; require HTML::Parser; use Lingua::EN::Summarize; use HTML::Summary; use HTML::TreeBuilder; use Lingua::EN::Keywords; use HTML::Tree; use LWP::Simple; @Metabot::ISA = qw(HTML::Parser); my $url = $ARGV[0]; my $parser = Metabot->new; my $robot = WWW::SimpleRobot->new( URLS => [ $url ], FOLLOW_REGEX => "^$url", DEPTH => 2, TRAVERSAL => 'depth', VISIT_CALLBACK => \&Botulism, BROKEN_LINK_CALLBACK => \&Snicklefritz, ); $robot->traverse; my @urls = @{$robot->urls}; my @pages = @{$robot->pages}; for my $page ( @pages ) { my $url = $page->{url}; my $depth = $page->{depth}; my $modification_time = $page->{modification_time}; } sub Botulism { my ( $url, $depth, $html, $links ) = @_; print "\nURL: $url - depth $depth\n"; $html = decode_entities($html); $html =~ s/document\.write\(.+?\)\;//g; $html =~ s/\&\#.+?\;//g; my $tree = HTML::TreeBuilder->new(); $tree->parse($html); no warnings 'uninitialized'; eval { my $Title = substr $tree->look_down( '_tag', 'title' )->as_text , 0, 65; print "Title exists and is: $Title.\n"; } or do { my $Title; for my $tag( qw' h1 h2 h3 h4 p ' ){ last if eval { $Title = substr $tree->look_down( '_tag', $tag )->as_text , 0, 65; if( length $Title ){ $html->push_content($Title); print "No title was found so the first $tag tag contents \n were written to the title field in the header.\n"; } } } unless($Title){ print "No title exists and no suitable \ntext was found by this bot to use as one.\n"; } }; my $filteredhtml = summarize( $html, filter => 'html' ); my $summary = summarize( $filteredhtml, maxlength => 500 ); $summary =~ s/\s+/ /gs; my $var = substr($summary, 0, 155); print "Using Lingua::EN::Summarize Summary: $var\n\n"; local $\ = $/; my $newmetadescription = HTML::Element->new('meta', 'name' => 'description', 'content' => "$var"); $tree->push_content("_content", "head", "text", "$newmetadescription"); $newmetadescription = $newmetadescription->delete; my $title = substr $tree->look_down( '_tag', 'title' )->as_text , 0, 65; my @keywords = keywords($title.$summary); print "Keywords: " . join(", ", @keywords) . "\n\n"; local $\ = $/; my $newmetakeywords = HTML::Element->new('meta', 'name' => 'keywords', 'content' => "@keywords"); $tree->push_content("_content", "head", "text", "$newmetakeywords"); $newmetakeywords = $newmetakeywords->delete; local $\ = $/; print $_->as_HTML for $tree->look_down( '_tag', 'img ', sub { not defined $_[0]->attr('alt') } ); print '---'; print $_->as_HTML for $tree->look_down( qw' _tag img ', sub { not length $_[0]->attr('alt') } ); print '---'; $_->attr( alt => MAlt($_) ) for $tree->look_down( qw' _tag img ', sub { not length $_[0]->attr('alt') } ); print $_->as_HTML for $tree->look_down(qw' _tag img '); print $tree->as_HTML; $tree = $tree->delete; } sub MAlt { my $imgscalar = $_[0]; my $imgsrc = $imgscalar->attr('src'); use File::Basename; my @suffixlist = qw(.gif .jpg .jpeg .png .bmp .php .ico .GIF .JPG .JPEG .PNG .BMP .PHP .ICO); my $imgfilenopathnoext = fileparse($imgsrc,@suffixlist); '!' . $imgfilenopathnoext; } sub Snicklefritz { my ( $url, $linked_from, $depth ) = @_; print "The link $url from the page $linked_from at depth $depth\n appears to be broken. please repair the link manually\n"; } sub Ebola { my( $html, $clip, $text ) = @_; if(defined $text and length $text ) { $text = substr $text, 0, $clip; $html->push_content( $text ); } } }