#!/usr/bin/perl package Metabot; use warnings; use strict; use WWW::SimpleRobot; use HTML::Entities; require HTML::Parser; use Lingua::EN::Summarize; use HTML::Summary; use HTML::TreeBuilder; use Lingua::EN::Keywords; use HTML::Tree; use LWP::Simple; @Metabot::ISA = qw(HTML::Parser); my $url = $ARGV[0]; my $parser = Metabot->new; my $robot = WWW::SimpleRobot->new( URLS => [ $url ], FOLLOW_REGEX => "^$url", DEPTH => 2, TRAVERSAL => 'depth', VISIT_CALLBACK => sub { my ( $url, $depth, $html, $links) = @_; print "\nURL: $url - depth $depth\n"; $html = decode_entities($html); $html =~ s/document\.write\(.+?\)\;//g; $html =~ s/\&\#.+?\;//g; my ($tree, $title, $titleastext, $newtitle, $newtitleh1, $newtitleastexth1, $newtitleastexth1clipped, $newtitleh2, $newtitleastexth2, $newtitleastexth2clipped, $newtitleh3, $newtitleastexth3, $newtitleastexth3clipped, $newtitleh4, $newtitleastexth4, $newtitleastexth4clipped, $newtitlep, $newtitleastextp, $newtitleastextpclipped, $summary, $var, $newmetadescription, $newmetakeywords); $tree = HTML::Tree->new(); $tree->parse($html); $title = $tree->look_down( '_tag' , 'title' ); $titleastext = $title->as_text; use HTML::Element; if ($titleastext){ print "\nTitle: $titleastext\n\n"; } else { $newtitle = HTML::Element->new('title'); $newtitle = $newtitleh1; $newtitleh1 = $tree->look_down( '_tag' , 'h1' ); if ($newtitleh1){ $newtitleastexth1 = $newtitleh1->as_text; } } if ($newtitleastexth1){ $newtitleastexth1clipped = substr($newtitleastexth1, 0, 65); $html->push_content($newtitleastexth1clipped); print "\n$url does not have a title. We created one from\n the first 66 characters your first headline tag \:\n $newtitleastexth1clipped.\n Please change if desired.\n\n"} else { $newtitleh2 = $tree->look_down( '_tag' , 'h2' ); if ($newtitleh2){ $newtitleastexth2 = $newtitleh2->as_text; } } if ($newtitleastexth2){ $newtitleastexth2clipped = substr($newtitleastexth2, 0, 65); $html->push_content($newtitleastexth2clipped); print "\n$url does not have a title. We created one from\n the first 66 characters your first headline tag \:\n $newtitleastexth2clipped.\n Please change if desired.\n\n" } else { $newtitleh3 = $tree->look_down( '_tag' , 'h3' ); if ($newtitleh3){ $newtitleastexth3 = $newtitleh3->as_text; } } if ($newtitleastexth3){ $newtitleastexth3clipped = substr($newtitleastexth3, 0, 65); $html->push_content($newtitleastexth3clipped); print "\n$url does not have a title. We created one from\n the first 66 characters your first headline tag \:\n $newtitleastexth3clipped.\n Please change if desired.\n\n" } else { $newtitleh4 = $tree->look_down( '_tag' , 'h4' ); if ($newtitleh4){ $newtitleastexth4 = $newtitleh4->as_text; } } if ($newtitleastexth4){ $newtitleastexth4clipped = substr($newtitleastexth4, 0, 65); $html->push_content($newtitleastexth4clipped); print "\n$url does not have a title. We created one from\n the first 66 characters your first headline tag \:\n $newtitleastexth4clipped.\n Please change if desired.\n\n" } else { $newtitlep = $tree->look_down( '_tag' , 'p' ); if ($newtitlep){ $newtitleastexth3 = $newtitlep->as_text; } } if ($newtitleastextp){ $newtitleastextpclipped = substr($newtitleastextp, 0, 65); $html->push_content($newtitleastextpclipped); print "\n$url does not have a title. We created one from\n the first 66 characters your first paragraph tag \:\n $newtitleastexth1clipped.\n Please change if desired.\n\n" } else { print "$url does not have a title and we are unable to suggest any."; } $tree = new HTML::TreeBuilder; $tree->parse($html); $summary = summarize( $html, filter => 'html', maxlength => 500 ); $summary =~ s/\s+/ /gs; $var = substr($summary, 0, 155); print "Using Lingua::EN::Summarize Summary: $var\n\n"; $newmetadescription = HTML::Element->new('meta', href => "$html"); $newmetadescription->attr('name', 'description'); $newmetadescription->attr('content', "$var"); $html->push_content($newmetadescription); $newmetadescription = $newmetadescription->delete; my @keywords = keywords($title.$summary); print "Keywords: " . join(", ", @keywords) . "\n\n"; $newmetakeywords = HTML::Element->new('meta', href => "$html"); $newmetakeywords->attr('name', 'keywords'); $newmetakeywords->attr('content', "@keywords"); $html->push_content($newmetakeywords); $newmetakeywords = $newmetakeywords->delete; # $tree = HTML::TreeBuilder->new_from_content($html); # foreach my $img ($tree->look_down('_tag', 'img')) { # my $alatt = $img->attr('alt') # if (!$alatt) { # $imgsrcattribute = $img->attr('src'); # use File::Basename; # my @suffixlist = qw(gif jpg jpeg png bmp php ico GIF JPG JPEG PNG BMP PHP ICO); # my $imgfilenopathnoext = fileparse($imgsrcattribute,@suffixlist); # my $newalttag = HTML::Element->new('img', alt=>""); # $newalttag->attr('alt', "$imgfilenopathnoext"); # $html->push_content($imgfilenopathnoext); # $imgfilenopathnoext = $imgfilenopathnoext->delete; # print "Alt tag not found for $imgsrcattribute \nInserted src file name $imgfilenopathnoext for alt attribute"; # } # } Second(@ARGV); exit(0); sub Second { my ( $url, $depth, $html, $links) = @_; print "\nURL: $url - depth $depth\n"; $html = decode_entities($html); $html =~ s/document\.write\(.+?\)\;//g; $html =~ s/\&\#.+?\;//g; my $tree = HTML::TreeBuilder->new(); $tree->parse($html); local $\ = $/; print $_->as_HTML for $tree->look_down( qw' _tag img ', sub { not defined $_[0]->attr('alt') } ); print '---'; print $_->as_HTML for $tree->look_down( qw' _tag img ', sub { not length $_[0]->attr('alt') } ); print '---'; $_->attr( alt => MAlt($_) ) for $tree->look_down( qw' _tag img ', sub { not length $_[0]->attr('alt') } ); print $_->as_HTML for $tree->look_down(qw' _tag img '); } ## end sub Main sub MAlt { my $imgscalar = $_[0]; my $imgsrc = $imgscalar->attr('src'); use File::Basename; my @suffixlist = qw(gif jpg jpeg png bmp php ico GIF JPG JPEG PNG BMP PHP ICO); my $imgfilenopathnoext = fileparse($imgsrc,@suffixlist); '!' . $imgfilenopathnoext; } } , BROKEN_LINK_CALLBACK => sub { my ( $url, $linked_from, $depth ) = @_; print STDERR "$url looks like a broken link on $linked_from\n"; print STDERR "Depth = $depth\n"; } ); $robot->traverse; my @urls = @{$robot->urls}; my @pages = @{$robot->pages}; for my $page ( @pages ) { my $url = $page->{url}; my $depth = $page->{depth}; my $modification_time = $page->{modification_time}; } sub text { my ($self,$text) = @_; $self->{TEXT} .= $text; } #### Can't call method "push_content" without a package or object reference at seo_f xer.pl line 126.