http://qs1969.pair.com?node_id=897129


in reply to Re^2: Trying to Insert Alt Tags Programmatically
in thread Trying to Insert Alt Tags Programmatically

Hi. Thanks to the anonymous monk and toolic. I'm a little further on. Below is my full code now. What I want the Perl to do is spider a local folder recursively, maybe 2 level deep and then write titles, meta descriptions, meta keywords and alt attributes for images if they are missing.

My biggest issue might be that the Perl in the script only accepts http paths as an argument, not any configured local path as far as I can tell. So the following don't work:

This poses an issue because I don't want to even try to update the files online via FTP, I just want to do it in a local folder. So how do I change the spider to accept a local file name? Thanks in advance.

The nice initial spider and description + keyword suggester comes courtesy of Troy (davistv): http://www.perlmonks.org/index.pl?node_id=267758

#!/usr/bin/perl package Metabot; use warnings; use strict; use WWW::SimpleRobot; use HTML::Entities; require HTML::Parser; use Lingua::EN::Summarize; use HTML::Summary; use HTML::TreeBuilder; use Lingua::EN::Keywords; use HTML::Tree; use LWP::Simple; @Metabot::ISA = qw(HTML::Parser); my $url = $ARGV[0]; my $parser = Metabot->new; my $robot = WWW::SimpleRobot->new( URLS => [ $url ], FOLLOW_REGEX => "^$url", DEPTH => 2, TRAVERSAL => 'depth', VISIT_CALLBACK => sub { my ( $url, $depth, $html, $links) = @_; print "\nURL: $url - depth $depth\n"; $html = decode_entities($html); $html =~ s/document\.write\(.+?\)\;//g; $html =~ s/\&amp;\#.+?\;//g; my ($tree, $title, $titleastext, $newtitle, $newtitleh1, $ +newtitleastexth1, $newtitleastexth1clipped, $newtitleh2, $newtitleast +exth2, $newtitleastexth2clipped, $newtitleh3, $newtitleastexth3, $new +titleastexth3clipped, $newtitleh4, $newtitleastexth4, $newtitleastext +h4clipped, $newtitlep, $newtitleastextp, $newtitleastextpclipped, $su +mmary, $var, $newmetadescription, $newmetakeywords); $tree = HTML::Tree->new(); $tree->parse($html); $title = $tree->look_down( '_tag' , 'title' ); $titleastext = $title->as_text; use HTML::Element; if ($titleastext){ print "\nTitle: $titleastext\n\n"; } else { $newtitle = HTML::Element->new('title'); $newtitle = $newtitleh1; $newtitleh1 = $tree->look_down( '_tag' , 'h1' ); if ($newtitleh1){ $newtitleastexth1 = $newtitleh1->as_text; } } if ($newtitleastexth1){ $newtitleastexth1clipped = substr($newtitleastexth1, 0, 65 +); $html->push_content($newtitleastexth1clipped); print "\n$url does not have a title. We created one from\n + the first 66 characters your first headline tag \<h1\>:\n $newtitleastexth1clipped.\n Please change if desired.\n\n"} else { $newtitleh2 = $tree->look_down( '_tag' , 'h2' ); if ($newtitleh2){ $newtitleastexth2 = $newtitleh2->as_text; } } if ($newtitleastexth2){ $newtitleastexth2clipped = substr($newtitleastexth2, 0, 65 +); $html->push_content($newtitleastexth2clipped); print "\n$url does not have a title. We created one from\n + the first 66 characters your first headline tag \<h2\>:\n $newtitleastexth2clipped.\n Please change if desired.\n\n" } else { $newtitleh3 = $tree->look_down( '_tag' , 'h3' ); if ($newtitleh3){ $newtitleastexth3 = $newtitleh3->as_text; } } if ($newtitleastexth3){ $newtitleastexth3clipped = substr($newtitleastexth3, 0, 65 +); $html->push_content($newtitleastexth3clipped); print "\n$url does not have a title. We created one from\n + the first 66 characters your first headline tag \<h3\>:\n $newtitleastexth3clipped.\n Please change if desired.\n\n" } else { $newtitleh4 = $tree->look_down( '_tag' , 'h4' ); if ($newtitleh4){ $newtitleastexth4 = $newtitleh4->as_text; } } if ($newtitleastexth4){ $newtitleastexth4clipped = substr($newtitleastexth4, 0, 65 +); $html->push_content($newtitleastexth4clipped); print "\n$url does not have a title. We created one from\n + the first 66 characters your first headline tag \<h4\>:\n $newtitleastexth4clipped.\n Please change if desired.\n\n" } else { $newtitlep = $tree->look_down( '_tag' , 'p' ); if ($newtitlep){ $newtitleastexth3 = $newtitlep->as_text; } } if ($newtitleastextp){ $newtitleastextpclipped = substr($newtitleastextp, 0, 65); $html->push_content($newtitleastextpclipped); print "\n$url does not have a title. We created one from\n + the first 66 characters your first paragraph tag \<p\>:\n $newtitleastexth1clipped.\n Please change if desired.\n\n" } else { print "$url does not have a title and we are unable to sug +gest any."; } $tree = new HTML::TreeBuilder; $tree->parse($html); $summary = summarize( $html, filter => 'html', maxlength = +> 500 ); $summary =~ s/\s+/ /gs; $var = substr($summary, 0, 155); print "Using Lingua::EN::Summarize Summary: $var\n\n"; $newmetadescription = HTML::Element->new('meta', href => " +$html"); $newmetadescription->attr('name', 'description'); $newmetadescription->attr('content', "$var"); $html->push_content($newmetadescription); $newmetadescription = $newmetadescription->delete; my @keywords = keywords($title.$summary); print "Keywords: " . join(", ", @keywords) . "\n\n"; $newmetakeywords = HTML::Element->new('meta', href => "$ht +ml"); $newmetakeywords->attr('name', 'keywords'); $newmetakeywords->attr('content', "@keywords"); $html->push_content($newmetakeywords); $newmetakeywords = $newmetakeywords->delete; # $tree = HTML::TreeBuilder->new_from_content($html); # foreach my $img ($tree->look_down('_tag', 'img')) { # my $alatt = $img->attr('alt') # if (!$alatt) { # $imgsrcattribute = $img->attr('src'); # use File::Basename; # my @suffixlist = qw(gif jpg jpeg png bmp php ico GIF JPG JPE +G PNG BMP PHP ICO); # my $imgfilenopathnoext = fileparse($imgsrcattribute,@suffixl +ist); # my $newalttag = HTML::Element->new('img', alt=>""); # $newalttag->attr('alt', "$imgfilenopathnoext"); # $html->push_content($imgfilenopathnoext); # $imgfilenopathnoext = $imgfilenopathnoext->delete; # print "Alt tag not found for $imgsrcattribute \nInserted src + file name $imgfilenopathnoext for alt attribute"; # } # } Second(@ARGV); exit(0); sub Second { my ( $url, $depth, $html, $links) = @_; print "\nURL: $url - depth $depth\n"; $html = decode_entities($html); $html =~ s/document\.write\(.+?\)\;//g; $html =~ s/\&amp;\#.+?\;//g; my $tree = HTML::TreeBuilder->new(); $tree->parse($html); local $\ = $/; print $_->as_HTML for $tree->look_down( qw' _tag img ', sub { not defined $_[0]->attr('alt') } ); print '---'; print $_->as_HTML for $tree->look_down( qw' _tag img ', sub { not length $_[0]->attr('alt') } ); print '---'; $_->attr( alt => MAlt($_) ) for $tree->look_down( qw' _tag img ', sub { not length $_[0]->attr('alt') } ); print $_->as_HTML for $tree->look_down(qw' _tag img '); } ## end sub Main sub MAlt { my $imgscalar = $_[0]; my $imgsrc = $imgscalar->attr('src'); use File::Basename; my @suffixlist = qw(gif jpg jpeg png bmp php ico GIF JPG JPEG PNG BMP +PHP ICO); my $imgfilenopathnoext = fileparse($imgsrc,@suffixlist); '!' . $imgfilenopathnoext; } } , BROKEN_LINK_CALLBACK => sub { my ( $url, $linked_from, $depth ) = @_; print STDERR "$url looks like a broken link on $linked_fro +m\n"; print STDERR "Depth = $depth\n"; } ); $robot->traverse; my @urls = @{$robot->urls}; my @pages = @{$robot->pages}; for my $page ( @pages ) { my $url = $page->{url}; my $depth = $page->{depth}; my $modification_time = $page->{modification_time}; } sub text { my ($self,$text) = @_; $self->{TEXT} .= $text; }

The current error when crawling http://s2services.com is:

Can't call method "push_content" without a package or object reference + at seo_f xer.pl line 126.

The error for trying to crawl local files is similar to: C:%5CUsers%5Csocrtwo%5CDesktop%5Cwgts%5Cindex.html is not a web page

or

index.html is not a valid URL