my ( $tree, $title, $titleastext, $newtitle, $newtitleh1, $newtitleastexth1, $newtitleastexth1clipped, $newtitleh2, $newtitleastexth2, $newtitleastexth2clipped, $newtitleh3, $newtitleastexth3, $newtitleastexth3clipped, $newtitleh4, $newtitleastexth4, $newtitleastexth4clipped, $newtitlep, $newtitleastextp, $newtitleastextpclipped, $summary, $var, $newmetadescription, $newmetakeywords ); $tree = HTML::Tree->new(); $tree->parse($html); $title = $tree->look_down( '_tag', 'title' ); $titleastext = $title->as_text; use HTML::Element; if ($titleastext) { print "\nTitle: $titleastext\n\n"; } else { $newtitle = HTML::Element->new('title'); $newtitle = $newtitleh1; $newtitleh1 = $tree->look_down( '_tag', 'h1' ); if ($newtitleh1) { $newtitleastexth1 = $newtitleh1->as_text; } } if ($newtitleastexth1) { $newtitleastexth1clipped = substr( $newtitleastexth1, 0, 65 ); $html->push_content($newtitleastexth1clipped); print "\n$url does not have a title. We created one from\n the first 66 characters your first headline tag \:\n $newtitleastexth1clipped.\n Please change if desired.\n\n"; } #### $newtitleh1 ... $newtitleastexth1 ... $newtitleastexth1clipped ... $html->push_content ... ... $newtitleh2 ... $newtitleastexth2 ... $newtitleastexth2clipped ... $html->push_content ... #### #!/usr/bin/perl -- # Ebola.pl use strict; use warnings; use HTML::Tree; Main( @ARGV ); exit( 0 ); sub Main { my $t = HTML::TreeBuilder->new_from_content(join'','

',0..9,'

'); my $f = HTML::TreeBuilder->new_from_content('f'); my $B = $f->look_down( qw' _tag body ' ); print $t->as_HTML, "\n\n"; print '-'x33, "\n\n"; if ( Ebola( $B, 5, eval{ $t->look_down(qw'_tag h1')->as_text } ) ) { print "Using h1\n"; } elsif ( Ebola( $B, 5, eval{ $t->look_down(qw'_tag h2')->as_text } ) ) { print "Using h2\n"; } elsif ( Ebola( $B, 5, eval{ $t->look_down(qw'_tag h3')->as_text } ) ) { print "Using h3\n"; } elsif ( Ebola( $B, 5, eval{ $t->look_down(qw'_tag h4')->as_text } ) ) { print "Using h4\n\n"; } print '-' x 33, "\n\n"; print $f->as_HTML, "\n\n"; } ## end sub Main sub Ebola { my ( $html, $clip, $text ) = @_; if ( defined $text and length $text ) { $text = substr $text, 0, $clip; $html->push_content($text); } } ## end sub Ebola __END__ ####

0123456789

--------------------------------- Using h4 --------------------------------- f01234 ##
## sub Main { AsdfQwerty( '

0123456789

', '01234' ); AsdfQwerty( '

0123456789

', '01234' ); AsdfQwerty( '

0123456789

', '01234' ); AsdfQwerty( '

0123456789

', '01234' ); AsdfQwerty( '
0123456789
', '01234' ); AsdfQwerty( '

0123456789

', '01234' ); } ##
## local $\ = $/; print $_->as_HTML for $tree->look_down( '_tag', 'img ', sub { not defined $_[0]->attr('alt') } ); print '---'; print $_->as_HTML for $tree->look_down( qw' _tag img ', sub { not length $_[0]->attr('alt') } ); print '---'; $_->attr( alt => MAlt($_) ) for $tree->look_down( qw' _tag img ', sub { not length $_[0]->attr('alt') } ); print $_->as_HTML for $tree->look_down(qw' _tag img '); print $tree->as_HTML; $tree = $tree->delete; #### #!/usr/bin/perl # HTML::TreeBuilder invokes HTML::Entities::decode on the contents of # HREF attributes. Some CGI-based sites use lang=en or such for # internationalization. When this parameter is after an ampersand, # the resulting &lang is decoded, breaking the link. "sub" is another # popular one. # Test provided by Rocco Caputo use warnings; use strict; use Test::More tests => 1; use HTML::TreeBuilder; my $tb = HTML::TreeBuilder->new(); $tb->parse( "Test" ); my @links = $tb->look_down( sub { $_[0]->tag eq "a" } ); my $href = $links[0]->attr("href"); ok($href =~ /lang/, "href should contain 'lang' (is: $href)"); exit;