my ( $tree, $title, $titleastext, $newtitle, $newtitleh1, $newtitleastexth1, $newtitleastexth1clipped, $newtitleh2, $newtitleastexth2, $newtitleastexth2clipped, $newtitleh3, $newtitleastexth3, $newtitleastexth3clipped, $newtitleh4, $newtitleastexth4, $newtitleastexth4clipped, $newtitlep, $newtitleastextp, $newtitleastextpclipped, $summary, $var, $newmetadescription, $newmetakeywords ); $tree = HTML::Tree->new(); $tree->parse($html); $title = $tree->look_down( '_tag', 'title' ); $titleastext = $title->as_text; use HTML::Element; if ($titleastext) { print "\nTitle: $titleastext\n\n"; } else { $newtitle = HTML::Element->new('title'); $newtitle = $newtitleh1; $newtitleh1 = $tree->look_down( '_tag', 'h1' ); if ($newtitleh1) { $newtitleastexth1 = $newtitleh1->as_text; } } if ($newtitleastexth1) { $newtitleastexth1clipped = substr( $newtitleastexth1, 0, 65 ); $html->push_content($newtitleastexth1clipped); print "\n$url does not have a title. We created one from\n the first 66 characters your first headline tag \
##
$newtitleh1 ...
$newtitleastexth1 ...
$newtitleastexth1clipped ...
$html->push_content ...
...
$newtitleh2 ...
$newtitleastexth2 ...
$newtitleastexth2clipped ...
$html->push_content ...
##
##
#!/usr/bin/perl --
# Ebola.pl
use strict;
use warnings;
use HTML::Tree;
Main( @ARGV );
exit( 0 );
sub Main {
my $t = HTML::TreeBuilder->new_from_content(join'','',0..9,'');
my $f = HTML::TreeBuilder->new_from_content('f ');
my $B = $f->look_down( qw' _tag body ' );
print $t->as_HTML, "\n\n";
print '-'x33, "\n\n";
if ( Ebola( $B, 5, eval{ $t->look_down(qw'_tag h1')->as_text } ) ) {
print "Using h1\n";
} elsif ( Ebola( $B, 5, eval{ $t->look_down(qw'_tag h2')->as_text } ) ) {
print "Using h2\n";
} elsif ( Ebola( $B, 5, eval{ $t->look_down(qw'_tag h3')->as_text } ) ) {
print "Using h3\n";
} elsif ( Ebola( $B, 5, eval{ $t->look_down(qw'_tag h4')->as_text } ) ) {
print "Using h4\n\n";
}
print '-' x 33, "\n\n";
print $f->as_HTML, "\n\n";
} ## end sub Main
sub Ebola {
my ( $html, $clip, $text ) = @_;
if ( defined $text and length $text ) {
$text = substr $text, 0, $clip;
$html->push_content($text);
}
} ## end sub Ebola
__END__
##
##
0123456789
---------------------------------
Using h4
---------------------------------
f 01234
##
##
sub Main {
AsdfQwerty( '0123456789
', '01234' );
AsdfQwerty( '0123456789', '01234' );
AsdfQwerty( '0123456789
', '01234' );
AsdfQwerty( '0123456789
', '01234' );
AsdfQwerty( '0123456789
', '01234' );
AsdfQwerty( '
0123456789
', '01234' );
}
##
##
local $\ = $/;
print $_->as_HTML
for $tree->look_down( '_tag', 'img ',
sub { not defined $_[0]->attr('alt') } );
print '---';
print $_->as_HTML
for $tree->look_down( qw' _tag img ',
sub { not length $_[0]->attr('alt') } );
print '---';
$_->attr( alt => MAlt($_) )
for $tree->look_down( qw' _tag img ',
sub { not length $_[0]->attr('alt') } );
print $_->as_HTML for $tree->look_down(qw' _tag img ');
print $tree->as_HTML;
$tree = $tree->delete;
##
##
#!/usr/bin/perl
# HTML::TreeBuilder invokes HTML::Entities::decode on the contents of
# HREF attributes. Some CGI-based sites use lang=en or such for
# internationalization. When this parameter is after an ampersand,
# the resulting &lang is decoded, breaking the link. "sub" is another
# popular one.
# Test provided by Rocco Caputo
use warnings;
use strict;
use Test::More tests => 1;
use HTML::TreeBuilder;
my $tb = HTML::TreeBuilder->new();
$tb->parse(
"Test"
);
my @links = $tb->look_down( sub { $_[0]->tag eq "a" } );
my $href = $links[0]->attr("href");
ok($href =~ /lang/, "href should contain 'lang' (is: $href)");
exit;