#!/usr/bin/perl
use strict;
use warnings;
use lib 'lib';
use MyParser;
my ($p, $txt);
my $html = do{local $/;<DATA>};
$p = MyParser->new(file => 'test.html')
or die "can't parse: $!\n";
$txt = $p->get_title;
print "$txt\n";
$p->get_tag('p');
$txt = $p->get_txt('p'); # upto a closing p tag
print "*$txt*\n";
__DATA__
<html>
<head>
<title>egrave: è : eacute : é : rsquo: ’ : lsquo:
+‘</title>
</head>
<body>
<p>
one
<span class="second">two</span>
three
<br>
four
five six
</p>
</body>
</html>
MyParser.pm
package MyParser;
use strict;
use warnings;
use HTML::TokeParser::Simple;
use base qw(HTML::TokeParser::Simple);
sub get_title{
my ($self) = @_;
$self->get_tag('title') or return;
$self->get_txt('title');
}
sub get_txt{
my ($self, $tag) = @_;
my ($txt);
while (my $t = $self->get_token){
last if $t->is_end_tag($tag);
next if $t->is_start_tag or $t->is_end_tag;
$txt .= $t->as_is if $t->is_text;
}
for ($txt){
s/\n/ /g;
s/^\s+//;
s/\s+$//;
s/\s+/ /g;
}
return $txt;
}
1;
If, like me, getting the title is a frequent task you can add a method to the wrapper to do that as I've done here. I've also tried to emulate HTML::TokeParser's get_trimmed_text
It skips any tags found before the required end tag (although this won't apply to titles).
I'd be interested in hearing comments from monks if any glaring fopahs have been committed. :-) |