#! /usr/bin/perl
use strict;
use warnings;
use HTML::TreeBuilder;
use Data::Dumper;
$Data::Dumper::Indent = 2;
my $html = do{local $/;<DATA>};
my $t = HTML::TreeBuilder->new_from_content($html)
or die qq{cant parse html: $!\n};
# get a list of p tags
my @paras = $t->look_down(_tag => q{p});
my @blocks;
for my $para (@paras){
my $txt;
# skip any of the tags at the start of the p tag,
# collect the first text found
# stop if a br tag found
for my $item ($para->content_refs_list){
if (ref $$item){
# we have a tag
my $tag = $$item->tag;
last if $tag eq q{br};
next;
}
# we have text
$txt = $$item;
}
# is it the p tag we need?
my ($title, $words);
next unless ($title, $words) = $txt =~ /\s---\s(.*?)\s---\s(.*)/;
# look down the p tag for the object
my $object = $para->look_down(_tag => q{object})
or die qq{look down didnt find object};
# stuff what we've found into a table
push @blocks, {
title => $title,
words => $words,
object => $object->as_HTML(undef, q{ }, {}),
}
}
print Dumper \@blocks;
__DATA__
<html><head><title>six blocks</title></head><body>
<p>text we don‘t want</p>
<p>text we don‘t want</p>
<p id="block1"><img src="pic.jpg"><a href="link.html">link</a> --- TIT
+LE1 --- WORDS1 WORDS1 WORDS1 WORDS1 WORDS1 WORDS1<BR>
<object id="object1"><param>If you can read this you are too close.</o
+bject>
</p>
<p id="block2"><img src="pic.jpg"><a href="link.html">link</a> --- TIT
+LE2 --- WORDS2 WORDS2 WORDS2 WORDS2 WORDS2 WORDS2<BR>
<object id="object2"><param>If you can read this you are too close.</o
+bject>
</p>
<p id="block3"><img src="pic.jpg"><a href="link.html">link</a> --- TIT
+LE3 --- WORDS3 WORDS3 WORDS3 WORDS3 WORDS3 WORDS3<BR>
<object id="object3"><param>If you can read this you are too close.</o
+bject>
</p>
<p>text we don‘t want</p>
<p id="block4"><img src="pic.jpg"><a href="link.html">link</a> --- TIT
+LE4 --- WORDS4 WORDS4 WORDS4 WORDS4 WORDS4 WORDS4<BR>
<object id="object4"><param>If you can read this you are too close.</o
+bject>
</p>
<p id="block5"><img src="pic.jpg"><a href="link.html">link</a> --- TIT
+LE5 --- WORDS5 WORDS5 WORDS5 WORDS5 WORDS5 WORDS5<BR>
<object id="object5"><param>If you can read this you are too close.</o
+bject>
</p>
<p id="block6"><img src="pic.jpg"><a href="link.html">link</a> --- TIT
+LE6 --- WORDS6 WORDS6 WORDS6 WORDS6 WORDS6 WORDS6<BR>
<object id="object6"><param>If you can read this you are too close.</o
+bject>
</p>
<p>text we don‘t want</p>
<p>text we don‘t want</p>
</body></html>
$VAR1 = [
{
'object' => '<object id="object1">
<param />If you can read this you are too close.</object>
',
'title' => 'TITLE1',
'words' => 'WORDS1 WORDS1 WORDS1 WORDS1 WORDS1 WORDS1'
},
{
'object' => '<object id="object2">
<param />If you can read this you are too close.</object>
',
'title' => 'TITLE2',
'words' => 'WORDS2 WORDS2 WORDS2 WORDS2 WORDS2 WORDS2'
},
{
'object' => '<object id="object3">
<param />If you can read this you are too close.</object>
',
'title' => 'TITLE3',
'words' => 'WORDS3 WORDS3 WORDS3 WORDS3 WORDS3 WORDS3'
},
{
'object' => '<object id="object4">
<param />If you can read this you are too close.</object>
',
'title' => 'TITLE4',
'words' => 'WORDS4 WORDS4 WORDS4 WORDS4 WORDS4 WORDS4'
},
{
'object' => '<object id="object5">
<param />If you can read this you are too close.</object>
',
'title' => 'TITLE5',
'words' => 'WORDS5 WORDS5 WORDS5 WORDS5 WORDS5 WORDS5'
},
{
'object' => '<object id="object6">
<param />If you can read this you are too close.</object>
',
'title' => 'TITLE6',
'words' => 'WORDS6 WORDS6 WORDS6 WORDS6 WORDS6 WORDS6'
}
];
btw <object.*> is greedy but that is only the start of your problems. :-)
update: tweaked the html
update2: removed module that isn't used |