#! /usr/bin/perl use strict; use warnings; use HTML::TreeBuilder; use Data::Dumper; $Data::Dumper::Indent = 2; my $html = do{local $/;<DATA>}; my $t = HTML::TreeBuilder->new_from_content($html) or die qq{cant parse html: $!\n}; # get a list of p tags my @paras = $t->look_down(_tag => q{p}); my @blocks; for my $para (@paras){ my $txt; # skip any of the tags at the start of the p tag, # collect the first text found # stop if a br tag found for my $item ($para->content_refs_list){ if (ref $$item){ # we have a tag my $tag = $$item->tag; last if $tag eq q{br}; next; } # we have text $txt = $$item; } # is it the p tag we need? my ($title, $words); next unless ($title, $words) = $txt =~ /\s---\s(.*?)\s---\s(.*)/; # look down the p tag for the object my $object = $para->look_down(_tag => q{object}) or die qq{look down didnt find object}; # stuff what we've found into a table push @blocks, { title => $title, words => $words, object => $object->as_HTML(undef, q{ }, {}), } } print Dumper \@blocks; __DATA__ <html><head><title>six blocks</title></head><body> <p>text we don‘t want</p> <p>text we don‘t want</p> <p id="block1"><img src="pic.jpg"><a href="link.html">link</a> --- TIT +LE1 --- WORDS1 WORDS1 WORDS1 WORDS1 WORDS1 WORDS1<BR> <object id="object1"><param>If you can read this you are too close.</o +bject> </p> <p id="block2"><img src="pic.jpg"><a href="link.html">link</a> --- TIT +LE2 --- WORDS2 WORDS2 WORDS2 WORDS2 WORDS2 WORDS2<BR> <object id="object2"><param>If you can read this you are too close.</o +bject> </p> <p id="block3"><img src="pic.jpg"><a href="link.html">link</a> --- TIT +LE3 --- WORDS3 WORDS3 WORDS3 WORDS3 WORDS3 WORDS3<BR> <object id="object3"><param>If you can read this you are too close.</o +bject> </p> <p>text we don‘t want</p> <p id="block4"><img src="pic.jpg"><a href="link.html">link</a> --- TIT +LE4 --- WORDS4 WORDS4 WORDS4 WORDS4 WORDS4 WORDS4<BR> <object id="object4"><param>If you can read this you are too close.</o +bject> </p> <p id="block5"><img src="pic.jpg"><a href="link.html">link</a> --- TIT +LE5 --- WORDS5 WORDS5 WORDS5 WORDS5 WORDS5 WORDS5<BR> <object id="object5"><param>If you can read this you are too close.</o +bject> </p> <p id="block6"><img src="pic.jpg"><a href="link.html">link</a> --- TIT +LE6 --- WORDS6 WORDS6 WORDS6 WORDS6 WORDS6 WORDS6<BR> <object id="object6"><param>If you can read this you are too close.</o +bject> </p> <p>text we don‘t want</p> <p>text we don‘t want</p> </body></html>
btw <object.*> is greedy but that is only the start of your problems. :-)$VAR1 = [ { 'object' => '<object id="object1"> <param />If you can read this you are too close.</object> ', 'title' => 'TITLE1', 'words' => 'WORDS1 WORDS1 WORDS1 WORDS1 WORDS1 WORDS1' }, { 'object' => '<object id="object2"> <param />If you can read this you are too close.</object> ', 'title' => 'TITLE2', 'words' => 'WORDS2 WORDS2 WORDS2 WORDS2 WORDS2 WORDS2' }, { 'object' => '<object id="object3"> <param />If you can read this you are too close.</object> ', 'title' => 'TITLE3', 'words' => 'WORDS3 WORDS3 WORDS3 WORDS3 WORDS3 WORDS3' }, { 'object' => '<object id="object4"> <param />If you can read this you are too close.</object> ', 'title' => 'TITLE4', 'words' => 'WORDS4 WORDS4 WORDS4 WORDS4 WORDS4 WORDS4' }, { 'object' => '<object id="object5"> <param />If you can read this you are too close.</object> ', 'title' => 'TITLE5', 'words' => 'WORDS5 WORDS5 WORDS5 WORDS5 WORDS5 WORDS5' }, { 'object' => '<object id="object6"> <param />If you can read this you are too close.</object> ', 'title' => 'TITLE6', 'words' => 'WORDS6 WORDS6 WORDS6 WORDS6 WORDS6 WORDS6' } ];
update: tweaked the html
update2: removed module that isn't used
In reply to Re: Trying to parse html file
by wfsp
in thread Trying to parse html file
by annunaki10
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |