#! /usr/bin/perl use strict; use warnings; use HTML::TreeBuilder; use Data::Dumper; $Data::Dumper::Indent = 2; my $html = do{local $/;}; my $t = HTML::TreeBuilder->new_from_content($html) or die qq{cant parse html: $!\n}; # get a list of p tags my @paras = $t->look_down(_tag => q{p}); my @blocks; for my $para (@paras){ my $txt; # skip any of the tags at the start of the p tag, # collect the first text found # stop if a br tag found for my $item ($para->content_refs_list){ if (ref $$item){ # we have a tag my $tag = $$item->tag; last if $tag eq q{br}; next; } # we have text $txt = $$item; } # is it the p tag we need? my ($title, $words); next unless ($title, $words) = $txt =~ /\s---\s(.*?)\s---\s(.*)/; # look down the p tag for the object my $object = $para->look_down(_tag => q{object}) or die qq{look down didnt find object}; # stuff what we've found into a table push @blocks, { title => $title, words => $words, object => $object->as_HTML(undef, q{ }, {}), } } print Dumper \@blocks; __DATA__ six blocks

text we don‘t want

text we don‘t want

link --- TITLE1 --- WORDS1 WORDS1 WORDS1 WORDS1 WORDS1 WORDS1
If you can read this you are too close.

link --- TITLE2 --- WORDS2 WORDS2 WORDS2 WORDS2 WORDS2 WORDS2
If you can read this you are too close.

link --- TITLE3 --- WORDS3 WORDS3 WORDS3 WORDS3 WORDS3 WORDS3
If you can read this you are too close.

text we don‘t want

link --- TITLE4 --- WORDS4 WORDS4 WORDS4 WORDS4 WORDS4 WORDS4
If you can read this you are too close.

link --- TITLE5 --- WORDS5 WORDS5 WORDS5 WORDS5 WORDS5 WORDS5
If you can read this you are too close.

link --- TITLE6 --- WORDS6 WORDS6 WORDS6 WORDS6 WORDS6 WORDS6
If you can read this you are too close.

text we don‘t want

text we don‘t want

#### $VAR1 = [ { 'object' => ' If you can read this you are too close. ', 'title' => 'TITLE1', 'words' => 'WORDS1 WORDS1 WORDS1 WORDS1 WORDS1 WORDS1' }, { 'object' => ' If you can read this you are too close. ', 'title' => 'TITLE2', 'words' => 'WORDS2 WORDS2 WORDS2 WORDS2 WORDS2 WORDS2' }, { 'object' => ' If you can read this you are too close. ', 'title' => 'TITLE3', 'words' => 'WORDS3 WORDS3 WORDS3 WORDS3 WORDS3 WORDS3' }, { 'object' => ' If you can read this you are too close. ', 'title' => 'TITLE4', 'words' => 'WORDS4 WORDS4 WORDS4 WORDS4 WORDS4 WORDS4' }, { 'object' => ' If you can read this you are too close. ', 'title' => 'TITLE5', 'words' => 'WORDS5 WORDS5 WORDS5 WORDS5 WORDS5 WORDS5' }, { 'object' => ' If you can read this you are too close. ', 'title' => 'TITLE6', 'words' => 'WORDS6 WORDS6 WORDS6 WORDS6 WORDS6 WORDS6' } ];