#!/usr/bin/perl use warnings; use strict; use Data::Dumper; $Data::Dumper::Indent = 1; use HTML::TreeBuilder; my $t = HTML::TreeBuilder->new_from_file(q{html/monk.html}) or die qq{cant build tree}; my $pre = $t->look_down( _tag => q{pre}, class => q{preElement} ); my @divs = $pre->look_down( _tag => q{div} ); my %table; my ($type, $record); for my $div (@divs){ if ($div->attr(q{class}) eq q{secTitle}){ $type = $div->as_text; next; } $record++; my @spans = $div->look_down( _tag => q{span}, class => qr/^x|f/ ); for my $span (@spans){ my $class = $span->attr(q{class}); my $txt = $span->as_text; $table{$type}{$record}{$class} = $txt; } my $trailing_txt = trailing_text($div); $table{$type}{$record}{family_data} = $trailing_txt; } print Dumper \%table; sub trailing_text { my ($div) = @_; my @rights = $div->right; my @txt; for my $right (@rights){ if (ref $right){ last if $right->tag eq q{div}; next if $right->tag eq q{br}; my $t = $right->as_text; next unless $t =~ /\S/; push @txt, trim($t); } else{ next unless $right =~ /\S/; push @txt, trim($right); } } return join(q{ }, @txt); } sub trim{ for (@_){ s/^\s+//; s/\s+$//; s/\s+/ /g; } return wantarray?@_:$_[0]; } #### $VAR1 = { 'WIFE' => { '2' => { 'fn n' => 'Marjoram Washburn', 'family_data' => 'Born: Died: Bef 21 Nov 1717 Father: Philip Washburn Mother: Elizabeth Irish' } }, 'CHILDREN' => { '6' => { 'fn n' => 'Mary Leonard', 'family_data' => 'Born: Bef 1710 Died: 1793 at Marlborough, MA Husband: Daniel Herrington' }, '4' => { 'fn n' => 'Elizabeth Leonard', 'family_data' => 'Born: Abt 1702 Died: 14 Oct 1783 at Bridgewater, Plymouth, MA Husband: James Washburn' }, '3' => { 'fn n' => 'John Leonard', 'family_data' => 'Born: Bet 1695 and 1723 Died: Bet 1748 and 1808 Wife: Anna Noble' }, '7' => { 'fn n' => 'Margene Leonard', 'family_data' => 'Born: Abt 1710 Died: Husband: Nathaniel Pratt' }, '5' => { 'fn n' => 'Josiah Leonard', 'family_data' => 'Born: Abt 1704 Died:' } }, 'HUSBAND' => { '1' => { 'x-marriage-date' => '1699-11-2', 'fn n' => 'Josiah Leonard', 'family_data' => 'Other Spouses: Abigail Washburn Father: John Leonard Mother: Sarah Leonard', 'x-gender' => 'Male', 'x-death-date' => '1745-1-1', 'x-death-location' => 'Bridgewater, Plymouth, MA', 'x-marriage-location' => 'Bridgewater, Plymouth, MA' } } };