#!/usr/bin/perl use warnings; use strict; use Data::Dumper; $Data::Dumper::Indent = 1; use HTML::TreeBuilder; my $t = HTML::TreeBuilder->new_from_file(q{html/monk.html}) or die qq{cant build tree}; my $pre = $t->look_down( _tag => q{pre}, class => q{preElement} ); my @divs = $pre->look_down( _tag => q{div} ); my %table; my ($type, $record); for my $div (@divs){ if ($div->attr(q{class}) eq q{secTitle}){ $type = $div->as_text; next; } $record++; my @spans = $div->look_down( _tag => q{span}, class => qr/^x|f/ ); for my $span (@spans){ my $class = $span->attr(q{class}); my $txt = $span->as_text; $table{$type}{$record}{$class} = $txt; } my $trailing_txt = trailing_text($div); $table{$type}{$record}{family_data} = $trailing_txt; } print Dumper \%table; sub trailing_text { my ($div) = @_; my @rights = $div->right; my @txt; for my $right (@rights){ if (ref $right){ last if $right->tag eq q{div}; next if $right->tag eq q{br}; my $t = $right->as_text; next unless $t =~ /\S/; push @txt, trim($t); } else{ next unless $right =~ /\S/; push @txt, trim($right); } } return join(q{ }, @txt); } sub trim{ for (@_){ s/^\s+//; s/\s+$//; s/\s+/ /g; } return wantarray?@_:$_[0]; }