use strict; use warnings; use HTML::TableExtract; use Data::Dumper; my $html = <<'EOHTML';
02:44 AM EDT
0:42 (est.)
Speech
U.S.-Japan Relations
Asia Society, Washington Center
Ryozo Kato , Japan
EOHTML my $te = HTML::TableExtract->new(); $te->parse( $html ); my @datatypes = qw( time length type title org ); foreach my $ts ( $te->table_states ) { foreach my $row ( $ts->rows ) { my @extracted = map { split( /\n\n/, $_ ) } @$row; my %data = map { $datatypes[$_] => clean_whitespace( $extracted[$_] ) } ( 0 .. $#datatypes ); print Dumper( \%data ); } } sub clean_whitespace { my ( $text ) = @_; $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g; return $text; } __OUTPUT__ $VAR1 = { 'org' => 'Asia Society, Washington Center Ryozo Kato , Japan', 'title' => 'U.S.-Japan Relations', 'length' => '0:42 (est.)', 'type' => 'Speech', 'time' => '02:44 AM EDT' };