use strict;
use warnings;
use HTML::TableExtract;
use Data::Dumper;
my $html = <<'EOHTML';
02:44 AM
EDT
0:42
(est.)
|
Speech
U.S.-Japan
Relations
Asia Society, Washington Center Ryozo Kato
, Japan |
EOHTML
my $te = HTML::TableExtract->new();
$te->parse( $html );
my @datatypes = qw( time length type title org );
foreach my $ts ( $te->table_states )
{
foreach my $row ( $ts->rows )
{
my @extracted = map { split( /\n\n/, $_ ) } @$row;
my %data = map { $datatypes[$_] => clean_whitespace( $extracted[$_] ) } ( 0 .. $#datatypes );
print Dumper( \%data );
}
}
sub clean_whitespace
{
my ( $text ) = @_;
$text =~ s/^\s+//;
$text =~ s/\s+$//;
$text =~ s/\s+/ /g;
return $text;
}
__OUTPUT__
$VAR1 = {
'org' => 'Asia Society, Washington Center Ryozo Kato , Japan',
'title' => 'U.S.-Japan Relations',
'length' => '0:42 (est.)',
'type' => 'Speech',
'time' => '02:44 AM EDT'
};