Simple little module that allows you to access the text inside nested tables using a multidimensional array. The html can either be in a variable or from a file.
usage:
my $table = Table->parse_it(\$content); or
my $table = Table->parse_it($filename);
then:
print $table->[$table][$row][$col];
package Table; use strict; use HTML::Parser; ## PRIVATE my $table = []; my $tb_count; my $tb_idx; my $row; my $column; my $table_status; my @save; sub new { my $type = shift; return bless $table, $type; } sub parse_it { my $self = shift; my $src = shift; my $p = HTML::Parser->new( api_version => 3, handlers => [ start => [ \&_start, "tagname"], end => [ \&_end, "tagname"], text => [ \&_text, "dtext"], ], marked_sections => 1, ); if (ref($src)){ $p->parse($$src) or return; }else{ $p->parse_file($src) or return; } return 1; } sub _start { my $tag = shift; if ($tag eq 'table'){ push @save, [$tb_idx, $row, $column]; $row = $column = 0; ++$tb_count; $tb_idx = $tb_count; ++$table_status; } $row++ if ($tag eq 'tr'); $column++ if ($tag eq 'td'); } sub _end { my $tag = shift; if ($tag eq 'table') { ($tb_idx, $row, $column) = @{ pop @save }; --$table_status; } $column = 0 if ($tag eq 'tr'); } sub _text { my $text = shift; $text =~ s/\xa0//; $table->[$tb_idx][$row][$column] .= $text if ($table_status) && ($text !~ m/^\s+$/) && ($text); } return 1;
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re: Table.pm: Extract text from html tables
by merlyn (Sage) on Jan 18, 2001 at 19:58 UTC | |
by zzspectrez (Hermit) on Jan 19, 2001 at 05:58 UTC | |
by extremely (Priest) on Jan 19, 2001 at 05:38 UTC | |
|
Re: Table.pm: Extract text from html tables
by Anonymous Monk on Feb 10, 2005 at 06:33 UTC | |
by Anonymous Monk on Aug 13, 2012 at 21:04 UTC | |
by runrig (Abbot) on Aug 13, 2012 at 21:12 UTC | |
|
Re: Table.pm: Extract text from html tables
by Anonymous Monk on Apr 30, 2010 at 23:56 UTC | |
by Anonymous Monk on May 01, 2010 at 03:57 UTC |