...
if ($grid_row->[$c]) {
#orig: bless $cell, 'HTML::ElementTable::DataElement';
bless $cell, 'HTML::ElementTable::DataElement' if defined $cell; #patch
next;
}
...
####
...
use HTML::TableExtract qw(tree);
...
my @headers = ("Solicitation #", qw(Types Agency));
my $te = HTML::TableExtract->new( slice_columns=> 1,
keep_html => 0,
# comment next line for "bless-error"
headers => \@headers
);
$te->parse($HTML);
foreach my $ts ($te->tables) {
print "======= Table (", join(',', $ts->coords), ") =======\n";
print join("\t", @headers), "\n";
foreach my $row ($ts->rows) {
print join("\t", map { $_->as_trimmed_text() } @$row), "\n";
}
}
####
...
Size: 1084487
======= Table (4,3) =======
*ROW: IFB, State Police
*ROW: IFB, Corrections
*ROW: IFB, Public Welfare
*ROW: IFB, Military and Veterans Affairs
*ROW: RFP, General Services
*ROW: IFB, Fish and Boat Commission
...
####
version_patchlevel_string='version 14 subversion 1';
archname='i686-linux-thread-multi';
Carp : 1.20
Config : n/a
Encode : 2.42
File::Slurp : 9999.19
HTML::Element : 4.2
HTML::Encoding : 0.61
HTML::TableExtract : 2.11
HTML::TreeBuilder : 4.2
LWP::UserAgent : 6.02
Readonly : 1.03
WWW::Mechanize : 1.70
strict : 1.04
warnings : 1.12
####
#!/usr/bin/perl
use strict;
use warnings;
use LWP::UserAgent;
use WWW::Mechanize;
use Readonly;
use HTML::TreeBuilder;
use HTML::Element qw(Table);
#use HTML::TableExtract qw(tree);
use HTML::TableExtract;
use HTML::Encoding 'encoding_from_http_message';
use Encode;
use File::Slurp;
sub versions { # print version of modules used
use Config qw(config_vars);
print config_vars(qw(version_patchlevel_string archname));
my @modules = sort grep { defined } map { /\A\s*use\s+(\S+)\b/i; $1 } read_file($0);
printf("%-20s: %s\n", $_, eval "\$$_\:\:VERSION // 'n/a'") for @modules;
}
versions; # just to see module versions involved
Readonly::Scalar my $url => 'http://www.emarketplace.state.pa.us/Search.aspx';
if (! -e "cache.tmp" ) { # FOR TESTING
my $mech = WWW::Mechanize->new( agent => 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0.2) Gecko/20100101 Firefox/6.0.2' );
$mech->get($url);
#There is only one form on the page and they start at 1 in WWW:Mechanize
my $form = $mech->form_number(1);
# 'wucSearch$btnSearch' is the NAME of the button we want to press; 'wucSearch_btnSearch' is the id
# 'wucSearchResults$ddlRows' is the NAME of the input item we want to set to 'ALL'
$mech->select('wucSearchResults$ddlRows','ALL');
my $response = $mech->click_button(name => 'wucSearch$btnSearch');
if ($response->is_success) {
# print $response->decoded_content; # or whatever
my $HTML = $response->decoded_content;
open my $cfh, '>', 'cache.tmp' or die "cannot cache.tmp - $!";
print $cfh $HTML;
close $cfh;
} else {
die $response->status_line;
}
}
#---- overwriting the original new_from_tree() method
# use HTML::TableExtract qw(tree); <-- patch works for this variant
{
package HTML::ElementTable;
my $DEBUG=0;
use Carp;
sub new_from_tree_PATCH {
# takes a regular HTML::Element table tree structure and reblesses and
# configures it into an HTML::ElementTable structure.
#
# Dealing with row and column span issues properly is a real PITA, so
# we cheat here a little bit by creating a new table structure with
# fully rendered spans and use that as a template for normalizing the
# old table.
my($class, $tree) = @_;
ref $tree or croak "Ref to element tree required.\n";
$tree->tag eq 'table' or croak "element tree should represent a table.\n";
# First get rid of non elements -- note this WILL zap comments within
# the html of the table structure (i.e. in between adjacent tr tags or
# td/th tags). While we're at it, determine dimensions.
my($maxrow, $maxcol) = (-1, -1);
my @rows;
my @content = reverse $tree->detach_content;
while (@content) {
my $row = pop @content;
next unless UNIVERSAL::isa($row, 'HTML::Element');
my $tag = $row->tag;
# hack around tbody, thead, tfoot - yes, this means they get
# stripped out of the resulting table tree
if ($tag eq 'tbody' || $tag eq 'thead' || $tag eq 'tfoot') {
push(@content, reverse $row->detach_content);
next;
}
if ($tag eq 'tr') {
++$maxrow;
my @cells;
foreach my $cell ($row->detach_content) {
if (UNIVERSAL::isa($cell, 'HTML::Element') &&
($cell->tag eq 'td' || $cell->tag eq 'th')) {
push(@cells, $cell);
}
}
$maxcol = $#cells if $#cells > $maxcol;
$row->push_content(@cells);
push(@rows, $row);
}
}
$tree->push_content(@rows);
# Rasterize the tree table into a grid template -- use that as a guide
# to flesh out our new H::ET
eval "use HTML::TableExtract 2.08 qw(tree)";
croak "Problem loading HTML::TableExtract : $@\n" if $@;
my $rasterizer = HTML::TableExtract::Rasterize->make_rasterizer;
@rows = $tree->content_list;
foreach my $r (0 .. $#rows) {
my $row = $rows[$r];
foreach my $cell ($row->content_list) {
my $rowspan = $cell->attr('rowspan') || 1;
my $colspan = $cell->attr('colspan') || 1;
$rasterizer->($r, $rowspan, $colspan);
}
}
my $grid = $rasterizer->();
# Flesh out the tree structure, inserting masked cells where
# appropriate
foreach my $r (0 .. $#$grid) {
my $row = $rows[$r];
my $grid_row = $grid->[$r];
my $content = $row->content_array_ref;
print STDERR "Flesh row $r ($#$content) to $#$grid_row\n" if $DEBUG;
foreach my $c (0 .. $#$grid_row) {
my $cell = $content->[$c];
print STDERR $grid_row->[$c] ? '1' : '0' if $DEBUG;
if ($grid_row->[$c]) {
## print "BLESS: $cell\n" unless ref $cell;
bless $cell, 'HTML::ElementTable::DataElement' if defined $cell; #patch
next;
}
else {
my $masked = HTML::ElementTable::DataElement->new;
$masked->mask(1);
$row->splice_content($c, 0, $masked);
}
}
print STDERR "\n" if $DEBUG;
croak "row $r splice mismatch: $#$content vs $#$grid_row\n"
unless $#$content == $#$grid_row;
## print "BLESS2: $row\n" unless ref $row;
bless $row, 'HTML::ElementTable::RowElement';
}
## print "BLESS3: $tree\n" unless ref $tree;
bless $tree, 'HTML::ElementTable';
$tree->_initialize_table;
$tree->refresh;
print $tree->as_HTML, "\n" if $DEBUG > 1;
return $tree;
}
*new_from_tree = *new_from_tree_PATCH;
}
package main;
#---- end of method patch
my $HTML = read_file("cache.tmp");
print "Size: ", length( $HTML ), "\n";
my $te = HTML::TableExtract->new(slice_columns=> 1,
keep_html => 0,
headers => ["Types", "Agency"] #TODO!
);
$te->parse($HTML);
foreach my $ts ($te->tables) {
print "======= Table (", join(',', $ts->coords), ") =======\n";
foreach my $row ($ts->rows) {
s/^\s+//,s/\s+$// for @$row; # trim
print "*ROW: ", join(", ", @$row), "\n";
}
}