#!/usr/bin/perl --
use strict;
use warnings;
use HTML::TreeBuilder;
Main(@ARGV);
exit(0);
sub Main {
if (@_) {
PumpDump(@_);
#~ PumpDump('', qw/look_down criteria/ );
#~ PumpDump('file', qw/look_down criteria/ );
} else {
print "Usage: $0 file _tag div\n\n";
print "Demo1\n";
Demo1();
print "Demo1\n";
Demo3();
} ## end else [ if (@_) ]
} ## end sub Main
sub Demo1 {
my $html = <<'__HTML__';
key1 val1
key2 val2
key3 val3
key4 val4
key5 val5
key6 val6
key7 val7
key8 val8
key9 val9
key10 val10
key11 val11
__HTML__
PumpDump( $html, _tag => qr/table|strong/i );
} ## end sub Demo1
sub Demo3 {
my $html = <<'__HTML__';
educa.ch
Altes Schulhaus Ossingen
Guntibachstrasse 10
8475 Ossingen
Tel:
052 317 15 45
Fax:
052 317 04 42
__HTML__
PumpDump( $html, _tag => qr/div/i );
} ## end sub Demo3
sub HTML::Element::addressx {
return join(
'/',
'', # // ROOT
reverse( # so it starts at the top
map {
my $count = 0;
my $t = $_->tag;
## LEFT CAN BE A STRING
my @left = $_->left;
for my $left (@left) {
eval { $count++ if $left->tag eq $t };
}
if ( $count > 1 ) {
$count = "[$count]";
} else {
$count = '';
}
$t . $count
} $_[0], # self and...
$_[0]->lineage
)
);
} ## end sub HTML::Element::addressx
sub HTML::Element::addressxx {
my (@stuff) = (
map {
my $count = 0;
my $t = $_->tag;
## LEFT CAN BE A STRING
my @left = $_->left;
for my $left (@left) {
eval { $count++ if $left->tag eq $t };
}
if ( my $attid = $_->attr('id') ) {
$count = "[\@id='$attid']";
} elsif ( $count > 1 ) {
$count = "[$count]";
} else {
$count = '';
}
$t . $count
} $_[0], # self and...
$_[0]->lineage
);
#~ use DDS; print Dump(\@stuff),"\n";
use List::MoreUtils qw[ before_incl ];
my $stuff = @stuff;
@stuff = before_incl { /\[\@id/i } @stuff;
return join(
'/',
( $stuff > @stuff ? '/' : '' ),
reverse( # so it starts at the top
@stuff
)
);
} ## end sub HTML::Element::addressxx
sub HTML::Element::addressxX {
my (@stuff) = (
map {
my $e = $_;
my $count = 0;
my $t = $e->tag;
my @left = $e->left;
for my $left (@left) {
eval { $count++ if $left->tag eq $t };
}
if ( my $attid = $e->id ) {
$count = "[\@id='$attid']";
} elsif ( my @att = grep !/^id$/, $e->all_external_attr_names ) {
$count = '['
. join( ' and ',
map { sprintf q!@%s='%s'!, $_, $e->attr($_) } @att )
. ']';
} elsif ( $count > 1 ) {
$count = "[$count]";
} else {
$count = '';
}
$t . $count
} $_[0], # self and...
$_[0]->lineage
);
#~ use DDS; print Dump(\@stuff),"\n";
my $stuff = @stuff;
use List::MoreUtils qw[ before_incl ];
@stuff = before_incl { /\[\@id/i } @stuff;
return join(
'/',
( $stuff > @stuff ? '/' : '' ),
reverse( # so it starts at the top
@stuff
)
);
} ## end sub HTML::Element::addressxX
sub PumpDump {
my ( $html, @lookdown ) = @_;
my $tree = HTML::TreeBuilder->new();
if ( $html =~ / ) {
$tree->parse($html);
} else {
$tree->parse_file($html);
}
$tree->eof;
for my $td ( $tree->look_down(@lookdown) ) {
my $text = $td->as_trimmed_text;
next if $text =~ /^\p{Zs}*$/; ## ysth, nbsp isn't \s
print $td, "\t", $td->address, "\n";
print $text, "\n";
print $td->addressx, "\n";
print $td->addressxx, "\n";
print $td->addressxX, "\n";
print '-' x 66, "\n";
} ## end for my $td ( $tree->look_down...)
$tree->delete;
undef $tree;
print '#' x 66, "\n\n";
} ## end sub PumpDump
__END__
####
$ perl htmltreexpather.pl select.html _tag option
HTML::Element=HASH(0xb139ec) 0.1.1.0.0
Chose Some aaa
/html/body/form/select/option
/html/body/form/select/option
/html/body[@bgcolor='red']/form[@action='/foo.cgi' and @name='queryfoo']/select[@name='singlelist']/option[@value='aaa']
------------------------------------------------------------------
##################################################################
##
##
$ perl htmltreexpather.pl
Usage: htmltreexpather.pl file _tag div
Demo1
HTML::Element=HASH(0xb163f4) 0.1.1.1.0.1.1.1.0
key1val1key2val2key3val3key4val4key5val5key6val6key7val7key8val8key9val9key10val10key11val11
/html/body/div/div/div/div/div/div/table
//table[@id='wrappedcontent']
//table[@id='wrappedcontent']
------------------------------------------------------------------
HTML::Element=HASH(0xb16574) 0.1.1.1.0.1.1.1.0.0.0.0.0
key1val1key2val2key3val3key4val4key5val5key6val6key7val7key8val8key9val9key10val10key11val11
/html/body/div/div/div/div/div/div/table/tbody/tr/td/table
//table[@id='wrappedcontent']/tbody/tr/td/table
//table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitzel']/tr/td/table
------------------------------------------------------------------
HTML::Element=HASH(0xb166c4) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.0.0.0
key1
/html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr/td/strong
//table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr/td/strong
//table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitzel']/tr/td/table/tbody/tr/td/strong
------------------------------------------------------------------
HTML::Element=HASH(0xb16874) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.1.0.0
key2
/html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr/td/strong
//table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr/td/strong
//table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitzel']/tr/td/table/tbody/tr/td/strong
------------------------------------------------------------------
HTML::Element=HASH(0xb6b9ac) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.2.0.0
key3
/html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr[2]/td/strong
//table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr[2]/td/strong
//table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitzel']/tr/td/table/tbody/tr[2]/td/strong
------------------------------------------------------------------
HTML::Element=HASH(0xb6bb5c) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.3.0.0
key4
/html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr[3]/td/strong
//table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr[3]/td/strong
//table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitzel']/tr/td/table/tbody/tr[3]/td/strong
------------------------------------------------------------------
HTML::Element=HASH(0xb6bd0c) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.4.0.0
key5
/html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr[4]/td/strong
//table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr[4]/td/strong
//table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitzel']/tr/td/table/tbody/tr[4]/td/strong
------------------------------------------------------------------
HTML::Element=HASH(0xb6bebc) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.5.0.0
key6
/html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr[5]/td/strong
//table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr[5]/td/strong
//table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitzel']/tr/td/table/tbody/tr[5]/td/strong
------------------------------------------------------------------
HTML::Element=HASH(0xb6c06c) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.6.0.0
key7
/html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr[6]/td/strong
//table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr[6]/td/strong
//table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitzel']/tr/td/table/tbody/tr[6]/td/strong
------------------------------------------------------------------
HTML::Element=HASH(0xb6c21c) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.7.0.0
key8
/html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr[7]/td/strong
//table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr[7]/td/strong
//table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitzel']/tr/td/table/tbody/tr[7]/td/strong
------------------------------------------------------------------
HTML::Element=HASH(0xb6c3cc) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.8.0.0
key9
/html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr[8]/td/strong
//table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr[8]/td/strong
//table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitzel']/tr/td/table/tbody/tr[8]/td/strong
------------------------------------------------------------------
HTML::Element=HASH(0xb6c57c) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.9.0.0
key10
/html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr[9]/td/strong
//table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr[9]/td/strong
//table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitzel']/tr/td/table/tbody/tr[9]/td/strong
------------------------------------------------------------------
HTML::Element=HASH(0xb6c72c) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.10.0.0
key11
/html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr[10]/td/strong
//table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr[10]/td/strong
//table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitzel']/tr/td/table/tbody/tr[10]/td/strong
------------------------------------------------------------------
##################################################################
Demo1
HTML::Element=HASH(0xb6c44c) 0.1.2
Altes Schulhaus Ossingen
/html/body/div
/html/body/div
/html/body[@topmargin='0' and @marginwidth='0' and @leftmargin='0' and @bgcolor='#FFFFFF' and @onload='check();' and @marginheight='0']/div[@class='leerzeile']
------------------------------------------------------------------
HTML::Element=HASH(0xb6c13c) 0.1.4
Guntibachstrasse 10
/html/body/div[3]
/html/body/div[3]
/html/body[@topmargin='0' and @marginwidth='0' and @leftmargin='0' and @bgcolor='#FFFFFF' and @onload='check();' and @marginheight='0']/div[3]
------------------------------------------------------------------
HTML::Element=HASH(0xb6c2cc) 0.1.6
8475 áOssingen
/html/body/div[5]
/html/body/div[5]
/html/body[@topmargin='0' and @marginwidth='0' and @leftmargin='0' and @bgcolor='#FFFFFF' and @onload='check();' and @marginheight='0']/div[5]
------------------------------------------------------------------
HTML::Element=HASH(0xb6bdec) 0.1.9
sekretariat.psossingen@bluewin.ch
/html/body/div[8]
/html/body/div[8]
/html/body[@topmargin='0' and @marginwidth='0' and @leftmargin='0' and @bgcolor='#FFFFFF' and @onload='check();' and @marginheight='0']/div[8]
------------------------------------------------------------------
HTML::Element=HASH(0xb6da44) 0.1.11
Tel:052 317 15 45
/html/body/div[10]
/html/body/div[10]
/html/body[@topmargin='0' and @marginwidth='0' and @leftmargin='0' and @bgcolor='#FFFFFF' and @onload='check();' and @marginheight='0']/div[10]
------------------------------------------------------------------
HTML::Element=HASH(0xb6bd1c) 0.1.12
Fax:052 317 04 42
/html/body/div[11]
/html/body/div[11]
/html/body[@topmargin='0' and @marginwidth='0' and @leftmargin='0' and @bgcolor='#FFFFFF' and @onload='check();' and @marginheight='0']/div[11]
------------------------------------------------------------------
##################################################################