#!/usr/bin/perl --
use strict;
use warnings;
our $VERSION = 20120112; # 2012-01-12
use HTML::TreeBuilder;
Main(@ARGV);
exit(0);
sub Main {
if (@_) {
PumpDump(@_);
#~ PumpDump('<html>', qw/look_down criteria/ );
#~ PumpDump('file', qw/look_down criteria/ );
} else {
print "Usage: $0 file _tag div\n\n";
for my $ix ( 1 .. 3 ){
my $demo = "Demo$ix";
print "$demo\n";
__PACKAGE__->can($demo)->();
}
print "Usage: $0 file _tag div\n\n";
} ## end else [ if (@_) ]
} ## end sub Main
sub Demo1 {
my $html = <<'__HTML__';
<html>
<body>
<div></div>
<div id="wrapper">
<div></div>
<div id="outer">
<div id="inner">
<div></div>
<div id="center">
<div></div>
<div id="main">
<div></div>
<div>
<table id="wrappedcontent">
<tbody class="shnitzel" bgcolor='red'>
<tr>
<td>
<table>
<tbody>
<tr> <td><strong>key1</strong></td> <td>val1</td> </tr>
<tr> <td><strong>key2</strong></td> <td>val2</td> </tr>
<tr> <td><strong>key3</strong></td> <td>val3</td> </tr>
<tr> <td><strong>key4</strong></td> <td>val4</td> </tr>
<tr> <td><strong>key5</strong></td> <td>val5</td> </tr>
<tr> <td><strong>key6</strong></td> <td>val6</td> </tr>
<tr> <td><strong>key7</strong></td> <td>val7</td> </tr>
<tr> <td><strong>key8</strong></td> <td>val8</td> </tr>
<tr> <td><strong>key9</strong></td> <td>val9</td> </tr>
<tr> <td><strong>key10</strong></td> <td>val10</td> </tr>
<tr> <td><strong>key11</strong></td> <td>val11</td> </tr>
</tbody>
</table>
</td>
</tr>
</tbody>
</table>
</div>
</div>
</div>
</div>
</div>
</div>
</body>
</html>
__HTML__
PumpDump( $html, _tag => qr/table|strong/i );
} ## end sub Demo1
sub Demo2 {
my $html = <<'__HTML__';
<html>
<body>
<div id="yo">
<div q="don't">
don't
<div></div>
<div q='"quote"'>
"quote"
<div q="it's"> it's </div>
<div q="it's"> nonsense </div>
</div>
</div>
</div>
</body>
</html>
__HTML__
PumpDump( $html, _tag => qr/div/i );
} ## end sub Demo2
sub Demo3 {
my $html = <<'__HTML__';
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://
+www.w3.org/TR/html4/loose.dtd"><html><head><meta name="generator" con
+tent="DigiOnline GmbH - WebWeaver 3.4 CMS - http://www.webweaver.de">
+<title>educa.ch</title><meta http-equiv="Content-Type" content="text/
+html; charset=iso-8859-1"><link rel="stylesheet" href="101.htm"><scri
+pt src="102.htm"></script><script language="JavaScript"><!--
var did='d79376';
var root=new Array('d200','d205','d73137','d1566','d79376','d');
var usefocus = 1;
function check() {
if ((self.focus) && (usefocus)) {
self.focus();
}
}
// --></script></head><body bgcolor="#FFFFFF" leftmargin="0" topmargin
+="0" marginwidth="0" marginheight="0" onload="check();"><table cellsp
+acing="0" cellpadding="0" border="0" width="100%"><tr><td width="15"
+class="popuphead"><img src="/0.gif" alt="" width="15" height="16"></t
+d><td width="99%" class="popuphead">Adresse - Schulen in der Schweiz<
+/td><td width="20" class="popuphead" valign="middle"><a href="#" titl
+e="Print" onclick="window.print(); return false;"><img src="../pics/p
+rint16x13.gif" alt="Drucken" width="16" height="13"></a></td><td widt
+h="20" class="popuphead" valign="middle"><a href="#" title="close" on
+click="window.close(); return false;"><img src="../pics/close21x13.gi
+f" alt="Schliessen" width="21" height="13"></a></td></tr>
<tr bgcolor="#B2B2B2"><td colspan="4"><img src="/0.gif" alt="" width="
+1" height="1"></td></tr></table><div class="leerzeile"> </div><d
+iv class="leerzeile"><img src="/0.gif" alt="" width="15"height="8">Al
+tes Schulhaus Ossingen </div><div class="leerzeile"> </div><d
+iv><img src="/0.gif" alt="" width="15" height="8">Guntibachstrasse 10
+</div><div><img src="/0.gif" alt="" width="15" height="8"></div><div>
+<img src="/0.gif" alt="" width="15" height="8">8475  Ossingen</d
+iv><div class="leerzeile"> </div><div><img src="/0.gif" alt="" w
+idth="15" height="8"><a href="" target="_blank"></a></div><div><img s
+rc="/0.gif" alt="" width="15" height="8"><a href="mailto: sekretariat
+.psossingen@bluewin.ch">sekretariat.psossingen@bluewin.ch</a></div><d
+iv class="leerzeile"> </div><div><img src="/0.gif" alt="" width=
+"15" height="8">Tel:<img src="/0.gif" alt="" width="6" height="8">052
+ 317 15 45 </div><div><img src="/0.gif" alt="" width="15" height="8">
+Fax:<img src="/0.gif" alt="" width="4" height="8">052 317 04 42 </div
+><div> </div></body></html>
__HTML__
PumpDump( $html, _tag => qr/div/i );
} ## end sub Demo3
sub HTML::Element::addressx {
return join(
'/',
'', # // ROOT
reverse( # so it starts at the top
map {
my $count = 1; # 2011-03-02-01:26:06 duh, off byone er
+ror, in xpath, start counting at 1? xpather, xpath checker agree
my $t = $_->tag;
## LEFT CAN BE A STRING
my @left = $_->left;
for my $left (@left) {
eval { $count++ if $left->tag eq $t };
}
if ( $count > 1 ) {
$count = "[$count]";
} else {
$count = '';
}
$t . $count
} $_[0], # self and...
$_[0]->lineage
)
);
} ## end sub HTML::Element::addressx
sub HTML::Element::addressxx {
my (@stuff) = (
map {
my $count = 1; # 2011-03-02-01:26:06 duh, off byone error,
+ in xpath, start counting at 1? xpather, xpath checker agree
my $t = $_->tag;
## LEFT CAN BE A STRING
my @left = $_->left;
for my $left (@left) {
eval { $count++ if $left->tag eq $t };
}
if ( my $attid = $_->attr('id') ) {
$attid = xpath_attr_escape( $attid );
$count = "[\@id='$attid']";
} elsif ( $count > 1 ) {
$count = "[$count]";
} else {
$count = '';
}
$t . $count
} $_[0], # self and...
$_[0]->lineage
);
#~ use DDS; print Dump(\@stuff),"\n";
use List::MoreUtils qw[ before_incl ];
my $stuff = @stuff;
@stuff = before_incl { /\[\@id/i } @stuff;
return join(
'/',
( $stuff > @stuff ? '/' : '' ),
reverse( # so it starts at the top
@stuff
)
);
} ## end sub HTML::Element::addressxx
sub HTML::Element::addressxX {
my (@stuff) = (
map {
my $e = $_;
my $count = 1; # 2011-03-02-01:26:06 duh, off byone error,
+ in xpath, start counting at 1? xpather, xpath checker agree
my $t = $e->tag;
my @left = $e->left;
for my $left (@left) {
eval { $count++ if $left->tag eq $t };
}
if ( my $attid = $e->id ) {
$attid = xpath_attr_escape( $attid );
$count = "[\@id='$attid']";
} elsif ( my @att = grep !/^id$/, $e->all_external_attr_na
+mes ) {
$count = '['
. join( ' and ',
map { sprintf q!@%s='%s'!, $_, xpath_attr_escape($
+e->attr($_)) } @att )
. ']';
} elsif ( $count > 1 ) {
$count = "[$count]";
} else {
$count = '';
}
$t . $count
} $_[0], # self and...
$_[0]->lineage
);
#~ use DDS; print Dump(\@stuff),"\n";
my $stuff = @stuff;
use List::MoreUtils qw[ before_incl ];
@stuff = before_incl { /\[\@id/i } @stuff;
return join(
'/',
( $stuff > @stuff ? '/' : '' ),
reverse( # so it starts at the top
@stuff
)
);
} ## end sub HTML::Element::addressxX
sub PumpDump {
my ( $html, @lookdown ) = @_;
my $tree = HTML::TreeBuilder->new();
if ( $html =~ /</ ) {
$tree->parse($html);
} else {
if( $html =~ /\.xml$/ ){
$tree->implicit_tags(0);
$tree->no_expand_entities(1);
$tree->ignore_unknown(0);
$tree->ignore_ignorable_whitespace(0);
$tree->no_space_compacting(1);
$tree->store_comments(1);
$tree->store_pis(1);
}
$tree->parse_file($html);
}
$tree->eof;
warn $tree->as_HTML, " " if $html =~ /\.xml$/; # because it just does
+n't work for xml
@lookdown = sub{1} unless @lookdown; # every tag
for my $td ( $tree->look_down(@lookdown) ) {
my $text = $td->as_trimmed_text;
next if $text =~ /^\p{Zs}*$/; ## ysth, nbsp isn't \s
print $td, "\t", $td->address, "\n";
print $text, "\n";
print $td->addressx, "\n";
print $td->addressxx, "\n";
print $td->addressxX, "\n";
print '-' x 66, "\n";
} ## end for my $td ( $tree->look_down...)
$tree->delete;
undef $tree;
print '#' x 66, "\n\n";
} ## end sub PumpDump
BEGIN {
my %rep = qw{ " " ' ' } ;
sub xpath_attr_escape {
my( $t ) = @_;
$t =~ s/(['"])/ $rep{$1} /ge;
$t;
}
}
__END__