#!/usr/bin/perl -- use strict; use warnings; use HTML::TreeBuilder; Main(@ARGV); exit(0); sub Main { if (@_) { PumpDump(@_); #~ PumpDump('', qw/look_down criteria/ ); #~ PumpDump('file', qw/look_down criteria/ ); } else { print "Usage: $0 file _tag div\n\n"; print "Demo1\n"; Demo1(); print "Demo1\n"; Demo3(); } ## end else [ if (@_) ] } ## end sub Main sub Demo1 { my $html = <<'__HTML__';
key1 val1
key2 val2
key3 val3
key4 val4
key5 val5
key6 val6
key7 val7
key8 val8
key9 val9
key10 val10
key11 val11
__HTML__ PumpDump( $html, _tag => qr/table|strong/i ); } ## end sub Demo1 sub Demo3 { my $html = <<'__HTML__'; educa.ch
Adresse - Schulen in der SchweizDruckenSchliessen
 
Altes Schulhaus Ossingen
 
Guntibachstrasse 10
8475  Ossingen
 
sekretariat.psossingen@bluewin.ch
 
Tel:052 317 15 45
Fax:052 317 04 42
 
__HTML__ PumpDump( $html, _tag => qr/div/i ); } ## end sub Demo3 sub HTML::Element::addressx { return join( '/', '', # // ROOT reverse( # so it starts at the top map { my $count = 0; my $t = $_->tag; ## LEFT CAN BE A STRING my @left = $_->left; for my $left (@left) { eval { $count++ if $left->tag eq $t }; } if ( $count > 1 ) { $count = "[$count]"; } else { $count = ''; } $t . $count } $_[0], # self and... $_[0]->lineage ) ); } ## end sub HTML::Element::addressx sub HTML::Element::addressxx { my (@stuff) = ( map { my $count = 0; my $t = $_->tag; ## LEFT CAN BE A STRING my @left = $_->left; for my $left (@left) { eval { $count++ if $left->tag eq $t }; } if ( my $attid = $_->attr('id') ) { $count = "[\@id='$attid']"; } elsif ( $count > 1 ) { $count = "[$count]"; } else { $count = ''; } $t . $count } $_[0], # self and... $_[0]->lineage ); #~ use DDS; print Dump(\@stuff),"\n"; use List::MoreUtils qw[ before_incl ]; my $stuff = @stuff; @stuff = before_incl { /\[\@id/i } @stuff; return join( '/', ( $stuff > @stuff ? '/' : '' ), reverse( # so it starts at the top @stuff ) ); } ## end sub HTML::Element::addressxx sub HTML::Element::addressxX { my (@stuff) = ( map { my $e = $_; my $count = 0; my $t = $e->tag; my @left = $e->left; for my $left (@left) { eval { $count++ if $left->tag eq $t }; } if ( my $attid = $e->id ) { $count = "[\@id='$attid']"; } elsif ( my @att = grep !/^id$/, $e->all_external_attr_names ) { $count = '[' . join( ' and ', map { sprintf q!@%s='%s'!, $_, $e->attr($_) } @att ) . ']'; } elsif ( $count > 1 ) { $count = "[$count]"; } else { $count = ''; } $t . $count } $_[0], # self and... $_[0]->lineage ); #~ use DDS; print Dump(\@stuff),"\n"; my $stuff = @stuff; use List::MoreUtils qw[ before_incl ]; @stuff = before_incl { /\[\@id/i } @stuff; return join( '/', ( $stuff > @stuff ? '/' : '' ), reverse( # so it starts at the top @stuff ) ); } ## end sub HTML::Element::addressxX sub PumpDump { my ( $html, @lookdown ) = @_; my $tree = HTML::TreeBuilder->new(); if ( $html =~ /parse($html); } else { $tree->parse_file($html); } $tree->eof; for my $td ( $tree->look_down(@lookdown) ) { my $text = $td->as_trimmed_text; next if $text =~ /^\p{Zs}*$/; ## ysth, nbsp isn't \s print $td, "\t", $td->address, "\n"; print $text, "\n"; print $td->addressx, "\n"; print $td->addressxx, "\n"; print $td->addressxX, "\n"; print '-' x 66, "\n"; } ## end for my $td ( $tree->look_down...) $tree->delete; undef $tree; print '#' x 66, "\n\n"; } ## end sub PumpDump __END__ #### $ perl htmltreexpather.pl select.html _tag option HTML::Element=HASH(0xb139ec) 0.1.1.0.0 Chose Some aaa /html/body/form/select/option /html/body/form/select/option /html/body[@bgcolor='red']/form[@action='/foo.cgi' and @name='queryfoo']/select[@name='singlelist']/option[@value='aaa'] ------------------------------------------------------------------ ################################################################## #### $ perl htmltreexpather.pl Usage: htmltreexpather.pl file _tag div Demo1 HTML::Element=HASH(0xb163f4) 0.1.1.1.0.1.1.1.0 key1val1key2val2key3val3key4val4key5val5key6val6key7val7key8val8key9val9key10val10key11val11 /html/body/div/div/div/div/div/div/table //table[@id='wrappedcontent'] //table[@id='wrappedcontent'] ------------------------------------------------------------------ HTML::Element=HASH(0xb16574) 0.1.1.1.0.1.1.1.0.0.0.0.0 key1val1key2val2key3val3key4val4key5val5key6val6key7val7key8val8key9val9key10val10key11val11 /html/body/div/div/div/div/div/div/table/tbody/tr/td/table //table[@id='wrappedcontent']/tbody/tr/td/table //table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitzel']/tr/td/table ------------------------------------------------------------------ HTML::Element=HASH(0xb166c4) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.0.0.0 key1 /html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr/td/strong //table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr/td/strong //table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitzel']/tr/td/table/tbody/tr/td/strong ------------------------------------------------------------------ HTML::Element=HASH(0xb16874) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.1.0.0 key2 /html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr/td/strong //table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr/td/strong //table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitzel']/tr/td/table/tbody/tr/td/strong ------------------------------------------------------------------ HTML::Element=HASH(0xb6b9ac) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.2.0.0 key3 /html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr[2]/td/strong //table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr[2]/td/strong //table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitzel']/tr/td/table/tbody/tr[2]/td/strong ------------------------------------------------------------------ HTML::Element=HASH(0xb6bb5c) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.3.0.0 key4 /html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr[3]/td/strong //table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr[3]/td/strong //table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitzel']/tr/td/table/tbody/tr[3]/td/strong ------------------------------------------------------------------ HTML::Element=HASH(0xb6bd0c) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.4.0.0 key5 /html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr[4]/td/strong //table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr[4]/td/strong //table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitzel']/tr/td/table/tbody/tr[4]/td/strong ------------------------------------------------------------------ HTML::Element=HASH(0xb6bebc) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.5.0.0 key6 /html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr[5]/td/strong //table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr[5]/td/strong //table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitzel']/tr/td/table/tbody/tr[5]/td/strong ------------------------------------------------------------------ HTML::Element=HASH(0xb6c06c) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.6.0.0 key7 /html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr[6]/td/strong //table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr[6]/td/strong //table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitzel']/tr/td/table/tbody/tr[6]/td/strong ------------------------------------------------------------------ HTML::Element=HASH(0xb6c21c) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.7.0.0 key8 /html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr[7]/td/strong //table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr[7]/td/strong //table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitzel']/tr/td/table/tbody/tr[7]/td/strong ------------------------------------------------------------------ HTML::Element=HASH(0xb6c3cc) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.8.0.0 key9 /html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr[8]/td/strong //table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr[8]/td/strong //table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitzel']/tr/td/table/tbody/tr[8]/td/strong ------------------------------------------------------------------ HTML::Element=HASH(0xb6c57c) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.9.0.0 key10 /html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr[9]/td/strong //table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr[9]/td/strong //table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitzel']/tr/td/table/tbody/tr[9]/td/strong ------------------------------------------------------------------ HTML::Element=HASH(0xb6c72c) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.10.0.0 key11 /html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr[10]/td/strong //table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr[10]/td/strong //table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitzel']/tr/td/table/tbody/tr[10]/td/strong ------------------------------------------------------------------ ################################################################## Demo1 HTML::Element=HASH(0xb6c44c) 0.1.2 Altes Schulhaus Ossingen /html/body/div /html/body/div /html/body[@topmargin='0' and @marginwidth='0' and @leftmargin='0' and @bgcolor='#FFFFFF' and @onload='check();' and @marginheight='0']/div[@class='leerzeile'] ------------------------------------------------------------------ HTML::Element=HASH(0xb6c13c) 0.1.4 Guntibachstrasse 10 /html/body/div[3] /html/body/div[3] /html/body[@topmargin='0' and @marginwidth='0' and @leftmargin='0' and @bgcolor='#FFFFFF' and @onload='check();' and @marginheight='0']/div[3] ------------------------------------------------------------------ HTML::Element=HASH(0xb6c2cc) 0.1.6 8475 áOssingen /html/body/div[5] /html/body/div[5] /html/body[@topmargin='0' and @marginwidth='0' and @leftmargin='0' and @bgcolor='#FFFFFF' and @onload='check();' and @marginheight='0']/div[5] ------------------------------------------------------------------ HTML::Element=HASH(0xb6bdec) 0.1.9 sekretariat.psossingen@bluewin.ch /html/body/div[8] /html/body/div[8] /html/body[@topmargin='0' and @marginwidth='0' and @leftmargin='0' and @bgcolor='#FFFFFF' and @onload='check();' and @marginheight='0']/div[8] ------------------------------------------------------------------ HTML::Element=HASH(0xb6da44) 0.1.11 Tel:052 317 15 45 /html/body/div[10] /html/body/div[10] /html/body[@topmargin='0' and @marginwidth='0' and @leftmargin='0' and @bgcolor='#FFFFFF' and @onload='check();' and @marginheight='0']/div[10] ------------------------------------------------------------------ HTML::Element=HASH(0xb6bd1c) 0.1.12 Fax:052 317 04 42 /html/body/div[11] /html/body/div[11] /html/body[@topmargin='0' and @marginwidth='0' and @leftmargin='0' and @bgcolor='#FFFFFF' and @onload='check();' and @marginheight='0']/div[11] ------------------------------------------------------------------ ##################################################################