Beefy Boxes and Bandwidth Generously Provided by pair Networks
more useful options
 
PerlMonks  

Re: htmltreexpather.pl - xpath helper, creates xpath search strings from html ($VERSION = 20120112 )

by Anonymous Monk
on Jan 12, 2012 at 13:00 UTC ( [id://947532]=note: print w/replies, xml ) Need Help??


in reply to htmltreexpather.pl - xpath helper, creates xpath search strings from html

Here is  $VERSION = 20120112 , incorporates OBO and properly escapes quotes/apostrophe in attributes

#!/usr/bin/perl -- use strict; use warnings; our $VERSION = 20120112; # 2012-01-12 use HTML::TreeBuilder; Main(@ARGV); exit(0); sub Main { if (@_) { PumpDump(@_); #~ PumpDump('<html>', qw/look_down criteria/ ); #~ PumpDump('file', qw/look_down criteria/ ); } else { print "Usage: $0 file _tag div\n\n"; for my $ix ( 1 .. 3 ){ my $demo = "Demo$ix"; print "$demo\n"; __PACKAGE__->can($demo)->(); } print "Usage: $0 file _tag div\n\n"; } ## end else [ if (@_) ] } ## end sub Main sub Demo1 { my $html = <<'__HTML__'; <html> <body> <div></div> <div id="wrapper"> <div></div> <div id="outer"> <div id="inner"> <div></div> <div id="center"> <div></div> <div id="main"> <div></div> <div> <table id="wrappedcontent"> <tbody class="shnitzel" bgcolor='red'> <tr> <td> <table> <tbody> <tr> <td><strong>key1</strong></td> <td>val1</td> </tr> <tr> <td><strong>key2</strong></td> <td>val2</td> </tr> <tr> <td><strong>key3</strong></td> <td>val3</td> </tr> <tr> <td><strong>key4</strong></td> <td>val4</td> </tr> <tr> <td><strong>key5</strong></td> <td>val5</td> </tr> <tr> <td><strong>key6</strong></td> <td>val6</td> </tr> <tr> <td><strong>key7</strong></td> <td>val7</td> </tr> <tr> <td><strong>key8</strong></td> <td>val8</td> </tr> <tr> <td><strong>key9</strong></td> <td>val9</td> </tr> <tr> <td><strong>key10</strong></td> <td>val10</td> </tr> <tr> <td><strong>key11</strong></td> <td>val11</td> </tr> </tbody> </table> </td> </tr> </tbody> </table> </div> </div> </div> </div> </div> </div> </body> </html> __HTML__ PumpDump( $html, _tag => qr/table|strong/i ); } ## end sub Demo1 sub Demo2 { my $html = <<'__HTML__'; <html> <body> <div id="yo"> <div q="don't"> don't <div></div> <div q='"quote"'> "quote" <div q="it's"> it's </div> <div q="it's"> nonsense </div> </div> </div> </div> </body> </html> __HTML__ PumpDump( $html, _tag => qr/div/i ); } ## end sub Demo2 sub Demo3 { my $html = <<'__HTML__'; <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http:// +www.w3.org/TR/html4/loose.dtd"><html><head><meta name="generator" con +tent="DigiOnline GmbH - WebWeaver 3.4 CMS - http://www.webweaver.de"> +<title>educa.ch</title><meta http-equiv="Content-Type" content="text/ +html; charset=iso-8859-1"><link rel="stylesheet" href="101.htm"><scri +pt src="102.htm"></script><script language="JavaScript"><!-- var did='d79376'; var root=new Array('d200','d205','d73137','d1566','d79376','d'); var usefocus = 1; function check() { if ((self.focus) && (usefocus)) { self.focus(); } } // --></script></head><body bgcolor="#FFFFFF" leftmargin="0" topmargin +="0" marginwidth="0" marginheight="0" onload="check();"><table cellsp +acing="0" cellpadding="0" border="0" width="100%"><tr><td width="15" +class="popuphead"><img src="/0.gif" alt="" width="15" height="16"></t +d><td width="99%" class="popuphead">Adresse - Schulen in der Schweiz< +/td><td width="20" class="popuphead" valign="middle"><a href="#" titl +e="Print" onclick="window.print(); return false;"><img src="../pics/p +rint16x13.gif" alt="Drucken" width="16" height="13"></a></td><td widt +h="20" class="popuphead" valign="middle"><a href="#" title="close" on +click="window.close(); return false;"><img src="../pics/close21x13.gi +f" alt="Schliessen" width="21" height="13"></a></td></tr> <tr bgcolor="#B2B2B2"><td colspan="4"><img src="/0.gif" alt="" width=" +1" height="1"></td></tr></table><div class="leerzeile">&#160;</div><d +iv class="leerzeile"><img src="/0.gif" alt="" width="15"height="8">Al +tes Schulhaus Ossingen </div><div class="leerzeile">&#160;</div><d +iv><img src="/0.gif" alt="" width="15" height="8">Guntibachstrasse 10 +</div><div><img src="/0.gif" alt="" width="15" height="8"></div><div> +<img src="/0.gif" alt="" width="15" height="8">8475 &#160;Ossingen</d +iv><div class="leerzeile">&#160;</div><div><img src="/0.gif" alt="" w +idth="15" height="8"><a href="" target="_blank"></a></div><div><img s +rc="/0.gif" alt="" width="15" height="8"><a href="mailto: sekretariat +.psossingen@bluewin.ch">sekretariat.psossingen@bluewin.ch</a></div><d +iv class="leerzeile">&#160;</div><div><img src="/0.gif" alt="" width= +"15" height="8">Tel:<img src="/0.gif" alt="" width="6" height="8">052 + 317 15 45 </div><div><img src="/0.gif" alt="" width="15" height="8"> +Fax:<img src="/0.gif" alt="" width="4" height="8">052 317 04 42 </div +><div>&#160;</div></body></html> __HTML__ PumpDump( $html, _tag => qr/div/i ); } ## end sub Demo3 sub HTML::Element::addressx { return join( '/', '', # // ROOT reverse( # so it starts at the top map { my $count = 1; # 2011-03-02-01:26:06 duh, off byone er +ror, in xpath, start counting at 1? xpather, xpath checker agree my $t = $_->tag; ## LEFT CAN BE A STRING my @left = $_->left; for my $left (@left) { eval { $count++ if $left->tag eq $t }; } if ( $count > 1 ) { $count = "[$count]"; } else { $count = ''; } $t . $count } $_[0], # self and... $_[0]->lineage ) ); } ## end sub HTML::Element::addressx sub HTML::Element::addressxx { my (@stuff) = ( map { my $count = 1; # 2011-03-02-01:26:06 duh, off byone error, + in xpath, start counting at 1? xpather, xpath checker agree my $t = $_->tag; ## LEFT CAN BE A STRING my @left = $_->left; for my $left (@left) { eval { $count++ if $left->tag eq $t }; } if ( my $attid = $_->attr('id') ) { $attid = xpath_attr_escape( $attid ); $count = "[\@id='$attid']"; } elsif ( $count > 1 ) { $count = "[$count]"; } else { $count = ''; } $t . $count } $_[0], # self and... $_[0]->lineage ); #~ use DDS; print Dump(\@stuff),"\n"; use List::MoreUtils qw[ before_incl ]; my $stuff = @stuff; @stuff = before_incl { /\[\@id/i } @stuff; return join( '/', ( $stuff > @stuff ? '/' : '' ), reverse( # so it starts at the top @stuff ) ); } ## end sub HTML::Element::addressxx sub HTML::Element::addressxX { my (@stuff) = ( map { my $e = $_; my $count = 1; # 2011-03-02-01:26:06 duh, off byone error, + in xpath, start counting at 1? xpather, xpath checker agree my $t = $e->tag; my @left = $e->left; for my $left (@left) { eval { $count++ if $left->tag eq $t }; } if ( my $attid = $e->id ) { $attid = xpath_attr_escape( $attid ); $count = "[\@id='$attid']"; } elsif ( my @att = grep !/^id$/, $e->all_external_attr_na +mes ) { $count = '[' . join( ' and ', map { sprintf q!@%s='%s'!, $_, xpath_attr_escape($ +e->attr($_)) } @att ) . ']'; } elsif ( $count > 1 ) { $count = "[$count]"; } else { $count = ''; } $t . $count } $_[0], # self and... $_[0]->lineage ); #~ use DDS; print Dump(\@stuff),"\n"; my $stuff = @stuff; use List::MoreUtils qw[ before_incl ]; @stuff = before_incl { /\[\@id/i } @stuff; return join( '/', ( $stuff > @stuff ? '/' : '' ), reverse( # so it starts at the top @stuff ) ); } ## end sub HTML::Element::addressxX sub PumpDump { my ( $html, @lookdown ) = @_; my $tree = HTML::TreeBuilder->new(); if ( $html =~ /</ ) { $tree->parse($html); } else { if( $html =~ /\.xml$/ ){ $tree->implicit_tags(0); $tree->no_expand_entities(1); $tree->ignore_unknown(0); $tree->ignore_ignorable_whitespace(0); $tree->no_space_compacting(1); $tree->store_comments(1); $tree->store_pis(1); } $tree->parse_file($html); } $tree->eof; warn $tree->as_HTML, " " if $html =~ /\.xml$/; # because it just does +n't work for xml @lookdown = sub{1} unless @lookdown; # every tag for my $td ( $tree->look_down(@lookdown) ) { my $text = $td->as_trimmed_text; next if $text =~ /^\p{Zs}*$/; ## ysth, nbsp isn't \s print $td, "\t", $td->address, "\n"; print $text, "\n"; print $td->addressx, "\n"; print $td->addressxx, "\n"; print $td->addressxX, "\n"; print '-' x 66, "\n"; } ## end for my $td ( $tree->look_down...) $tree->delete; undef $tree; print '#' x 66, "\n\n"; } ## end sub PumpDump BEGIN { my %rep = qw{ " &quot; ' &apos; } ; sub xpath_attr_escape { my( $t ) = @_; $t =~ s/(['"])/ $rep{$1} /ge; $t; } } __END__

Demo2 HTML::Element=HASH(0xb33a1c) 0.1.0 don't "quote" it's nonsense /html/body/div //div[@id='yo'] //div[@id='yo'] ------------------------------------------------------------------ HTML::Element=HASH(0xb33bdc) 0.1.0.0.2.2 nonsense /html/body/div/div/div[2]/div[2] //div[@id='yo']/div/div[2]/div[2] //div[@id='yo']/div[@q='don&apos;t']/div[@q='&quot;quote&quot;']/div[@ +q='it&apos;s'] ------------------------------------------------------------------ ##################################################################

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Node Status?
node history
Node Type: note [id://947532]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others avoiding work at the Monastery: (5)
As of 2024-03-28 15:00 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found