#!/usr/bin/perl use HTML::Tidy; use XML::XPath; use XML::XPath::XMLParser; use Data::Dumper; use strict; use warnings; my $body; while(my $line = ) { $body .= $line; } my $tidy = HTML::Tidy->new({output_xml=>1,numeric_entities=>1}); my $clean = $tidy->clean($body); my $parser = XML::XPath->new(xml => $clean); my $set = '//p/a'; my $nodes = $parser->find($set); foreach my $node ($nodes->get_nodelist) { print "\n"; print "FOUND\n\n", XML::XPath::XMLParser::as_string($node),"\n"; print Dumper($node); # print ${$node}->[5],"\n"; # element name } exit(0); _DATA__ test

test heading

paragraph one one example.

paragraph two another example.

#### ... Element Node [ $parent, # node_parent , # node_pos 'xxx', # node_prefix - namespace prefix on this element [ ... ], # node_children 'yyy', # node_name - element tag name [ ... ], # node_attribs - attributes on this element [ ... ], # node_namespaces - namespaces currently in scope ] Attribute Node [ $parent, # node_parent - the element node , # node_pos 'xxx', # node_prefix - namespace prefix on this element 'href', # node_key - attribute name 'ftp://ftp.com/', # node_value - value in the node ] ... #### FOUND one $VAR1 = bless( do{\(my $o = bless( [ bless( [ bless( [ bless( [ bless( [ bless( [ undef, undef, 5, undef, [ bless( do{\(my $o = ${$VAR1}->[0]->[0]->[0]->[0])}, 'XML::XPath::Node::Element' ) ], undef, [], [ bless( do{\(my $o = bless( [ ${$VAR1}->[0]->[0]->[0]->[0]->[0], 0, 10, 'xml', 'http://www.w3.org/XML/1998/namespace' ], 'XML::XPath::Node::NamespaceImpl' ))}, 'XML::XPath::Node::Namespace' ) ] ], 'XML::XPath::Node::ElementImpl' ), 0, 15, '', [ bless( do{\(my $o = bless( [ ${$VAR1}->[0]->[0]->[0]->[0], 0, 35, ' ' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ), bless( do{\(my $o = bless( [ ${$VAR1}->[0]->[0]->[0]->[0], 1, 40, '', [ bless( do{\(my $o = bless( [ ${${$VAR1}->[0]->[0]->[0]->[0]->[4]->[1]}, 0, 45, ' ' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ), bless( do{\(my $o = bless( [ ${${$VAR1}->[0]->[0]->[0]->[0]->[4]->[1]}, 1, 50, '', [], 'meta', [ bless( do{\(my $o = bless( [ ${${${$VAR1}->[0]->[0]->[0]->[0]->[4]->[1]}->[4]->[1]}, 0, 55, '', 'name', 'generator' ], 'XML::XPath::Node::AttributeImpl' ))}, 'XML::XPath::Node::Attribute' ), bless( do{\(my $o = bless( [ ${${${$VAR1}->[0]->[0]->[0]->[0]->[4]->[1]}->[4]->[1]}, 1, 60, '', 'content', 'HTML Tidy for HTML5 for Linux version 5.6.0' ], 'XML::XPath::Node::AttributeImpl' ))}, 'XML::XPath::Node::Attribute' ) ] ], 'XML::XPath::Node::ElementImpl' ))}, 'XML::XPath::Node::Element' ), bless( do{\(my $o = bless( [ ${${$VAR1}->[0]->[0]->[0]->[0]->[4]->[1]}, 2, 65, ' ' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ), bless( do{\(my $o = bless( [ ${${$VAR1}->[0]->[0]->[0]->[0]->[4]->[1]}, 3, 70, '', [ bless( do{\(my $o = bless( [ ${${${$VAR1}->[0]->[0]->[0]->[0]->[4]->[1]}->[4]->[3]}, 0, 75, 'test' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ) ], 'title', [] ], 'XML::XPath::Node::ElementImpl' ))}, 'XML::XPath::Node::Element' ), bless( do{\(my $o = bless( [ ${${$VAR1}->[0]->[0]->[0]->[0]->[4]->[1]}, 4, 80, ' ' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ) ], 'head', [] ], 'XML::XPath::Node::ElementImpl' ))}, 'XML::XPath::Node::Element' ), bless( do{\(my $o = bless( [ ${$VAR1}->[0]->[0]->[0]->[0], 2, 85, ' ' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ), bless( do{\(my $o = ${$VAR1}->[0]->[0]->[0])}, 'XML::XPath::Node::Element' ), bless( do{\(my $o = bless( [ ${$VAR1}->[0]->[0]->[0]->[0], 4, 200, ' ' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ) ], 'html', [ bless( do{\(my $o = bless( [ ${$VAR1}->[0]->[0]->[0]->[0], 0, 20, '', 'class', 'no-focus-outline no-js' ], 'XML::XPath::Node::AttributeImpl' ))}, 'XML::XPath::Node::Attribute' ), bless( do{\(my $o = bless( [ ${$VAR1}->[0]->[0]->[0]->[0], 1, 25, '', 'lang', 'en-US' ], 'XML::XPath::Node::AttributeImpl' ))}, 'XML::XPath::Node::Attribute' ), bless( do{\(my $o = bless( [ ${$VAR1}->[0]->[0]->[0]->[0], 2, 30, '', 'data-modal-active', 'true' ], 'XML::XPath::Node::AttributeImpl' ))}, 'XML::XPath::Node::Attribute' ) ] ], 'XML::XPath::Node::ElementImpl' ), 3, 90, '', [ bless( do{\(my $o = bless( [ ${$VAR1}->[0]->[0]->[0], 0, 95, ' ' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ), bless( do{\(my $o = bless( [ ${$VAR1}->[0]->[0]->[0], 1, 100, '', [ bless( do{\(my $o = bless( [ ${${$VAR1}->[0]->[0]->[0]->[4]->[1]}, 0, 105, "test\x{a0}heading" ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ) ], 'h1', [] ], 'XML::XPath::Node::ElementImpl' ))}, 'XML::XPath::Node::Element' ), bless( do{\(my $o = bless( [ ${$VAR1}->[0]->[0]->[0], 2, 110, ' ' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ), bless( do{\(my $o = ${$VAR1}->[0]->[0])}, 'XML::XPath::Node::Element' ), bless( do{\(my $o = bless( [ ${$VAR1}->[0]->[0]->[0], 4, 195, ' ' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ) ], 'body', [] ], 'XML::XPath::Node::ElementImpl' ), 3, 115, '', [ bless( do{\(my $o = bless( [ ${$VAR1}->[0]->[0], 0, 120, ' ' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ), bless( do{\(my $o = ${$VAR1}->[0])}, 'XML::XPath::Node::Element' ), bless( do{\(my $o = bless( [ ${$VAR1}->[0]->[0], 2, 155, ' ' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ), bless( do{\(my $o = bless( [ ${$VAR1}->[0]->[0], 3, 160, '', [ bless( do{\(my $o = bless( [ ${${$VAR1}->[0]->[0]->[4]->[3]}, 0, 165, 'paragraph two ' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ), bless( do{\(my $o = bless( [ ${${$VAR1}->[0]->[0]->[4]->[3]}, 1, 170, '', [ bless( do{\(my $o = bless( [ ${${${$VAR1}->[0]->[0]->[4]->[3]}->[4]->[1]}, 0, 180, 'another' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ) ], 'a', [ bless( do{\(my $o = bless( [ ${${${$VAR1}->[0]->[0]->[4]->[3]}->[4]->[1]}, 0, 175, '', 'href', 'https://example.com/two/three.html' ], 'XML::XPath::Node::AttributeImpl' ))}, 'XML::XPath::Node::Attribute' ) ] ], 'XML::XPath::Node::ElementImpl' ))}, 'XML::XPath::Node::Element' ), bless( do{\(my $o = bless( [ ${${$VAR1}->[0]->[0]->[4]->[3]}, 2, 185, ' example.' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ) ], 'p', [] ], 'XML::XPath::Node::ElementImpl' ))}, 'XML::XPath::Node::Element' ), bless( do{\(my $o = bless( [ ${$VAR1}->[0]->[0], 4, 190, ' ' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ) ], 'div', [] ], 'XML::XPath::Node::ElementImpl' ), 1, 125, '', [ bless( do{\(my $o = bless( [ ${$VAR1}->[0], 0, 130, 'paragraph one ' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ), $VAR1, bless( do{\(my $o = bless( [ ${$VAR1}->[0], 2, 150, ' example.' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ) ], 'p', [] ], 'XML::XPath::Node::ElementImpl' ), 1, 135, '', [ bless( do{\(my $o = bless( [ ${$VAR1}, 0, 145, 'one' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ) ], 'a', [ bless( do{\(my $o = bless( [ ${$VAR1}, 0, 140, '', 'href', 'https://example.com/one/two.html' ], 'XML::XPath::Node::AttributeImpl' ))}, 'XML::XPath::Node::Attribute' ) ], [] ], 'XML::XPath::Node::ElementImpl' ))}, 'XML::XPath::Node::Element' ); FOUND another $VAR1 = bless( do{\(my $o = bless( [ bless( [ bless( [ bless( [ bless( [ bless( [ undef, undef, 5, undef, [ bless( do{\(my $o = ${$VAR1}->[0]->[0]->[0]->[0])}, 'XML::XPath::Node::Element' ) ], undef, [], [ bless( do{\(my $o = bless( [ ${$VAR1}->[0]->[0]->[0]->[0]->[0], 0, 10, 'xml', 'http://www.w3.org/XML/1998/namespace' ], 'XML::XPath::Node::NamespaceImpl' ))}, 'XML::XPath::Node::Namespace' ) ] ], 'XML::XPath::Node::ElementImpl' ), 0, 15, '', [ bless( do{\(my $o = bless( [ ${$VAR1}->[0]->[0]->[0]->[0], 0, 35, ' ' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ), bless( do{\(my $o = bless( [ ${$VAR1}->[0]->[0]->[0]->[0], 1, 40, '', [ bless( do{\(my $o = bless( [ ${${$VAR1}->[0]->[0]->[0]->[0]->[4]->[1]}, 0, 45, ' ' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ), bless( do{\(my $o = bless( [ ${${$VAR1}->[0]->[0]->[0]->[0]->[4]->[1]}, 1, 50, '', [], 'meta', [ bless( do{\(my $o = bless( [ ${${${$VAR1}->[0]->[0]->[0]->[0]->[4]->[1]}->[4]->[1]}, 0, 55, '', 'name', 'generator' ], 'XML::XPath::Node::AttributeImpl' ))}, 'XML::XPath::Node::Attribute' ), bless( do{\(my $o = bless( [ ${${${$VAR1}->[0]->[0]->[0]->[0]->[4]->[1]}->[4]->[1]}, 1, 60, '', 'content', 'HTML Tidy for HTML5 for Linux version 5.6.0' ], 'XML::XPath::Node::AttributeImpl' ))}, 'XML::XPath::Node::Attribute' ) ] ], 'XML::XPath::Node::ElementImpl' ))}, 'XML::XPath::Node::Element' ), bless( do{\(my $o = bless( [ ${${$VAR1}->[0]->[0]->[0]->[0]->[4]->[1]}, 2, 65, ' ' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ), bless( do{\(my $o = bless( [ ${${$VAR1}->[0]->[0]->[0]->[0]->[4]->[1]}, 3, 70, '', [ bless( do{\(my $o = bless( [ ${${${$VAR1}->[0]->[0]->[0]->[0]->[4]->[1]}->[4]->[3]}, 0, 75, 'test' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ) ], 'title', [] ], 'XML::XPath::Node::ElementImpl' ))}, 'XML::XPath::Node::Element' ), bless( do{\(my $o = bless( [ ${${$VAR1}->[0]->[0]->[0]->[0]->[4]->[1]}, 4, 80, ' ' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ) ], 'head', [] ], 'XML::XPath::Node::ElementImpl' ))}, 'XML::XPath::Node::Element' ), bless( do{\(my $o = bless( [ ${$VAR1}->[0]->[0]->[0]->[0], 2, 85, ' ' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ), bless( do{\(my $o = ${$VAR1}->[0]->[0]->[0])}, 'XML::XPath::Node::Element' ), bless( do{\(my $o = bless( [ ${$VAR1}->[0]->[0]->[0]->[0], 4, 200, ' ' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ) ], 'html', [ bless( do{\(my $o = bless( [ ${$VAR1}->[0]->[0]->[0]->[0], 0, 20, '', 'class', 'no-focus-outline no-js' ], 'XML::XPath::Node::AttributeImpl' ))}, 'XML::XPath::Node::Attribute' ), bless( do{\(my $o = bless( [ ${$VAR1}->[0]->[0]->[0]->[0], 1, 25, '', 'lang', 'en-US' ], 'XML::XPath::Node::AttributeImpl' ))}, 'XML::XPath::Node::Attribute' ), bless( do{\(my $o = bless( [ ${$VAR1}->[0]->[0]->[0]->[0], 2, 30, '', 'data-modal-active', 'true' ], 'XML::XPath::Node::AttributeImpl' ))}, 'XML::XPath::Node::Attribute' ) ] ], 'XML::XPath::Node::ElementImpl' ), 3, 90, '', [ bless( do{\(my $o = bless( [ ${$VAR1}->[0]->[0]->[0], 0, 95, ' ' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ), bless( do{\(my $o = bless( [ ${$VAR1}->[0]->[0]->[0], 1, 100, '', [ bless( do{\(my $o = bless( [ ${${$VAR1}->[0]->[0]->[0]->[4]->[1]}, 0, 105, "test\x{a0}heading" ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ) ], 'h1', [] ], 'XML::XPath::Node::ElementImpl' ))}, 'XML::XPath::Node::Element' ), bless( do{\(my $o = bless( [ ${$VAR1}->[0]->[0]->[0], 2, 110, ' ' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ), bless( do{\(my $o = ${$VAR1}->[0]->[0])}, 'XML::XPath::Node::Element' ), bless( do{\(my $o = bless( [ ${$VAR1}->[0]->[0]->[0], 4, 195, ' ' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ) ], 'body', [] ], 'XML::XPath::Node::ElementImpl' ), 3, 115, '', [ bless( do{\(my $o = bless( [ ${$VAR1}->[0]->[0], 0, 120, ' ' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ), bless( do{\(my $o = bless( [ ${$VAR1}->[0]->[0], 1, 125, '', [ bless( do{\(my $o = bless( [ ${${$VAR1}->[0]->[0]->[4]->[1]}, 0, 130, 'paragraph one ' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ), bless( do{\(my $o = bless( [ ${${$VAR1}->[0]->[0]->[4]->[1]}, 1, 135, '', [ bless( do{\(my $o = bless( [ ${${${$VAR1}->[0]->[0]->[4]->[1]}->[4]->[1]}, 0, 145, 'one' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ) ], 'a', [ bless( do{\(my $o = bless( [ ${${${$VAR1}->[0]->[0]->[4]->[1]}->[4]->[1]}, 0, 140, '', 'href', 'https://example.com/one/two.html' ], 'XML::XPath::Node::AttributeImpl' ))}, 'XML::XPath::Node::Attribute' ) ], [] ], 'XML::XPath::Node::ElementImpl' ))}, 'XML::XPath::Node::Element' ), bless( do{\(my $o = bless( [ ${${$VAR1}->[0]->[0]->[4]->[1]}, 2, 150, ' example.' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ) ], 'p', [] ], 'XML::XPath::Node::ElementImpl' ))}, 'XML::XPath::Node::Element' ), bless( do{\(my $o = bless( [ ${$VAR1}->[0]->[0], 2, 155, ' ' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ), bless( do{\(my $o = ${$VAR1}->[0])}, 'XML::XPath::Node::Element' ), bless( do{\(my $o = bless( [ ${$VAR1}->[0]->[0], 4, 190, ' ' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ) ], 'div', [] ], 'XML::XPath::Node::ElementImpl' ), 3, 160, '', [ bless( do{\(my $o = bless( [ ${$VAR1}->[0], 0, 165, 'paragraph two ' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ), $VAR1, bless( do{\(my $o = bless( [ ${$VAR1}->[0], 2, 185, ' example.' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ) ], 'p', [] ], 'XML::XPath::Node::ElementImpl' ), 1, 170, '', [ bless( do{\(my $o = bless( [ ${$VAR1}, 0, 180, 'another' ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ) ], 'a', [ bless( do{\(my $o = bless( [ ${$VAR1}, 0, 175, '', 'href', 'https://example.com/two/three.html' ], 'XML::XPath::Node::AttributeImpl' ))}, 'XML::XPath::Node::Attribute' ) ], [] ], 'XML::XPath::Node::ElementImpl' ))}, 'XML::XPath::Node::Element' );