What am I not getting?
Now I looked, you also need to read https://metacpan.org/module/HTML::TreeBuilder#parse_file because treebuilder is interpreting those UTF-8-encoded-bytes as latin-1
This works
#!/usr/bin/perl --
use strict;
use warnings;
use autodie;
use WWW::Mechanize 1.72;
#~ use HTML::TreeBuilder::XPath;
#~ use HTML::TreeBuilder::LibXML;
Main( @ARGV );
exit( 0 );
sub xtree {
local $@;
if( eval { require HTML::TreeBuilder::LibXML; } ){
return HTML::TreeBuilder::LibXML->new;
}
if( eval { require HTML::TreeBuilder::XPath; } ){
return HTML::TreeBuilder::XPath->new;
}
die "$@
you need to install use HTML::TreeBuilder::XPath
or use HTML::TreeBuilder::LibXML\n\n";
}
sub Main {
my $url = shift or die "\n\nUsage: $0 http....\n\n";
binmode STDOUT, ':encoding(UTF-8)'; ## grr
my $mech = WWW::Mechanize->new( autocheck => 1 );
my $tree = xtree();
$mech->get( $url );
$tree->parse( $mech->content );
for my $node ( $tree->findnodes( q{ //span[ @itemprop="reviewBody"
+ ] } ) ){
print $node->as_HTML, "\n\n";
print $node->as_text, "\n\n";
print $node->as_HTML( q{<&>} ), "\n\n";
}
}
__END__