#!/usr/bin/perl
use strict;
use warnings;
use Data::Dumper qw( Dumper );
use Encode qw( decode encode );
use XML::Simple qw( :strict );
my @KNOWN_SAX_PARSERS = qw(
XML::LibXML::SAX
XML::LibXML::SAX::Parser
XML::SAX::ExpatXS
XML::SAX::PurePerl
);
use constant TEST_IDX_ENC_USED => 0;
use constant TEST_IDX_ENC_SPEC => 1;
use constant TEST_IDX_BOM => 2;
use constant TEST_IDX_XML => 3;
use constant TEST_IDX_MUST_PASS => 4;
use constant TEST_IDX_MUST_FAIL => 5;
use constant TEST_IDX_EXPECT => 6;
use constant RESU_IDX_TEST => 0;
use constant RESU_IDX_PARSE_ERR => 1;
use constant RESU_IDX_MATCH_ERR => 2;
use constant RESU_IDX_GOT_STR => 3;
sub xml_from_text {
my $s = @_ ? $_[0] : $_;
$s =~ s/&/&/g;
$s =~ s/</g;
$s =~ s/>/>/g;
return $s;
}
sub xmlval_from_text {
my $s = @_ ? $_[0] : $_;
$s =~ s/&/&/g;
$s =~ s/</g;
$s =~ s/>/>/g;
$s =~ s/"/"/g;
$s =~ s/'/'/g;
return $s;
}
sub format_str {
my $s = @_ ? $_[0] : $_;
local $Data::Dumper::Useqq = 1;
local $Data::Dumper::Terse = 1;
local $Data::Dumper::Indent = 0;
return Dumper($s);
}
sub has_bom {
my ($enc) = @_;
# encode's second arg can't be read-only for some reason.
return eval { encode($enc, my $bom=chr(0xFEFF), Encode::FB_CROAK); 1 };
}
sub build_tests {
my @tests;
my $payload = "abcd\x82\x85\x{2660}\x{2661}\x{2662}\x{2663}";
for my $specify_enc (0,1) {
for my $use_bom (0,1) {
my $bom = $use_bom ? chr(0xFEFF) : '';
push @tests, [];
for my $enc (qw( iso-8859-1 UTF-8 UTF-16 UTF-16be UTF-16le )) {
my @enc_with = ($enc eq 'UTF-16'
? qw( UTF-16be UTF-16le )
: $enc
);
my $has_bom = has_bom($enc);
next if $use_bom && !$has_bom;
for my $enc_with (@enc_with) {
next if !$specify_enc && $enc ne $enc_with;
my $xml_decl = ($specify_enc
? qq{}
: qq{}
);
my $xml = encode($enc_with, <<"__EOI__");
$bom$xml_decl
${\ xml_from_text($payload) }
__EOI__
my $must_pass = 1;
my $must_fail = 0;
if (!$use_bom && !$specify_enc && $enc eq 'iso-8859-1' ) {
$must_pass = 0;
$must_fail = 1;
}
elsif ( (!$use_bom && $enc_with =~ /^UTF-16/ )
|| ( !$specify_enc && $enc_with ne 'UTF-8' )
) {
$must_pass = 0;
$must_fail = 0;
}
my @test;
$test[TEST_IDX_ENC_USED ] = $enc_with;
$test[TEST_IDX_ENC_SPEC ] = $specify_enc ? $enc : undef;
$test[TEST_IDX_BOM ] = $use_bom;
$test[TEST_IDX_XML ] = $xml;
$test[TEST_IDX_MUST_PASS ] = $must_pass;
$test[TEST_IDX_MUST_FAIL ] = $must_fail;
$test[TEST_IDX_EXPECT ] = decode($enc_with, encode($enc_with, $payload));
push @{ $tests[-1] }, \@test;
}
}
}
}
return \@tests;
}
sub load_module {
my ($mod) = @_;
$mod =~ s{::}{/}g;
$mod .= '.pm';
return eval { require $mod };
}
sub get_parser_desc_name {
my ($name) = @_;
my $ver = $name->VERSION();
return $name . ( defined($ver) ? " $ver" : '' );
}
my $xml_simple_desc_name = get_parser_desc_name('XML::Simple');
sub xml_simple_parser {
my ($name) = @_;
my $parser = XML::Simple->new( ForceArray => 1, KeyAttr => {} );
return [
$name,
sprintf('%s (via %s)',
get_parser_desc_name($name),
$xml_simple_desc_name,
),
sub {
local $XML::Simple::PREFERRED_PARSER = $name;
return $parser->XMLin($_[0]);
},
sub {
my ($tree) = @_;
ref($tree ) eq 'HASH' or return undef;
ref($tree->{text}) eq 'ARRAY' or return undef;
return $tree->{text}[0];
},
];
}
sub find_parsers {
my @parsers;
if (!load_module('XML::LibXML')) {
warn("warn: XML::LibXML not available\n");
} else {
my $parser = XML::LibXML->new();
push @parsers, [
'XML::LibXML',
get_parser_desc_name('XML::LibXML'),
sub { $parser->parse_string($_[0]) },
sub {
my ($tree) = @_;
my @roots = $tree->childNodes()
or return undef;
my @texts = grep { $_->nodeType() == 1
&& $_->nodeName() eq 'text' }
$roots[0]->childNodes()
or return undef;
return $texts[0]->textContent();
},
];
}
if (!load_module('XML::Bare')) {
warn("warn: XML::Bare not available\n");
} else {
push @parsers, [
'XML::Bare',
get_parser_desc_name('XML::Bare'),
sub {
my $parser = XML::Bare->new( text => $_[0] );
return $parser->parse();
},
sub {
my ($tree) = @_;
ref($tree ) eq 'HASH' or return undef;
ref($tree->{root} ) eq 'HASH' or return undef;
ref($tree->{root}{text}) eq 'HASH' or return undef;
return $tree->{root}{text}{value};
},
];
}
if (!load_module('XML::Parser')) {
warn("warn: XML::Parser not available\n");
} else {
push @parsers, xml_simple_parser('XML::Parser');
}
if (!load_module('XML::SAX')) {
warn("warn: XML::SAX not available\n");
} else {
my @sax = sort map { $_->{Name} } @{ XML::SAX->parsers() };
my %sax = map { $_ => 1 } @sax;
my %known = map { $_ => 1 } @KNOWN_SAX_PARSERS;
for my $sax (@sax) {
warn("info: Discovered new SAX parser $sax\n")
if !delete($known{$sax});
load_module($sax)
or do { warn("error: Can't load SAX parser $sax\n");
next;
};
push @parsers, xml_simple_parser($sax);
}
for my $known (keys %known) {
load_module($known)
or do { warn("warn: Known parser $known not installed\n");
next;
};
warn("warn: XML::SAX unaware of installed parser $known\n");
push @parsers, xml_simple_parser($known);
}
}
return \@parsers;
}
sub run_tests {
my ($name, $desc_name, $parser, $result_getter, $tests) = @_;
my @results;
for my $test_grp (@$tests) {
push @results, [];
for my $test (@$test_grp) {
my $xml = $test->[TEST_IDX_XML];
my $expect = $test->[TEST_IDX_EXPECT];
my $parse_err;
my $match_err;
my $got;
my $tree = eval { $parser->($xml) };
if (!defined($tree)) {
$parse_err = $@ || '[Unknown]';
} else {
$got = $result_getter->($tree);
if (!defined($got)) {
$parse_err = 'Bad tree';
}
elsif ((defined($got) xor defined($expect)) || $got ne $expect ) {
$match_err = "got= ".format_str($got )
. "\n" . "expect=".format_str($expect);
}
}
my @result;
$result[RESU_IDX_TEST ] = $test;
$result[RESU_IDX_PARSE_ERR] = $parse_err;
$result[RESU_IDX_MATCH_ERR] = $match_err;
$result[RESU_IDX_GOT_STR ] = $got;
push @{ $results[-1] }, \@result;
}
}
return \@results;
}
sub html_from_text {
my $s = @_ ? $_[0] : $_;
$s =~ s/&/&/g;
$s =~ s/</g;
$s =~ s/>/>/g;
return $s;
}
sub htmlval_from_text {
my $s = @_ ? $_[0] : $_;
$s =~ s/&/&/g;
$s =~ s/</g;
$s =~ s/>/>/g;
$s =~ s/"/"/g;
$s =~ s/'/'/g;
return $s;
}
sub html_pre_from_text {
my ($text) = @_;
}
sub output_header {
print(<<'__EOI__');
Encodings and XML Parsers
__EOI__
}
sub output_parser_header {
my ($parser_name) = @_;
print(<<"__EOI__");
$parser_name
|
Encoding Used
| Encoding Specified
| Used BOM
| Result
__EOI__
}
sub output_group_header {
print(<<'__EOI__');
|
__EOI__
}
sub output_result {
my ($result) = @_;
my $test = $result->[RESU_IDX_TEST];
my $h_enc_used = html_from_text($test->[TEST_IDX_ENC_USED]);
my $h_enc_spec = $test->[TEST_IDX_ENC_SPEC] || '[none]';
my $h_used_bom = $test->[TEST_IDX_BOM] ? 'Yes' : 'No';
my $hv_status;
my $h_result;
if ($result->[RESU_IDX_PARSE_ERR] || $result->[RESU_IDX_MATCH_ERR]) {
if ($test->[TEST_IDX_MUST_FAIL]) {
$hv_status = 'desired_fail';
$h_result = 'Desired Fail = OK
';
} elsif ($test->[TEST_IDX_MUST_PASS]) {
$hv_status = 'undesired_fail';
$h_result = 'Undesired Fail = ERR
';
} else {
$hv_status = 'acceptable_fail';
$h_result = 'Acceptable Fail = OK
';
}
if ($result->[RESU_IDX_PARSE_ERR]) {
my $err = $result->[RESU_IDX_PARSE_ERR];
$err = html_from_text($err); # Must leave newlines intact.
$err =~ s/\n/
/g;
$h_result .= qq{Parse error:$err
};
}
elsif ($result->[RESU_IDX_MATCH_ERR]) {
my $err = $result->[RESU_IDX_MATCH_ERR];
$err = html_from_text($err); # Must leave newlines intact.
$err =~ s/\n/
/g;
$h_result .= qq{Match error:$err
};
}
}
else
{
if ($test->[TEST_IDX_MUST_PASS]) {
$hv_status = 'desired_pass';
$h_result = 'Desired Pass = OK
';
} elsif ($test->[TEST_IDX_MUST_FAIL]) {
$hv_status = 'undesired_pass';
$h_result = 'Undesired Pass = ERR
';
} else {
$hv_status = 'unexpected_pass';
$h_result = 'Unexpected Pass = OK
';
}
}
print(<<"__EOI__");
$h_enc_used
| $h_enc_spec
| $h_used_bom
| $h_result
__EOI__
}
sub output_group_footer {
my ($last) = @_;
# I wish I could style TBODY instead of doing this.
print(<<'__EOI__') if !$last;
|
__EOI__
}
sub output_parser_footer {
my ($parser_name) = @_;
print(<<'__EOI__');
|
__EOI__
}
sub output_footer {
print(<<'__EOI__');
__EOI__
}
{
my $parsers = find_parsers();
my $tests = build_tests();
output_header();
for (@$parsers) {
my ($name, $desc_name, $parser, $result_getter) = @$_;
output_parser_header($desc_name);
my $results = run_tests($name, $desc_name, $parser, $result_getter, $tests);
for my $result_grp (@$results) {
output_group_header();
for my $result (@$result_grp) {
output_result($result);
}
my $last = $result_grp == $results->[-1];
output_group_footer($last);
}
output_parser_footer();
}
output_footer();
}