#!/usr/bin/perl use strict; use warnings; use Data::Dumper qw( Dumper ); use Encode qw( decode encode ); use XML::Simple qw( :strict ); my @KNOWN_SAX_PARSERS = qw( XML::LibXML::SAX XML::LibXML::SAX::Parser XML::SAX::ExpatXS XML::SAX::PurePerl ); use constant TEST_IDX_ENC_USED => 0; use constant TEST_IDX_ENC_SPEC => 1; use constant TEST_IDX_BOM => 2; use constant TEST_IDX_XML => 3; use constant TEST_IDX_MUST_PASS => 4; use constant TEST_IDX_MUST_FAIL => 5; use constant TEST_IDX_EXPECT => 6; use constant RESU_IDX_TEST => 0; use constant RESU_IDX_PARSE_ERR => 1; use constant RESU_IDX_MATCH_ERR => 2; use constant RESU_IDX_GOT_STR => 3; sub xml_from_text { my $s = @_ ? $_[0] : $_; $s =~ s/&/&/g; $s =~ s//>/g; return $s; } sub xmlval_from_text { my $s = @_ ? $_[0] : $_; $s =~ s/&/&/g; $s =~ s//>/g; $s =~ s/"/"/g; $s =~ s/'/'/g; return $s; } sub format_str { my $s = @_ ? $_[0] : $_; local $Data::Dumper::Useqq = 1; local $Data::Dumper::Terse = 1; local $Data::Dumper::Indent = 0; return Dumper($s); } sub has_bom { my ($enc) = @_; # encode's second arg can't be read-only for some reason. return eval { encode($enc, my $bom=chr(0xFEFF), Encode::FB_CROAK); 1 }; } sub build_tests { my @tests; my $payload = "abcd\x82\x85\x{2660}\x{2661}\x{2662}\x{2663}"; for my $specify_enc (0,1) { for my $use_bom (0,1) { my $bom = $use_bom ? chr(0xFEFF) : ''; push @tests, []; for my $enc (qw( iso-8859-1 UTF-8 UTF-16 UTF-16be UTF-16le )) { my @enc_with = ($enc eq 'UTF-16' ? qw( UTF-16be UTF-16le ) : $enc ); my $has_bom = has_bom($enc); next if $use_bom && !$has_bom; for my $enc_with (@enc_with) { next if !$specify_enc && $enc ne $enc_with; my $xml_decl = ($specify_enc ? qq{} : qq{} ); my $xml = encode($enc_with, <<"__EOI__"); $bom$xml_decl ${\ xml_from_text($payload) } __EOI__ my $must_pass = 1; my $must_fail = 0; if (!$use_bom && !$specify_enc && $enc eq 'iso-8859-1' ) { $must_pass = 0; $must_fail = 1; } elsif ( (!$use_bom && $enc_with =~ /^UTF-16/ ) || ( !$specify_enc && $enc_with ne 'UTF-8' ) ) { $must_pass = 0; $must_fail = 0; } my @test; $test[TEST_IDX_ENC_USED ] = $enc_with; $test[TEST_IDX_ENC_SPEC ] = $specify_enc ? $enc : undef; $test[TEST_IDX_BOM ] = $use_bom; $test[TEST_IDX_XML ] = $xml; $test[TEST_IDX_MUST_PASS ] = $must_pass; $test[TEST_IDX_MUST_FAIL ] = $must_fail; $test[TEST_IDX_EXPECT ] = decode($enc_with, encode($enc_with, $payload)); push @{ $tests[-1] }, \@test; } } } } return \@tests; } sub load_module { my ($mod) = @_; $mod =~ s{::}{/}g; $mod .= '.pm'; return eval { require $mod }; } sub get_parser_desc_name { my ($name) = @_; my $ver = $name->VERSION(); return $name . ( defined($ver) ? " $ver" : '' ); } my $xml_simple_desc_name = get_parser_desc_name('XML::Simple'); sub xml_simple_parser { my ($name) = @_; my $parser = XML::Simple->new( ForceArray => 1, KeyAttr => {} ); return [ $name, sprintf('%s (via %s)', get_parser_desc_name($name), $xml_simple_desc_name, ), sub { local $XML::Simple::PREFERRED_PARSER = $name; return $parser->XMLin($_[0]); }, sub { my ($tree) = @_; ref($tree ) eq 'HASH' or return undef; ref($tree->{text}) eq 'ARRAY' or return undef; return $tree->{text}[0]; }, ]; } sub find_parsers { my @parsers; if (!load_module('XML::LibXML')) { warn("warn: XML::LibXML not available\n"); } else { my $parser = XML::LibXML->new(); push @parsers, [ 'XML::LibXML', get_parser_desc_name('XML::LibXML'), sub { $parser->parse_string($_[0]) }, sub { my ($tree) = @_; my @roots = $tree->childNodes() or return undef; my @texts = grep { $_->nodeType() == 1 && $_->nodeName() eq 'text' } $roots[0]->childNodes() or return undef; return $texts[0]->textContent(); }, ]; } if (!load_module('XML::Bare')) { warn("warn: XML::Bare not available\n"); } else { push @parsers, [ 'XML::Bare', get_parser_desc_name('XML::Bare'), sub { my $parser = XML::Bare->new( text => $_[0] ); return $parser->parse(); }, sub { my ($tree) = @_; ref($tree ) eq 'HASH' or return undef; ref($tree->{root} ) eq 'HASH' or return undef; ref($tree->{root}{text}) eq 'HASH' or return undef; return $tree->{root}{text}{value}; }, ]; } if (!load_module('XML::Parser')) { warn("warn: XML::Parser not available\n"); } else { push @parsers, xml_simple_parser('XML::Parser'); } if (!load_module('XML::SAX')) { warn("warn: XML::SAX not available\n"); } else { my @sax = sort map { $_->{Name} } @{ XML::SAX->parsers() }; my %sax = map { $_ => 1 } @sax; my %known = map { $_ => 1 } @KNOWN_SAX_PARSERS; for my $sax (@sax) { warn("info: Discovered new SAX parser $sax\n") if !delete($known{$sax}); load_module($sax) or do { warn("error: Can't load SAX parser $sax\n"); next; }; push @parsers, xml_simple_parser($sax); } for my $known (keys %known) { load_module($known) or do { warn("warn: Known parser $known not installed\n"); next; }; warn("warn: XML::SAX unaware of installed parser $known\n"); push @parsers, xml_simple_parser($known); } } return \@parsers; } sub run_tests { my ($name, $desc_name, $parser, $result_getter, $tests) = @_; my @results; for my $test_grp (@$tests) { push @results, []; for my $test (@$test_grp) { my $xml = $test->[TEST_IDX_XML]; my $expect = $test->[TEST_IDX_EXPECT]; my $parse_err; my $match_err; my $got; my $tree = eval { $parser->($xml) }; if (!defined($tree)) { $parse_err = $@ || '[Unknown]'; } else { $got = $result_getter->($tree); if (!defined($got)) { $parse_err = 'Bad tree'; } elsif ((defined($got) xor defined($expect)) || $got ne $expect ) { $match_err = "got= ".format_str($got ) . "\n" . "expect=".format_str($expect); } } my @result; $result[RESU_IDX_TEST ] = $test; $result[RESU_IDX_PARSE_ERR] = $parse_err; $result[RESU_IDX_MATCH_ERR] = $match_err; $result[RESU_IDX_GOT_STR ] = $got; push @{ $results[-1] }, \@result; } } return \@results; } sub html_from_text { my $s = @_ ? $_[0] : $_; $s =~ s/&/&/g; $s =~ s//>/g; return $s; } sub htmlval_from_text { my $s = @_ ? $_[0] : $_; $s =~ s/&/&/g; $s =~ s//>/g; $s =~ s/"/"/g; $s =~ s/'/'/g; return $s; } sub html_pre_from_text { my ($text) = @_; } sub output_header { print(<<'__EOI__'); Encodings and XML Parsers __EOI__ } sub output_parser_header { my ($parser_name) = @_; print(<<"__EOI__"); __EOI__ } sub output_result { my ($result) = @_; my $test = $result->[RESU_IDX_TEST]; my $h_enc_used = html_from_text($test->[TEST_IDX_ENC_USED]); my $h_enc_spec = $test->[TEST_IDX_ENC_SPEC] || '[none]'; my $h_used_bom = $test->[TEST_IDX_BOM] ? 'Yes' : 'No'; my $hv_status; my $h_result; if ($result->[RESU_IDX_PARSE_ERR] || $result->[RESU_IDX_MATCH_ERR]) { if ($test->[TEST_IDX_MUST_FAIL]) { $hv_status = 'desired_fail'; $h_result = 'Desired Fail = OK
'; } elsif ($test->[TEST_IDX_MUST_PASS]) { $hv_status = 'undesired_fail'; $h_result = 'Undesired Fail = ERR
'; } else { $hv_status = 'acceptable_fail'; $h_result = 'Acceptable Fail = OK
'; } if ($result->[RESU_IDX_PARSE_ERR]) { my $err = $result->[RESU_IDX_PARSE_ERR]; $err = html_from_text($err); # Must leave newlines intact. $err =~ s/\n/
/g; $h_result .= qq{Parse error:
$err
}; } elsif ($result->[RESU_IDX_MATCH_ERR]) { my $err = $result->[RESU_IDX_MATCH_ERR]; $err = html_from_text($err); # Must leave newlines intact. $err =~ s/\n/
/g; $h_result .= qq{Match error:
$err
}; } } else { if ($test->[TEST_IDX_MUST_PASS]) { $hv_status = 'desired_pass'; $h_result = 'Desired Pass = OK
'; } elsif ($test->[TEST_IDX_MUST_FAIL]) { $hv_status = 'undesired_pass'; $h_result = 'Undesired Pass = ERR
'; } else { $hv_status = 'unexpected_pass'; $h_result = 'Unexpected Pass = OK
'; } } print(<<"__EOI__");
$parser_name
Encoding Used Encoding Specified Used BOM Result __EOI__ } sub output_group_header { print(<<'__EOI__');
$h_enc_used $h_enc_spec $h_used_bom $h_result __EOI__ } sub output_group_footer { my ($last) = @_; # I wish I could style TBODY instead of doing this. print(<<'__EOI__') if !$last;
__EOI__ } sub output_parser_footer { my ($parser_name) = @_; print(<<'__EOI__');
__EOI__ } sub output_footer { print(<<'__EOI__'); __EOI__ } { my $parsers = find_parsers(); my $tests = build_tests(); output_header(); for (@$parsers) { my ($name, $desc_name, $parser, $result_getter) = @$_; output_parser_header($desc_name); my $results = run_tests($name, $desc_name, $parser, $result_getter, $tests); for my $result_grp (@$results) { output_group_header(); for my $result (@$result_grp) { output_result($result); } my $last = $result_grp == $results->[-1]; output_group_footer($last); } output_parser_footer(); } output_footer(); }