chcp
Active code page: 1252
perl -e "print chr 199"
Ç
perl -e "print join ' ', map {sprintf '%02x', $_} unpack 'C*', chr 199"
c7
####
perl -MEncode -e "print Encode::encode_utf8 chr 199"
Ç
perl -MEncode -e "print join ' ', map {sprintf '%02x', $_} unpack 'C*', Encode::encode_utf8 chr 199"
c3 87
####
perl -MEncode -e "print Encode::decode_utf8 \"\xc3\x87\""
Ç
####
Ç foo
####
use strict;
use warnings;
use feature 'say';
#~ use utf8;
use XML::SAX::ParserFactory;
$|++;
#to force one kind of parser for ParserFactory->parser()
#~ $XML::SAX::ParserPackage = "XML::SAX::PurePerl";
#~ $XML::SAX::ParserPackage = "XML::SAX::Expat"; #no xml_decl
#~ $XML::SAX::ParserPackage = "XML::SAX::ExpatXS";
#~ $XML::SAX::ParserPackage = "XML::LibXML::SAX";
$XML::SAX::ParserPackage = "XML::LibXML::SAX::Parser";
{
package MySax;
use feature 'say';
use Devel::Peek;
sub new {
my $class = shift;
return bless {}, $class;
}
sub hexprint {
my ($self, $data) = @_;
join ' ', map { sprintf '%02X', $_ } unpack 'C*', $data;
}
sub characters {
my ($self, $data) = @_;
my $content = $data->{Data};
say "characters for elt: ". $content;
say "bytes for elt: ". $self->hexprint($content);
Dump($content);
}
}
my $handler = new MySax;
my $parser = XML::SAX::ParserFactory->parser(Handler => $handler);
say "parser is " . ref $parser;
say "file: " . $ARGV[0] if $ARGV[0];
$parser->parse_file($ARGV[0] // *DATA);
__DATA__
####
perl sax_utf.pl utf8-1.xml
parser is XML::LibXML::SAX::Parser
file: utf8-1.xml
characters for elt: Ç foo
bytes for elt: C7 20 66 6F 6F
SV = PV(0x288c658) at 0x233d2e8
REFCNT = 1
FLAGS = (PADMY,POK,IsCOW,pPOK,UTF8)
PV = 0x2b28228 "\303\207 foo"\0 [UTF8 "\x{c7} foo"]
CUR = 6
LEN = 10
COW_REFCNT = 1
####
sub characters {
use Encode;
my ($self, $data) = @_;
my $content = Encode::encode_utf8 $data->{Data};
say "characters for elt: ". $content;
say "bytes for elt: ". $self->hexprint($content);
Dump($content);
}
####
characters for elt: Ç foo
bytes for elt: C3 87 20 66 6F 6F
SV = PV(0x28ba328) at 0x236d2b8
REFCNT = 1
FLAGS = (PADMY,POK,IsCOW,pPOK)
PV = 0x2b548b8 "\303\207 foo"\0
CUR = 6
LEN = 10
COW_REFCNT = 1
####
parser is XML::LibXML::SAX::Parser
file: utf8-2.xml
Wide character in say at sax_utf.pl line 36.
characters for elt: Ç foo €
bytes for elt: C7 20 66 6F 6F 20 20AC
SV = PV(0x2a61748) at 0x250ade8
REFCNT = 1
FLAGS = (PADMY,POK,IsCOW,pPOK,UTF8)
PV = 0x2cefc98 "\303\207 foo \342\202\254"\0 [UTF8 "\x{c7} foo \x{20ac}"]
CUR = 10
LEN = 12
COW_REFCNT = 1
####
parser is XML::LibXML::SAX::Parser
file: utf8-2.xml
characters for elt: Ç foo €
bytes for elt: C3 87 20 66 6F 6F 20 E2 82 AC
SV = PV(0x2991768) at 0x243ade8
REFCNT = 1
FLAGS = (PADMY,POK,IsCOW,pPOK)
PV = 0x2c1fc98 "\303\207 foo \342\202\254"\0
CUR = 10
LEN = 12
COW_REFCNT = 1