If you're using a recent version of DBD::Pg, you shouldn't need to set the parameter at all. The only way how I was able to get the wrong string back was to set it to 0 but not set the binmode of STDOUT. Try experimenting with the following script:
#!/usr/bin/perl
use warnings;
use strict;
use utf8;
use feature qw{ say };
use DBI;
use Encode;
for my $utf8 (1, -1, 0) {
my $string = $utf8 ? 'Кирилл цагаан толгой'
: encode('UTF-8', 'Кирилл цагаан толгой');
my $db = 'DBI'->connect('dbi:Pg:dbname=postgres', "", "");
say $db->{pg_enable_utf8} = $utf8;
$db->do('CREATE TABLE IF NOT EXISTS cyr (t TEXT)');
$db->do('DELETE FROM cyr');
my $insert = $db->prepare('INSERT INTO cyr (t) VALUES (?)');
$insert->execute($string);
my $select = $db->prepare('SELECT t FROM cyr');
$select->execute;
binmode *STDOUT, $utf8 ? ':encoding(UTF-8)' : ':raw';
while (my @row = $select->fetchrow_array) {
say @row;
}
}
The script needs to be saved as UTF-8.
map{substr$_->[0],$_->[1]||0,1}[\*||{},3],[[]],[ref qr-1,-,-1],[{}],[sub{}^*ARGV,3]
| [reply] [d/l] |
Output is three lines of Cyrillic (I don't know how to get this website to accept Cyrillic chars).
Same thing when I write to a file instead of STDOUT.I think the database transaction is not the problem. When I run Lingua::Identify on the items in the DB then compare the detected language to the item in the DB, it's getting the languages correct. It couldn't do that if the DB was returning non-Cyrillic characters, right? This suggests the problem is in the output. But I am setting binmode on the output file with the correct encoding, just like in your example. I also tried declaring utf-8 encoding in the open statement, but am still getting the ANSII ourput. This is perplexing.
| [reply] |
| [reply] [d/l] |
#!/usr/bin/perl --
use strict;
use warnings;
use Data::Dump qw/ dd /;
use Path::Tiny qw/ path /;
use JSON::XS();
use JSON::PP();
my @humps = "\x{FEFF}\x{1F42A} one hump two humps \x{1F42B}";
dd( JSON::XS->new->pretty(1)->encode( \@humps ) );
dd( JSON::PP->new->pretty(1)->encode( \@humps ) );
dd( JSON::XS->new->utf8(1)->pretty(1)->encode( \@humps ) );
dd( JSON::PP->new->utf8(1)->pretty(1)->encode( \@humps ) );
dd( JSON::XS->new->ascii(1)->pretty(1)->encode( \@humps ) );
dd( JSON::PP->new->ascii(1)->pretty(1)->encode( \@humps ) );
print "#" x 6, "\n";
path( 'deleteme.txt')->spew_raw( JSON::XS->new->pretty(1)->encode( \@h
+umps ) );
dd( path( 'deleteme.txt')->slurp_raw );
path( 'deleteme.txt')->spew_raw( JSON::PP->new->pretty(1)->encode( \@h
+umps ) );
dd( path( 'deleteme.txt')->slurp_raw );
print "#" x 6, "\n";
path( 'deleteme.txt')->spew_utf8( JSON::XS->new->pretty(1)->encode( \@
+humps ) );
dd( path( 'deleteme.txt')->slurp_raw );
dd( path( 'deleteme.txt')->slurp_utf8 );
path( 'deleteme.txt')->spew_utf8( JSON::PP->new->pretty(1)->encode( \@
+humps ) );
dd( path( 'deleteme.txt')->slurp_raw );
dd( path( 'deleteme.txt')->slurp_utf8 );
print "#" x 6, "\n";
path( 'deleteme.txt')->spew_utf8( JSON::XS->new->utf8(1)->pretty(1)->e
+ncode( \@humps ) );
dd( path( 'deleteme.txt')->slurp_utf8 );
path( 'deleteme.txt')->spew_utf8( JSON::PP->new->utf8(1)->pretty(1)->e
+ncode( \@humps ) );
dd( path( 'deleteme.txt')->slurp_utf8 );
print "#" x 6, "\n";
path( 'deleteme.txt')->spew_utf8( JSON::XS->new->ascii(1)->pretty(1)->
+encode( \@humps ) );
dd( path( 'deleteme.txt')->slurp_utf8 );
path( 'deleteme.txt')->spew_utf8( JSON::PP->new->ascii(1)->pretty(1)->
+encode( \@humps ) );
dd( path( 'deleteme.txt')->slurp_utf8 );
print "#" x 6, "\n";
path( 'deleteme.txt')->spew_raw( JSON::XS->new->utf8(1)->pretty(1)->en
+code( \@humps ) );
dd( path( 'deleteme.txt')->slurp_utf8 );
path( 'deleteme.txt')->spew_raw( JSON::PP->new->utf8(1)->pretty(1)->en
+code( \@humps ) );
dd( path( 'deleteme.txt')->slurp_utf8 );
# path( 'deleteme.txt')->remove;
__END__
"[\n \"\x{FEFF}\x{1F42A} one hump two humps \x{1F42B}\"\n]\n"
"[\n \"\x{FEFF}\x{1F42A} one hump two humps \x{1F42B}\"\n]\n"
"[\n \"\xEF\xBB\xBF\xF0\x9F\x90\xAA one hump two humps \xF0\x9F\x90\
+xAB\"\n]\n"
"[\n \"\xEF\xBB\xBF\xF0\x9F\x90\xAA one hump two humps \xF0\x9F\x90\
+xAB\"\n]\n"
"[\n \"\\ufeff\\ud83d\\udc2a one hump two humps \\ud83d\\udc2b\"\n]\
+n"
"[\n \"\\ufeff\\ud83d\\udc2a one hump two humps \\ud83d\\udc2b\"\n]\
+n"
######
Wide character in print at C:/perl/site/lib/Path/Tiny.pm line 1848.
"[\n \"\xEF\xBB\xBF\xF0\x9F\x90\xAA one hump two humps \xF0\x9F\x90\
+xAB\"\n]\n"
Wide character in print at C:/perl/site/lib/Path/Tiny.pm line 1848.
"[\n \"\xEF\xBB\xBF\xF0\x9F\x90\xAA one hump two humps \xF0\x9F\x90\
+xAB\"\n]\n"
######
"[\n \"\xEF\xBB\xBF\xF0\x9F\x90\xAA one hump two humps \xF0\x9F\x90\
+xAB\"\n]\n"
"[\n \"\x{FEFF}\x{1F42A} one hump two humps \x{1F42B}\"\n]\n"
"[\n \"\xEF\xBB\xBF\xF0\x9F\x90\xAA one hump two humps \xF0\x9F\x90\
+xAB\"\n]\n"
"[\n \"\x{FEFF}\x{1F42A} one hump two humps \x{1F42B}\"\n]\n"
######
"[\n \"\xEF\xBB\xBF\xF0\x9F\x90\xAA one hump two humps \xF0\x9F\x90\
+xAB\"\n]\n"
"[\n \"\xEF\xBB\xBF\xF0\x9F\x90\xAA one hump two humps \xF0\x9F\x90\
+xAB\"\n]\n"
######
"[\n \"\\ufeff\\ud83d\\udc2a one hump two humps \\ud83d\\udc2b\"\n]\
+n"
"[\n \"\\ufeff\\ud83d\\udc2a one hump two humps \\ud83d\\udc2b\"\n]\
+n"
######
"[\n \"\x{FEFF}\x{1F42A} one hump two humps \x{1F42B}\"\n]\n"
"[\n \"\x{FEFF}\x{1F42A} one hump two humps \x{1F42B}\"\n]\n"
"🐪 one hump two humps 🐫"
"🐪 one hump two humps 🐫" | [reply] [d/l] |
After some more doc-diving, I discovered the problem is that if you use encode_json (which is equivalent to $json_text = JSON::XS->new->utf8->encode ($perl_scalar) AND set the file encoding to utf8, then text gets double-encoded. When I do this
open(OUT,">twitter-non-en.json") or die "Can't open output: $!";
#binmode OUT, ':encoding(UTF-8)';
print OUT encode_json($sample);
close OUT;
The JSON contains the expected Cyrillic text. Thanks for all the input. | [reply] [d/l] [select] |