in reply to Re^2: CGI hidden params vs. character encoding
in thread CGI hidden params vs. character encoding

This problem is unrelated to the use of "decode()" shown in the OP script here

You're right. I thought binmode($untrusted_fh, ':utf8') was the same as decode('utf8', $untrusted), but it's the same as _utf8_on($untrusted).

use strict; use warnings; use Encode qw( decode _utf8_on ); use Devel::Peek; my $bytes = "\x{C2}A"; { open(my $fh, '>:raw', 'temp') or die; print $fh $bytes; } { print STDERR "_utf8_on\n"; print STDERR "--------\n"; open(my $fh, '<:raw', 'temp') or die; _utf8_on(my $chars = <$fh>); Dump($chars); } print STDERR "\n"; { print STDERR "binmode ':utf8'\n"; print STDERR "---------------\n"; open(my $fh, '<:utf8', 'temp') or die; my $chars = <$fh>; Dump($chars); } print STDERR "\n"; { print STDERR "binmode ':encoding(utf8)'\n"; print STDERR "-------------------------\n"; open(my $fh, '<:encoding(utf8)', 'temp') or die; my $chars = <$fh>; Dump($chars); } print STDERR "\n"; { print STDERR "decode 'utf8'\n"; print STDERR "-------------\n"; my $chars = decode('utf8', $bytes); Dump($chars); } print STDERR "\n"; { print STDERR "decode 'utf-8'\n"; print STDERR "--------------\n"; my $chars = decode('utf-8', $bytes); Dump($chars); } unlink 'temp';
_utf8_on -------- SV = PV(0x226154) at 0x1853e20 REFCNT = 1 FLAGS = (PADBUSY,PADMY,POK,pPOK,UTF8) PV = 0x18307d4 "\302A"\0Malformed UTF-8 character (unexpected non-co +ntinuation byte 0x41, immediately after start byte 0xc2) in subroutin +e entry at 688773.pl line 19, <$fh> line 1. [UTF8 "\x{0}"] CUR = 2 LEN = 80 binmode ':utf8' --------------- utf8 "\xC2" does not map to Unicode at 688773.pl line 28, <$fh> line 1 +. SV = PV(0x226154) at 0x18dcdc4 REFCNT = 1 FLAGS = (PADBUSY,PADMY,POK,pPOK,UTF8) PV = 0x183083c "\302A"\0Malformed UTF-8 character (unexpected non-co +ntinuation byte 0x41, immediately after start byte 0xc2) in subroutin +e entry at 688773.pl line 29, <$fh> line 1. [UTF8 "\x{0}"] CUR = 2 LEN = 80 binmode ':encoding(utf8)' ------------------------- utf8 "\xC2" does not map to Unicode at 688773.pl line 38. SV = PV(0x22616c) at 0x18dce6c REFCNT = 1 FLAGS = (PADBUSY,PADMY,POK,pPOK,UTF8) PV = 0x18f2964 "\\xC2A"\0 [UTF8 "\\xC2A"] CUR = 5 LEN = 80 decode 'utf8' ------------- SV = PV(0x22616c) at 0x18dce9c REFCNT = 1 FLAGS = (PADBUSY,PADMY,POK,pPOK,UTF8) PV = 0x182d0cc "\357\277\275A"\0 [UTF8 "\x{fffd}A"] CUR = 4 LEN = 8 decode 'utf-8' -------------- SV = PV(0x18d1688) at 0x18dcefc REFCNT = 1 FLAGS = (PADBUSY,PADMY,POK,pPOK,UTF8) PV = 0x182ce5c "\357\277\275A"\0 [UTF8 "\x{fffd}A"] CUR = 4 LEN = 8