The hidden form variable sounds like a great strategy, thank you for the tip. I've been playing with this concept today, but I'm not getting useful results so far.
If I use the unicode smiley character as my hidden form field value (\x{263a}). I get back a character sequence like "☺", or \xe2\x98\xba in latin1. I can detect that with the regex /^\xe2\x98\xba$/, but even if the default encoding for Apache is UTF-8, the content-type charset in the resulting page is utf-8, the script is utf-8 (no BOM or perl gives an error like "(8)Exec format error: exec of '/var/www/cgi-bin/char2.cgi' failed"), and my browser is setting itself to UTF8 encoding as it should, and I copy and paste text from a document known to be in UTF-8, it's detected as latin 1 and not utf-8.
So can I ask what string you use as a detection mechanism? And what are you using to match the mis-converted string in other encodings? I'm interested in Win1252 and Latin 1, if that makes any difference. My current source is below.
Thank You,
Troy
#!/usr/bin/perl use utf8; use strict; use Unicode::String qw(utf8 latin1 utf16); use Encode; use CGI; use HTML::Entities; require Unicode::Map8; my $smiley = "\x{263a}"; my $l1_map = Unicode::Map8->new("latin1") || die; my $win_map = Unicode::Map8->new("cp1252") || die; my $cgiq = new CGI; my $qtext = $cgiq->param('textInput'); binmode(STDOUT, ":utf8"); print $cgiq->header(-charset=>'utf-8'); print '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> <html> <head> <meta http-equiv="content-type" content="text/html; charset=ut +f-8"> <title>Character conversion test</title> </head> <body bgcolor="#ffffff"> '; my $encoded = ''; if ($cgiq->param('enc_sniffer') =~ /^\x{263a}$/) { print "<p>Unicode encoding detected.</p>\n"; my $u = utf8($qtext); my $converted = $u->latin1; $encoded = encode_entities($converted); } elsif ($cgiq->param('enc_sniffer') =~ /^\xe2\x98\xba$/ ) { print "<p>Latin1 encoding detected.</p>\n"; my $u = utf8($qtext); my $converted = $u->latin1; $encoded = encode_entities($converted); } elsif ($cgiq->param('enc_sniffer') =~ /not sure what to put here/ ) { print "<p>Windows 1252 encoding detected.</p>\n"; $encoded = TransWin1252($qtext); } print ' enc_sniffer: ' . $smiley; print "\n<p></p>\n"; print ' Text submitted:<br>' . $qtext . ' <p></p>'; print ' Encoded:<br>' . $encoded . ' <p></p> <form action="/cgi-bin/char2.cgi" method="post" enctype="m +ultipart/form-data"> <input type="hidden" name="enc_sniffer" value="' . $sm +iley . '"> <textarea name="textInput" rows="25" cols="72"></texta +rea><p> <input type="submit"> </form> </p> <p></p> </body> </html>'; exit; sub TransWin1252 { my $s = $_[0]; $s =~ s/\x80/€/g; $s =~ s/\x81/ /g; $s =~ s/\x82/‚/g; $s =~ s/\x83/ƒ/g; $s =~ s/\x84/„/g; $s =~ s/\x85/…/g; $s =~ s/\x86/†/g; $s =~ s/\x87/‡/g; $s =~ s/\x88/ˆ/g; $s =~ s/\x89/‰/g; $s =~ s/\x8A/Š/g; $s =~ s/\x8B/‹/g; $s =~ s/\x8C/Œ/g; $s =~ s/\x8D/ /g; $s =~ s/\x8E/Ž/g; $s =~ s/\x8F/ /g; $s =~ s/\x90/ /g; $s =~ s/\x91/‘/g; $s =~ s/\x92/’/g; $s =~ s/\x93/“/g; $s =~ s/\x94/”/g; $s =~ s/\x95/•/g; $s =~ s/\x96/–/g; $s =~ s/\x97/—/g; $s =~ s/\x98/˜/g; $s =~ s/\x99/™/g; $s =~ s/\x9A/š/g; $s =~ s/\x9B/›/g; $s =~ s/\x9C/œ/g; $s =~ s/\x9D/ /g; $s =~ s/\x9E/ž/g; $s =~ s/\x9F/Ÿ/g; $s =~ s/\xA0/ /g; $s =~ s/\xA1/¡/g; $s =~ s/\xA2/¢/g; $s =~ s/\xA3/£/g; $s =~ s/\xA4/¤/g; $s =~ s/\xA5/¥/g; $s =~ s/\xA6/¦/g; $s =~ s/\xA7/§/g; $s =~ s/\xA8/¨/g; $s =~ s/\xA9/©/g; $s =~ s/\xAA/ª/g; $s =~ s/\xAB/«/g; $s =~ s/\xAC/¬/g; $s =~ s/\xAD/­/g; $s =~ s/\xAE/®/g; $s =~ s/\xAF/¯/g; $s =~ s/\xB0/°/g; $s =~ s/\xB1/±/g; $s =~ s/\xB2/²/g; $s =~ s/\x83/³/g; $s =~ s/\xB4/´/g; $s =~ s/\xB5/µ/g; $s =~ s/\xB6/¶/g; $s =~ s/\xB7/·/g; $s =~ s/\xB8/¸/g; $s =~ s/\xB9/¹/g; $s =~ s/\xBA/º/g; $s =~ s/\xBB/»/g; $s =~ s/\xBC/¼/g; $s =~ s/\xBD/½/g; $s =~ s/\xBE/¾/g; $s =~ s/\xBF/¿/g; $s =~ s/\xC0/À/g; $s =~ s/\xC1/Á/g; $s =~ s/\xC2/Â/g; $s =~ s/\x83/Ã/g; $s =~ s/\xC4/Ä/g; $s =~ s/\xC5/Å/g; $s =~ s/\xC6/Æ/g; $s =~ s/\xC7/Ç/g; $s =~ s/\xC8/È/g; $s =~ s/\xC9/É/g; $s =~ s/\xCA/Ê/g; $s =~ s/\xCB/Ë/g; $s =~ s/\xCC/Ì/g; $s =~ s/\xCD/Í/g; $s =~ s/\xCE/Î/g; $s =~ s/\xCF/Ï/g; $s =~ s/\xD0/Ð/g; $s =~ s/\xD1/Ñ/g; $s =~ s/\xD2/Ò/g; $s =~ s/\x83/Ó/g; $s =~ s/\xD4/Ô/g; $s =~ s/\xD5/Õ/g; $s =~ s/\xD6/Ö/g; $s =~ s/\xD7/×/g; $s =~ s/\xD8/Ø/g; $s =~ s/\xD9/Ù/g; $s =~ s/\xDA/Ú/g; $s =~ s/\xDB/Û/g; $s =~ s/\xDC/Ü/g; $s =~ s/\xDD/Ý/g; $s =~ s/\xDE/Þ/g; $s =~ s/\xDF/ß/g; $s =~ s/\xE0/à/g; $s =~ s/\xE1/á/g; $s =~ s/\xE2/â/g; $s =~ s/\x83/ã/g; $s =~ s/\xE4/ä/g; $s =~ s/\xE5/å/g; $s =~ s/\xE6/æ/g; $s =~ s/\xE7/ç/g; $s =~ s/\xE8/è/g; $s =~ s/\xE9/é/g; $s =~ s/\xEA/ê/g; $s =~ s/\xEB/ë/g; $s =~ s/\xEC/ì/g; $s =~ s/\xED/í/g; $s =~ s/\xEE/î/g; $s =~ s/\xEF/ï/g; $s =~ s/\xF0/ð/g; $s =~ s/\xF1/ñ/g; $s =~ s/\xF2/ò/g; $s =~ s/\x83/ó/g; $s =~ s/\xF4/ô/g; $s =~ s/\xF5/õ/g; $s =~ s/\xF6/ö/g; $s =~ s/\xF7/÷/g; $s =~ s/\xF8/ø/g; $s =~ s/\xF9/ù/g; $s =~ s/\xFA/ú/g; $s =~ s/\xFB/û/g; $s =~ s/\xFC/ü/g; $s =~ s/\xFD/ý/g; $s =~ s/\xFE/þ/g; $s =~ s/\xFF/ÿ/g; return($s); }
In reply to Re^2: Encoding confusion with CGI forms
by davistv
in thread Encoding confusion with CGI forms
by davistv
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |