# ciconv.pl - example script on detecting encoding set wrong # # This is a simple iconv replacement that tries to detect when the user # accidentally sets the input encoding wrong, ie. sets a byte encoding # when the input is actually utf-8 or the other way. This can't be done # completely reliably, but this may still be useful to catch simple errors. # This should work in perl 5.8 or newer. # # Usage: perl ciconv.pl -f inputencoding -t outputencoding files # use warnings; use strict; use Encode; use Getopt::Long; our $default_input_encoding = "iso-8859-1"; our $default_output_encoding = "iso-8859-1"; our($INFH, $input_encoding, $input_postdecobj, $input_declayer, $input_isutf8, $input_decwarn); sub set_input_encoding { my($enc) = @_; my $obj = Encode::find_encoding($input_encoding = $enc) or die qq(error: unknown input encoding: "$enc"); if (do { my $ts = "Egyenest oda fog folyamodni\n"; $ts eq $obj->decode($ts) }) { # most ascii-based encodings, eg. 8859-*, cp-125*, utf-8, etc $input_declayer = undef; $input_postdecobj = $obj; $input_isutf8 = do { my $ts = "Ali gy\x{151}zelem-\x{fc}nnepe van ma!\x{201d}\n"; $ts eq $obj->decode(encode_utf8($ts)) }; } else { # wide character encodings like utf-16, also ebcdic or other encodings not related to ascii at all $input_declayer = ":encoding(" . $obj->name . ")"; $input_postdecobj = undef; } } sub getinputline { if (!$INFH && !@ARGV) { $ARGV = "-"; open $INFH, "<&=", *STDIN or die "error fduping stdin: $!"; if ($input_declayer) { binmode $INFH, $input_declayer or die "error: cannot set input encoding io layer $input_declayer to stdin fdup"; } } while (!$INFH || eof($INFH)) { if (@ARGV) { open $INFH, "<", ($ARGV = shift @ARGV) or die "error opening input file \"$ARGV\": $!"; if ($input_declayer) { binmode $INFH, $input_declayer or die "error: cannot set input encoding io layer $input_declayer"; } $input_decwarn = 0; } else { return undef; } } my $l = <$INFH>; if (!defined($l)) { warn "read error reading amsrefs input file $ARGV: $!"; return undef; } if ($input_postdecobj) { if ($input_isutf8) { $l =~ /(?:[\x00-\x7f]|\A)[\x80-\xff][\x00-\x7f]/ && !$input_decwarn++ and warn "warning: input is not utf-8 encoded near $ARGV:$., currently decoding using character encoding $input_encoding, make sure you set the correct input encoding with the -f switch"; } else { $l =~ /[\xc2-\xdf\xe2][\x80-\xbf]/ && !$input_decwarn++ and warn "warning: input seems to be probably utf-8 encoded near $ARGV:$., currently decoding using character encoding $input_encoding, make sure you set the correct input encoding with the -t switch"; } $l = $input_postdecobj->decode($l, Encode::FB_DEFAULT()); } $l; } sub cimain { my($inputenc, $outputenc); Getopt::Long::Configure qw"gnu_getopt prefix_pattern=(--|-)"; GetOptions( "inputencoding|input-encoding|from-code|f=s", \$inputenc, "outputencoding|output-encoding|to-code|t=s", \$outputenc, ); $inputenc ||= $default_input_encoding; set_input_encoding($inputenc); $outputenc ||= $default_output_encoding; my $oobj = Encode::find_encoding($outputenc) or die qq(error: unknown output encoding: "$outputenc"); my $olayer = ":encoding(" . $oobj->name . ")"; binmode STDOUT, $olayer or die "error: cannot set output encoding layer $olayer: $!"; while ($_ = getinputline()) { print $_; } } cimain(); __END__