# example 1: see what perl produces for "UTF-16" output: $ perl -e 'binmode STDOUT, ":encoding(UTF-16)"; print "abc\n"' | od -txC -a 0000000 fe ff 00 61 00 62 00 63 00 0a fe ff nul a nul b nul c nul nl 0000012 # perl is well-behaved: it attaches a BOM when writing UTF-16 to a file handle # example 1 was done on a mac powerbook (big-endian) # ... on an intel box the output would have been: # ff fe 61 00 62 00 63 00 0a 00 # ff fe a nul b nul c nul nl nul # example 2: see how perl reads "UTF-16" input: $ perl -e 'binmode STDOUT, ":encoding(UTF-16)"; print "abc\n"' | perl -e 'binmode STDIN, ":encoding(UTF-16)"; $_ = <>; print' | od -txC -a 0000000 61 62 63 0a a b c nl 0000004 # when reading UTF-16, perl removes the BOM from input and # converts data internally to utf8 (in this example, the # result is just ascii, because there were no wide characters) # example 3: what perl does when byte order is specified in the encoding: $ perl -e 'binmode STDOUT, ":encoding(UTF-16BE)"; print "abc\n"' | od -txC -a 0000000 00 61 00 62 00 63 00 0a nul a nul b nul c nul nl 0000010 # when byte order is specified, perl does not write a BOM # example 4: what perl does when reading data with no BOM: $ perl -e 'binmode STDOUT, ":encoding(UTF-16BE)"; print "abc\n"' | perl -e 'binmode STDIN, ":encoding(UTF-16)"; $_ = <>; print' UTF-16:Unrecognised BOM 61 at -e line 1. # if the reading script set "UTF-16BE" on STDIN, to match how # it was written, it would work correctly. #### #!/usr/bin/perl use strict; my $encoding = 'UTF-16'; my $Usage = "Usage: $0 [-BE|-LE] file.u16\n"; if ( @ARGV and $ARGV[0] =~ /^-([BL]E)$/ ) { $encoding .= $1; shift; } die $Usage unless ( @ARGV == 1 and -f $ARGV[0] ); my $filename = pop @ARGV; # if user didn't specify byte order, let's check the input file if ( $encoding eq 'UTF-16' ) { my $first_short; open( F, "<", $filename ) or die "$filename: $!"; my $n = sysread( F, $first_short, 2 ); die "sysread failed on $filename" unless ( $n == 2 ); if ( $first_short == pack( 'S', 0xfeff ) or $first_short == pack( 'S', 0xfffe ) { # it's a BOM, and using ":encoding(UTF-16)" is fine } else { die "$filename has no BOM; please specify byte order\n$Usage"; } } close F; open( F, "<:$encoding", $filename ); # ... and go to work...