So apparently the decoder object created by Encode::Guess remembers that it's UTF-16 but forgets immediately whether it was big-endian or little-endian? That seems broken.
Looking at the code, it "forgets" if it finds a BOM, it remembers otherwise. That's reasonable since it would cause $enc->encode to add the BOM if and only if there was one present in the sample.
You should probably incorporate into your code the couple of lines of E::G you actually do use. Then , you can change this behaviour if you don't like it.
| [reply] [Watch: Dir/Any] [d/l] |
I modified my figure_out_encoding() to recreate the decoder if Encode::Guess said the file was UTF-16 or UTF-32. I haven't exactly taken your advice to extract the bits of code I need from Encode::Guess(), though I yet may -- and it started working for big-endian files. For little-endian files I had to make another change in my add_paras() function to rejoin the lines I read with the <> operator in :raw mode and then re-split them with an appropriate 16-bit or 32-bit newline character. The modified functions are as follows; the whole script they come from is at the link cited in my earlier messages. Thanks for your help.
# pick a random file, snag a random subset of paragraphs from it
sub add_paras {
my $legalese = 0;
my $filename = $filenames[ int rand scalar @filenames ];
if ( not open FILE, $filename ) {
print STDERR qq(can't open "$filename" for reading\n);
if ( ++$err_count >= $max_errs ) {
die "too many file open errors\n";
}
return;
}
binmode( FILE, ':raw' );
my ( $dev, $inode, $mode, $nlink, $uid, $gid, $rdev,
$size,
$atime, $mtime, $ctime, $blocksize, $blocks )
= stat FILE;
if ( $size == 0 ) {
return;
}
print "getting paragraphs from $filename\n" if $debug;
# we can't know how to decode the file until we examine it.
# slurp the file first:
my @lines = <FILE>;
my $decoder = &figure_out_encoding( \@lines );
if ( not $decoder ) {
# figure_out_encoding() should already have logged err msg if nece
+ssary
return;
}
if ( $decoder->name =~ /UTF-(16|32)LE/ ) {
# split was wrong in raw mode, redo it now that we know what
# a little-endian newline really looks like
my $whole_file = join "", @lines;
if ( $decoder->name eq "UTF-16LE" ) {
@lines = split /\n\0/, $whole_file;
} else {
@lines = split /\n\0\0\0/, $whole_file;
}
print "rejoined and resplit because of littleendian format\n" if $
+debug;
}
# we want the number of pararaphs taken from each file to be rough
+ly
# the same, whether it's a short one or a long one.
my $prange = $size / SAVE_RATE;
my $paras_added = 0;
my $saved_indices = 0;
my $in_para = 0;
my $saving = 0;
my $current_para = "";
for ( my $i = 0; $i < scalar @lines; $i++ ) {
#my $line = $decoder->decode( $lines[$i] );
# don't die if decode misbehaves:
my $line = eval { $decoder->decode( $lines[$i] ) };
if ( $@ ) {
my $eval_err = chomp $@;
print qq(Encode->decode() failed horribly: "$eval_err"\n) i
+f $debug;
pop @indices;
return;
}
my $linenum = $i+1;
# this used to happen with LE charsets before I added the rejoin/r
+eplit
# code above. keep it in case something else occasionally goes wr
+ong
# with decode.
if ( not utf8::valid($line) ) {
if ( $debug ) {
print "have invalid utf8 after decoding at $filename line $lin
+enum:\n";
print $line;
print "\n";
}
pop @indices;
return;
}
$line =~ s/\r//g; # leave newlines alone but strip CR
if ( $line !~ m/^\s*$/ ) { # non-blank
if ( $line =~ m/(the\s+)*Project\s+Gutenberg.*e-?(text|book)/i
or $line =~ m/START.*SMALL\s+PRINT/i
or $line =~ m/END OF.*PROJECT\s+GUTENBERG E-?(TEXT|BOOK)/
or $line =~ m/START.*FULL LICENSE/) {
$legalese = 1;
print "start skipping legalese on line $linenum \n" if $debu
+g >= 3;
next;
}
if ( $in_para == 0 ) {
if ( (not $legalese) and (rand $prange <= 1) ) {
push @indices, "lines $linenum+ of $filename\n\n";
$saved_indices++;
$saving = 1;
}
}
$in_para = 1;
if ( $line =~ m/\END.*SMALL\s+PRINT/
or $line =~ m/START OF THE PROJECT\s+GUTENBERG.*E-?(BOOK|TEXT)
+/
or $line =~ m/END.*FULL LICENSE/) {
print "stop skipping legalese on line $linenum \n" if $debug
+ >= 3;
$legalese = 0;
}
} else { # blank line signaling end of paragraph
if ( $saving ) {
if ( $handle_html && $filename =~ m/\.html?$|\.php$/ ) {
if ( $current_para = &strip_html_tags( $current_para ) ) {
push @paras, $current_para . "\n\n";
$paras_added++;
} else {
# there's nothing left after stripping HTML tags
# so get rid of the filename/line number we saved for this
+ para
pop @indices;
}
} else {
push @paras, $current_para . "\n\n";
$paras_added++;
}
}
$current_para = "";
$in_para = 0;
$saving = 0;
}
if ( $saving ) {
$current_para .= $line;
}
}
if ( $saving ) {
if ( $handle_html && $filename =~ m/\.html?$/ ) {
$current_para = &strip_html_tags( $current_para );
}
$current_para .= "\n\n";
push @paras, $current_para;
$current_para = "";
$paras_added++;
}
close FILE;
if ( $debug ) {
push @paras_got_counts, $paras_added;
print "added $paras_added paragraphs and $saved_indices indices fr
+om $filename of $size bytes\n";
print "now have " . scalar @paras . " paragraphs in memory\n";
print "\tand " . scalar @indices . " filename/line number indices\
+n";
}
if ( $paras_added != $saved_indices ) {
print "file $filename charset "
. ( defined $decoder
? ( ref $decoder ? $decoder->name : $decoder )
: "(undefined)" )
. "\$saved_indices = $saved_indices, \$paras_added = $paras_a
+dded\n";
die "assertion failed: $paras_added must be equal to $saved_indice
+s\n";
}
}
#....
sub figure_out_encoding {
my $arr_ref = shift;
if ( not ref $arr_ref eq "ARRAY" or scalar @$arr_ref == 0 ) {
die "arg to figure_out_encoding() should be ref to non-empty arra
+y";
}
# do an Encode::Guess on the join of all the lines
# then if its results are ambiguous, do further heuristics
# first save the possible byte order mark because Encode::Guess is
# too lame to specify UTF-16BE or UTF16LE, it just says UTF-16 so
# the resulting decoder obj then fails if actually used on a
# random string form the middle of the file
# haven't actually had a problem yet but I suspect UTF-32 could
# have the same prob
my $first_two_bytes = substr( $$arr_ref[0], 0, 2 );
my $first_four_bytes = substr( $$arr_ref[0], 0, 4 );
my $content = join '', @$arr_ref;
my $cp437_letters = 0;
my $latin1_letters = 0;
my $decoder = eval { Encode::Guess->guess( $content ); };
if ( $@ ) {
my $eval_err = chomp $@;
print qq($_: Encode::Guess->guess() failed horribly: "$eval_err"\n
+);
return undef;
}
if ( ref $decoder ) {
print "appears to be " . $decoder->name . "\n" if $debug;
if ( $decoder->name eq "UTF-16" ) {
# examine byte order mark and recreate decoder as UTF-16BE or
+UTF16LE
if ( $first_two_bytes eq "\xFF\xFE" ) {
# little-endian
$decoder = find_encoding( "UTF-16LE" );
if ( $debug ) {
if ( ref $decoder ) {
print "Encode::Guess thinks the file is UTF-16 and the byt
+e order mark is FFFE so recreated decoder as UTF-16LE\n";
} else {
print "can't find encoding for 'UTF-16LE'\n";
}
}
} elsif ( $first_two_bytes eq "\xFE\xFF" ) {
# big-endian
$decoder = find_encoding( "UTF-16BE" );
if ( $debug ) {
if ( ref $decoder ) {
print "Encode::Guess thinks the file is UTF-16 and the byt
+e order mark is FEFF so recreated decoder as UTF-16BE\n";
} else {
print "can't find encoding for 'UTF-16LE'\n";
}
}
} else {
print "Encode::Guess thinks this file is UTF-16 but first two
+bytes is neither FFFE nor FEFF so aborting decode attempt\n" if $
+debug;
return undef;
}
return ( ref $decoder ? $decoder : undef );
} elsif ( $decoder->name eq "UTF-32" ) {
# examine byte order mark and recreate decoder as UTF-32BE or
+UTF32LE
if ( $first_four_bytes eq "\xFF\xFE\x00\x00" ) {
# little-endian
$decoder = find_encoding( "UTF-32LE" );
if ( $debug ) {
if ( ref $decoder ) {
print "Encode::Guess thinks the file is UTF-32 and the byt
+e order mark is FFFE0000 so recreated decoder as UTF-32LE\n";
} else {
print "can't find encoding for 'UTF-32LE'\n";
}
}
} elsif ( $first_four_bytes eq "\x00\x00\xFE\xFF" ) {
# big-endian
$decoder = find_encoding( "UTF-32BE" );
if ( $debug ) {
if ( ref $decoder ) {
print "Encode::Guess thinks the file is UTF-32 and the byt
+e order mark is 0000FEFF so recreated decoder as UTF-32BE\n";
} else {
print "can't find encoding for 'UTF-32LE'\n";
}
}
} else {
print "Encode::Guess thinks this file is UTF-32 but first four
+ bytes is not a valid byte order mark so aborting decode attempt\n"
+ if $debug;
return undef;
}
return ( ref $decoder ? $decoder : undef );
} elsif ( $decoder->name eq "iso-8859-1" ) {
my $lines_to_check = ( scalar @$arr_ref > 1000 ) ? 1000 : sca
+lar @$arr_ref;
for ( my $i = 0; $i < $lines_to_check; $i++ ) {
while ( $$arr_ref[$i] =~ m/[\x80-\xA5]/g ) {
$cp437_letters++;
}
while ( $$arr_ref[$i] =~ m/[\xC0-\xFF]/g ) {
$latin1_letters++;
}
}
if ( $cp437_letters > $latin1_letters ) {
$decoder = find_encoding( "cp437" );
if ( $debug ) {
if ( ref $decoder ) {
print "more chars in cp437 range ($cp437_letters) than in
+Latin-1 letters range ($latin1_letters) so prob cp437\n";
} else {
print "can't find encoding for 'cp437'\n";
}
}
return ( ref $decoder ? $decoder : undef );
} else {
print "at least as many chars in Latin-1 letters range ($latin
+1_letters) as in cp437 range ($cp437_letters) so prob Latin-1\n" i
+f $debug;
return $decoder;
}
}
return $decoder;
} else {
if ( $debug ) {
print qq(bad decoder returned by Encode::Guess->guess() );
print ( ( defined $decoder ) ? qq("$decoder") : "(undefined)"
+);
print qq(\n);
}
if ( defined $decoder && $decoder =~ m/utf8/ ) {
my $lines_to_check = ( scalar @$arr_ref > 1000 ) ? 1000 : sca
+lar @$arr_ref;
for ( my $i = 0; $i < $lines_to_check; $i++ ) {
if ( $$arr_ref[$i] =~ m/[\x00-\x7F][\x80-\xFF][\x00-\x7F]/
or $$arr_ref[$i] =~ m/[\x00-\x7F][\x80-\xFF]$/
or $$arr_ref[$i] =~ m/^[\x80-\xFF][\x00-\x7F]$/ )
{
# this can't be utf8 if it has a high-bit char by its lone
+some
# with low-bit chars or begin/end of line on either side
$decoder = find_encoding( "iso-8859-1" );
if ( $debug ) {
if ( ref $decoder ) {
print "looks like iso-8859-1, using that encoding\n";
} else {
print "can't find encoding for 'iso-8859-1'\n";
}
}
return ( ref $decoder ? $decoder : undef );
}
} # end for each line of array
$decoder = find_encoding( "utf8" );
if ( $debug ) {
if ( ref $decoder ) {
print "looks like utf8, using that encoding\n";
} else {
print "can't find encoding for 'utf8'\n";
}
}
return ( ref $decoder ? $decoder : undef );
} else {
print "bad decoder and not possible match for utf8\n" if $d
+ebug;
return undef;
}# end if possible utf8
} # end if/else we have no good decoder ref yet
}
| [reply] [Watch: Dir/Any] [d/l] |