UTF-16:Unrecognised BOM 6e at /home/jim/Documents/scripts/textual-slideshow.pl line 549, line 10. #### #! /usr/bin/perl -w # stripped-down version of textual-slideshow.pl # for reproducing bad byte-order-mark defect use Term::ReadKey; use strict; use warnings; use Encode; use Encode::Guess qw( iso-8859-1 ); #use open qw( :std :encoding(UTF-8) ); # proportion between file size and probability that a given # paragraph will be saved for later display. the smaller this number, # the fewer paragraphs from each file are saved. use constant SAVE_RATE => 2000; my $debug = 0; my $max_paras = 100000; my $err_count = 0; my $max_errs = 3; my @paras; my @indices; my $n_paras; srand; while ( my $filename = shift ) { &add_paras($filename); } #================================================== sub add_paras { my $legalese = 0; my $filename = shift; if ( not open FILE, $filename ) { print STDERR qq(can't open "$filename" for reading\n); if ( ++$err_count >= $max_errs ) { die "too many file open errors\n"; } return; } binmode( FILE, ':raw' ); my ( $dev, $inode, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blocksize, $blocks ) = stat FILE; if ( $size == 0 ) { return; } print "getting paragraphs from $filename\n" if $debug; # we can't know how to decode the file until we examine it. # slurp the file first: my @lines = ; close FILE; my $decoder = &figure_out_encoding( \@lines ); if ( not $decoder ) { # figure_out_encoding should already have logged err msg if necessary return; } # we want the number of pararaphs taken from a file to be roughly # the same, whether it's a short one or a long one. my $prange = $size / SAVE_RATE; my $paras_added = 0; my $saved_indices = 0; my $in_para = 0; my $saving = 0; my $current_para = ""; for ( my $i = 0; $i < scalar @lines; $i++ ) { my $line = $decoder->decode( $lines[$i] ); my $linenum = $i+1; $line =~ s/\r//g; # leave newlines alone but strip CR if ( $line !~ m/^\s*$/ ) { # non-blank if ( $line =~ m/(the\s+)*Project\s+Gutenberg.*e-?(text|book)/i or $line =~ m/START.*SMALL\s+PRINT/i or $line =~ m/END OF.*PROJECT\s+GUTENBERG E-?(TEXT|BOOK)/ or $line =~ m/START.*FULL LICENSE/) { $legalese = 1; print "start skipping legalese on line $linenum \n" if $debug >= 3; next; } if ( $in_para == 0 ) { if ( (not $legalese) and (rand $prange <= 1) ) { push @indices, "lines $linenum+ of $filename\n\n"; $saved_indices++; $saving = 1; } } $in_para = 1; if ( $line =~ m/\END.*SMALL\s+PRINT/ or $line =~ m/START OF THE PROJECT\s+GUTENBERG.*E-?(BOOK|TEXT)/ or $line =~ m/END.*FULL LICENSE/) { print "stop skipping legalese on line $. \n" if $debug >= 3; $legalese = 0; } } else { # blank line if ( $saving ) { push @paras, $current_para . "\n\n"; $paras_added++; } $current_para = ""; $in_para = 0; $saving = 0; } my $a = 1; $a++; if ( $saving ) { $a--; $current_para .= $line; } } if ( $saving ) { $current_para .= "\n\n"; push @paras, $current_para; $current_para = ""; $paras_added++; } } =pod Use Encode::Guess plus some heuristics to figure out the probable encoding of a file, based on an array of lines from the file (passed by reference because it could be very big and we don't need to modify it). Return a decoder object. =cut sub figure_out_encoding { my $arr_ref = shift; if ( not ref $arr_ref eq "ARRAY" or scalar @$arr_ref == 0 ) { die "arg to figure_out_encoding() should be ref to non-empty array"; } # do an Encode::Guess on the join of all the lines # then if its results are ambiguous, do further heuristics my $content = join '', @$arr_ref; my $cp437_letters = 0; my $latin1_letters = 0; my $decoder = eval { Encode::Guess->guess( $content ); }; if ( $@ ) { my $eval_err = chomp $@; print qq($_: Encode::Guess->guess() failed horribly: "$eval_err"\n); return undef; } if ( ref $decoder ) { print "appears to be " . $decoder->name . "\n" if $debug; if ( $decoder->name eq "iso-8859-1" ) { my $lines_to_check = ( scalar @$arr_ref > 1000 ) ? 1000 : scalar @$arr_ref; for ( my $i = 0; $i < $lines_to_check; $i++ ) { while ( $$arr_ref[$i] =~ m/[\x80-\xA5]/g ) { $cp437_letters++; } while ( $$arr_ref[$i] =~ m/[\xC0-\xFF]/g ) { $latin1_letters++; } } if ( $cp437_letters > $latin1_letters ) { $decoder = find_encoding( "cp437" ); if ( $debug ) { if ( ref $decoder ) { print "more chars in cp437 range ($cp437_letters) than in Latin-1 letters range ($latin1_letters) so prob cp437\n"; } else { print "can't find encoding for 'cp437'\n"; } } return ( ref $decoder ? $decoder : undef ); } else { print "more chars in Latin-1 letters range ($latin1_letters) than in cp437 range ($cp437_letters) so prob Latin-1\n" if $debug; return $decoder; } } return $decoder; } else { print qq(bad decoder returned by Encode::Guess->guess() ); print ( ( defined $decoder ) ? qq("$decoder") : "(undefined)" ); print qq(\n); if ( defined $decoder && $decoder =~ m/utf8/ ) { my $lines_to_check = ( scalar @$arr_ref > 1000 ) ? 1000 : scalar @$arr_ref; for ( my $i = 0; $i < $lines_to_check; $i++ ) { if ( $$arr_ref[$i] =~ m/[\x00-\x7F][\x80-\xFF][\x00-\x7F]/ or $$arr_ref[$i] =~ m/[\x00-\x7F][\x80-\xFF]$/ or $$arr_ref[$i] =~ m/^[\x80-\xFF][\x00-\x7F]$/ ) { # this can't be utf8 if it has a high-bit char by its lonesome # with low-bit chars or begin/end of line on either side $decoder = find_encoding( "iso-8859-1" ); if ( $debug ) { if ( ref $decoder ) { print "looks like iso-8859-1, using that encoding\n"; } else { print "can't find encoding for 'iso-8859-1'\n"; } } return ( ref $decoder ? $decoder : undef ); } } # end for each line of array $decoder = find_encoding( "utf8" ); if ( $debug ) { if ( ref $decoder ) { print "looks like utf8, using that encoding\n"; } else { print "can't find encoding for 'utf8'\n"; } } return ( ref $decoder ? $decoder : undef ); } else { print "bad decoder and not possible match for utf8\n" if $debug; return undef; }# end if possible utf8 } # end if/else we have no good decoder ref yet }