# pick a random file, snag a random subset of paragraphs from it sub add_paras { my $legalese = 0; my $filename = $filenames[ int rand scalar @filenames ]; if ( not open FILE, $filename ) { print STDERR qq(can't open "$filename" for reading\n); if ( ++$err_count >= $max_errs ) { die "too many file open errors\n"; } return; } binmode( FILE, ':raw' ); my ( $dev, $inode, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blocksize, $blocks ) = stat FILE; if ( $size == 0 ) { return; } print "getting paragraphs from $filename\n" if $debug; # we can't know how to decode the file until we examine it. # slurp the file first: my @lines = ; my $decoder = &figure_out_encoding( \@lines ); if ( not $decoder ) { # figure_out_encoding() should already have logged err msg if necessary return; } if ( $decoder->name =~ /UTF-(16|32)LE/ ) { # split was wrong in raw mode, redo it now that we know what # a little-endian newline really looks like my $whole_file = join "", @lines; if ( $decoder->name eq "UTF-16LE" ) { @lines = split /\n\0/, $whole_file; } else { @lines = split /\n\0\0\0/, $whole_file; } print "rejoined and resplit because of littleendian format\n" if $debug; } # we want the number of pararaphs taken from each file to be roughly # the same, whether it's a short one or a long one. my $prange = $size / SAVE_RATE; my $paras_added = 0; my $saved_indices = 0; my $in_para = 0; my $saving = 0; my $current_para = ""; for ( my $i = 0; $i < scalar @lines; $i++ ) { #my $line = $decoder->decode( $lines[$i] ); # don't die if decode misbehaves: my $line = eval { $decoder->decode( $lines[$i] ) }; if ( $@ ) { my $eval_err = chomp $@; print qq(Encode->decode() failed horribly: "$eval_err"\n) if $debug; pop @indices; return; } my $linenum = $i+1; # this used to happen with LE charsets before I added the rejoin/replit # code above. keep it in case something else occasionally goes wrong # with decode. if ( not utf8::valid($line) ) { if ( $debug ) { print "have invalid utf8 after decoding at $filename line $linenum:\n"; print $line; print "\n"; } pop @indices; return; } $line =~ s/\r//g; # leave newlines alone but strip CR if ( $line !~ m/^\s*$/ ) { # non-blank if ( $line =~ m/(the\s+)*Project\s+Gutenberg.*e-?(text|book)/i or $line =~ m/START.*SMALL\s+PRINT/i or $line =~ m/END OF.*PROJECT\s+GUTENBERG E-?(TEXT|BOOK)/ or $line =~ m/START.*FULL LICENSE/) { $legalese = 1; print "start skipping legalese on line $linenum \n" if $debug >= 3; next; } if ( $in_para == 0 ) { if ( (not $legalese) and (rand $prange <= 1) ) { push @indices, "lines $linenum+ of $filename\n\n"; $saved_indices++; $saving = 1; } } $in_para = 1; if ( $line =~ m/\END.*SMALL\s+PRINT/ or $line =~ m/START OF THE PROJECT\s+GUTENBERG.*E-?(BOOK|TEXT)/ or $line =~ m/END.*FULL LICENSE/) { print "stop skipping legalese on line $linenum \n" if $debug >= 3; $legalese = 0; } } else { # blank line signaling end of paragraph if ( $saving ) { if ( $handle_html && $filename =~ m/\.html?$|\.php$/ ) { if ( $current_para = &strip_html_tags( $current_para ) ) { push @paras, $current_para . "\n\n"; $paras_added++; } else { # there's nothing left after stripping HTML tags # so get rid of the filename/line number we saved for this para pop @indices; } } else { push @paras, $current_para . "\n\n"; $paras_added++; } } $current_para = ""; $in_para = 0; $saving = 0; } if ( $saving ) { $current_para .= $line; } } if ( $saving ) { if ( $handle_html && $filename =~ m/\.html?$/ ) { $current_para = &strip_html_tags( $current_para ); } $current_para .= "\n\n"; push @paras, $current_para; $current_para = ""; $paras_added++; } close FILE; if ( $debug ) { push @paras_got_counts, $paras_added; print "added $paras_added paragraphs and $saved_indices indices from $filename of $size bytes\n"; print "now have " . scalar @paras . " paragraphs in memory\n"; print "\tand " . scalar @indices . " filename/line number indices\n"; } if ( $paras_added != $saved_indices ) { print "file $filename charset " . ( defined $decoder ? ( ref $decoder ? $decoder->name : $decoder ) : "(undefined)" ) . "\$saved_indices = $saved_indices, \$paras_added = $paras_added\n"; die "assertion failed: $paras_added must be equal to $saved_indices\n"; } } #.... sub figure_out_encoding { my $arr_ref = shift; if ( not ref $arr_ref eq "ARRAY" or scalar @$arr_ref == 0 ) { die "arg to figure_out_encoding() should be ref to non-empty array"; } # do an Encode::Guess on the join of all the lines # then if its results are ambiguous, do further heuristics # first save the possible byte order mark because Encode::Guess is # too lame to specify UTF-16BE or UTF16LE, it just says UTF-16 so # the resulting decoder obj then fails if actually used on a # random string form the middle of the file # haven't actually had a problem yet but I suspect UTF-32 could # have the same prob my $first_two_bytes = substr( $$arr_ref[0], 0, 2 ); my $first_four_bytes = substr( $$arr_ref[0], 0, 4 ); my $content = join '', @$arr_ref; my $cp437_letters = 0; my $latin1_letters = 0; my $decoder = eval { Encode::Guess->guess( $content ); }; if ( $@ ) { my $eval_err = chomp $@; print qq($_: Encode::Guess->guess() failed horribly: "$eval_err"\n); return undef; } if ( ref $decoder ) { print "appears to be " . $decoder->name . "\n" if $debug; if ( $decoder->name eq "UTF-16" ) { # examine byte order mark and recreate decoder as UTF-16BE or UTF16LE if ( $first_two_bytes eq "\xFF\xFE" ) { # little-endian $decoder = find_encoding( "UTF-16LE" ); if ( $debug ) { if ( ref $decoder ) { print "Encode::Guess thinks the file is UTF-16 and the byte order mark is FFFE so recreated decoder as UTF-16LE\n"; } else { print "can't find encoding for 'UTF-16LE'\n"; } } } elsif ( $first_two_bytes eq "\xFE\xFF" ) { # big-endian $decoder = find_encoding( "UTF-16BE" ); if ( $debug ) { if ( ref $decoder ) { print "Encode::Guess thinks the file is UTF-16 and the byte order mark is FEFF so recreated decoder as UTF-16BE\n"; } else { print "can't find encoding for 'UTF-16LE'\n"; } } } else { print "Encode::Guess thinks this file is UTF-16 but first two bytes is neither FFFE nor FEFF so aborting decode attempt\n" if $debug; return undef; } return ( ref $decoder ? $decoder : undef ); } elsif ( $decoder->name eq "UTF-32" ) { # examine byte order mark and recreate decoder as UTF-32BE or UTF32LE if ( $first_four_bytes eq "\xFF\xFE\x00\x00" ) { # little-endian $decoder = find_encoding( "UTF-32LE" ); if ( $debug ) { if ( ref $decoder ) { print "Encode::Guess thinks the file is UTF-32 and the byte order mark is FFFE0000 so recreated decoder as UTF-32LE\n"; } else { print "can't find encoding for 'UTF-32LE'\n"; } } } elsif ( $first_four_bytes eq "\x00\x00\xFE\xFF" ) { # big-endian $decoder = find_encoding( "UTF-32BE" ); if ( $debug ) { if ( ref $decoder ) { print "Encode::Guess thinks the file is UTF-32 and the byte order mark is 0000FEFF so recreated decoder as UTF-32BE\n"; } else { print "can't find encoding for 'UTF-32LE'\n"; } } } else { print "Encode::Guess thinks this file is UTF-32 but first four bytes is not a valid byte order mark so aborting decode attempt\n" if $debug; return undef; } return ( ref $decoder ? $decoder : undef ); } elsif ( $decoder->name eq "iso-8859-1" ) { my $lines_to_check = ( scalar @$arr_ref > 1000 ) ? 1000 : scalar @$arr_ref; for ( my $i = 0; $i < $lines_to_check; $i++ ) { while ( $$arr_ref[$i] =~ m/[\x80-\xA5]/g ) { $cp437_letters++; } while ( $$arr_ref[$i] =~ m/[\xC0-\xFF]/g ) { $latin1_letters++; } } if ( $cp437_letters > $latin1_letters ) { $decoder = find_encoding( "cp437" ); if ( $debug ) { if ( ref $decoder ) { print "more chars in cp437 range ($cp437_letters) than in Latin-1 letters range ($latin1_letters) so prob cp437\n"; } else { print "can't find encoding for 'cp437'\n"; } } return ( ref $decoder ? $decoder : undef ); } else { print "at least as many chars in Latin-1 letters range ($latin1_letters) as in cp437 range ($cp437_letters) so prob Latin-1\n" if $debug; return $decoder; } } return $decoder; } else { if ( $debug ) { print qq(bad decoder returned by Encode::Guess->guess() ); print ( ( defined $decoder ) ? qq("$decoder") : "(undefined)" ); print qq(\n); } if ( defined $decoder && $decoder =~ m/utf8/ ) { my $lines_to_check = ( scalar @$arr_ref > 1000 ) ? 1000 : scalar @$arr_ref; for ( my $i = 0; $i < $lines_to_check; $i++ ) { if ( $$arr_ref[$i] =~ m/[\x00-\x7F][\x80-\xFF][\x00-\x7F]/ or $$arr_ref[$i] =~ m/[\x00-\x7F][\x80-\xFF]$/ or $$arr_ref[$i] =~ m/^[\x80-\xFF][\x00-\x7F]$/ ) { # this can't be utf8 if it has a high-bit char by its lonesome # with low-bit chars or begin/end of line on either side $decoder = find_encoding( "iso-8859-1" ); if ( $debug ) { if ( ref $decoder ) { print "looks like iso-8859-1, using that encoding\n"; } else { print "can't find encoding for 'iso-8859-1'\n"; } } return ( ref $decoder ? $decoder : undef ); } } # end for each line of array $decoder = find_encoding( "utf8" ); if ( $debug ) { if ( ref $decoder ) { print "looks like utf8, using that encoding\n"; } else { print "can't find encoding for 'utf8'\n"; } } return ( ref $decoder ? $decoder : undef ); } else { print "bad decoder and not possible match for utf8\n" if $debug; return undef; }# end if possible utf8 } # end if/else we have no good decoder ref yet }