#!/usr/bin/perl # usage: # fix_surrogates.pl < infile > outfile # Hi Surrogate: D800-DBFF # Lo Surrogate: DC00-DFFF use strict; use warnings; binmode STDIN; # Disable :crlf binmode STDOUT; # Disable :crlf my $read_size = 16*1024; my $valid_pat = qr/ .[^\xD8-\xDF] | .[\xD8-\xDB].[\xDC-\xDF] /xs; my $invalid_pat = qr/ .[\xDC-\xDF] | .[\xD8-\xDB](?=.[^\xDC-\xDF]) /xs; my $ibuf = ''; my $obuf = ''; for (;;) { my $rv = read(STDIN, $ibuf, $read_size, length($ibuf)); die("$!\n") if !defined($rv); last if !$rv; for ($ibuf) { /\G ($valid_pat+) /xgc && do { $obuf .= $1; }; /\G $invalid_pat /xgc && do { $obuf .= "\xFD\xFF"; redo }; } print($obuf); $ibuf = substr($ibuf, pos($ibuf)||0); $obuf = ''; } $ibuf =~ s/..?/\xFD\xFF/sg; print($ibuf); #### >type testdata.pl binmode STDOUT; my $hi = "\xF4\xDB"; my $lo = "\xE2\xDE"; print "a\0" . $hi . $lo . "b\0" . "\n\0", "c\0" . $hi . "c\0" . "d\0" . "\n\0", "e\0" . $lo . "f\0" . "g\0" . "\n\0"; >perl testdata.pl | perl fix_surrogates.pl | perl -0777 -pe"BEGIN { binmode STDIN, ':encoding(UTF-16le)'; binmode STDOUT, ':encoding(US-ASCII)' }" "\x{10d2e2}" does not map to ascii, <> chunk 1. "\x{fffd}" does not map to ascii, <> chunk 1. "\x{fffd}" does not map to ascii, <> chunk 1. a\x{10d2e2}b c\x{fffd}cd e\x{fffd}fg