# This does what Unicode::Escape::escape does, except a lot less. # It assumes the input is utf8 encoded. # It assumes the input is defined. # Invalid utf8 is deleted silently. sub utf8_escape { # Short circuit if it's all seven bit ASCII. return $_[0] unless $_[0] =~ /[\x80-\xff]/; my $s = shift; $s =~ s{ ([\xc2-\xdf]) ([\x80-\xbf]) }{ '\\u' . sprintf( '%04x', ( ( 0b00011111 & ord $1 ) << 6 ) | ( 0b00111111 & ord $2 ) ) }exmsg; $s =~ s{ ([\xe0-\xef]) ([\x80-\xbf]) ([\x80-\xbf]) }{ '\\u' . sprintf( '%04x', ( ( 0b00001111 & ord $1 ) << 12 ) | ( ( 0b00111111 & ord $2 ) << 6 ) | ( 0b00111111 & ord $3 ) ) }exmsg; # valid utf8 that can't be encoded in \uXXXX $s =~ s{ [\xf0-\xf4] [\x80-\xbf]{3} }{\\ufffd}xmsg; # invalid utf8 $s =~ tr/\x80-\xff//d; return $s; }