if ( /[^\x00-\x7f]/ ) # true if $_ contains any non-ascii character
####
use Encode;
... # load the string into $_, then:
my $test;
eval "\$test = decode( 'utf8', \$_, Encode::FB_CROAK )";
if ( $@ ) {
# Encode would fail/die if $_ was not a valid utf8 string
}
####
my @bytes = unpack( "C*", $_ ); # break string into bytes
my $widec = ""; # accumulate valid utf8 bytes here
my $width = 0; # keep track of how many bytes to accumulate
for my $b ( @bytes ) {
if (( $b & 0xf0 ) == 0xe0 or # high 4 bits are 1110
( $b & 0xe0 ) == 0xc0 ) # high 3 bits are 110
{
# either condition represents the start of a multibyte-char
die "Bad byte sequence\n" if ( $width );
$width = (( $b & 0xe0 ) == 0xe0 ) ? 3 : 2;
$widec .= chr( $b );
}
elsif (( $b & 0xc0 ) == 0x80 ) # high 2 bits are 10
{
# this should be a continuation of a multibyte-char
die "Bad byte sequence\n" unless ( $width );
$widec .= chr( $b );
}
elsif (( $b & 0x80 ) == 0 ) # this is an ascii byte
{
# cannot occur while assembling a multibyte-char
die "Bad byte sequence\n" if ( $width );
$width = 1;
$widec = chr( $b );
}
else {
die "Bad byte value\n"; # all four high-bits set
}
if ( length( $widec ) == $width ) {
$width = 0;
$widec = "";
}
}
die "Incomplete multibyte char\n" if ( $width );
# get here if the string was valid utf8