my @bytes = unpack( "C*", $_ ); # break string into bytes my $widec = ""; # accumulate valid utf8 bytes here my $width = 0; # keep track of how many bytes to accumulate for my $b ( @bytes ) { if (( $b & 0xf0 ) == 0xe0 or # high 4 bits are 1110 ( $b & 0xe0 ) == 0xc0 ) # high 3 bits are 110 { # either condition represents the start of a multibyte-char die "Bad byte sequence\n" if ( $width ); $width = (( $b & 0xe0 ) == 0xe0 ) ? 3 : 2; $widec .= chr( $b ); } elsif (( $b & 0xc0 ) == 0x80 ) # high 2 bits are 10 { # this should be a continuation of a multibyte-char die "Bad byte sequence\n" unless ( $width ); $widec .= chr( $b ); } elsif (( $b & 0x80 ) == 0 ) # this is an ascii byte { # cannot occur while assembling a multibyte-char die "Bad byte sequence\n" if ( $width ); $width = 1; $widec = chr( $b ); } else { die "Bad byte value\n"; # all four high-bits set } if ( length( $widec ) == $width ) { $width = 0; $widec = ""; } } die "Incomplete multibyte char\n" if ( $width ); # get here if the string was valid utf8