if ( /[^\x00-\x7f]/ ) # true if $_ contains any non-ascii character #### use Encode; ... # load the string into $_, then: my $test; eval "\$test = decode( 'utf8', \$_, Encode::FB_CROAK )"; if ( $@ ) { # Encode would fail/die if $_ was not a valid utf8 string } #### my @bytes = unpack( "C*", $_ ); # break string into bytes my $widec = ""; # accumulate valid utf8 bytes here my $width = 0; # keep track of how many bytes to accumulate for my $b ( @bytes ) { if (( $b & 0xf0 ) == 0xe0 or # high 4 bits are 1110 ( $b & 0xe0 ) == 0xc0 ) # high 3 bits are 110 { # either condition represents the start of a multibyte-char die "Bad byte sequence\n" if ( $width ); $width = (( $b & 0xe0 ) == 0xe0 ) ? 3 : 2; $widec .= chr( $b ); } elsif (( $b & 0xc0 ) == 0x80 ) # high 2 bits are 10 { # this should be a continuation of a multibyte-char die "Bad byte sequence\n" unless ( $width ); $widec .= chr( $b ); } elsif (( $b & 0x80 ) == 0 ) # this is an ascii byte { # cannot occur while assembling a multibyte-char die "Bad byte sequence\n" if ( $width ); $width = 1; $widec = chr( $b ); } else { die "Bad byte value\n"; # all four high-bits set } if ( length( $widec ) == $width ) { $width = 0; $widec = ""; } } die "Incomplete multibyte char\n" if ( $width ); # get here if the string was valid utf8