$str = Unicode::Normalize::NFKD($str);
$str =~ s/\p{NonspacingMark}//g;
####
## Demonstrate stripping of diacritical marks from Unicode strings
## April 2010, Bryce Nesbitt, Berkeley Electronic Press
## See also http://unicodelookup.com/
## See also http://en.wikipedia.org/wiki/Diacritic
## Keywords: perl, diacritic, diacritical
## accent, iso-8859-1, normalization.
use utf8; # Tell perl source code is utf-8
use 5.10.0;
use Unicode::Normalize;
# Sample: "latin small letter e with circumflex and tilde" ễ
# "latin small ligature ff" (will be expanded)
# "latin small ligature oe" (won't be expanded)
$str = shift || "\x{1ec5} märks \x{fb00} \x{153}";
say "Input: ".debug_chatty_string($str);
# Decompose into letter and combining marks, in "Kompatibility" mode
$str = NFKD($str);
say "NFKD : ".debug_chatty_string($str);
# Remove combining marks
$str =~ s/\p{NonspacingMark}//g;
$str = lc($str);
say "Out : ".debug_chatty_string($str);
sub debug_chatty_string
{
my $outstring;
# Use shift below, so utf-8 flag is preserved.
# Else you might have to fiddle with Encode::_utf8_on()
foreach $char (split //,shift) {
my $ord = ord($char);
if(($ord >= 32 && $ord < 127) || $ord == 10) {
$outstring .= $char;
} else {
$outstring .= "<0x".sprintf("%x",$ord).">";
}
}
return $outstring;
}
##
##
Input: <0x1ec5> m<0xe4>rks <0xfb00> <0x153>
NFKD : e<0x302><0x303> ma<0x308>rks ff <0x153>
Out : e marks ff <0x153>