use strict; use warnings; binmode(DATA, ":encoding(UTF-8)"); my %normalize; while () { my ($string, $normalized) = split; # convert to arrays (avoid unicode issues?) my @string = $string =~ m/\X/g; my @normalized = $normalized =~ m/\X/g; # skip matching the beginning chars while (@string and @normalized and $string[0] eq $normalized[0]) { shift @string; shift @normalized; } # skip matching end chars while (@string and @normalized and $string[-1] eq $normalized[-1]) { pop @string; pop @normalized; } my $key = join("", @string); $normalize{$key} = join("", @normalized); print "'$key' => '$normalize{$key}'\n"; } __DATA__ ABCÅD ABCD ABCÄD ABCëëD ABCááD ABCèD