# create a list of "named" characters, as defined by "unicore/Names.pl": # (this leaves out the "unified CJK ideographs" and code points > U+FFFF, # but they're not relevant) $ perl -CS -le 'for (split /^/, do "unicore/Name.pl"){ ($c)=(/^(\S+)/); last if length($c)>4; print chr(hex($c))}' > /tmp/test.chr $ wc -l /tmp/test.chr 13069 /tmp/test.chr # NB: actually it's 13068: "\n" gets counted twice $ grepp -r utf8 '^[[:alpha:]]' /tmp/test.chr > /tmp/posix-alpha.chr $ grepp -r utf8 '^[^[:alpha:]]' /tmp/test.chr > /tmp/posix-not-alpha.chr $ grepp -r utf8 '^[\p{IsAlpha}]' /tmp/test.chr > /tmp/unicode-alpha.chr $ grepp -r utf8 '^[^\p{IsAlpha}]' /tmp/test.chr > /tmp/unicode-not-alpha.chr $ grepp -r utf8 '^[[:^alpha:]]' /tmp/test.chr > /tmp/posix-not-alpha-2.chr $ wc -l /tmp/*.chr 8666 /tmp/posix-alpha.chr 4468 /tmp/posix-not-alpha-2.chr # some "[[:^alpha:]]" are also [[:alpha:]] ! 4403 /tmp/posix-not-alpha.chr 13069 /tmp/test.chr 8666 /tmp/unicode-alpha.chr 4403 /tmp/unicode-not-alpha.chr # equivalent to "[^[:alpha:]]" 43675 total # what are the "funny" characters (both "[[:alpha:]]" and "[[:^alpha:]]") ? $ cmpcol -i /tmp/posix-alpha.chr /tmp/posix-not-alpha-2.chr | tlu -o uf | grep -v LINE.FEED 00AA ª FEMININE ORDINAL INDICATOR 00B5 µ MICRO SIGN 00BA º MASCULINE ORDINAL INDICATOR 00C0 À LATIN CAPITAL LETTER A WITH GRAVE 00C1 Á LATIN CAPITAL LETTER A WITH ACUTE 00C2 Â LATIN CAPITAL LETTER A WITH CIRCUMFLEX 00C3 Ã LATIN CAPITAL LETTER A WITH TILDE 00C4 Ä LATIN CAPITAL LETTER A WITH DIAERESIS 00C5 Å LATIN CAPITAL LETTER A WITH RING ABOVE 00C6 Æ LATIN CAPITAL LETTER AE 00C7 Ç LATIN CAPITAL LETTER C WITH CEDILLA 00C8 È LATIN CAPITAL LETTER E WITH GRAVE 00C9 É LATIN CAPITAL LETTER E WITH ACUTE 00CA Ê LATIN CAPITAL LETTER E WITH CIRCUMFLEX 00CB Ë LATIN CAPITAL LETTER E WITH DIAERESIS 00CC Ì LATIN CAPITAL LETTER I WITH GRAVE 00CD Í LATIN CAPITAL LETTER I WITH ACUTE 00CE Î LATIN CAPITAL LETTER I WITH CIRCUMFLEX 00CF Ï LATIN CAPITAL LETTER I WITH DIAERESIS 00D0 Ð LATIN CAPITAL LETTER ETH 00D1 Ñ LATIN CAPITAL LETTER N WITH TILDE 00D2 Ò LATIN CAPITAL LETTER O WITH GRAVE 00D3 Ó LATIN CAPITAL LETTER O WITH ACUTE 00D4 Ô LATIN CAPITAL LETTER O WITH CIRCUMFLEX 00D5 Õ LATIN CAPITAL LETTER O WITH TILDE 00D6 Ö LATIN CAPITAL LETTER O WITH DIAERESIS 00D8 Ø LATIN CAPITAL LETTER O WITH STROKE 00D9 Ù LATIN CAPITAL LETTER U WITH GRAVE 00DA Ú LATIN CAPITAL LETTER U WITH ACUTE 00DB Û LATIN CAPITAL LETTER U WITH CIRCUMFLEX 00DC Ü LATIN CAPITAL LETTER U WITH DIAERESIS 00DD Ý LATIN CAPITAL LETTER Y WITH ACUTE 00DE Þ LATIN CAPITAL LETTER THORN 00DF ß LATIN SMALL LETTER SHARP S 00E0 à LATIN SMALL LETTER A WITH GRAVE 00E1 á LATIN SMALL LETTER A WITH ACUTE 00E2 â LATIN SMALL LETTER A WITH CIRCUMFLEX 00E3 ã LATIN SMALL LETTER A WITH TILDE 00E4 ä LATIN SMALL LETTER A WITH DIAERESIS 00E5 å LATIN SMALL LETTER A WITH RING ABOVE 00E6 æ LATIN SMALL LETTER AE 00E7 ç LATIN SMALL LETTER C WITH CEDILLA 00E8 è LATIN SMALL LETTER E WITH GRAVE 00E9 é LATIN SMALL LETTER E WITH ACUTE 00EA ê LATIN SMALL LETTER E WITH CIRCUMFLEX 00EB ë LATIN SMALL LETTER E WITH DIAERESIS 00EC ì LATIN SMALL LETTER I WITH GRAVE 00ED í LATIN SMALL LETTER I WITH ACUTE 00EE î LATIN SMALL LETTER I WITH CIRCUMFLEX 00EF ï LATIN SMALL LETTER I WITH DIAERESIS 00F0 ð LATIN SMALL LETTER ETH 00F1 ñ LATIN SMALL LETTER N WITH TILDE 00F2 ò LATIN SMALL LETTER O WITH GRAVE 00F3 ó LATIN SMALL LETTER O WITH ACUTE 00F4 ô LATIN SMALL LETTER O WITH CIRCUMFLEX 00F5 õ LATIN SMALL LETTER O WITH TILDE 00F6 ö LATIN SMALL LETTER O WITH DIAERESIS 00F8 ø LATIN SMALL LETTER O WITH STROKE 00F9 ù LATIN SMALL LETTER U WITH GRAVE 00FA ú LATIN SMALL LETTER U WITH ACUTE 00FB û LATIN SMALL LETTER U WITH CIRCUMFLEX 00FC ü LATIN SMALL LETTER U WITH DIAERESIS 00FD ý LATIN SMALL LETTER Y WITH ACUTE 00FE þ LATIN SMALL LETTER THORN 00FF ÿ LATIN SMALL LETTER Y WITH DIAERESIS