Re: unicode normalization

of Perl Wisdom

Replies are listed 'Best First'.
Re: unicode normalization by mscudder (Initiate) on May 08, 2006 at 00:34 UTC
Many thanks to wfsp and graff for your very helpful suggestions, code, and referral to The Björk Situation. My solution below, designed for thoroughness Regards, Michael +alents my $_dashes=join '', map { 0x2010..0x2015, 0x2053, 0x207B, my $_squots=join '', map { my $_dquots=join '', map { my $_spaces=join '', map { 0x205F, 0x3000); my $_dots =join '', map { sub scrub { my $text = shift; return "" if !$text; # remove HTML phrasal level tags foreach my $markup (keys %HTML::Tagset: $text=~s/<\s?\/?$markup\s?>/ } #decode html entities for (1..3) { # assume no more HTML::Entities::decode_entities($t if ($text=~/&#?[a-zA-Z } # replace character escapes $text=~s/%([0-9A-Fa-f]{2})/chr( # replace 'wide character' # ascii-compatible whitespace $text=~s/\s/ /g; $text=~s/[$_spaces]/ /g; # transliterate 'wide character' # ascii-compatible equivalents # with thanks to graff of Perl $text=~s/[$_dashes]/-/g; $text=~s/[$_squots]/\'/g; $text=~s/[$_dquots]/"/g; $text=~s/[$_dots]/\x{00B7}/g; # replace remaining 'wide' # (my preferred) ascii-compatible $text=~s/(.)/$_char2equiv{$1}?$_char2eq # unidecode any remaining characters if ($text=~/[\x{100}-\x{ffff}]/) my @chars=split //, $text; # "Text::Unidecode is # transliterator-of-last resort,... foreach my $char (@chars) { $char=unidecode($char) if $char=~/[\x{0100}-\x } $text=join '', @chars; # strip out remaining 'wide' $text=~s/[\x{0100}-\x{ffff}] } # trim leading, trailing, and $text=~s/^\s+//; $text=~s/\s{2,}/ return $text; } ####################################### # initialization # ####################################### BEGIN { foreach my $entity (@_html_entities) { $entity->[0]=chr($entity $_entity2char{$entity->[2 $_entity2char{$entity->[3 $_char2equiv{$entity->[0] if $entity->[1]; } } my @_html_entities=( # --------------------------------------- +---------- # 0 1 2 3 4 # char equiv entity entity # --------------------------------------- +---------- ['"', '', +PL quote', ['&', '', + ['<', '', + ['>', '', +, ['', '...', + ['', '-', + ['', '-', + ['Ą', '', +on mark', ['˘', '', + ['Ł', '', + ['¤', '', + ['Ľ', '', +gn', ['Ś', '', +n vertical bar', ['§', '', + ['¨', '', +g diaeresis', ['Š', '', + ['Ş', '', +ndicator', ['Ť', '', +le ['Ź', '', + ['', '', +retionary hyphen', ['Ž', '', +registered ['Ż', '', +acron ['°', '', + ['ą', '', +plus-or-minus ['˛', '', +superscript ['ł', '', += ['´', '', +cing acute', ['ľ', '', + ['ś', '', +agraph sign', ['ˇ', '', +ian ['¸', '', +cedilla', ['š', '', +superscript ['ş', '', +indicator', ['ť', '', +ble ['ź', '', +e ['˝', '', +e ['ž', '', +ree ['ż', '', +mark= # --------------------------------------- +---------- # 0 1 2 3 4 # char equiv entity entity # --------------------------------------- +---------- ['Ŕ', '', +er ['Á', '', +er A with acute', ['Â', '', +er ['Ă', '', +er A with tilde', ['Ä', '', +er ['Ĺ', '', +er ['Ć', '', +er ['Ç', '', +er ['Č', '', +er E with grave', ['É', '', +er E with acute', ['Ę', '', +er ['Ë', '', +er ['Ě', '', +er I with grave', ['Í', '', +er I with acute', ['Î', '', +er ['Ď', '', +er ['Đ', '', +er ETH', ['Ń', '', +er N with tilde', ['Ň', '', +er O with grave', ['Ó', '', +er O with acute', ['Ô', '', +er ['Ő', '', +er O with tilde', ['Ö', '', +er ['×', '', +n', ['Ř', '', +er ['Ů', '', +er U with grave', ['Ú', '', +er U with acute', ['Ű', '', +er ['Ü', '', +er ['Ý', '', +er Y with acute', ['Ţ', '', +er THORN', ['ß', '', + ['ŕ', '', + ['á', '', + a with acute', ['â', '', + ['ă', '', + a with tilde', ['ä', '', + # --------------------------------------- +---------- # 0 1 2 3 4 # char equiv entity entity # --------------------------------------- +---------- ['ĺ', '', + ['ć', '', + ['ç', '', + c with cedilla', ['č', '', + e with grave', ['é', '', + e with acute', ['ę', '', + ['ë', '', + ['ě', '', + i with grave', ['í', '', + i with acute', ['î', '', + ['ď', '', + ['đ', '', + eth', ['ń', '', + n with tilde', ['ň', '', + o with grave', ['ó', '', + o with acute', ['ô', '', + ['ő', '', + o with tilde', ['ö', '', + ['÷', '', + ['ů', '', + u with grave', ['ú', '', + u with acute', ['ű', '', + ['ü', '', + ['ý', '', + y with acute', ['ţ', '', + thorn', ['˙', '', + ['', 'OE', +ture OE', ['', 'oe', +re oe', ['', 'S', +er S with caron', ['', 's', + s with caron', ['', 'Y', +er ['', 'f', + ['', '', +rcumflex accent', ['', '', + ['Γ', ' Gamma ', + letter gamma', ['Δ', ' Delta ', + letter delta', ['Θ', ' Theta ', + letter theta', # --------------------------------------- +---------- # 0 1 2 3 4 # char equiv entity entity # --------------------------------------- +---------- ['Λ', ' Lambda ', + letter lambda', ['Ξ', ' Xi ', + letter xi', ['Π', ' Pi ', + letter pi', ['Σ', ' Sigma ', + letter sigma', ['Υ', ' + letter upsilon', ['Φ', ' Phi ', + letter phi', ['Ψ', ' Psi ', + letter psi', ['Ω', ' Omega ', + letter omega', ['α', ' alpha ', +etter alpha', ['β', ' beta ', +etter beta', ['γ', ' gamma ', +etter gamma', ['δ', ' delta ', +etter delta', ['ε', ' +etter epsilon', ['η', ' eta ', +etter eta', ['θ', ' theta ', +etter theta', ['ι', ' iota ', +etter iota', ['κ', ' kappa ', +etter kappa', ['λ', ' lambda ', +etter lambda', ['μ', ' mu ', +etter mu', ['ν', ' nu ', +etter nu', ['ξ', ' xi ', +etter xi', ['ο', ' +etter omicron', ['π', ' pi ', +etter pi', ['ρ', ' rho ', +etter rho', ['ς', ' sigma ', +etter ['σ', ' sigma ', +etter sigma', ['τ', ' tau ', +etter tau', ['υ', ' +etter upsilon', ['φ', ' phi ', +etter phi', ['χ', ' chi ', +etter chi', ['ψ', ' psi ', +etter psi', ['ω', ' omega ', +etter omega', ['ϑ', ' theta ', +etter ['ϒ', ' + ['ϖ', ' pi ', +ol', [' ', ' ', + [' ', ' ', + # --------------------------------------- +---------- # 0 1 2 3 4 # char equiv entity entity # --------------------------------------- +---------- [' ', ' ', + ['‌', '', +on-joiner', ['‍', '', +oiner', ['‎', '->', +t mark', ['‏', '<-', +t mark', ['', '-', + ['', '-', + ['', '\'', +ion mark', ['', '\'', +tion mark', ['', '\'', +tion mark', ['', '\"', +ion mark', ['', '\"', +tion mark', ['', '\"', +tion mark', ['', '+', + ['', '++', + ['', chr(183), 'bull', +ll circle', ['', '...', +s ['', '%%', + ['′', '\'', +tes = feet', ['‹', '<', +ng ['›', '>', +ing ['‾', '', +pacing overscore', ['⁄', '/', +sh', ['', ' euro ', + ['ℑ', 'I', +capital ['℘', 'P', +al ['ℜ', 'R', +capital ['', '(tm)', + ['ℵ', '', += ['←', '<-', +row', ['↑', '', +w', ['→', '->', +rrow', ['↓', '', +row', ['↔', '', +rrow', ['↵', '<-', +row ['⇐', '<=', +uble arrow', ['⇑', '', +le arrow', # --------------------------------------- +---------- # 0 1 2 3 4 # char equiv entity entity # --------------------------------------- +---------- ['⇒', '=>', +ouble arrow', ['⇓', '', +uble arrow', ['∀', ' + ['∂', '', +erential', ['∃', '', +', ['∅', '', +null ['∇', '', +ward difference', ['∈', '', + ['∉', '', +nt of', ['∋', '', +member', ['∏', '', +t = product sign', ['∑', '', +on', ['−', '-', + ['∗', '*', +rator', ['√', '', += radical sign', ['∝', '', + to', ['∞', '', + ['∠', '', + ['∧', ' AND ', += wedge', ['∨', ' OR ', + vee', ['∩', '', + = cap', ['∪', '', +, ['∴', '', + ['∼', '~', +or ['≅', '~', +y equal to', ['≈', '', + ['≠', '<>', +', ['≡', '', +', ['≤', '<=', + equal to', ['≥', '>=', + or equal to', ['⊂', '', + ['⊃', '', +, ['⊄', '', + of', ['⊆', '', + equal to', ['⊇', '', +or equal to', ['⊕', '', + = direct sum', ['⊗', '', +s # --------------------------------------- +---------- # 0 1 2 3 4 # char equiv entity entity # --------------------------------------- +---------- ['⊥', '', +thogonal ['⋅', chr(177), +', ['⌈', '', + = apl upstile', ['⌉', '', +g', ['⌊', '', + apl downstile', ['⌋', '', +, ['〈', '<', +g ['◊', '', + ['♠', '', +suit', ['♣', '', +uit = shamrock', ['♥', '', +suit = valentine', ['♦', '', +d suit', # --------------------------------------- +---------- # 0 1 2 3 4 # char equiv entity entity # --------------------------------------- +---------- );	In Section Seekers rather than speed. Guaranteed at least 150% effective! class='codetext'> ---------- 5 description ---------- 34, 'U+0022', 'quotation mark=APL quote', ], 38, 'U+0026', 'ampersand', ], 60, 'U+003C', 'less-than sign',], 62, 'U+003E', 'greater-than sign',], 133, 'U+0085', '',], 150, '', '',], 151, '', '',], 'iexcl', 161, 'U+00A1', 'inverted exclamation mark',], 162, 'U+00A2', 'cent sign', ], 163, 'U+00A3', 'pound sign', ], 'curren', 164, 'U+00A4', 'currency sign', ], 165, 'U+00A5', 'yen sign = yuan sign', ], 'brvbar', 166, 'U+00A6', 'broken vertical bar',], 167, 'U+00A7', 'section sign', ], 168, 'U+00A8', 'diaeresis', ], 169, 'U+00A9', 'copyright sign', ], 170, 'U+00AA', 'feminine ordinal indicator',], character equivalents 'ASCII-compatible' character equiv chr() } ( 0x096, 0x097, 0x058A, 0x1806, 0x208B, 0x2212, 0xFE63, 0xFF0D); chr() } ( 0x02BC, 0x2018..0x201A, 0x2032 ); chr() } ( 0x02EE, 0x201C..0x201E ); chr() } ( 0x2000..0x200B, 0x202F, chr() } ( 0x2022, 0x22C5); :isPhraseMarkup) { /gi; than triple nested html entities ext) 0-9]+;/); hex($1))/eg; whitespace with punctuation to Monks for this code characters with equivalents uiv{$1}:$1/eg; greater than 0xff { meant to be a " {ffff}]/; characters ;//g; excess whitespace /g; $text=~s/\s+$//; ->[3]); 3;}=$entity->[0]; 3;}=$entity->[0]; ;}=$entity->[1] --------------------------- -------------------- 5 codepoint description --------------------------- -------------------- 'quot', 34, 'U+0022', 'quotation mark = A ], 'amp', 38, 'U+0026', 'ampersand', ], 'lt', 60, 'U+003C', 'less-than sign', ], 'gt', 62, 'U+003E', 'greater-than sign' ], '', 133, 'U+0085', '', ], '', 150, '', '', ], '', 151, '', '', ], 'iexcl', 161, 'U+00A1', 'inverted exclamati ], 'cent', 162, 'U+00A2', 'cent sign', ], 'pound', 163, 'U+00A3', 'pound sign', ], 'curren', 164, 'U+00A4', 'currency sign', ], 'yen', 165, 'U+00A5', 'yen sign = yuan si ], 'brvbar', 166, 'U+00A6', 'broken bar = broke ], 'sect', 167, 'U+00A7', 'section sign', ], 'uml', 168, 'U+00A8', 'diaeresis = spacin ], 'copy', 169, 'U+00A9', 'copyright sign', ], 'ordf', 170, 'U+00AA', 'feminine ordinal i ], 'laquo', 171, 'U+00AB', 'left-pointing doub angle quotation mark= left pointing guillemet', ], 'not', 172, 'U+00AC', 'not sign', ], 'shy', 173, 'U+00AD', 'soft hyphen = disc ], 'reg', 174, 'U+00AE', 'registered sign = trade mark sign', ], 'macr', 175, 'U+00AF', 'macron = spacing m = overline= APL overbar', ], 'deg', 176, 'U+00B0', 'degree sign', ], 'plusmn', 177, 'U+00B1', 'plus-minus sign = sign', ], 'sup2', 178, 'U+00B2', 'superscript two = digit two= squared', ], 'sup3', 179, 'U+00B3', 'superscript three superscript digit three= cubed', ], 'acute', 180, 'U+00B4', 'acute accent = spa ], 'micro', 181, 'U+00B5', 'micro sign', ], 'para', 182, 'U+00B6', 'pilcrow sign = par ], 'middot', 183, 'U+00B7', 'middle dot = Georg comma= Greek middle dot', ], 'cedil', 184, 'U+00B8', 'cedilla = spacing ], 'sup1', 185, 'U+00B9', 'superscript one = digit one', ], 'ordm', 186, 'U+00BA', 'masculine ordinal ], 'raquo', 187, 'U+00BB', 'right-pointing dou angle quotation mark= right pointing guillemet',], 'frac14', 188, 'U+00BC', 'vulgar fraction on quarter= fraction one quarter', ], 'frac12', 189, 'U+00BD', 'vulgar fraction on half= fraction one half', ], 'frac34', 190, 'U+00BE', 'vulgar fraction th quarters= fraction three quarters', ], 'iquest', 191, 'U+00BF', 'inverted question turned question mark', ], --------------------------- -------------------- 5 codepoint description --------------------------- -------------------- 'Agrave', 192, 'U+00C0', 'latin capital lett A with grave= latin capital letter A grave', ], 'Aacute', 193, 'U+00C1', 'latin capital lett ], 'Acirc', 194, 'U+00C2', 'latin capital lett A with circumflex', ], 'Atilde', 195, 'U+00C3', 'latin capital lett ], 'Auml', 196, 'U+00C4', 'latin capital lett A with diaeresis', ], 'Aring', 197, 'U+00C5', 'latin capital lett A with ring above= latin capital letter A ring', ], 'AElig', 198, 'U+00C6', 'latin capital lett AE= latin capital ligature AE', ], 'Ccedil', 199, 'U+00C7', 'latin capital lett C with cedilla', ], 'Egrave', 200, 'U+00C8', 'latin capital lett ], 'Eacute', 201, 'U+00C9', 'latin capital lett ], 'Ecirc', 202, 'U+00CA', 'latin capital lett E with circumflex', ], 'Euml', 203, 'U+00CB', 'latin capital lett E with diaeresis', ], 'Igrave', 204, 'U+00CC', 'latin capital lett ], 'Iacute', 205, 'U+00CD', 'latin capital lett ], 'Icirc', 206, 'U+00CE', 'latin capital lett I with circumflex', ], 'Iuml', 207, 'U+00CF', 'latin capital lett I with diaeresis', ], 'ETH', 208, 'U+00D0', 'latin capital lett ], 'Ntilde', 209, 'U+00D1', 'latin capital lett ], 'Ograve', 210, 'U+00D2', 'latin capital lett ], 'Oacute', 211, 'U+00D3', 'latin capital lett ], 'Ocirc', 212, 'U+00D4', 'latin capital lett O with circumflex', ], 'Otilde', 213, 'U+00D5', 'latin capital lett ], 'Ouml', 214, 'U+00D6', 'latin capital lett O with diaeresis', ], 'times', 215, 'U+00D7', 'multiplication sig ], 'Oslash', 216, 'U+00D8', 'latin capital lett O with stroke= latin capital letter O slash', ], 'Ugrave', 217, 'U+00D9', 'latin capital lett ], 'Uacute', 218, 'U+00DA', 'latin capital lett ], 'Ucirc', 219, 'U+00DB', 'latin capital lett U with circumflex', ], 'Uuml', 220, 'U+00DC', 'latin capital lett U with diaeresis', ], 'Yacute', 221, 'U+00DD', 'latin capital lett ], 'THORN', 222, 'U+00DE', 'latin capital lett ], 'szlig', 223, 'U+00DF', 'latin small letter sharp s = ess-zed', ], 'agrave', 224, 'U+00E0', 'latin small letter a with grave= latin small letter a grave', ], 'aacute', 225, 'U+00E1', 'latin small letter ], 'acirc', 226, 'U+00E2', 'latin small letter a with circumflex', ], 'atilde', 227, 'U+00E3', 'latin small letter ], 'auml', 228, 'U+00E4', 'latin small letter a with diaeresis', ], --------------------------- -------------------- 5 codepoint description --------------------------- -------------------- 'aring', 229, 'U+00E5', 'latin small letter a with ring above= latin small letter a ring', ], 'aelig', 230, 'U+00E6', 'latin small letter ae= latin small ligature ae', ], 'ccedil', 231, 'U+00E7', 'latin small letter ], 'egrave', 232, 'U+00E8', 'latin small letter ], 'eacute', 233, 'U+00E9', 'latin small letter ], 'ecirc', 234, 'U+00EA', 'latin small letter e with circumflex', ], 'euml', 235, 'U+00EB', 'latin small letter e with diaeresis', ], 'igrave', 236, 'U+00EC', 'latin small letter ], 'iacute', 237, 'U+00ED', 'latin small letter ], 'icirc', 238, 'U+00EE', 'latin small letter i with circumflex', ], 'iuml', 239, 'U+00EF', 'latin small letter i with diaeresis', ], 'eth', 240, 'U+00F0', 'latin small letter ], 'ntilde', 241, 'U+00F1', 'latin small letter ], 'ograve', 242, 'U+00F2', 'latin small letter ], 'oacute', 243, 'U+00F3', 'latin small letter ], 'ocirc', 244, 'U+00F4', 'latin small letter o with circumflex', ], 'otilde', 245, 'U+00F5', 'latin small letter ], 'ouml', 246, 'U+00F6', 'latin small letter o with diaeresis', ], 'divide', 247, 'U+00F7', 'division sign', ], 'ugrave', 249, 'U+00F9', 'latin small letter ], 'uacute', 250, 'U+00FA', 'latin small letter ], 'ucirc', 251, 'U+00FB', 'latin small letter u with circumflex', ], 'uuml', 252, 'U+00FC', 'latin small letter u with diaeresis', ], 'yacute', 253, 'U+00FD', 'latin small letter ], 'thorn', 254, 'U+00FE', 'latin small letter ], 'yuml', 255, 'U+00FF', 'latin small letter y with diaeresis', ], 'OElig', 338, 'U+0152', 'latin capital liga ], 'oelig', 339, 'U+0153', 'latin small ligatu ], 'Scaron', 352, 'U+0160', 'latin capital lett ], 'scaron', 353, 'U+0161', 'latin small letter ], 'Yuml', 376, 'U+0178', 'latin capital lett Y with diaeresis', ], 'fnof', 402, 'U+0192', 'latin small f with hook = function= florin', ], 'circ', 710, 'U+02C6', 'modifier letter ci ], 'tilde', 732, 'U+02DC', 'small tilde', ], 'Gamma', 915, 'U+0393', 'greek capital ], 'Delta', 916, 'U+0394', 'greek capital ], 'Theta', 920, 'U+0398', 'greek capital ], --------------------------- -------------------- 5 codepoint description --------------------------- -------------------- 'Lambda', 923, 'U+039B', 'greek capital ], 'Xi', 926, 'U+039E', 'greek capital ], 'Pi', 928, 'U+03A0', 'greek capital ], 'Sigma', 931, 'U+03A3', 'greek capital ], Upsilon ', 'Upsilon', 933, 'U+03A5', 'greek capital ], 'Phi', 934, 'U+03A6', 'greek capital ], 'Psi', 936, 'U+03A8', 'greek capital ], 'Omega', 937, 'U+03A9', 'greek capital ], 'alpha', 945, 'U+03B1', 'greek small l ], 'beta', 946, 'U+03B2', 'greek small l ], 'gamma', 947, 'U+03B3', 'greek small l ], 'delta', 948, 'U+03B4', 'greek small l ], epsilon ', 'epsilon', 949, 'U+03B5', 'greek small l ], 'eta', 951, 'U+03B7', 'greek small l ], 'theta', 952, 'U+03B8', 'greek small l ], 'iota', 953, 'U+03B9', 'greek small l ], 'kappa', 954, 'U+03BA', 'greek small l ], 'lambda', 955, 'U+03BB', 'greek small l ], 'mu', 956, 'U+03BC', 'greek small l ], 'nu', 957, 'U+03BD', 'greek small l ], 'xi', 958, 'U+03BE', 'greek small l ], omicron ', 'omicron', 959, 'U+03BF', 'greek small l ], 'pi', 960, 'U+03C0', 'greek small l ], 'rho', 961, 'U+03C1', 'greek small l ], 'sigmaf', 962, 'U+03C2', 'greek small l final sigma', ], 'sigma', 963, 'U+03C3', 'greek small l ], 'tau', 964, 'U+03C4', 'greek small l ], upsilon ', 'upsilon', 965, 'U+03C5', 'greek small l ], 'phi', 966, 'U+03C6', 'greek small l ], 'chi', 967, 'U+03C7', 'greek small l ], 'psi', 968, 'U+03C8', 'greek small l ], 'omega', 969, 'U+03C9', 'greek small l ], 'thetasym', 977, 'U+03D1', 'greek small l theta symbol', ], upsilon ', 'upsih', 978, 'U+03D2', 'greek upsilon with hook symbol', ], 'piv', 982, 'U+03D6', 'greek pi symb ], 'ensp', 8194, 'U+2002', 'en space', ], 'emsp', 8195, 'U+2003', 'em space', ], --------------------------- -------------------- 5 codepoint description --------------------------- -------------------- 'thinsp', 8201, 'U+2009', 'thin space', ], 'zwnj', 8204, 'U+200C', 'zero width n ], 'zwj', 8205, 'U+200D', 'zero width j ], 'lrm', 8206, 'U+200E', 'left-to-righ ], 'rlm', 8207, 'U+200F', 'right-to-lef ], 'ndash', 8211, 'U+2013', 'en dash', ], 'mdash', 8212, 'U+2014', 'em dash', ], 'lsquo', 8216, 'U+2018', 'left single quotat ], 'rsquo', 8217, 'U+2019', 'right single quota ], 'sbquo', 8218, 'U+201A', 'single low-9 quota ], 'ldquo', 8220, 'U+201C', 'left double quotat ], 'rdquo', 8221, 'U+201D', 'right double quota ], 'bdquo', 8222, 'U+201E', 'double low-9 quota ], 'dagger', 8224, 'U+2020', 'dagger', ], 'Dagger', 8225, 'U+2021', 'double dagger', ], 8226, 'U+2022', 'bullet = black sma ], 'hellip', 8230, 'U+2026', 'horizontal ellipsi = three dot leader', ], 'permil', 8240, 'U+2030', 'per mille sign', ], 'prime', 8242, 'U+2032', 'prime = minu ], 'lsaquo', 8249, 'U+2039', 'single left-pointi angle quotation mark', ], 'rsaquo', 8250, 'U+203A', 'single right-point angle quotation mark', ], 'oline', 8254, 'U+203E', 'overline = s ], 'frasl', 8260, 'U+2044', 'fraction sla ], 'euro', 8364, 'U+20AC', 'euro sign', ], 'image', 8465, 'U+2111', 'blackletter I = imaginary part', ], 'weierp', 8472, 'U+2118', 'script capit P = power set= Weierstrass p', ], 'real', 8476, 'U+211C', 'blackletter R = real part symbol', ], 'trade', 8482, 'U+2122', 'trade mark sign', ], 'alefsym', 8501, 'U+2135', 'alef symbol first transfinite cardinal', ], 'larr', 8592, 'U+2190', 'leftwards ar ], 'uarr', 8593, 'U+2191', 'upwards arro ], 'rarr', 8594, 'U+2192', 'rightwards a ], 'darr', 8595, 'U+2193', 'downwards ar ], 'harr', 8596, 'U+2194', 'left right a ], 'crarr', 8629, 'U+21B5', 'downwards ar with corner leftwards= carriage return', ], 'lArr', 8656, 'U+21D0', 'leftwards do ], 'uArr', 8657, 'U+21D1', 'upwards doub ], --------------------------- -------------------- 5 codepoint description --------------------------- -------------------- 'rArr', 8658, 'U+21D2', 'rightwards d ], 'dArr', 8659, 'U+21D3', 'downwards do ], foreach ', 'forall', 8704, 'U+2200', 'for all', ], 'part', 8706, 'U+2202', 'partial diff ], 'exist', 8707, 'U+2203', 'there exists ], 'empty', 8709, 'U+2205', 'empty set = set = diameter', ], 'nabla', 8711, 'U+2207', 'nabla = back ], 'isin', 8712, 'U+2208', 'element of', ], 'notin', 8713, 'U+2209', 'not an eleme ], 'ni', 8715, 'U+220B', 'contains as ], 'prod', 8719, 'U+220F', 'n-ary produc ], 'sum', 8721, 'U+2211', 'n-ary sumati ], 'minus', 8722, 'U+2212', 'minus sign', ], 'lowast', 8727, 'U+2217', 'asterisk ope ], 'radic', 8730, 'U+221A', 'square root ], 'prop', 8733, 'U+221D', 'proportional ], 'infin', 8734, 'U+221E', 'infinity', ], 'ang', 8736, 'U+2220', 'angle', ], 'and', 8743, 'U+2227', 'logical and ], 'or', 8744, 'U+2228', 'logical or = ], 'cap', 8745, 'U+2229', 'intersection ], 'cup', 8746, 'U+222A', 'union = cup' ], 'there4', 8756, 'U+2234', 'therefore', ], 'sim', 8764, 'U+223C', 'tilde operat = varies with = similar to', ], 'cong', 8773, 'U+2245', 'approximatel ], 'asymp', 8776, 'U+2248', 'almost equal to = asymptotic to', ], 'ne', 8800, 'U+2260', 'not equal to ], 'equiv', 8801, 'U+2261', 'identical to ], 'le', 8804, 'U+2264', 'less-than or ], 'ge', 8805, 'U+2265', 'greater-than ], 'sub', 8834, 'U+2282', 'subset of', ], 'sup', 8835, 'U+2283', 'superset of' ], 'nsub', 8836, 'U+2284', 'not a subset ], 'sube', 8838, 'U+2286', 'subset of or ], 'supe', 8839, 'U+2287', 'superset of ], 'oplus', 8853, 'U+2295', 'circled plus ], 'otimes', 8855, 'U+2297', 'circled time = vector product', ], --------------------------- -------------------- 5 codepoint description --------------------------- -------------------- 'perp', 8869, 'U+22A5', 'up tack = or to = perpendicular', ], 'sdot', 8901, 'U+22C5', 'dot operator ], 'lceil', 8968, 'U+2308', 'left ceiling ], 'rceil', 8969, 'U+2309', 'right ceilin ], 'lfloor', 8970, 'U+230A', 'left floor = ], 'rfloor', 8971, 'U+230B', 'right floor' ], 'lang', 9001, 'U+2329', 'left-pointin angle bracket = bra', ], 'loz', 9674, 'U+25CA', 'lozenge', ], 'spades', 9824, 'U+2660', 'black spade ], 'clubs', 9827, 'U+2663', 'black club s ], 'hearts', 9829, 'U+2665', 'black heart ], 'diams', 9830, 'U+2666', 'black diamon ], --------------------------- -------------------- 5 codepoint description --------------------------- -------------------- href="?node_id=547944;part=1;displaytype=displaycode;abspart=1">[download] align='left' valign='bottom'>[reply] [d/l]

Replies are listed 'Best First'.

Re: unicode normalization
by mscudder (Initiate) on May 08, 2006 at 00:34 UTC

My solution below, designed for thoroughness

Regards,
Michael

+alents

my $_dashes=join '', map { 0x2010..0x2015, 0x2053, 0x207B, my $_squots=join '', map { my $_dquots=join '', map { my $_spaces=join '', map { 0x205F, 0x3000); my $_dots =join '', map {

sub scrub { my $text = shift; return "" if !$text;

# remove HTML phrasal level tags foreach my $markup (keys %HTML::Tagset: $text=~s/<\s?\/?$markup\s?>/ }

#decode html entities for (1..3) { # assume no more HTML::Entities::decode_entities($t if ($text=~/&#?[a-zA-Z }

# replace character escapes $text=~s/%([0-9A-Fa-f]{2})/chr(

# replace 'wide character' # ascii-compatible whitespace $text=~s/\s/ /g; $text=~s/[$_spaces]/ /g;

# transliterate 'wide character' # ascii-compatible equivalents # with thanks to graff of Perl $text=~s/[$_dashes]/-/g; $text=~s/[$_squots]/\'/g; $text=~s/[$_dquots]/"/g; $text=~s/[$_dots]/\x{00B7}/g;

# replace remaining 'wide' # (my preferred) ascii-compatible $text=~s/(.)/$_char2equiv{$1}?$_char2eq

# unidecode any remaining characters if ($text=~/[\x{100}-\x{ffff}]/) my @chars=split //, $text; # "Text::Unidecode is # transliterator-of-last resort,... foreach my $char (@chars) { $char=unidecode($char) if $char=~/[\x{0100}-\x }

$text=join '', @chars;

# strip out remaining 'wide' $text=~s/[\x{0100}-\x{ffff}] }

# trim leading, trailing, and $text=~s/^\s+//; $text=~s/\s{2,}/

return $text; }

####################################### # initialization                      # ####################################### BEGIN { foreach my $entity (@_html_entities) { $entity->[0]=chr($entity $_entity2char{$entity->[2	 $_entity2char{$entity->[3	 $_char2equiv{$entity->[0] if $entity->[1]; } }

my @_html_entities=( #   --------------------------------------- +---------- #     0    1             2           3      4 #   char  equiv         entity    entity #   --------------------------------------- +---------- ['"', '', +PL quote', ['&', '', + ['<', '', + ['>', '', +, ['',  '...', + ['',  '-', + ['',  '-', + ['Ą', '', +on mark', ['˘', '', + ['Ł', '', + ['¤', '', + ['Ľ', '', +gn', ['Ś', '', +n vertical bar', ['§', '', + ['¨', '', +g diaeresis', ['Š', '', + ['Ş', '', +ndicator', ['Ť', '', +le ['Ź', '', + ['', '', +retionary hyphen', ['Ž', '', +registered ['Ż', '', +acron ['°', '', + ['ą', '', +plus-or-minus ['˛', '', +superscript ['ł', '', += ['´', '', +cing acute', ['ľ', '', + ['ś', '', +agraph sign', ['ˇ', '', +ian ['¸', '', +cedilla', ['š', '', +superscript ['ş', '', +indicator', ['ť', '', +ble ['ź', '', +e ['˝', '', +e ['ž', '', +ree ['ż', '', +mark= #   --------------------------------------- +---------- #     0    1             2           3      4 #   char  equiv         entity    entity #   --------------------------------------- +---------- ['Ŕ', '', +er ['Á', '', +er A with acute', ['Â', '', +er ['Ă', '', +er A with tilde', ['Ä', '', +er ['Ĺ', '', +er ['Ć', '', +er ['Ç', '', +er ['Č', '', +er E with grave', ['É', '', +er E with acute', ['Ę', '', +er ['Ë', '', +er ['Ě', '', +er I with grave', ['Í', '', +er I with acute', ['Î', '', +er ['Ď', '', +er ['Đ', '', +er ETH', ['Ń', '', +er N with tilde', ['Ň', '', +er O with grave', ['Ó', '', +er O with acute', ['Ô', '', +er ['Ő', '', +er O with tilde', ['Ö', '', +er ['×', '', +n', ['Ř', '', +er ['Ů', '', +er U with grave', ['Ú', '', +er U with acute', ['Ű', '', +er ['Ü', '', +er ['Ý', '', +er Y with acute', ['Ţ', '', +er THORN', ['ß', '', + ['ŕ', '', + ['á', '', + a with acute', ['â', '', + ['ă', '', + a with tilde', ['ä', '', + #   --------------------------------------- +---------- #     0    1             2           3      4 #   char  equiv         entity    entity #   --------------------------------------- +---------- ['ĺ', '', + ['ć', '', + ['ç', '', + c with cedilla', ['č', '', + e with grave', ['é', '', + e with acute', ['ę', '', + ['ë', '', + ['ě', '', + i with grave', ['í', '', + i with acute', ['î', '', + ['ď', '', + ['đ', '', + eth', ['ń', '', + n with tilde', ['ň', '', + o with grave', ['ó', '', + o with acute', ['ô', '', + ['ő', '', + o with tilde', ['ö', '', + ['÷', '', + ['ů', '', + u with grave', ['ú', '', + u with acute', ['ű', '', + ['ü', '', + ['ý', '', + y with acute', ['ţ', '', + thorn', ['˙', '', + ['', 'OE', +ture OE', ['', 'oe', +re oe', ['', 'S', +er S with caron', ['', 's', + s with caron', ['', 'Y', +er ['', 'f', + ['', '', +rcumflex accent', ['', '', + ['&#915;', ' Gamma ', + letter gamma', ['&#916;', ' Delta ', + letter delta', ['&#920;', ' Theta ', + letter theta', #   --------------------------------------- +---------- #     0    1             2           3      4 #   char  equiv         entity    entity #   --------------------------------------- +---------- ['&#923;', ' Lambda ', + letter lambda', ['&#926;', ' Xi ', + letter xi', ['&#928;', ' Pi ', + letter pi', ['&#931;', ' Sigma ', + letter sigma', ['&#933;', ' + letter upsilon', ['&#934;', ' Phi ', + letter phi', ['&#936;', ' Psi ', + letter psi', ['&#937;', ' Omega ', + letter omega', ['&#945;', ' alpha ', +etter alpha', ['&#946;', ' beta ', +etter beta', ['&#947;', ' gamma ', +etter gamma', ['&#948;', ' delta ', +etter delta', ['&#949;', ' +etter epsilon', ['&#951;', ' eta ', +etter eta', ['&#952;', ' theta ', +etter theta', ['&#953;', ' iota ', +etter iota', ['&#954;', ' kappa ', +etter kappa', ['&#955;', ' lambda ', +etter lambda', ['&#956;', ' mu ', +etter mu', ['&#957;', ' nu ', +etter nu', ['&#958;', ' xi ', +etter xi', ['&#959;', ' +etter omicron', ['&#960;', ' pi ', +etter pi', ['&#961;', ' rho ', +etter rho', ['&#962;', ' sigma ', +etter ['&#963;', ' sigma ', +etter sigma', ['&#964;', ' tau ', +etter tau', ['&#965;', ' +etter upsilon', ['&#966;', ' phi ', +etter phi', ['&#967;', ' chi ', +etter chi', ['&#968;', ' psi ', +etter psi', ['&#969;', ' omega ', +etter omega', ['&#977;', ' theta ', +etter ['&#978;', ' + ['&#982;', ' pi ', +ol', ['&#8194;', ' ', + ['&#8195;', ' ', + #   --------------------------------------- +---------- #     0    1             2           3      4 #   char  equiv         entity    entity #   --------------------------------------- +---------- ['&#8201;', ' ', + ['&#8204;', '', +on-joiner', ['&#8205;', '', +oiner', ['&#8206;', '->', +t mark', ['&#8207;', '<-', +t mark', ['', '-', + ['', '-', + ['', '\'', +ion mark', ['', '\'', +tion mark', ['', '\'', +tion mark', ['', '\"', +ion mark', ['', '\"', +tion mark', ['', '\"', +tion mark', ['', '+', + ['', '++', + ['', chr(183),    'bull', +ll circle', ['', '...', +s ['', '%%', + ['&#8242;', '\'', +tes = feet', ['‹', '<', +ng ['›', '>', +ing ['&#8254;', '', +pacing overscore', ['&#8260;', '/', +sh', ['', ' euro ', + ['&#8465;', 'I', +capital ['&#8472;', 'P', +al ['&#8476;', 'R', +capital ['', '(tm)', + ['&#8501;', '', += ['&#8592;', '<-', +row', ['&#8593;', '', +w', ['&#8594;', '->', +rrow', ['&#8595;', '', +row', ['&#8596;', '', +rrow', ['&#8629;', '<-', +row ['&#8656;', '<=', +uble arrow', ['&#8657;', '', +le arrow', #   --------------------------------------- +---------- #     0    1             2           3      4 #   char  equiv         entity    entity #   --------------------------------------- +---------- ['&#8658;', '=>', +ouble arrow', ['&#8659;', '', +uble arrow', ['&#8704;', ' + ['&#8706;', '', +erential', ['&#8707;', '', +', ['&#8709;', '', +null ['&#8711;', '', +ward difference', ['&#8712;', '', + ['&#8713;', '', +nt of', ['&#8715;', '', +member', ['&#8719;', '', +t = product sign', ['&#8721;', '', +on', ['&#8722;', '-', + ['&#8727;', '*', +rator', ['&#8730;', '', += radical sign', ['&#8733;', '', + to', ['&#8734;', '', + ['&#8736;', '', + ['&#8743;', ' AND ', += wedge', ['&#8744;', ' OR ', + vee', ['&#8745;', '', + = cap', ['&#8746;', '', +, ['&#8756;', '', + ['&#8764;', '~', +or ['&#8773;', '~', +y equal to', ['&#8776;', '', + ['&#8800;', '<>', +', ['&#8801;', '', +', ['&#8804;', '<=', + equal to', ['&#8805;', '>=', + or equal to', ['&#8834;', '', + ['&#8835;', '', +, ['&#8836;', '', + of', ['&#8838;', '', + equal to', ['&#8839;', '', +or equal to', ['&#8853;', '', + = direct sum', ['&#8855;', '', +s #   --------------------------------------- +---------- #     0    1             2           3      4 #   char  equiv         entity    entity #   --------------------------------------- +---------- ['&#8869;', '', +thogonal ['&#8901;', chr(177), +', ['&#8968;', '', + = apl upstile', ['&#8969;', '', +g', ['&#8970;', '', + apl downstile', ['&#8971;', '', +, ['&#9001;', '<', +g ['&#9674;', '', + ['&#9824;', '', +suit', ['&#9827;', '', +uit = shamrock', ['&#9829;', '', +suit = valentine', ['&#9830;', '', +d suit', #   --------------------------------------- +---------- #     0    1             2           3      4 #   char  equiv         entity    entity #   --------------------------------------- +---------- );

In Section Seekers