Many thanks to wfsp and graff for your very helpful suggestions, code, and referral to The Björk Situation.
My solution below, designed for thoroughness rather than speed. Guaranteed at least 150% effective!
Regards,
Michael
use HTML::Entities;
use HTML::Tagset;
my @_html_entities=(
# ---------------------------------------------------
# 0 1 2 3 4 5
# char equiv entity entity codepoint description
# ---------------------------------------------------
['"', '', 'quot', 34, 'U+0022', 'quotation mark=APL quote', ],
['&', '', 'amp', 38, 'U+0026', 'ampersand', ],
['<', '', 'lt', 60, 'U+003C', 'less-than sign',],
['>', '', 'gt', 62, 'U+003E', 'greater-than sign',],
['', '...','', 133, 'U+0085', '',],
['', '-', '', 150, '', '',],
['', '-', '', 151, '', '',],
['¡', '', 'iexcl', 161, 'U+00A1', 'inverted exclamation mark',],
['¢', '', 'cent', 162, 'U+00A2', 'cent sign', ],
['£', '', 'pound', 163, 'U+00A3', 'pound sign', ],
['¤', '', 'curren', 164, 'U+00A4', 'currency sign', ],
['¥', '', 'yen', 165, 'U+00A5', 'yen sign = yuan sign', ],
['¦', '', 'brvbar', 166, 'U+00A6', 'broken vertical bar',],
['§', '', 'sect', 167, 'U+00A7', 'section sign', ],
['¨', '', 'uml', 168, 'U+00A8', 'diaeresis', ],
['©', '', 'copy', 169, 'U+00A9', 'copyright sign', ],
['ª', '', 'ordf', 170, 'U+00AA', 'feminine ordinal indicator',],
# etc. (complete table below)
);
my %_entity2char=(); # HTML entity character equivalents
my %_char2equiv=(); # my preferred 'ASCII-compatible' character equiv
+alents
my $_dashes=join '', map { chr() } ( 0x096, 0x097, 0x058A, 0x1806,
0x2010..0x2015, 0x2053, 0x207B, 0x208B, 0x2212, 0xFE63, 0xFF0D);
my $_squots=join '', map { chr() } ( 0x02BC, 0x2018..0x201A, 0x2032 );
my $_dquots=join '', map { chr() } ( 0x02EE, 0x201C..0x201E );
my $_spaces=join '', map { chr() } ( 0x2000..0x200B, 0x202F,
0x205F, 0x3000);
my $_dots =join '', map { chr() } ( 0x2022, 0x22C5);
sub scrub {
my $text = shift;
return "" if !$text;
# remove HTML phrasal level tags
foreach my $markup (keys %HTML::Tagset::isPhraseMarkup) {
$text=~s/<\s?\/?$markup\s?>/ /gi;
}
#decode html entities
for (1..3) { # assume no more than triple nested html entities
HTML::Entities::decode_entities($text)
if ($text=~/&#?[a-zA-Z0-9]+;/);
}
# replace character escapes
$text=~s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
# replace 'wide character' whitespace with
# ascii-compatible whitespace
$text=~s/\s/ /g;
$text=~s/[$_spaces]/ /g;
# transliterate 'wide character' punctuation to
# ascii-compatible equivalents
# with thanks to graff of Perl Monks for this code
$text=~s/[$_dashes]/-/g;
$text=~s/[$_squots]/\'/g;
$text=~s/[$_dquots]/"/g;
$text=~s/[$_dots]/\x{00B7}/g;
# replace remaining 'wide' characters with
# (my preferred) ascii-compatible equivalents
$text=~s/(.)/$_char2equiv{$1}?$_char2equiv{$1}:$1/eg;
# unidecode any remaining characters greater than 0xff
if ($text=~/[\x{100}-\x{ffff}]/) {
my @chars=split //, $text;
# "Text::Unidecode is meant to be a
# transliterator-of-last resort,..."
foreach my $char (@chars) {
$char=unidecode($char)
if $char=~/[\x{0100}-\x{ffff}]/;
}
$text=join '', @chars;
# strip out remaining 'wide' characters
$text=~s/[\x{0100}-\x{ffff}]//g;
}
# trim leading, trailing, and excess whitespace
$text=~s/^\s+//; $text=~s/\s{2,}/ /g; $text=~s/\s+$//;
return $text;
}
#######################################
# initialization #
#######################################
BEGIN {
foreach my $entity (@_html_entities) {
$entity->[0]=chr($entity->[3]);
$_entity2char{$entity->[2]}=$entity->[0];
$_entity2char{$entity->[3]}=$entity->[0];
$_char2equiv{$entity->[0]}=$entity->[1]
if $entity->[1];
}
}
my @_html_entities=(
# ------------------------------------------------------------------
+------------------------------
# 0 1 2 3 4 5
# char equiv entity entity codepoint description
# ------------------------------------------------------------------
+------------------------------
['"', '', 'quot', 34, 'U+0022', 'quotation mark = A
+PL quote', ],
['&', '', 'amp', 38, 'U+0026', 'ampersand',
+ ],
['<', '', 'lt', 60, 'U+003C', 'less-than sign',
+ ],
['>', '', 'gt', 62, 'U+003E', 'greater-than sign'
+, ],
['', '...', '', 133, 'U+0085', '',
+ ],
['', '-', '', 150, '', '',
+ ],
['', '-', '', 151, '', '',
+ ],
['¡', '', 'iexcl', 161, 'U+00A1', 'inverted exclamati
+on mark', ],
['¢', '', 'cent', 162, 'U+00A2', 'cent sign',
+ ],
['£', '', 'pound', 163, 'U+00A3', 'pound sign',
+ ],
['¤', '', 'curren', 164, 'U+00A4', 'currency sign',
+ ],
['¥', '', 'yen', 165, 'U+00A5', 'yen sign = yuan si
+gn', ],
['¦', '', 'brvbar', 166, 'U+00A6', 'broken bar = broke
+n vertical bar', ],
['§', '', 'sect', 167, 'U+00A7', 'section sign',
+ ],
['¨', '', 'uml', 168, 'U+00A8', 'diaeresis = spacin
+g diaeresis', ],
['©', '', 'copy', 169, 'U+00A9', 'copyright sign',
+ ],
['ª', '', 'ordf', 170, 'U+00AA', 'feminine ordinal i
+ndicator', ],
['«', '', 'laquo', 171, 'U+00AB', 'left-pointing doub
+le angle quotation mark= left pointing guillemet', ],
['¬', '', 'not', 172, 'U+00AC', 'not sign',
+ ],
['', '', 'shy', 173, 'U+00AD', 'soft hyphen = disc
+retionary hyphen', ],
['®', '', 'reg', 174, 'U+00AE', 'registered sign =
+registered trade mark sign', ],
['¯', '', 'macr', 175, 'U+00AF', 'macron = spacing m
+acron = overline= APL overbar', ],
['°', '', 'deg', 176, 'U+00B0', 'degree sign',
+ ],
['±', '', 'plusmn', 177, 'U+00B1', 'plus-minus sign =
+plus-or-minus sign', ],
['²', '', 'sup2', 178, 'U+00B2', 'superscript two =
+superscript digit two= squared', ],
['³', '', 'sup3', 179, 'U+00B3', 'superscript three
+= superscript digit three= cubed', ],
['´', '', 'acute', 180, 'U+00B4', 'acute accent = spa
+cing acute', ],
['µ', '', 'micro', 181, 'U+00B5', 'micro sign',
+ ],
['¶', '', 'para', 182, 'U+00B6', 'pilcrow sign = par
+agraph sign', ],
['·', '', 'middot', 183, 'U+00B7', 'middle dot = Georg
+ian comma= Greek middle dot', ],
['¸', '', 'cedil', 184, 'U+00B8', 'cedilla = spacing
+cedilla', ],
['¹', '', 'sup1', 185, 'U+00B9', 'superscript one =
+superscript digit one', ],
['º', '', 'ordm', 186, 'U+00BA', 'masculine ordinal
+indicator', ],
['»', '', 'raquo', 187, 'U+00BB', 'right-pointing dou
+ble angle quotation mark= right pointing guillemet',],
['¼', '', 'frac14', 188, 'U+00BC', 'vulgar fraction on
+e quarter= fraction one quarter', ],
['½', '', 'frac12', 189, 'U+00BD', 'vulgar fraction on
+e half= fraction one half', ],
['¾', '', 'frac34', 190, 'U+00BE', 'vulgar fraction th
+ree quarters= fraction three quarters', ],
['¿', '', 'iquest', 191, 'U+00BF', 'inverted question
+mark= turned question mark', ],
# ------------------------------------------------------------------
+------------------------------
# 0 1 2 3 4 5
# char equiv entity entity codepoint description
# ------------------------------------------------------------------
+------------------------------
['À', '', 'Agrave', 192, 'U+00C0', 'latin capital lett
+er A with grave= latin capital letter A grave', ],
['Á', '', 'Aacute', 193, 'U+00C1', 'latin capital lett
+er A with acute', ],
['Â', '', 'Acirc', 194, 'U+00C2', 'latin capital lett
+er A with circumflex', ],
['Ã', '', 'Atilde', 195, 'U+00C3', 'latin capital lett
+er A with tilde', ],
['Ä', '', 'Auml', 196, 'U+00C4', 'latin capital lett
+er A with diaeresis', ],
['Å', '', 'Aring', 197, 'U+00C5', 'latin capital lett
+er A with ring above= latin capital letter A ring', ],
['Æ', '', 'AElig', 198, 'U+00C6', 'latin capital lett
+er AE= latin capital ligature AE', ],
['Ç', '', 'Ccedil', 199, 'U+00C7', 'latin capital lett
+er C with cedilla', ],
['È', '', 'Egrave', 200, 'U+00C8', 'latin capital lett
+er E with grave', ],
['É', '', 'Eacute', 201, 'U+00C9', 'latin capital lett
+er E with acute', ],
['Ê', '', 'Ecirc', 202, 'U+00CA', 'latin capital lett
+er E with circumflex', ],
['Ë', '', 'Euml', 203, 'U+00CB', 'latin capital lett
+er E with diaeresis', ],
['Ì', '', 'Igrave', 204, 'U+00CC', 'latin capital lett
+er I with grave', ],
['Í', '', 'Iacute', 205, 'U+00CD', 'latin capital lett
+er I with acute', ],
['Î', '', 'Icirc', 206, 'U+00CE', 'latin capital lett
+er I with circumflex', ],
['Ï', '', 'Iuml', 207, 'U+00CF', 'latin capital lett
+er I with diaeresis', ],
['Ð', '', 'ETH', 208, 'U+00D0', 'latin capital lett
+er ETH', ],
['Ñ', '', 'Ntilde', 209, 'U+00D1', 'latin capital lett
+er N with tilde', ],
['Ò', '', 'Ograve', 210, 'U+00D2', 'latin capital lett
+er O with grave', ],
['Ó', '', 'Oacute', 211, 'U+00D3', 'latin capital lett
+er O with acute', ],
['Ô', '', 'Ocirc', 212, 'U+00D4', 'latin capital lett
+er O with circumflex', ],
['Õ', '', 'Otilde', 213, 'U+00D5', 'latin capital lett
+er O with tilde', ],
['Ö', '', 'Ouml', 214, 'U+00D6', 'latin capital lett
+er O with diaeresis', ],
['×', '', 'times', 215, 'U+00D7', 'multiplication sig
+n', ],
['Ø', '', 'Oslash', 216, 'U+00D8', 'latin capital lett
+er O with stroke= latin capital letter O slash', ],
['Ù', '', 'Ugrave', 217, 'U+00D9', 'latin capital lett
+er U with grave', ],
['Ú', '', 'Uacute', 218, 'U+00DA', 'latin capital lett
+er U with acute', ],
['Û', '', 'Ucirc', 219, 'U+00DB', 'latin capital lett
+er U with circumflex', ],
['Ü', '', 'Uuml', 220, 'U+00DC', 'latin capital lett
+er U with diaeresis', ],
['Ý', '', 'Yacute', 221, 'U+00DD', 'latin capital lett
+er Y with acute', ],
['Þ', '', 'THORN', 222, 'U+00DE', 'latin capital lett
+er THORN', ],
['ß', '', 'szlig', 223, 'U+00DF', 'latin small letter
+ sharp s = ess-zed', ],
['à', '', 'agrave', 224, 'U+00E0', 'latin small letter
+ a with grave= latin small letter a grave', ],
['á', '', 'aacute', 225, 'U+00E1', 'latin small letter
+ a with acute', ],
['â', '', 'acirc', 226, 'U+00E2', 'latin small letter
+ a with circumflex', ],
['ã', '', 'atilde', 227, 'U+00E3', 'latin small letter
+ a with tilde', ],
['ä', '', 'auml', 228, 'U+00E4', 'latin small letter
+ a with diaeresis', ],
# ------------------------------------------------------------------
+------------------------------
# 0 1 2 3 4 5
# char equiv entity entity codepoint description
# ------------------------------------------------------------------
+------------------------------
['å', '', 'aring', 229, 'U+00E5', 'latin small letter
+ a with ring above= latin small letter a ring', ],
['æ', '', 'aelig', 230, 'U+00E6', 'latin small letter
+ ae= latin small ligature ae', ],
['ç', '', 'ccedil', 231, 'U+00E7', 'latin small letter
+ c with cedilla', ],
['è', '', 'egrave', 232, 'U+00E8', 'latin small letter
+ e with grave', ],
['é', '', 'eacute', 233, 'U+00E9', 'latin small letter
+ e with acute', ],
['ê', '', 'ecirc', 234, 'U+00EA', 'latin small letter
+ e with circumflex', ],
['ë', '', 'euml', 235, 'U+00EB', 'latin small letter
+ e with diaeresis', ],
['ì', '', 'igrave', 236, 'U+00EC', 'latin small letter
+ i with grave', ],
['í', '', 'iacute', 237, 'U+00ED', 'latin small letter
+ i with acute', ],
['î', '', 'icirc', 238, 'U+00EE', 'latin small letter
+ i with circumflex', ],
['ï', '', 'iuml', 239, 'U+00EF', 'latin small letter
+ i with diaeresis', ],
['ð', '', 'eth', 240, 'U+00F0', 'latin small letter
+ eth', ],
['ñ', '', 'ntilde', 241, 'U+00F1', 'latin small letter
+ n with tilde', ],
['ò', '', 'ograve', 242, 'U+00F2', 'latin small letter
+ o with grave', ],
['ó', '', 'oacute', 243, 'U+00F3', 'latin small letter
+ o with acute', ],
['ô', '', 'ocirc', 244, 'U+00F4', 'latin small letter
+ o with circumflex', ],
['õ', '', 'otilde', 245, 'U+00F5', 'latin small letter
+ o with tilde', ],
['ö', '', 'ouml', 246, 'U+00F6', 'latin small letter
+ o with diaeresis', ],
['÷', '', 'divide', 247, 'U+00F7', 'division sign',
+ ],
['ù', '', 'ugrave', 249, 'U+00F9', 'latin small letter
+ u with grave', ],
['ú', '', 'uacute', 250, 'U+00FA', 'latin small letter
+ u with acute', ],
['û', '', 'ucirc', 251, 'U+00FB', 'latin small letter
+ u with circumflex', ],
['ü', '', 'uuml', 252, 'U+00FC', 'latin small letter
+ u with diaeresis', ],
['ý', '', 'yacute', 253, 'U+00FD', 'latin small letter
+ y with acute', ],
['þ', '', 'thorn', 254, 'U+00FE', 'latin small letter
+ thorn', ],
['ÿ', '', 'yuml', 255, 'U+00FF', 'latin small letter
+ y with diaeresis', ],
['Œ', 'OE', 'OElig', 338, 'U+0152', 'latin capital liga
+ture OE', ],
['œ', 'oe', 'oelig', 339, 'U+0153', 'latin small ligatu
+re oe', ],
['Š', 'S', 'Scaron', 352, 'U+0160', 'latin capital lett
+er S with caron', ],
['š', 's', 'scaron', 353, 'U+0161', 'latin small letter
+ s with caron', ],
['Ÿ', 'Y', 'Yuml', 376, 'U+0178', 'latin capital lett
+er Y with diaeresis', ],
['ƒ', 'f', 'fnof', 402, 'U+0192', 'latin small f with
+ hook = function= florin', ],
['ˆ', '', 'circ', 710, 'U+02C6', 'modifier letter ci
+rcumflex accent', ],
['˜', '', 'tilde', 732, 'U+02DC', 'small tilde',
+ ],
['Γ', ' Gamma ', 'Gamma', 915, 'U+0393', 'greek capital
+ letter gamma', ],
['Δ', ' Delta ', 'Delta', 916, 'U+0394', 'greek capital
+ letter delta', ],
['Θ', ' Theta ', 'Theta', 920, 'U+0398', 'greek capital
+ letter theta', ],
# ------------------------------------------------------------------
+------------------------------
# 0 1 2 3 4 5
# char equiv entity entity codepoint description
# ------------------------------------------------------------------
+------------------------------
['Λ', ' Lambda ', 'Lambda', 923, 'U+039B', 'greek capital
+ letter lambda', ],
['Ξ', ' Xi ', 'Xi', 926, 'U+039E', 'greek capital
+ letter xi', ],
['Π', ' Pi ', 'Pi', 928, 'U+03A0', 'greek capital
+ letter pi', ],
['Σ', ' Sigma ', 'Sigma', 931, 'U+03A3', 'greek capital
+ letter sigma', ],
['Υ', ' Upsilon ', 'Upsilon', 933, 'U+03A5', 'greek capital
+ letter upsilon', ],
['Φ', ' Phi ', 'Phi', 934, 'U+03A6', 'greek capital
+ letter phi', ],
['Ψ', ' Psi ', 'Psi', 936, 'U+03A8', 'greek capital
+ letter psi', ],
['Ω', ' Omega ', 'Omega', 937, 'U+03A9', 'greek capital
+ letter omega', ],
['α', ' alpha ', 'alpha', 945, 'U+03B1', 'greek small l
+etter alpha', ],
['β', ' beta ', 'beta', 946, 'U+03B2', 'greek small l
+etter beta', ],
['γ', ' gamma ', 'gamma', 947, 'U+03B3', 'greek small l
+etter gamma', ],
['δ', ' delta ', 'delta', 948, 'U+03B4', 'greek small l
+etter delta', ],
['ε', ' epsilon ', 'epsilon', 949, 'U+03B5', 'greek small l
+etter epsilon', ],
['η', ' eta ', 'eta', 951, 'U+03B7', 'greek small l
+etter eta', ],
['θ', ' theta ', 'theta', 952, 'U+03B8', 'greek small l
+etter theta', ],
['ι', ' iota ', 'iota', 953, 'U+03B9', 'greek small l
+etter iota', ],
['κ', ' kappa ', 'kappa', 954, 'U+03BA', 'greek small l
+etter kappa', ],
['λ', ' lambda ', 'lambda', 955, 'U+03BB', 'greek small l
+etter lambda', ],
['μ', ' mu ', 'mu', 956, 'U+03BC', 'greek small l
+etter mu', ],
['ν', ' nu ', 'nu', 957, 'U+03BD', 'greek small l
+etter nu', ],
['ξ', ' xi ', 'xi', 958, 'U+03BE', 'greek small l
+etter xi', ],
['ο', ' omicron ', 'omicron', 959, 'U+03BF', 'greek small l
+etter omicron', ],
['π', ' pi ', 'pi', 960, 'U+03C0', 'greek small l
+etter pi', ],
['ρ', ' rho ', 'rho', 961, 'U+03C1', 'greek small l
+etter rho', ],
['ς', ' sigma ', 'sigmaf', 962, 'U+03C2', 'greek small l
+etter final sigma', ],
['σ', ' sigma ', 'sigma', 963, 'U+03C3', 'greek small l
+etter sigma', ],
['τ', ' tau ', 'tau', 964, 'U+03C4', 'greek small l
+etter tau', ],
['υ', ' upsilon ', 'upsilon', 965, 'U+03C5', 'greek small l
+etter upsilon', ],
['φ', ' phi ', 'phi', 966, 'U+03C6', 'greek small l
+etter phi', ],
['χ', ' chi ', 'chi', 967, 'U+03C7', 'greek small l
+etter chi', ],
['ψ', ' psi ', 'psi', 968, 'U+03C8', 'greek small l
+etter psi', ],
['ω', ' omega ', 'omega', 969, 'U+03C9', 'greek small l
+etter omega', ],
['ϑ', ' theta ', 'thetasym', 977, 'U+03D1', 'greek small l
+etter theta symbol', ],
['ϒ', ' upsilon ', 'upsih', 978, 'U+03D2', 'greek upsilon
+ with hook symbol', ],
['ϖ', ' pi ', 'piv', 982, 'U+03D6', 'greek pi symb
+ol', ],
[' ', ' ', 'ensp', 8194, 'U+2002', 'en space',
+ ],
[' ', ' ', 'emsp', 8195, 'U+2003', 'em space',
+ ],
# ------------------------------------------------------------------
+------------------------------
# 0 1 2 3 4 5
# char equiv entity entity codepoint description
# ------------------------------------------------------------------
+------------------------------
[' ', ' ', 'thinsp', 8201, 'U+2009', 'thin space',
+ ],
['‌', '', 'zwnj', 8204, 'U+200C', 'zero width n
+on-joiner', ],
['‍', '', 'zwj', 8205, 'U+200D', 'zero width j
+oiner', ],
['‎', '->', 'lrm', 8206, 'U+200E', 'left-to-righ
+t mark', ],
['‏', '<-', 'rlm', 8207, 'U+200F', 'right-to-lef
+t mark', ],
['–', '-', 'ndash', 8211, 'U+2013', 'en dash',
+ ],
['—', '-', 'mdash', 8212, 'U+2014', 'em dash',
+ ],
['‘', '\'', 'lsquo', 8216, 'U+2018', 'left single quotat
+ion mark', ],
['’', '\'', 'rsquo', 8217, 'U+2019', 'right single quota
+tion mark', ],
['‚', '\'', 'sbquo', 8218, 'U+201A', 'single low-9 quota
+tion mark', ],
['“', '\"', 'ldquo', 8220, 'U+201C', 'left double quotat
+ion mark', ],
['”', '\"', 'rdquo', 8221, 'U+201D', 'right double quota
+tion mark', ],
['„', '\"', 'bdquo', 8222, 'U+201E', 'double low-9 quota
+tion mark', ],
['†', '+', 'dagger', 8224, 'U+2020', 'dagger',
+ ],
['‡', '++', 'Dagger', 8225, 'U+2021', 'double dagger',
+ ],
['•', chr(183), 'bull', 8226, 'U+2022', 'bullet = black sma
+ll circle', ],
['…', '...', 'hellip', 8230, 'U+2026', 'horizontal ellipsi
+s = three dot leader', ],
['‰', '%%', 'permil', 8240, 'U+2030', 'per mille sign',
+ ],
['′', '\'', 'prime', 8242, 'U+2032', 'prime = minu
+tes = feet', ],
['‹', '<', 'lsaquo', 8249, 'U+2039', 'single left-pointi
+ng angle quotation mark', ],
['›', '>', 'rsaquo', 8250, 'U+203A', 'single right-point
+ing angle quotation mark', ],
['‾', '', 'oline', 8254, 'U+203E', 'overline = s
+pacing overscore', ],
['⁄', '/', 'frasl', 8260, 'U+2044', 'fraction sla
+sh', ],
['€', ' euro ', 'euro', 8364, 'U+20AC', 'euro sign',
+ ],
['ℑ', 'I', 'image', 8465, 'U+2111', 'blackletter
+capital I = imaginary part', ],
['℘', 'P', 'weierp', 8472, 'U+2118', 'script capit
+al P = power set= Weierstrass p', ],
['ℜ', 'R', 'real', 8476, 'U+211C', 'blackletter
+capital R = real part symbol', ],
['™', '(tm)', 'trade', 8482, 'U+2122', 'trade mark sign',
+ ],
['ℵ', '', 'alefsym', 8501, 'U+2135', 'alef symbol
+= first transfinite cardinal', ],
['←', '<-', 'larr', 8592, 'U+2190', 'leftwards ar
+row', ],
['↑', '', 'uarr', 8593, 'U+2191', 'upwards arro
+w', ],
['→', '->', 'rarr', 8594, 'U+2192', 'rightwards a
+rrow', ],
['↓', '', 'darr', 8595, 'U+2193', 'downwards ar
+row', ],
['↔', '', 'harr', 8596, 'U+2194', 'left right a
+rrow', ],
['↵', '<-', 'crarr', 8629, 'U+21B5', 'downwards ar
+row with corner leftwards= carriage return', ],
['⇐', '<=', 'lArr', 8656, 'U+21D0', 'leftwards do
+uble arrow', ],
['⇑', '', 'uArr', 8657, 'U+21D1', 'upwards doub
+le arrow', ],
# ------------------------------------------------------------------
+------------------------------
# 0 1 2 3 4 5
# char equiv entity entity codepoint description
# ------------------------------------------------------------------
+------------------------------
['⇒', '=>', 'rArr', 8658, 'U+21D2', 'rightwards d
+ouble arrow', ],
['⇓', '', 'dArr', 8659, 'U+21D3', 'downwards do
+uble arrow', ],
['∀', ' foreach ', 'forall', 8704, 'U+2200', 'for all',
+ ],
['∂', '', 'part', 8706, 'U+2202', 'partial diff
+erential', ],
['∃', '', 'exist', 8707, 'U+2203', 'there exists
+', ],
['∅', '', 'empty', 8709, 'U+2205', 'empty set =
+null set = diameter', ],
['∇', '', 'nabla', 8711, 'U+2207', 'nabla = back
+ward difference', ],
['∈', '', 'isin', 8712, 'U+2208', 'element of',
+ ],
['∉', '', 'notin', 8713, 'U+2209', 'not an eleme
+nt of', ],
['∋', '', 'ni', 8715, 'U+220B', 'contains as
+member', ],
['∏', '', 'prod', 8719, 'U+220F', 'n-ary produc
+t = product sign', ],
['∑', '', 'sum', 8721, 'U+2211', 'n-ary sumati
+on', ],
['−', '-', 'minus', 8722, 'U+2212', 'minus sign',
+ ],
['∗', '*', 'lowast', 8727, 'U+2217', 'asterisk ope
+rator', ],
['√', '', 'radic', 8730, 'U+221A', 'square root
+= radical sign', ],
['∝', '', 'prop', 8733, 'U+221D', 'proportional
+ to', ],
['∞', '', 'infin', 8734, 'U+221E', 'infinity',
+ ],
['∠', '', 'ang', 8736, 'U+2220', 'angle',
+ ],
['∧', ' AND ', 'and', 8743, 'U+2227', 'logical and
+= wedge', ],
['∨', ' OR ', 'or', 8744, 'U+2228', 'logical or =
+ vee', ],
['∩', '', 'cap', 8745, 'U+2229', 'intersection
+ = cap', ],
['∪', '', 'cup', 8746, 'U+222A', 'union = cup'
+, ],
['∴', '', 'there4', 8756, 'U+2234', 'therefore',
+ ],
['∼', '~', 'sim', 8764, 'U+223C', 'tilde operat
+or = varies with = similar to', ],
['≅', '~', 'cong', 8773, 'U+2245', 'approximatel
+y equal to', ],
['≈', '', 'asymp', 8776, 'U+2248', 'almost equal
+ to = asymptotic to', ],
['≠', '<>', 'ne', 8800, 'U+2260', 'not equal to
+', ],
['≡', '', 'equiv', 8801, 'U+2261', 'identical to
+', ],
['≤', '<=', 'le', 8804, 'U+2264', 'less-than or
+ equal to', ],
['≥', '>=', 'ge', 8805, 'U+2265', 'greater-than
+ or equal to', ],
['⊂', '', 'sub', 8834, 'U+2282', 'subset of',
+ ],
['⊃', '', 'sup', 8835, 'U+2283', 'superset of'
+, ],
['⊄', '', 'nsub', 8836, 'U+2284', 'not a subset
+ of', ],
['⊆', '', 'sube', 8838, 'U+2286', 'subset of or
+ equal to', ],
['⊇', '', 'supe', 8839, 'U+2287', 'superset of
+or equal to', ],
['⊕', '', 'oplus', 8853, 'U+2295', 'circled plus
+ = direct sum', ],
['⊗', '', 'otimes', 8855, 'U+2297', 'circled time
+s = vector product', ],
# ------------------------------------------------------------------
+------------------------------
# 0 1 2 3 4 5
# char equiv entity entity codepoint description
# ------------------------------------------------------------------
+------------------------------
['⊥', '', 'perp', 8869, 'U+22A5', 'up tack = or
+thogonal to = perpendicular', ],
['⋅', chr(177), 'sdot', 8901, 'U+22C5', 'dot operator
+', ],
['⌈', '', 'lceil', 8968, 'U+2308', 'left ceiling
+ = apl upstile', ],
['⌉', '', 'rceil', 8969, 'U+2309', 'right ceilin
+g', ],
['⌊', '', 'lfloor', 8970, 'U+230A', 'left floor =
+ apl downstile', ],
['⌋', '', 'rfloor', 8971, 'U+230B', 'right floor'
+, ],
['〈', '<', 'lang', 9001, 'U+2329', 'left-pointin
+g angle bracket = bra', ],
['◊', '', 'loz', 9674, 'U+25CA', 'lozenge',
+ ],
['♠', '', 'spades', 9824, 'U+2660', 'black spade
+suit', ],
['♣', '', 'clubs', 9827, 'U+2663', 'black club s
+uit = shamrock', ],
['♥', '', 'hearts', 9829, 'U+2665', 'black heart
+suit = valentine', ],
['♦', '', 'diams', 9830, 'U+2666', 'black diamon
+d suit', ],
# ------------------------------------------------------------------
+------------------------------
# 0 1 2 3 4 5
# char equiv entity entity codepoint description
# ------------------------------------------------------------------
+------------------------------
);
|