use HTML::Entities;
use HTML::Tagset;
my @_html_entities=(
# ---------------------------------------------------
# 0 1 2 3 4 5
# char equiv entity entity codepoint description
# ---------------------------------------------------
['"', '', 'quot', 34, 'U+0022', 'quotation mark=APL quote', ],
['&', '', 'amp', 38, 'U+0026', 'ampersand', ],
['<', '', 'lt', 60, 'U+003C', 'less-than sign',],
['>', '', 'gt', 62, 'U+003E', 'greater-than sign',],
['', '...','', 133, 'U+0085', '',],
['', '-', '', 150, '', '',],
['', '-', '', 151, '', '',],
['¡', '', 'iexcl', 161, 'U+00A1', 'inverted exclamation mark',],
['¢', '', 'cent', 162, 'U+00A2', 'cent sign', ],
['£', '', 'pound', 163, 'U+00A3', 'pound sign', ],
['¤', '', 'curren', 164, 'U+00A4', 'currency sign', ],
['¥', '', 'yen', 165, 'U+00A5', 'yen sign = yuan sign', ],
['¦', '', 'brvbar', 166, 'U+00A6', 'broken vertical bar',],
['§', '', 'sect', 167, 'U+00A7', 'section sign', ],
['¨', '', 'uml', 168, 'U+00A8', 'diaeresis', ],
['©', '', 'copy', 169, 'U+00A9', 'copyright sign', ],
['ª', '', 'ordf', 170, 'U+00AA', 'feminine ordinal indicator',],
# etc. (complete table below)
);
my %_entity2char=(); # HTML entity character equivalents
my %_char2equiv=(); # my preferred 'ASCII-compatible' character equiv
+alents
my $_dashes=join '', map { chr() } ( 0x096, 0x097, 0x058A, 0x1806,
0x2010..0x2015, 0x2053, 0x207B, 0x208B, 0x2212, 0xFE63, 0xFF0D);
my $_squots=join '', map { chr() } ( 0x02BC, 0x2018..0x201A, 0x2032 );
my $_dquots=join '', map { chr() } ( 0x02EE, 0x201C..0x201E );
my $_spaces=join '', map { chr() } ( 0x2000..0x200B, 0x202F,
0x205F, 0x3000);
my $_dots =join '', map { chr() } ( 0x2022, 0x22C5);
sub scrub {
my $text = shift;
return "" if !$text;
# remove HTML phrasal level tags
foreach my $markup (keys %HTML::Tagset::isPhraseMarkup) {
$text=~s/<\s?\/?$markup\s?>/ /gi;
}
#decode html entities
for (1..3) { # assume no more than triple nested html entities
HTML::Entities::decode_entities($text)
if ($text=~/&#?[a-zA-Z0-9]+;/);
}
# replace character escapes
$text=~s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
# replace 'wide character' whitespace with
# ascii-compatible whitespace
$text=~s/\s/ /g;
$text=~s/[$_spaces]/ /g;
# transliterate 'wide character' punctuation to
# ascii-compatible equivalents
# with thanks to graff of Perl Monks for this code
$text=~s/[$_dashes]/-/g;
$text=~s/[$_squots]/\'/g;
$text=~s/[$_dquots]/"/g;
$text=~s/[$_dots]/\x{00B7}/g;
# replace remaining 'wide' characters with
# (my preferred) ascii-compatible equivalents
$text=~s/(.)/$_char2equiv{$1}?$_char2equiv{$1}:$1/eg;
# unidecode any remaining characters greater than 0xff
if ($text=~/[\x{100}-\x{ffff}]/) {
my @chars=split //, $text;
# "Text::Unidecode is meant to be a
# transliterator-of-last resort,..."
foreach my $char (@chars) {
$char=unidecode($char)
if $char=~/[\x{0100}-\x{ffff}]/;
}
$text=join '', @chars;
# strip out remaining 'wide' characters
$text=~s/[\x{0100}-\x{ffff}]//g;
}
# trim leading, trailing, and excess whitespace
$text=~s/^\s+//; $text=~s/\s{2,}/ /g; $text=~s/\s+$//;
return $text;
}
#######################################
# initialization #
#######################################
BEGIN {
foreach my $entity (@_html_entities) {
$entity->[0]=chr($entity->[3]);
$_entity2char{$entity->[2]}=$entity->[0];
$_entity2char{$entity->[3]}=$entity->[0];
$_char2equiv{$entity->[0]}=$entity->[1]
if $entity->[1];
}
}
my @_html_entities=(
# ------------------------------------------------------------------
+------------------------------
# 0 1 2 3 4 5
# char equiv entity entity codepoint description
# ------------------------------------------------------------------
+------------------------------
['"', '', 'quot', 34, 'U+0022', 'quotation mark = A
+PL quote', ],
['&', '', 'amp', 38, 'U+0026', 'ampersand',
+ ],
['<', '', 'lt', 60, 'U+003C', 'less-than sign',
+ ],
['>', '', 'gt', 62, 'U+003E', 'greater-than sign'
+, ],
['', '...', '', 133, 'U+0085', '',
+ ],
['', '-', '', 150, '', '',
+ ],
['', '-', '', 151, '', '',
+ ],
['¡', '', 'iexcl', 161, 'U+00A1', 'inverted exclamati
+on mark', ],
['¢', '', 'cent', 162, 'U+00A2', 'cent sign',
+ ],
['£', '', 'pound', 163, 'U+00A3', 'pound sign',
+ ],
['¤', '', 'curren', 164, 'U+00A4', 'currency sign',
+ ],
['¥', '', 'yen', 165, 'U+00A5', 'yen sign = yuan si
+gn', ],
['¦', '', 'brvbar', 166, 'U+00A6', 'broken bar = broke
+n vertical bar', ],
['§', '', 'sect', 167, 'U+00A7', 'section sign',
+ ],
['¨', '', 'uml', 168, 'U+00A8', 'diaeresis = spacin
+g diaeresis', ],
['©', '', 'copy', 169, 'U+00A9', 'copyright sign',
+ ],
['ª', '', 'ordf', 170, 'U+00AA', 'feminine ordinal i
+ndicator', ],
['«', '', 'laquo', 171, 'U+00AB', 'left-pointing doub
+le angle quotation mark= left pointing guillemet', ],
['¬', '', 'not', 172, 'U+00AC', 'not sign',
+ ],
['', '', 'shy', 173, 'U+00AD', 'soft hyphen = disc
+retionary hyphen', ],
['®', '', 'reg', 174, 'U+00AE', 'registered sign =
+registered trade mark sign', ],
['¯', '', 'macr', 175, 'U+00AF', 'macron = spacing m
+acron = overline= APL overbar', ],
['°', '', 'deg', 176, 'U+00B0', 'degree sign',
+ ],
['±', '', 'plusmn', 177, 'U+00B1', 'plus-minus sign =
+plus-or-minus sign', ],
['²', '', 'sup2', 178, 'U+00B2', 'superscript two =
+superscript digit two= squared', ],
['³', '', 'sup3', 179, 'U+00B3', 'superscript three
+= superscript digit three= cubed', ],
['´', '', 'acute', 180, 'U+00B4', 'acute accent = spa
+cing acute', ],
['µ', '', 'micro', 181, 'U+00B5', 'micro sign',
+ ],
['¶', '', 'para', 182, 'U+00B6', 'pilcrow sign = par
+agraph sign', ],
['·', '', 'middot', 183, 'U+00B7', 'middle dot = Georg
+ian comma= Greek middle dot', ],
['¸', '', 'cedil', 184, 'U+00B8', 'cedilla = spacing
+cedilla', ],
['¹', '', 'sup1', 185, 'U+00B9', 'superscript one =
+superscript digit one', ],
['º', '', 'ordm', 186, 'U+00BA', 'masculine ordinal
+indicator', ],
['»', '', 'raquo', 187, 'U+00BB', 'right-pointing dou
+ble angle quotation mark= right pointing guillemet',],
['¼', '', 'frac14', 188, 'U+00BC', 'vulgar fraction on
+e quarter= fraction one quarter', ],
['½', '', 'frac12', 189, 'U+00BD', 'vulgar fraction on
+e half= fraction one half', ],
['¾', '', 'frac34', 190, 'U+00BE', 'vulgar fraction th
+ree quarters= fraction three quarters', ],
['¿', '', 'iquest', 191, 'U+00BF', 'inverted question
+mark= turned question mark', ],
# ------------------------------------------------------------------
+------------------------------
# 0 1 2 3 4 5
# char equiv entity entity codepoint description
# ------------------------------------------------------------------
+------------------------------
['À', '', 'Agrave', 192, 'U+00C0', 'latin capital lett
+er A with grave= latin capital letter A grave', ],
['Á', '', 'Aacute', 193, 'U+00C1', 'latin capital lett
+er A with acute', ],
['Â', '', 'Acirc', 194, 'U+00C2', 'latin capital lett
+er A with circumflex', ],
['Ã', '', 'Atilde', 195, 'U+00C3', 'latin capital lett
+er A with tilde', ],
['Ä', '', 'Auml', 196, 'U+00C4', 'latin capital lett
+er A with diaeresis', ],
['Å', '', 'Aring', 197, 'U+00C5', 'latin capital lett
+er A with ring above= latin capital letter A ring', ],
['Æ', '', 'AElig', 198, 'U+00C6', 'latin capital lett
+er AE= latin capital ligature AE', ],
['Ç', '', 'Ccedil', 199, 'U+00C7', 'latin capital lett
+er C with cedilla', ],
['È', '', 'Egrave', 200, 'U+00C8', 'latin capital lett
+er E with grave', ],
['É', '', 'Eacute', 201, 'U+00C9', 'latin capital lett
+er E with acute', ],
['Ê', '', 'Ecirc', 202, 'U+00CA', 'latin capital lett
+er E with circumflex', ],
['Ë', '', 'Euml', 203, 'U+00CB', 'latin capital lett
+er E with diaeresis', ],
['Ì', '', 'Igrave', 204, 'U+00CC', 'latin capital lett
+er I with grave', ],
['Í', '', 'Iacute', 205, 'U+00CD', 'latin capital lett
+er I with acute', ],
['Î', '', 'Icirc', 206, 'U+00CE', 'latin capital lett
+er I with circumflex', ],
['Ï', '', 'Iuml', 207, 'U+00CF', 'latin capital lett
+er I with diaeresis', ],
['Ð', '', 'ETH', 208, 'U+00D0', 'latin capital lett
+er ETH', ],
['Ñ', '', 'Ntilde', 209, 'U+00D1', 'latin capital lett
+er N with tilde', ],
['Ò', '', 'Ograve', 210, 'U+00D2', 'latin capital lett
+er O with grave', ],
['Ó', '', 'Oacute', 211, 'U+00D3', 'latin capital lett
+er O with acute', ],
['Ô', '', 'Ocirc', 212, 'U+00D4', 'latin capital lett
+er O with circumflex', ],
['Õ', '', 'Otilde', 213, 'U+00D5', 'latin capital lett
+er O with tilde', ],
['Ö', '', 'Ouml', 214, 'U+00D6', 'latin capital lett
+er O with diaeresis', ],
['×', '', 'times', 215, 'U+00D7', 'multiplication sig
+n', ],
['Ø', '', 'Oslash', 216, 'U+00D8', 'latin capital lett
+er O with stroke= latin capital letter O slash', ],
['Ù', '', 'Ugrave', 217, 'U+00D9', 'latin capital lett
+er U with grave', ],
['Ú', '', 'Uacute', 218, 'U+00DA', 'latin capital lett
+er U with acute', ],
['Û', '', 'Ucirc', 219, 'U+00DB', 'latin capital lett
+er U with circumflex', ],
['Ü', '', 'Uuml', 220, 'U+00DC', 'latin capital lett
+er U with diaeresis', ],
['Ý', '', 'Yacute', 221, 'U+00DD', 'latin capital lett
+er Y with acute', ],
['Þ', '', 'THORN', 222, 'U+00DE', 'latin capital lett
+er THORN', ],
['ß', '', 'szlig', 223, 'U+00DF', 'latin small letter
+ sharp s = ess-zed', ],
['à', '', 'agrave', 224, 'U+00E0', 'latin small letter
+ a with grave= latin small letter a grave', ],
['á', '', 'aacute', 225, 'U+00E1', 'latin small letter
+ a with acute', ],
['â', '', 'acirc', 226, 'U+00E2', 'latin small letter
+ a with circumflex', ],
['ã', '', 'atilde', 227, 'U+00E3', 'latin small letter
+ a with tilde', ],
['ä', '', 'auml', 228, 'U+00E4', 'latin small letter
+ a with diaeresis', ],
# ------------------------------------------------------------------
+------------------------------
# 0 1 2 3 4 5
# char equiv entity entity codepoint description
# ------------------------------------------------------------------
+------------------------------
['å', '', 'aring', 229, 'U+00E5', 'latin small letter
+ a with ring above= latin small letter a ring', ],
['æ', '', 'aelig', 230, 'U+00E6', 'latin small letter
+ ae= latin small ligature ae', ],
['ç', '', 'ccedil', 231, 'U+00E7', 'latin small letter
+ c with cedilla', ],
['è', '', 'egrave', 232, 'U+00E8', 'latin small letter
+ e with grave', ],
['é', '', 'eacute', 233, 'U+00E9', 'latin small letter
+ e with acute', ],
['ê', '', 'ecirc', 234, 'U+00EA', 'latin small letter
+ e with circumflex', ],
['ë', '', 'euml', 235, 'U+00EB', 'latin small letter
+ e with diaeresis', ],
['ì', '', 'igrave', 236, 'U+00EC', 'latin small letter
+ i with grave', ],
['í', '', 'iacute', 237, 'U+00ED', 'latin small letter
+ i with acute', ],
['î', '', 'icirc', 238, 'U+00EE', 'latin small letter
+ i with circumflex', ],
['ï', '', 'iuml', 239, 'U+00EF', 'latin small letter
+ i with diaeresis', ],
['ð', '', 'eth', 240, 'U+00F0', 'latin small letter
+ eth', ],
['ñ', '', 'ntilde', 241, 'U+00F1', 'latin small letter
+ n with tilde', ],
['ò', '', 'ograve', 242, 'U+00F2', 'latin small letter
+ o with grave', ],
['ó', '', 'oacute', 243, 'U+00F3', 'latin small letter
+ o with acute', ],
['ô', '', 'ocirc', 244, 'U+00F4', 'latin small letter
+ o with circumflex', ],
['õ', '', 'otilde', 245, 'U+00F5', 'latin small letter
+ o with tilde', ],
['ö', '', 'ouml', 246, 'U+00F6', 'latin small letter
+ o with diaeresis', ],
['÷', '', 'divide', 247, 'U+00F7', 'division sign',
+ ],
['ù', '', 'ugrave', 249, 'U+00F9', 'latin small letter
+ u with grave', ],
['ú', '', 'uacute', 250, 'U+00FA', 'latin small letter
+ u with acute', ],
['û', '', 'ucirc', 251, 'U+00FB', 'latin small letter
+ u with circumflex', ],
['ü', '', 'uuml', 252, 'U+00FC', 'latin small letter
+ u with diaeresis', ],
['ý', '', 'yacute', 253, 'U+00FD', 'latin small letter
+ y with acute', ],
['þ', '', 'thorn', 254, 'U+00FE', 'latin small letter
+ thorn', ],
['ÿ', '', 'yuml', 255, 'U+00FF', 'latin small letter
+ y with diaeresis', ],
['Œ', 'OE', 'OElig', 338, 'U+0152', 'latin capital liga
+ture OE', ],
['œ', 'oe', 'oelig', 339, 'U+0153', 'latin small ligatu
+re oe', ],
['Š', 'S', 'Scaron', 352, 'U+0160', 'latin capital lett
+er S with caron', ],
['š', 's', 'scaron', 353, 'U+0161', 'latin small letter
+ s with caron', ],
['Ÿ', 'Y', 'Yuml', 376, 'U+0178', 'latin capital lett
+er Y with diaeresis', ],
['ƒ', 'f', 'fnof', 402, 'U+0192', 'latin small f with
+ hook = function= florin', ],
['ˆ', '', 'circ', 710, 'U+02C6', 'modifier letter ci
+rcumflex accent', ],
['˜', '', 'tilde', 732, 'U+02DC', 'small tilde',
+ ],
['Γ', ' Gamma ', 'Gamma', 915, 'U+0393', 'greek capital
+ letter gamma', ],
['Δ', ' Delta ', 'Delta', 916, 'U+0394', 'greek capital
+ letter delta', ],
['Θ', ' Theta ', 'Theta', 920, 'U+0398', 'greek capital
+ letter theta', ],
# ------------------------------------------------------------------
+------------------------------
# 0 1 2 3 4 5
# char equiv entity entity codepoint description
# ------------------------------------------------------------------
+------------------------------
['Λ', ' Lambda ', 'Lambda', 923, 'U+039B', 'greek capital
+ letter lambda', ],
['Ξ', ' Xi ', 'Xi', 926, 'U+039E', 'greek capital
+ letter xi', ],
['Π', ' Pi ', 'Pi', 928, 'U+03A0', 'greek capital
+ letter pi', ],
['Σ', ' Sigma ', 'Sigma', 931, 'U+03A3', 'greek capital
+ letter sigma', ],
['Υ', ' Upsilon ', 'Upsilon', 933, 'U+03A5', 'greek capital
+ letter upsilon', ],
['Φ', ' Phi ', 'Phi', 934, 'U+03A6', 'greek capital
+ letter phi', ],
['Ψ', ' Psi ', 'Psi', 936, 'U+03A8', 'greek capital
+ letter psi', ],
['Ω', ' Omega ', 'Omega', 937, 'U+03A9', 'greek capital
+ letter omega', ],
['α', ' alpha ', 'alpha', 945, 'U+03B1', 'greek small l
+etter alpha', ],
['β', ' beta ', 'beta', 946, 'U+03B2', 'greek small l
+etter beta', ],
['γ', ' gamma ', 'gamma', 947, 'U+03B3', 'greek small l
+etter gamma', ],
['δ', ' delta ', 'delta', 948, 'U+03B4', 'greek small l
+etter delta', ],
['ε', ' epsilon ', 'epsilon', 949, 'U+03B5', 'greek small l
+etter epsilon', ],
['η', ' eta ', 'eta', 951, 'U+03B7', 'greek small l
+etter eta', ],
['θ', ' theta ', 'theta', 952, 'U+03B8', 'greek small l
+etter theta', ],
['ι', ' iota ', 'iota', 953, 'U+03B9', 'greek small l
+etter iota', ],
['κ', ' kappa ', 'kappa', 954, 'U+03BA', 'greek small l
+etter kappa', ],
['λ', ' lambda ', 'lambda', 955, 'U+03BB', 'greek small l
+etter lambda', ],
['μ', ' mu ', 'mu', 956, 'U+03BC', 'greek small l
+etter mu', ],
['ν', ' nu ', 'nu', 957, 'U+03BD', 'greek small l
+etter nu', ],
['ξ', ' xi ', 'xi', 958, 'U+03BE', 'greek small l
+etter xi', ],
['ο', ' omicron ', 'omicron', 959, 'U+03BF', 'greek small l
+etter omicron', ],
['π', ' pi ', 'pi', 960, 'U+03C0', 'greek small l
+etter pi', ],
['ρ', ' rho ', 'rho', 961, 'U+03C1', 'greek small l
+etter rho', ],
['ς', ' sigma ', 'sigmaf', 962, 'U+03C2', 'greek small l
+etter final sigma', ],
['σ', ' sigma ', 'sigma', 963, 'U+03C3', 'greek small l
+etter sigma', ],
['τ', ' tau ', 'tau', 964, 'U+03C4', 'greek small l
+etter tau', ],
['υ', ' upsilon ', 'upsilon', 965, 'U+03C5', 'greek small l
+etter upsilon', ],
['φ', ' phi ', 'phi', 966, 'U+03C6', 'greek small l
+etter phi', ],
['χ', ' chi ', 'chi', 967, 'U+03C7', 'greek small l
+etter chi', ],
['ψ', ' psi ', 'psi', 968, 'U+03C8', 'greek small l
+etter psi', ],
['ω', ' omega ', 'omega', 969, 'U+03C9', 'greek small l
+etter omega', ],
['ϑ', ' theta ', 'thetasym', 977, 'U+03D1', 'greek small l
+etter theta symbol', ],
['ϒ', ' upsilon ', 'upsih', 978, 'U+03D2', 'greek upsilon
+ with hook symbol', ],
['ϖ', ' pi ', 'piv', 982, 'U+03D6', 'greek pi symb
+ol', ],
[' ', ' ', 'ensp', 8194, 'U+2002', 'en space',
+ ],
[' ', ' ', 'emsp', 8195, 'U+2003', 'em space',
+ ],
# ------------------------------------------------------------------
+------------------------------
# 0 1 2 3 4 5
# char equiv entity entity codepoint description
# ------------------------------------------------------------------
+------------------------------
[' ', ' ', 'thinsp', 8201, 'U+2009', 'thin space',
+ ],
['‌', '', 'zwnj', 8204, 'U+200C', 'zero width n
+on-joiner', ],
['‍', '', 'zwj', 8205, 'U+200D', 'zero width j
+oiner', ],
['‎', '->', 'lrm', 8206, 'U+200E', 'left-to-righ
+t mark', ],
['‏', '<-', 'rlm', 8207, 'U+200F', 'right-to-lef
+t mark', ],
['–', '-', 'ndash', 8211, 'U+2013', 'en dash',
+ ],
['—', '-', 'mdash', 8212, 'U+2014', 'em dash',
+ ],
['‘', '\'', 'lsquo', 8216, 'U+2018', 'left single quotat
+ion mark', ],
['’', '\'', 'rsquo', 8217, 'U+2019', 'right single quota
+tion mark', ],
['‚', '\'', 'sbquo', 8218, 'U+201A', 'single low-9 quota
+tion mark', ],
['“', '\"', 'ldquo', 8220, 'U+201C', 'left double quotat
+ion mark', ],
['”', '\"', 'rdquo', 8221, 'U+201D', 'right double quota
+tion mark', ],
['„', '\"', 'bdquo', 8222, 'U+201E', 'double low-9 quota
+tion mark', ],
['†', '+', 'dagger', 8224, 'U+2020', 'dagger',
+ ],
['‡', '++', 'Dagger', 8225, 'U+2021', 'double dagger',
+ ],
['•', chr(183), 'bull', 8226, 'U+2022', 'bullet = black sma
+ll circle', ],
['…', '...', 'hellip', 8230, 'U+2026', 'horizontal ellipsi
+s = three dot leader', ],
['‰', '%%', 'permil', 8240, 'U+2030', 'per mille sign',
+ ],
['′', '\'', 'prime', 8242, 'U+2032', 'prime = minu
+tes = feet', ],
['‹', '<', 'lsaquo', 8249, 'U+2039', 'single left-pointi
+ng angle quotation mark', ],
['›', '>', 'rsaquo', 8250, 'U+203A', 'single right-point
+ing angle quotation mark', ],
['‾', '', 'oline', 8254, 'U+203E', 'overline = s
+pacing overscore', ],
['⁄', '/', 'frasl', 8260, 'U+2044', 'fraction sla
+sh', ],
['€', ' euro ', 'euro', 8364, 'U+20AC', 'euro sign',
+ ],
['ℑ', 'I', 'image', 8465, 'U+2111', 'blackletter
+capital I = imaginary part', ],
['℘', 'P', 'weierp', 8472, 'U+2118', 'script capit
+al P = power set= Weierstrass p', ],
['ℜ', 'R', 'real', 8476, 'U+211C', 'blackletter
+capital R = real part symbol', ],
['™', '(tm)', 'trade', 8482, 'U+2122', 'trade mark sign',
+ ],
['ℵ', '', 'alefsym', 8501, 'U+2135', 'alef symbol
+= first transfinite cardinal', ],
['←', '<-', 'larr', 8592, 'U+2190', 'leftwards ar
+row', ],
['↑', '', 'uarr', 8593, 'U+2191', 'upwards arro
+w', ],
['→', '->', 'rarr', 8594, 'U+2192', 'rightwards a
+rrow', ],
['↓', '', 'darr', 8595, 'U+2193', 'downwards ar
+row', ],
['↔', '', 'harr', 8596, 'U+2194', 'left right a
+rrow', ],
['↵', '<-', 'crarr', 8629, 'U+21B5', 'downwards ar
+row with corner leftwards= carriage return', ],
['⇐', '<=', 'lArr', 8656, 'U+21D0', 'leftwards do
+uble arrow', ],
['⇑', '', 'uArr', 8657, 'U+21D1', 'upwards doub
+le arrow', ],
# ------------------------------------------------------------------
+------------------------------
# 0 1 2 3 4 5
# char equiv entity entity codepoint description
# ------------------------------------------------------------------
+------------------------------
['⇒', '=>', 'rArr', 8658, 'U+21D2', 'rightwards d
+ouble arrow', ],
['⇓', '', 'dArr', 8659, 'U+21D3', 'downwards do
+uble arrow', ],
['∀', ' foreach ', 'forall', 8704, 'U+2200', 'for all',
+ ],
['∂', '', 'part', 8706, 'U+2202', 'partial diff
+erential', ],
['∃', '', 'exist', 8707, 'U+2203', 'there exists
+', ],
['∅', '', 'empty', 8709, 'U+2205', 'empty set =
+null set = diameter', ],
['∇', '', 'nabla', 8711, 'U+2207', 'nabla = back
+ward difference', ],
['∈', '', 'isin', 8712, 'U+2208', 'element of',
+ ],
['∉', '', 'notin', 8713, 'U+2209', 'not an eleme
+nt of', ],
['∋', '', 'ni', 8715, 'U+220B', 'contains as
+member', ],
['∏', '', 'prod', 8719, 'U+220F', 'n-ary produc
+t = product sign', ],
['∑', '', 'sum', 8721, 'U+2211', 'n-ary sumati
+on', ],
['−', '-', 'minus', 8722, 'U+2212', 'minus sign',
+ ],
['∗', '*', 'lowast', 8727, 'U+2217', 'asterisk ope
+rator', ],
['√', '', 'radic', 8730, 'U+221A', 'square root
+= radical sign', ],
['∝', '', 'prop', 8733, 'U+221D', 'proportional
+ to', ],
['∞', '', 'infin', 8734, 'U+221E', 'infinity',
+ ],
['∠', '', 'ang', 8736, 'U+2220', 'angle',
+ ],
['∧', ' AND ', 'and', 8743, 'U+2227', 'logical and
+= wedge', ],
['∨', ' OR ', 'or', 8744, 'U+2228', 'logical or =
+ vee', ],
['∩', '', 'cap', 8745, 'U+2229', 'intersection
+ = cap', ],
['∪', '', 'cup', 8746, 'U+222A', 'union = cup'
+, ],
['∴', '', 'there4', 8756, 'U+2234', 'therefore',
+ ],
['∼', '~', 'sim', 8764, 'U+223C', 'tilde operat
+or = varies with = similar to', ],
['≅', '~', 'cong', 8773, 'U+2245', 'approximatel
+y equal to', ],
['≈', '', 'asymp', 8776, 'U+2248', 'almost equal
+ to = asymptotic to', ],
['≠', '<>', 'ne', 8800, 'U+2260', 'not equal to
+', ],
['≡', '', 'equiv', 8801, 'U+2261', 'identical to
+', ],
['≤', '<=', 'le', 8804, 'U+2264', 'less-than or
+ equal to', ],
['≥', '>=', 'ge', 8805, 'U+2265', 'greater-than
+ or equal to', ],
['⊂', '', 'sub', 8834, 'U+2282', 'subset of',
+ ],
['⊃', '', 'sup', 8835, 'U+2283', 'superset of'
+, ],
['⊄', '', 'nsub', 8836, 'U+2284', 'not a subset
+ of', ],
['⊆', '', 'sube', 8838, 'U+2286', 'subset of or
+ equal to', ],
['⊇', '', 'supe', 8839, 'U+2287', 'superset of
+or equal to', ],
['⊕', '', 'oplus', 8853, 'U+2295', 'circled plus
+ = direct sum', ],
['⊗', '', 'otimes', 8855, 'U+2297', 'circled time
+s = vector product', ],
# ------------------------------------------------------------------
+------------------------------
# 0 1 2 3 4 5
# char equiv entity entity codepoint description
# ------------------------------------------------------------------
+------------------------------
['⊥', '', 'perp', 8869, 'U+22A5', 'up tack = or
+thogonal to = perpendicular', ],
['⋅', chr(177), 'sdot', 8901, 'U+22C5', 'dot operator
+', ],
['⌈', '', 'lceil', 8968, 'U+2308', 'left ceiling
+ = apl upstile', ],
['⌉', '', 'rceil', 8969, 'U+2309', 'right ceilin
+g', ],
['⌊', '', 'lfloor', 8970, 'U+230A', 'left floor =
+ apl downstile', ],
['⌋', '', 'rfloor', 8971, 'U+230B', 'right floor'
+, ],
['〈', '<', 'lang', 9001, 'U+2329', 'left-pointin
+g angle bracket = bra', ],
['◊', '', 'loz', 9674, 'U+25CA', 'lozenge',
+ ],
['♠', '', 'spades', 9824, 'U+2660', 'black spade
+suit', ],
['♣', '', 'clubs', 9827, 'U+2663', 'black club s
+uit = shamrock', ],
['♥', '', 'hearts', 9829, 'U+2665', 'black heart
+suit = valentine', ],
['♦', '', 'diams', 9830, 'U+2666', 'black diamon
+d suit', ],
# ------------------------------------------------------------------
+------------------------------
# 0 1 2 3 4 5
# char equiv entity entity codepoint description
# ------------------------------------------------------------------
+------------------------------
);
|