use HTML::Entities;
use HTML::Tagset;
my @_html_entities=(
# ---------------------------------------------------
# 0 1 2 3 4 5
# char equiv entity entity codepoint description
# ---------------------------------------------------
['"', '', 'quot', 34, 'U+0022', 'quotation mark=APL quote', ],
['&', '', 'amp', 38, 'U+0026', 'ampersand', ],
['<', '', 'lt', 60, 'U+003C', 'less-than sign',],
['>', '', 'gt', 62, 'U+003E', 'greater-than sign',],
['', '...','', 133, 'U+0085', '',],
['', '-', '', 150, '', '',],
['', '-', '', 151, '', '',],
['¡', '', 'iexcl', 161, 'U+00A1', 'inverted exclamation mark',],
['¢', '', 'cent', 162, 'U+00A2', 'cent sign', ],
['£', '', 'pound', 163, 'U+00A3', 'pound sign', ],
['¤', '', 'curren', 164, 'U+00A4', 'currency sign', ],
['¥', '', 'yen', 165, 'U+00A5', 'yen sign = yuan sign', ],
['¦', '', 'brvbar', 166, 'U+00A6', 'broken vertical bar',],
['§', '', 'sect', 167, 'U+00A7', 'section sign', ],
['¨', '', 'uml', 168, 'U+00A8', 'diaeresis', ],
['©', '', 'copy', 169, 'U+00A9', 'copyright sign', ],
['ª', '', 'ordf', 170, 'U+00AA', 'feminine ordinal indicator',],
# etc. (complete table below)
);
my %_entity2char=(); # HTML entity character equivalents
my %_char2equiv=(); # my preferred 'ASCII-compatible' character equivalents
my $_dashes=join '', map { chr() } ( 0x096, 0x097, 0x058A, 0x1806,
0x2010..0x2015, 0x2053, 0x207B, 0x208B, 0x2212, 0xFE63, 0xFF0D);
my $_squots=join '', map { chr() } ( 0x02BC, 0x2018..0x201A, 0x2032 );
my $_dquots=join '', map { chr() } ( 0x02EE, 0x201C..0x201E );
my $_spaces=join '', map { chr() } ( 0x2000..0x200B, 0x202F,
0x205F, 0x3000);
my $_dots =join '', map { chr() } ( 0x2022, 0x22C5);
sub scrub {
my $text = shift;
return "" if !$text;
# remove HTML phrasal level tags
foreach my $markup (keys %HTML::Tagset::isPhraseMarkup) {
$text=~s/<\s?\/?$markup\s?>/ /gi;
}
#decode html entities
for (1..3) { # assume no more than triple nested html entities
HTML::Entities::decode_entities($text)
if ($text=~/?[a-zA-Z0-9]+;/);
}
# replace character escapes
$text=~s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
# replace 'wide character' whitespace with
# ascii-compatible whitespace
$text=~s/\s/ /g;
$text=~s/[$_spaces]/ /g;
# transliterate 'wide character' punctuation to
# ascii-compatible equivalents
# with thanks to graff of Perl Monks for this code
$text=~s/[$_dashes]/-/g;
$text=~s/[$_squots]/\'/g;
$text=~s/[$_dquots]/"/g;
$text=~s/[$_dots]/\x{00B7}/g;
# replace remaining 'wide' characters with
# (my preferred) ascii-compatible equivalents
$text=~s/(.)/$_char2equiv{$1}?$_char2equiv{$1}:$1/eg;
# unidecode any remaining characters greater than 0xff
if ($text=~/[\x{100}-\x{ffff}]/) {
my @chars=split //, $text;
# "Text::Unidecode is meant to be a
# transliterator-of-last resort,..."
foreach my $char (@chars) {
$char=unidecode($char)
if $char=~/[\x{0100}-\x{ffff}]/;
}
$text=join '', @chars;
# strip out remaining 'wide' characters
$text=~s/[\x{0100}-\x{ffff}]//g;
}
# trim leading, trailing, and excess whitespace
$text=~s/^\s+//; $text=~s/\s{2,}/ /g; $text=~s/\s+$//;
return $text;
}
#######################################
# initialization #
#######################################
BEGIN {
foreach my $entity (@_html_entities) {
$entity->[0]=chr($entity->[3]);
$_entity2char{$entity->[2]}=$entity->[0];
$_entity2char{$entity->[3]}=$entity->[0];
$_char2equiv{$entity->[0]}=$entity->[1]
if $entity->[1];
}
}
my @_html_entities=(
# ------------------------------------------------------------------------------------------------
# 0 1 2 3 4 5
# char equiv entity entity codepoint description
# ------------------------------------------------------------------------------------------------
['"', '', 'quot', 34, 'U+0022', 'quotation mark = APL quote', ],
['&', '', 'amp', 38, 'U+0026', 'ampersand', ],
['<', '', 'lt', 60, 'U+003C', 'less-than sign', ],
['>', '', 'gt', 62, 'U+003E', 'greater-than sign', ],
['', '...', '', 133, 'U+0085', '', ],
['', '-', '', 150, '', '', ],
['', '-', '', 151, '', '', ],
['¡', '', 'iexcl', 161, 'U+00A1', 'inverted exclamation mark', ],
['¢', '', 'cent', 162, 'U+00A2', 'cent sign', ],
['£', '', 'pound', 163, 'U+00A3', 'pound sign', ],
['¤', '', 'curren', 164, 'U+00A4', 'currency sign', ],
['¥', '', 'yen', 165, 'U+00A5', 'yen sign = yuan sign', ],
['¦', '', 'brvbar', 166, 'U+00A6', 'broken bar = broken vertical bar', ],
['§', '', 'sect', 167, 'U+00A7', 'section sign', ],
['¨', '', 'uml', 168, 'U+00A8', 'diaeresis = spacing diaeresis', ],
['©', '', 'copy', 169, 'U+00A9', 'copyright sign', ],
['ª', '', 'ordf', 170, 'U+00AA', 'feminine ordinal indicator', ],
['«', '', 'laquo', 171, 'U+00AB', 'left-pointing double angle quotation mark= left pointing guillemet', ],
['¬', '', 'not', 172, 'U+00AC', 'not sign', ],
['', '', 'shy', 173, 'U+00AD', 'soft hyphen = discretionary hyphen', ],
['®', '', 'reg', 174, 'U+00AE', 'registered sign = registered trade mark sign', ],
['¯', '', 'macr', 175, 'U+00AF', 'macron = spacing macron = overline= APL overbar', ],
['°', '', 'deg', 176, 'U+00B0', 'degree sign', ],
['±', '', 'plusmn', 177, 'U+00B1', 'plus-minus sign = plus-or-minus sign', ],
['²', '', 'sup2', 178, 'U+00B2', 'superscript two = superscript digit two= squared', ],
['³', '', 'sup3', 179, 'U+00B3', 'superscript three = superscript digit three= cubed', ],
['´', '', 'acute', 180, 'U+00B4', 'acute accent = spacing acute', ],
['µ', '', 'micro', 181, 'U+00B5', 'micro sign', ],
['¶', '', 'para', 182, 'U+00B6', 'pilcrow sign = paragraph sign', ],
['·', '', 'middot', 183, 'U+00B7', 'middle dot = Georgian comma= Greek middle dot', ],
['¸', '', 'cedil', 184, 'U+00B8', 'cedilla = spacing cedilla', ],
['¹', '', 'sup1', 185, 'U+00B9', 'superscript one = superscript digit one', ],
['º', '', 'ordm', 186, 'U+00BA', 'masculine ordinal indicator', ],
['»', '', 'raquo', 187, 'U+00BB', 'right-pointing double angle quotation mark= right pointing guillemet',],
['¼', '', 'frac14', 188, 'U+00BC', 'vulgar fraction one quarter= fraction one quarter', ],
['½', '', 'frac12', 189, 'U+00BD', 'vulgar fraction one half= fraction one half', ],
['¾', '', 'frac34', 190, 'U+00BE', 'vulgar fraction three quarters= fraction three quarters', ],
['¿', '', 'iquest', 191, 'U+00BF', 'inverted question mark= turned question mark', ],
# ------------------------------------------------------------------------------------------------
# 0 1 2 3 4 5
# char equiv entity entity codepoint description
# ------------------------------------------------------------------------------------------------
['À', '', 'Agrave', 192, 'U+00C0', 'latin capital letter A with grave= latin capital letter A grave', ],
['Á', '', 'Aacute', 193, 'U+00C1', 'latin capital letter A with acute', ],
['Â', '', 'Acirc', 194, 'U+00C2', 'latin capital letter A with circumflex', ],
['Ã', '', 'Atilde', 195, 'U+00C3', 'latin capital letter A with tilde', ],
['Ä', '', 'Auml', 196, 'U+00C4', 'latin capital letter A with diaeresis', ],
['Å', '', 'Aring', 197, 'U+00C5', 'latin capital letter A with ring above= latin capital letter A ring', ],
['Æ', '', 'AElig', 198, 'U+00C6', 'latin capital letter AE= latin capital ligature AE', ],
['Ç', '', 'Ccedil', 199, 'U+00C7', 'latin capital letter C with cedilla', ],
['È', '', 'Egrave', 200, 'U+00C8', 'latin capital letter E with grave', ],
['É', '', 'Eacute', 201, 'U+00C9', 'latin capital letter E with acute', ],
['Ê', '', 'Ecirc', 202, 'U+00CA', 'latin capital letter E with circumflex', ],
['Ë', '', 'Euml', 203, 'U+00CB', 'latin capital letter E with diaeresis', ],
['Ì', '', 'Igrave', 204, 'U+00CC', 'latin capital letter I with grave', ],
['Í', '', 'Iacute', 205, 'U+00CD', 'latin capital letter I with acute', ],
['Î', '', 'Icirc', 206, 'U+00CE', 'latin capital letter I with circumflex', ],
['Ï', '', 'Iuml', 207, 'U+00CF', 'latin capital letter I with diaeresis', ],
['Ð', '', 'ETH', 208, 'U+00D0', 'latin capital letter ETH', ],
['Ñ', '', 'Ntilde', 209, 'U+00D1', 'latin capital letter N with tilde', ],
['Ò', '', 'Ograve', 210, 'U+00D2', 'latin capital letter O with grave', ],
['Ó', '', 'Oacute', 211, 'U+00D3', 'latin capital letter O with acute', ],
['Ô', '', 'Ocirc', 212, 'U+00D4', 'latin capital letter O with circumflex', ],
['Õ', '', 'Otilde', 213, 'U+00D5', 'latin capital letter O with tilde', ],
['Ö', '', 'Ouml', 214, 'U+00D6', 'latin capital letter O with diaeresis', ],
['×', '', 'times', 215, 'U+00D7', 'multiplication sign', ],
['Ø', '', 'Oslash', 216, 'U+00D8', 'latin capital letter O with stroke= latin capital letter O slash', ],
['Ù', '', 'Ugrave', 217, 'U+00D9', 'latin capital letter U with grave', ],
['Ú', '', 'Uacute', 218, 'U+00DA', 'latin capital letter U with acute', ],
['Û', '', 'Ucirc', 219, 'U+00DB', 'latin capital letter U with circumflex', ],
['Ü', '', 'Uuml', 220, 'U+00DC', 'latin capital letter U with diaeresis', ],
['Ý', '', 'Yacute', 221, 'U+00DD', 'latin capital letter Y with acute', ],
['Þ', '', 'THORN', 222, 'U+00DE', 'latin capital letter THORN', ],
['ß', '', 'szlig', 223, 'U+00DF', 'latin small letter sharp s = ess-zed', ],
['à', '', 'agrave', 224, 'U+00E0', 'latin small letter a with grave= latin small letter a grave', ],
['á', '', 'aacute', 225, 'U+00E1', 'latin small letter a with acute', ],
['â', '', 'acirc', 226, 'U+00E2', 'latin small letter a with circumflex', ],
['ã', '', 'atilde', 227, 'U+00E3', 'latin small letter a with tilde', ],
['ä', '', 'auml', 228, 'U+00E4', 'latin small letter a with diaeresis', ],
# ------------------------------------------------------------------------------------------------
# 0 1 2 3 4 5
# char equiv entity entity codepoint description
# ------------------------------------------------------------------------------------------------
['å', '', 'aring', 229, 'U+00E5', 'latin small letter a with ring above= latin small letter a ring', ],
['æ', '', 'aelig', 230, 'U+00E6', 'latin small letter ae= latin small ligature ae', ],
['ç', '', 'ccedil', 231, 'U+00E7', 'latin small letter c with cedilla', ],
['è', '', 'egrave', 232, 'U+00E8', 'latin small letter e with grave', ],
['é', '', 'eacute', 233, 'U+00E9', 'latin small letter e with acute', ],
['ê', '', 'ecirc', 234, 'U+00EA', 'latin small letter e with circumflex', ],
['ë', '', 'euml', 235, 'U+00EB', 'latin small letter e with diaeresis', ],
['ì', '', 'igrave', 236, 'U+00EC', 'latin small letter i with grave', ],
['í', '', 'iacute', 237, 'U+00ED', 'latin small letter i with acute', ],
['î', '', 'icirc', 238, 'U+00EE', 'latin small letter i with circumflex', ],
['ï', '', 'iuml', 239, 'U+00EF', 'latin small letter i with diaeresis', ],
['ð', '', 'eth', 240, 'U+00F0', 'latin small letter eth', ],
['ñ', '', 'ntilde', 241, 'U+00F1', 'latin small letter n with tilde', ],
['ò', '', 'ograve', 242, 'U+00F2', 'latin small letter o with grave', ],
['ó', '', 'oacute', 243, 'U+00F3', 'latin small letter o with acute', ],
['ô', '', 'ocirc', 244, 'U+00F4', 'latin small letter o with circumflex', ],
['õ', '', 'otilde', 245, 'U+00F5', 'latin small letter o with tilde', ],
['ö', '', 'ouml', 246, 'U+00F6', 'latin small letter o with diaeresis', ],
['÷', '', 'divide', 247, 'U+00F7', 'division sign', ],
['ù', '', 'ugrave', 249, 'U+00F9', 'latin small letter u with grave', ],
['ú', '', 'uacute', 250, 'U+00FA', 'latin small letter u with acute', ],
['û', '', 'ucirc', 251, 'U+00FB', 'latin small letter u with circumflex', ],
['ü', '', 'uuml', 252, 'U+00FC', 'latin small letter u with diaeresis', ],
['ý', '', 'yacute', 253, 'U+00FD', 'latin small letter y with acute', ],
['þ', '', 'thorn', 254, 'U+00FE', 'latin small letter thorn', ],
['ÿ', '', 'yuml', 255, 'U+00FF', 'latin small letter y with diaeresis', ],
['Œ', 'OE', 'OElig', 338, 'U+0152', 'latin capital ligature OE', ],
['œ', 'oe', 'oelig', 339, 'U+0153', 'latin small ligature oe', ],
['Š', 'S', 'Scaron', 352, 'U+0160', 'latin capital letter S with caron', ],
['š', 's', 'scaron', 353, 'U+0161', 'latin small letter s with caron', ],
['Ÿ', 'Y', 'Yuml', 376, 'U+0178', 'latin capital letter Y with diaeresis', ],
['ƒ', 'f', 'fnof', 402, 'U+0192', 'latin small f with hook = function= florin', ],
['ˆ', '', 'circ', 710, 'U+02C6', 'modifier letter circumflex accent', ],
['˜', '', 'tilde', 732, 'U+02DC', 'small tilde', ],
['Γ', ' Gamma ', 'Gamma', 915, 'U+0393', 'greek capital letter gamma', ],
['Δ', ' Delta ', 'Delta', 916, 'U+0394', 'greek capital letter delta', ],
['Θ', ' Theta ', 'Theta', 920, 'U+0398', 'greek capital letter theta', ],
# ------------------------------------------------------------------------------------------------
# 0 1 2 3 4 5
# char equiv entity entity codepoint description
# ------------------------------------------------------------------------------------------------
['Λ', ' Lambda ', 'Lambda', 923, 'U+039B', 'greek capital letter lambda', ],
['Ξ', ' Xi ', 'Xi', 926, 'U+039E', 'greek capital letter xi', ],
['Π', ' Pi ', 'Pi', 928, 'U+03A0', 'greek capital letter pi', ],
['Σ', ' Sigma ', 'Sigma', 931, 'U+03A3', 'greek capital letter sigma', ],
['Υ', ' Upsilon ', 'Upsilon', 933, 'U+03A5', 'greek capital letter upsilon', ],
['Φ', ' Phi ', 'Phi', 934, 'U+03A6', 'greek capital letter phi', ],
['Ψ', ' Psi ', 'Psi', 936, 'U+03A8', 'greek capital letter psi', ],
['Ω', ' Omega ', 'Omega', 937, 'U+03A9', 'greek capital letter omega', ],
['α', ' alpha ', 'alpha', 945, 'U+03B1', 'greek small letter alpha', ],
['β', ' beta ', 'beta', 946, 'U+03B2', 'greek small letter beta', ],
['γ', ' gamma ', 'gamma', 947, 'U+03B3', 'greek small letter gamma', ],
['δ', ' delta ', 'delta', 948, 'U+03B4', 'greek small letter delta', ],
['ε', ' epsilon ', 'epsilon', 949, 'U+03B5', 'greek small letter epsilon', ],
['η', ' eta ', 'eta', 951, 'U+03B7', 'greek small letter eta', ],
['θ', ' theta ', 'theta', 952, 'U+03B8', 'greek small letter theta', ],
['ι', ' iota ', 'iota', 953, 'U+03B9', 'greek small letter iota', ],
['κ', ' kappa ', 'kappa', 954, 'U+03BA', 'greek small letter kappa', ],
['λ', ' lambda ', 'lambda', 955, 'U+03BB', 'greek small letter lambda', ],
['μ', ' mu ', 'mu', 956, 'U+03BC', 'greek small letter mu', ],
['ν', ' nu ', 'nu', 957, 'U+03BD', 'greek small letter nu', ],
['ξ', ' xi ', 'xi', 958, 'U+03BE', 'greek small letter xi', ],
['ο', ' omicron ', 'omicron', 959, 'U+03BF', 'greek small letter omicron', ],
['π', ' pi ', 'pi', 960, 'U+03C0', 'greek small letter pi', ],
['ρ', ' rho ', 'rho', 961, 'U+03C1', 'greek small letter rho', ],
['ς', ' sigma ', 'sigmaf', 962, 'U+03C2', 'greek small letter final sigma', ],
['σ', ' sigma ', 'sigma', 963, 'U+03C3', 'greek small letter sigma', ],
['τ', ' tau ', 'tau', 964, 'U+03C4', 'greek small letter tau', ],
['υ', ' upsilon ', 'upsilon', 965, 'U+03C5', 'greek small letter upsilon', ],
['φ', ' phi ', 'phi', 966, 'U+03C6', 'greek small letter phi', ],
['χ', ' chi ', 'chi', 967, 'U+03C7', 'greek small letter chi', ],
['ψ', ' psi ', 'psi', 968, 'U+03C8', 'greek small letter psi', ],
['ω', ' omega ', 'omega', 969, 'U+03C9', 'greek small letter omega', ],
['ϑ', ' theta ', 'thetasym', 977, 'U+03D1', 'greek small letter theta symbol', ],
['ϒ', ' upsilon ', 'upsih', 978, 'U+03D2', 'greek upsilon with hook symbol', ],
['ϖ', ' pi ', 'piv', 982, 'U+03D6', 'greek pi symbol', ],
[' ', ' ', 'ensp', 8194, 'U+2002', 'en space', ],
[' ', ' ', 'emsp', 8195, 'U+2003', 'em space', ],
# ------------------------------------------------------------------------------------------------
# 0 1 2 3 4 5
# char equiv entity entity codepoint description
# ------------------------------------------------------------------------------------------------
[' ', ' ', 'thinsp', 8201, 'U+2009', 'thin space', ],
['', '', 'zwnj', 8204, 'U+200C', 'zero width non-joiner', ],
['', '', 'zwj', 8205, 'U+200D', 'zero width joiner', ],
['', '->', 'lrm', 8206, 'U+200E', 'left-to-right mark', ],
['', '<-', 'rlm', 8207, 'U+200F', 'right-to-left mark', ],
['–', '-', 'ndash', 8211, 'U+2013', 'en dash', ],
['—', '-', 'mdash', 8212, 'U+2014', 'em dash', ],
['‘', '\'', 'lsquo', 8216, 'U+2018', 'left single quotation mark', ],
['’', '\'', 'rsquo', 8217, 'U+2019', 'right single quotation mark', ],
['‚', '\'', 'sbquo', 8218, 'U+201A', 'single low-9 quotation mark', ],
['“', '\"', 'ldquo', 8220, 'U+201C', 'left double quotation mark', ],
['”', '\"', 'rdquo', 8221, 'U+201D', 'right double quotation mark', ],
['„', '\"', 'bdquo', 8222, 'U+201E', 'double low-9 quotation mark', ],
['†', '+', 'dagger', 8224, 'U+2020', 'dagger', ],
['‡', '++', 'Dagger', 8225, 'U+2021', 'double dagger', ],
['•', chr(183), 'bull', 8226, 'U+2022', 'bullet = black small circle', ],
['…', '...', 'hellip', 8230, 'U+2026', 'horizontal ellipsis = three dot leader', ],
['‰', '%%', 'permil', 8240, 'U+2030', 'per mille sign', ],
['′', '\'', 'prime', 8242, 'U+2032', 'prime = minutes = feet', ],
['‹', '<', 'lsaquo', 8249, 'U+2039', 'single left-pointing angle quotation mark', ],
['›', '>', 'rsaquo', 8250, 'U+203A', 'single right-pointing angle quotation mark', ],
['‾', '', 'oline', 8254, 'U+203E', 'overline = spacing overscore', ],
['⁄', '/', 'frasl', 8260, 'U+2044', 'fraction slash', ],
['€', ' euro ', 'euro', 8364, 'U+20AC', 'euro sign', ],
['ℑ', 'I', 'image', 8465, 'U+2111', 'blackletter capital I = imaginary part', ],
['℘', 'P', 'weierp', 8472, 'U+2118', 'script capital P = power set= Weierstrass p', ],
['ℜ', 'R', 'real', 8476, 'U+211C', 'blackletter capital R = real part symbol', ],
['™', '(tm)', 'trade', 8482, 'U+2122', 'trade mark sign', ],
['ℵ', '', 'alefsym', 8501, 'U+2135', 'alef symbol = first transfinite cardinal', ],
['←', '<-', 'larr', 8592, 'U+2190', 'leftwards arrow', ],
['↑', '', 'uarr', 8593, 'U+2191', 'upwards arrow', ],
['→', '->', 'rarr', 8594, 'U+2192', 'rightwards arrow', ],
['↓', '', 'darr', 8595, 'U+2193', 'downwards arrow', ],
['↔', '', 'harr', 8596, 'U+2194', 'left right arrow', ],
['↵', '<-', 'crarr', 8629, 'U+21B5', 'downwards arrow with corner leftwards= carriage return', ],
['⇐', '<=', 'lArr', 8656, 'U+21D0', 'leftwards double arrow', ],
['⇑', '', 'uArr', 8657, 'U+21D1', 'upwards double arrow', ],
# ------------------------------------------------------------------------------------------------
# 0 1 2 3 4 5
# char equiv entity entity codepoint description
# ------------------------------------------------------------------------------------------------
['⇒', '=>', 'rArr', 8658, 'U+21D2', 'rightwards double arrow', ],
['⇓', '', 'dArr', 8659, 'U+21D3', 'downwards double arrow', ],
['∀', ' foreach ', 'forall', 8704, 'U+2200', 'for all', ],
['∂', '', 'part', 8706, 'U+2202', 'partial differential', ],
['∃', '', 'exist', 8707, 'U+2203', 'there exists', ],
['∅', '', 'empty', 8709, 'U+2205', 'empty set = null set = diameter', ],
['∇', '', 'nabla', 8711, 'U+2207', 'nabla = backward difference', ],
['∈', '', 'isin', 8712, 'U+2208', 'element of', ],
['∉', '', 'notin', 8713, 'U+2209', 'not an element of', ],
['∋', '', 'ni', 8715, 'U+220B', 'contains as member', ],
['∏', '', 'prod', 8719, 'U+220F', 'n-ary product = product sign', ],
['∑', '', 'sum', 8721, 'U+2211', 'n-ary sumation', ],
['−', '-', 'minus', 8722, 'U+2212', 'minus sign', ],
['∗', '*', 'lowast', 8727, 'U+2217', 'asterisk operator', ],
['√', '', 'radic', 8730, 'U+221A', 'square root = radical sign', ],
['∝', '', 'prop', 8733, 'U+221D', 'proportional to', ],
['∞', '', 'infin', 8734, 'U+221E', 'infinity', ],
['∠', '', 'ang', 8736, 'U+2220', 'angle', ],
['∧', ' AND ', 'and', 8743, 'U+2227', 'logical and = wedge', ],
['∨', ' OR ', 'or', 8744, 'U+2228', 'logical or = vee', ],
['∩', '', 'cap', 8745, 'U+2229', 'intersection = cap', ],
['∪', '', 'cup', 8746, 'U+222A', 'union = cup', ],
['∴', '', 'there4', 8756, 'U+2234', 'therefore', ],
['∼', '~', 'sim', 8764, 'U+223C', 'tilde operator = varies with = similar to', ],
['≅', '~', 'cong', 8773, 'U+2245', 'approximately equal to', ],
['≈', '', 'asymp', 8776, 'U+2248', 'almost equal to = asymptotic to', ],
['≠', '<>', 'ne', 8800, 'U+2260', 'not equal to', ],
['≡', '', 'equiv', 8801, 'U+2261', 'identical to', ],
['≤', '<=', 'le', 8804, 'U+2264', 'less-than or equal to', ],
['≥', '>=', 'ge', 8805, 'U+2265', 'greater-than or equal to', ],
['⊂', '', 'sub', 8834, 'U+2282', 'subset of', ],
['⊃', '', 'sup', 8835, 'U+2283', 'superset of', ],
['⊄', '', 'nsub', 8836, 'U+2284', 'not a subset of', ],
['⊆', '', 'sube', 8838, 'U+2286', 'subset of or equal to', ],
['⊇', '', 'supe', 8839, 'U+2287', 'superset of or equal to', ],
['⊕', '', 'oplus', 8853, 'U+2295', 'circled plus = direct sum', ],
['⊗', '', 'otimes', 8855, 'U+2297', 'circled times = vector product', ],
# ------------------------------------------------------------------------------------------------
# 0 1 2 3 4 5
# char equiv entity entity codepoint description
# ------------------------------------------------------------------------------------------------
['⊥', '', 'perp', 8869, 'U+22A5', 'up tack = orthogonal to = perpendicular', ],
['⋅', chr(177), 'sdot', 8901, 'U+22C5', 'dot operator', ],
['⌈', '', 'lceil', 8968, 'U+2308', 'left ceiling = apl upstile', ],
['⌉', '', 'rceil', 8969, 'U+2309', 'right ceiling', ],
['⌊', '', 'lfloor', 8970, 'U+230A', 'left floor = apl downstile', ],
['⌋', '', 'rfloor', 8971, 'U+230B', 'right floor', ],
['〈', '<', 'lang', 9001, 'U+2329', 'left-pointing angle bracket = bra', ],
['◊', '', 'loz', 9674, 'U+25CA', 'lozenge', ],
['♠', '', 'spades', 9824, 'U+2660', 'black spade suit', ],
['♣', '', 'clubs', 9827, 'U+2663', 'black club suit = shamrock', ],
['♥', '', 'hearts', 9829, 'U+2665', 'black heart suit = valentine', ],
['♦', '', 'diams', 9830, 'U+2666', 'black diamond suit', ],
# ------------------------------------------------------------------------------------------------
# 0 1 2 3 4 5
# char equiv entity entity codepoint description
# ------------------------------------------------------------------------------------------------
);