use HTML::Entities; use HTML::Tagset; my @_html_entities=( # --------------------------------------------------- # 0 1 2 3 4 5 # char equiv entity entity codepoint description # --------------------------------------------------- ['"', '', 'quot', 34, 'U+0022', 'quotation mark=APL quote', ], ['&', '', 'amp', 38, 'U+0026', 'ampersand', ], ['<', '', 'lt', 60, 'U+003C', 'less-than sign',], ['>', '', 'gt', 62, 'U+003E', 'greater-than sign',], ['', '...','', 133, 'U+0085', '',], ['', '-', '', 150, '', '',], ['', '-', '', 151, '', '',], ['¡', '', 'iexcl', 161, 'U+00A1', 'inverted exclamation mark',], ['¢', '', 'cent', 162, 'U+00A2', 'cent sign', ], ['£', '', 'pound', 163, 'U+00A3', 'pound sign', ], ['¤', '', 'curren', 164, 'U+00A4', 'currency sign', ], ['¥', '', 'yen', 165, 'U+00A5', 'yen sign = yuan sign', ], ['¦', '', 'brvbar', 166, 'U+00A6', 'broken vertical bar',], ['§', '', 'sect', 167, 'U+00A7', 'section sign', ], ['¨', '', 'uml', 168, 'U+00A8', 'diaeresis', ], ['©', '', 'copy', 169, 'U+00A9', 'copyright sign', ], ['ª', '', 'ordf', 170, 'U+00AA', 'feminine ordinal indicator',], # etc. (complete table below) ); my %_entity2char=(); # HTML entity character equivalents my %_char2equiv=(); # my preferred 'ASCII-compatible' character equivalents my $_dashes=join '', map { chr() } ( 0x096, 0x097, 0x058A, 0x1806, 0x2010..0x2015, 0x2053, 0x207B, 0x208B, 0x2212, 0xFE63, 0xFF0D); my $_squots=join '', map { chr() } ( 0x02BC, 0x2018..0x201A, 0x2032 ); my $_dquots=join '', map { chr() } ( 0x02EE, 0x201C..0x201E ); my $_spaces=join '', map { chr() } ( 0x2000..0x200B, 0x202F, 0x205F, 0x3000); my $_dots =join '', map { chr() } ( 0x2022, 0x22C5); sub scrub { my $text = shift; return "" if !$text; # remove HTML phrasal level tags foreach my $markup (keys %HTML::Tagset::isPhraseMarkup) { $text=~s/<\s?\/?$markup\s?>/ /gi; } #decode html entities for (1..3) { # assume no more than triple nested html entities HTML::Entities::decode_entities($text) if ($text=~/&#?[a-zA-Z0-9]+;/); } # replace character escapes $text=~s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; # replace 'wide character' whitespace with # ascii-compatible whitespace $text=~s/\s/ /g; $text=~s/[$_spaces]/ /g; # transliterate 'wide character' punctuation to # ascii-compatible equivalents # with thanks to graff of Perl Monks for this code $text=~s/[$_dashes]/-/g; $text=~s/[$_squots]/\'/g; $text=~s/[$_dquots]/"/g; $text=~s/[$_dots]/\x{00B7}/g; # replace remaining 'wide' characters with # (my preferred) ascii-compatible equivalents $text=~s/(.)/$_char2equiv{$1}?$_char2equiv{$1}:$1/eg; # unidecode any remaining characters greater than 0xff if ($text=~/[\x{100}-\x{ffff}]/) { my @chars=split //, $text; # "Text::Unidecode is meant to be a # transliterator-of-last resort,..." foreach my $char (@chars) { $char=unidecode($char) if $char=~/[\x{0100}-\x{ffff}]/; } $text=join '', @chars; # strip out remaining 'wide' characters $text=~s/[\x{0100}-\x{ffff}]//g; } # trim leading, trailing, and excess whitespace $text=~s/^\s+//; $text=~s/\s{2,}/ /g; $text=~s/\s+$//; return $text; } ####################################### # initialization # ####################################### BEGIN { foreach my $entity (@_html_entities) { $entity->[0]=chr($entity->[3]); $_entity2char{$entity->[2]}=$entity->[0]; $_entity2char{$entity->[3]}=$entity->[0]; $_char2equiv{$entity->[0]}=$entity->[1] if $entity->[1]; } } my @_html_entities=( # ------------------------------------------------------------------------------------------------ # 0 1 2 3 4 5 # char equiv entity entity codepoint description # ------------------------------------------------------------------------------------------------ ['"', '', 'quot', 34, 'U+0022', 'quotation mark = APL quote', ], ['&', '', 'amp', 38, 'U+0026', 'ampersand', ], ['<', '', 'lt', 60, 'U+003C', 'less-than sign', ], ['>', '', 'gt', 62, 'U+003E', 'greater-than sign', ], ['', '...', '', 133, 'U+0085', '', ], ['', '-', '', 150, '', '', ], ['', '-', '', 151, '', '', ], ['¡', '', 'iexcl', 161, 'U+00A1', 'inverted exclamation mark', ], ['¢', '', 'cent', 162, 'U+00A2', 'cent sign', ], ['£', '', 'pound', 163, 'U+00A3', 'pound sign', ], ['¤', '', 'curren', 164, 'U+00A4', 'currency sign', ], ['¥', '', 'yen', 165, 'U+00A5', 'yen sign = yuan sign', ], ['¦', '', 'brvbar', 166, 'U+00A6', 'broken bar = broken vertical bar', ], ['§', '', 'sect', 167, 'U+00A7', 'section sign', ], ['¨', '', 'uml', 168, 'U+00A8', 'diaeresis = spacing diaeresis', ], ['©', '', 'copy', 169, 'U+00A9', 'copyright sign', ], ['ª', '', 'ordf', 170, 'U+00AA', 'feminine ordinal indicator', ], ['«', '', 'laquo', 171, 'U+00AB', 'left-pointing double angle quotation mark= left pointing guillemet', ], ['¬', '', 'not', 172, 'U+00AC', 'not sign', ], ['­', '', 'shy', 173, 'U+00AD', 'soft hyphen = discretionary hyphen', ], ['®', '', 'reg', 174, 'U+00AE', 'registered sign = registered trade mark sign', ], ['¯', '', 'macr', 175, 'U+00AF', 'macron = spacing macron = overline= APL overbar', ], ['°', '', 'deg', 176, 'U+00B0', 'degree sign', ], ['±', '', 'plusmn', 177, 'U+00B1', 'plus-minus sign = plus-or-minus sign', ], ['²', '', 'sup2', 178, 'U+00B2', 'superscript two = superscript digit two= squared', ], ['³', '', 'sup3', 179, 'U+00B3', 'superscript three = superscript digit three= cubed', ], ['´', '', 'acute', 180, 'U+00B4', 'acute accent = spacing acute', ], ['µ', '', 'micro', 181, 'U+00B5', 'micro sign', ], ['¶', '', 'para', 182, 'U+00B6', 'pilcrow sign = paragraph sign', ], ['·', '', 'middot', 183, 'U+00B7', 'middle dot = Georgian comma= Greek middle dot', ], ['¸', '', 'cedil', 184, 'U+00B8', 'cedilla = spacing cedilla', ], ['¹', '', 'sup1', 185, 'U+00B9', 'superscript one = superscript digit one', ], ['º', '', 'ordm', 186, 'U+00BA', 'masculine ordinal indicator', ], ['»', '', 'raquo', 187, 'U+00BB', 'right-pointing double angle quotation mark= right pointing guillemet',], ['¼', '', 'frac14', 188, 'U+00BC', 'vulgar fraction one quarter= fraction one quarter', ], ['½', '', 'frac12', 189, 'U+00BD', 'vulgar fraction one half= fraction one half', ], ['¾', '', 'frac34', 190, 'U+00BE', 'vulgar fraction three quarters= fraction three quarters', ], ['¿', '', 'iquest', 191, 'U+00BF', 'inverted question mark= turned question mark', ], # ------------------------------------------------------------------------------------------------ # 0 1 2 3 4 5 # char equiv entity entity codepoint description # ------------------------------------------------------------------------------------------------ ['À', '', 'Agrave', 192, 'U+00C0', 'latin capital letter A with grave= latin capital letter A grave', ], ['Á', '', 'Aacute', 193, 'U+00C1', 'latin capital letter A with acute', ], ['Â', '', 'Acirc', 194, 'U+00C2', 'latin capital letter A with circumflex', ], ['Ã', '', 'Atilde', 195, 'U+00C3', 'latin capital letter A with tilde', ], ['Ä', '', 'Auml', 196, 'U+00C4', 'latin capital letter A with diaeresis', ], ['Å', '', 'Aring', 197, 'U+00C5', 'latin capital letter A with ring above= latin capital letter A ring', ], ['Æ', '', 'AElig', 198, 'U+00C6', 'latin capital letter AE= latin capital ligature AE', ], ['Ç', '', 'Ccedil', 199, 'U+00C7', 'latin capital letter C with cedilla', ], ['È', '', 'Egrave', 200, 'U+00C8', 'latin capital letter E with grave', ], ['É', '', 'Eacute', 201, 'U+00C9', 'latin capital letter E with acute', ], ['Ê', '', 'Ecirc', 202, 'U+00CA', 'latin capital letter E with circumflex', ], ['Ë', '', 'Euml', 203, 'U+00CB', 'latin capital letter E with diaeresis', ], ['Ì', '', 'Igrave', 204, 'U+00CC', 'latin capital letter I with grave', ], ['Í', '', 'Iacute', 205, 'U+00CD', 'latin capital letter I with acute', ], ['Î', '', 'Icirc', 206, 'U+00CE', 'latin capital letter I with circumflex', ], ['Ï', '', 'Iuml', 207, 'U+00CF', 'latin capital letter I with diaeresis', ], ['Ð', '', 'ETH', 208, 'U+00D0', 'latin capital letter ETH', ], ['Ñ', '', 'Ntilde', 209, 'U+00D1', 'latin capital letter N with tilde', ], ['Ò', '', 'Ograve', 210, 'U+00D2', 'latin capital letter O with grave', ], ['Ó', '', 'Oacute', 211, 'U+00D3', 'latin capital letter O with acute', ], ['Ô', '', 'Ocirc', 212, 'U+00D4', 'latin capital letter O with circumflex', ], ['Õ', '', 'Otilde', 213, 'U+00D5', 'latin capital letter O with tilde', ], ['Ö', '', 'Ouml', 214, 'U+00D6', 'latin capital letter O with diaeresis', ], ['×', '', 'times', 215, 'U+00D7', 'multiplication sign', ], ['Ø', '', 'Oslash', 216, 'U+00D8', 'latin capital letter O with stroke= latin capital letter O slash', ], ['Ù', '', 'Ugrave', 217, 'U+00D9', 'latin capital letter U with grave', ], ['Ú', '', 'Uacute', 218, 'U+00DA', 'latin capital letter U with acute', ], ['Û', '', 'Ucirc', 219, 'U+00DB', 'latin capital letter U with circumflex', ], ['Ü', '', 'Uuml', 220, 'U+00DC', 'latin capital letter U with diaeresis', ], ['Ý', '', 'Yacute', 221, 'U+00DD', 'latin capital letter Y with acute', ], ['Þ', '', 'THORN', 222, 'U+00DE', 'latin capital letter THORN', ], ['ß', '', 'szlig', 223, 'U+00DF', 'latin small letter sharp s = ess-zed', ], ['à', '', 'agrave', 224, 'U+00E0', 'latin small letter a with grave= latin small letter a grave', ], ['á', '', 'aacute', 225, 'U+00E1', 'latin small letter a with acute', ], ['â', '', 'acirc', 226, 'U+00E2', 'latin small letter a with circumflex', ], ['ã', '', 'atilde', 227, 'U+00E3', 'latin small letter a with tilde', ], ['ä', '', 'auml', 228, 'U+00E4', 'latin small letter a with diaeresis', ], # ------------------------------------------------------------------------------------------------ # 0 1 2 3 4 5 # char equiv entity entity codepoint description # ------------------------------------------------------------------------------------------------ ['å', '', 'aring', 229, 'U+00E5', 'latin small letter a with ring above= latin small letter a ring', ], ['æ', '', 'aelig', 230, 'U+00E6', 'latin small letter ae= latin small ligature ae', ], ['ç', '', 'ccedil', 231, 'U+00E7', 'latin small letter c with cedilla', ], ['è', '', 'egrave', 232, 'U+00E8', 'latin small letter e with grave', ], ['é', '', 'eacute', 233, 'U+00E9', 'latin small letter e with acute', ], ['ê', '', 'ecirc', 234, 'U+00EA', 'latin small letter e with circumflex', ], ['ë', '', 'euml', 235, 'U+00EB', 'latin small letter e with diaeresis', ], ['ì', '', 'igrave', 236, 'U+00EC', 'latin small letter i with grave', ], ['í', '', 'iacute', 237, 'U+00ED', 'latin small letter i with acute', ], ['î', '', 'icirc', 238, 'U+00EE', 'latin small letter i with circumflex', ], ['ï', '', 'iuml', 239, 'U+00EF', 'latin small letter i with diaeresis', ], ['ð', '', 'eth', 240, 'U+00F0', 'latin small letter eth', ], ['ñ', '', 'ntilde', 241, 'U+00F1', 'latin small letter n with tilde', ], ['ò', '', 'ograve', 242, 'U+00F2', 'latin small letter o with grave', ], ['ó', '', 'oacute', 243, 'U+00F3', 'latin small letter o with acute', ], ['ô', '', 'ocirc', 244, 'U+00F4', 'latin small letter o with circumflex', ], ['õ', '', 'otilde', 245, 'U+00F5', 'latin small letter o with tilde', ], ['ö', '', 'ouml', 246, 'U+00F6', 'latin small letter o with diaeresis', ], ['÷', '', 'divide', 247, 'U+00F7', 'division sign', ], ['ù', '', 'ugrave', 249, 'U+00F9', 'latin small letter u with grave', ], ['ú', '', 'uacute', 250, 'U+00FA', 'latin small letter u with acute', ], ['û', '', 'ucirc', 251, 'U+00FB', 'latin small letter u with circumflex', ], ['ü', '', 'uuml', 252, 'U+00FC', 'latin small letter u with diaeresis', ], ['ý', '', 'yacute', 253, 'U+00FD', 'latin small letter y with acute', ], ['þ', '', 'thorn', 254, 'U+00FE', 'latin small letter thorn', ], ['ÿ', '', 'yuml', 255, 'U+00FF', 'latin small letter y with diaeresis', ], ['Œ', 'OE', 'OElig', 338, 'U+0152', 'latin capital ligature OE', ], ['œ', 'oe', 'oelig', 339, 'U+0153', 'latin small ligature oe', ], ['Š', 'S', 'Scaron', 352, 'U+0160', 'latin capital letter S with caron', ], ['š', 's', 'scaron', 353, 'U+0161', 'latin small letter s with caron', ], ['Ÿ', 'Y', 'Yuml', 376, 'U+0178', 'latin capital letter Y with diaeresis', ], ['ƒ', 'f', 'fnof', 402, 'U+0192', 'latin small f with hook = function= florin', ], ['ˆ', '', 'circ', 710, 'U+02C6', 'modifier letter circumflex accent', ], ['˜', '', 'tilde', 732, 'U+02DC', 'small tilde', ], ['Γ', ' Gamma ', 'Gamma', 915, 'U+0393', 'greek capital letter gamma', ], ['Δ', ' Delta ', 'Delta', 916, 'U+0394', 'greek capital letter delta', ], ['Θ', ' Theta ', 'Theta', 920, 'U+0398', 'greek capital letter theta', ], # ------------------------------------------------------------------------------------------------ # 0 1 2 3 4 5 # char equiv entity entity codepoint description # ------------------------------------------------------------------------------------------------ ['Λ', ' Lambda ', 'Lambda', 923, 'U+039B', 'greek capital letter lambda', ], ['Ξ', ' Xi ', 'Xi', 926, 'U+039E', 'greek capital letter xi', ], ['Π', ' Pi ', 'Pi', 928, 'U+03A0', 'greek capital letter pi', ], ['Σ', ' Sigma ', 'Sigma', 931, 'U+03A3', 'greek capital letter sigma', ], ['Υ', ' Upsilon ', 'Upsilon', 933, 'U+03A5', 'greek capital letter upsilon', ], ['Φ', ' Phi ', 'Phi', 934, 'U+03A6', 'greek capital letter phi', ], ['Ψ', ' Psi ', 'Psi', 936, 'U+03A8', 'greek capital letter psi', ], ['Ω', ' Omega ', 'Omega', 937, 'U+03A9', 'greek capital letter omega', ], ['α', ' alpha ', 'alpha', 945, 'U+03B1', 'greek small letter alpha', ], ['β', ' beta ', 'beta', 946, 'U+03B2', 'greek small letter beta', ], ['γ', ' gamma ', 'gamma', 947, 'U+03B3', 'greek small letter gamma', ], ['δ', ' delta ', 'delta', 948, 'U+03B4', 'greek small letter delta', ], ['ε', ' epsilon ', 'epsilon', 949, 'U+03B5', 'greek small letter epsilon', ], ['η', ' eta ', 'eta', 951, 'U+03B7', 'greek small letter eta', ], ['θ', ' theta ', 'theta', 952, 'U+03B8', 'greek small letter theta', ], ['ι', ' iota ', 'iota', 953, 'U+03B9', 'greek small letter iota', ], ['κ', ' kappa ', 'kappa', 954, 'U+03BA', 'greek small letter kappa', ], ['λ', ' lambda ', 'lambda', 955, 'U+03BB', 'greek small letter lambda', ], ['μ', ' mu ', 'mu', 956, 'U+03BC', 'greek small letter mu', ], ['ν', ' nu ', 'nu', 957, 'U+03BD', 'greek small letter nu', ], ['ξ', ' xi ', 'xi', 958, 'U+03BE', 'greek small letter xi', ], ['ο', ' omicron ', 'omicron', 959, 'U+03BF', 'greek small letter omicron', ], ['π', ' pi ', 'pi', 960, 'U+03C0', 'greek small letter pi', ], ['ρ', ' rho ', 'rho', 961, 'U+03C1', 'greek small letter rho', ], ['ς', ' sigma ', 'sigmaf', 962, 'U+03C2', 'greek small letter final sigma', ], ['σ', ' sigma ', 'sigma', 963, 'U+03C3', 'greek small letter sigma', ], ['τ', ' tau ', 'tau', 964, 'U+03C4', 'greek small letter tau', ], ['υ', ' upsilon ', 'upsilon', 965, 'U+03C5', 'greek small letter upsilon', ], ['φ', ' phi ', 'phi', 966, 'U+03C6', 'greek small letter phi', ], ['χ', ' chi ', 'chi', 967, 'U+03C7', 'greek small letter chi', ], ['ψ', ' psi ', 'psi', 968, 'U+03C8', 'greek small letter psi', ], ['ω', ' omega ', 'omega', 969, 'U+03C9', 'greek small letter omega', ], ['ϑ', ' theta ', 'thetasym', 977, 'U+03D1', 'greek small letter theta symbol', ], ['ϒ', ' upsilon ', 'upsih', 978, 'U+03D2', 'greek upsilon with hook symbol', ], ['ϖ', ' pi ', 'piv', 982, 'U+03D6', 'greek pi symbol', ], [' ', ' ', 'ensp', 8194, 'U+2002', 'en space', ], [' ', ' ', 'emsp', 8195, 'U+2003', 'em space', ], # ------------------------------------------------------------------------------------------------ # 0 1 2 3 4 5 # char equiv entity entity codepoint description # ------------------------------------------------------------------------------------------------ [' ', ' ', 'thinsp', 8201, 'U+2009', 'thin space', ], ['‌', '', 'zwnj', 8204, 'U+200C', 'zero width non-joiner', ], ['‍', '', 'zwj', 8205, 'U+200D', 'zero width joiner', ], ['‎', '->', 'lrm', 8206, 'U+200E', 'left-to-right mark', ], ['‏', '<-', 'rlm', 8207, 'U+200F', 'right-to-left mark', ], ['–', '-', 'ndash', 8211, 'U+2013', 'en dash', ], ['—', '-', 'mdash', 8212, 'U+2014', 'em dash', ], ['‘', '\'', 'lsquo', 8216, 'U+2018', 'left single quotation mark', ], ['’', '\'', 'rsquo', 8217, 'U+2019', 'right single quotation mark', ], ['‚', '\'', 'sbquo', 8218, 'U+201A', 'single low-9 quotation mark', ], ['“', '\"', 'ldquo', 8220, 'U+201C', 'left double quotation mark', ], ['”', '\"', 'rdquo', 8221, 'U+201D', 'right double quotation mark', ], ['„', '\"', 'bdquo', 8222, 'U+201E', 'double low-9 quotation mark', ], ['†', '+', 'dagger', 8224, 'U+2020', 'dagger', ], ['‡', '++', 'Dagger', 8225, 'U+2021', 'double dagger', ], ['•', chr(183), 'bull', 8226, 'U+2022', 'bullet = black small circle', ], ['…', '...', 'hellip', 8230, 'U+2026', 'horizontal ellipsis = three dot leader', ], ['‰', '%%', 'permil', 8240, 'U+2030', 'per mille sign', ], ['′', '\'', 'prime', 8242, 'U+2032', 'prime = minutes = feet', ], ['‹', '<', 'lsaquo', 8249, 'U+2039', 'single left-pointing angle quotation mark', ], ['›', '>', 'rsaquo', 8250, 'U+203A', 'single right-pointing angle quotation mark', ], ['‾', '', 'oline', 8254, 'U+203E', 'overline = spacing overscore', ], ['⁄', '/', 'frasl', 8260, 'U+2044', 'fraction slash', ], ['€', ' euro ', 'euro', 8364, 'U+20AC', 'euro sign', ], ['ℑ', 'I', 'image', 8465, 'U+2111', 'blackletter capital I = imaginary part', ], ['℘', 'P', 'weierp', 8472, 'U+2118', 'script capital P = power set= Weierstrass p', ], ['ℜ', 'R', 'real', 8476, 'U+211C', 'blackletter capital R = real part symbol', ], ['™', '(tm)', 'trade', 8482, 'U+2122', 'trade mark sign', ], ['ℵ', '', 'alefsym', 8501, 'U+2135', 'alef symbol = first transfinite cardinal', ], ['←', '<-', 'larr', 8592, 'U+2190', 'leftwards arrow', ], ['↑', '', 'uarr', 8593, 'U+2191', 'upwards arrow', ], ['→', '->', 'rarr', 8594, 'U+2192', 'rightwards arrow', ], ['↓', '', 'darr', 8595, 'U+2193', 'downwards arrow', ], ['↔', '', 'harr', 8596, 'U+2194', 'left right arrow', ], ['↵', '<-', 'crarr', 8629, 'U+21B5', 'downwards arrow with corner leftwards= carriage return', ], ['⇐', '<=', 'lArr', 8656, 'U+21D0', 'leftwards double arrow', ], ['⇑', '', 'uArr', 8657, 'U+21D1', 'upwards double arrow', ], # ------------------------------------------------------------------------------------------------ # 0 1 2 3 4 5 # char equiv entity entity codepoint description # ------------------------------------------------------------------------------------------------ ['⇒', '=>', 'rArr', 8658, 'U+21D2', 'rightwards double arrow', ], ['⇓', '', 'dArr', 8659, 'U+21D3', 'downwards double arrow', ], ['∀', ' foreach ', 'forall', 8704, 'U+2200', 'for all', ], ['∂', '', 'part', 8706, 'U+2202', 'partial differential', ], ['∃', '', 'exist', 8707, 'U+2203', 'there exists', ], ['∅', '', 'empty', 8709, 'U+2205', 'empty set = null set = diameter', ], ['∇', '', 'nabla', 8711, 'U+2207', 'nabla = backward difference', ], ['∈', '', 'isin', 8712, 'U+2208', 'element of', ], ['∉', '', 'notin', 8713, 'U+2209', 'not an element of', ], ['∋', '', 'ni', 8715, 'U+220B', 'contains as member', ], ['∏', '', 'prod', 8719, 'U+220F', 'n-ary product = product sign', ], ['∑', '', 'sum', 8721, 'U+2211', 'n-ary sumation', ], ['−', '-', 'minus', 8722, 'U+2212', 'minus sign', ], ['∗', '*', 'lowast', 8727, 'U+2217', 'asterisk operator', ], ['√', '', 'radic', 8730, 'U+221A', 'square root = radical sign', ], ['∝', '', 'prop', 8733, 'U+221D', 'proportional to', ], ['∞', '', 'infin', 8734, 'U+221E', 'infinity', ], ['∠', '', 'ang', 8736, 'U+2220', 'angle', ], ['∧', ' AND ', 'and', 8743, 'U+2227', 'logical and = wedge', ], ['∨', ' OR ', 'or', 8744, 'U+2228', 'logical or = vee', ], ['∩', '', 'cap', 8745, 'U+2229', 'intersection = cap', ], ['∪', '', 'cup', 8746, 'U+222A', 'union = cup', ], ['∴', '', 'there4', 8756, 'U+2234', 'therefore', ], ['∼', '~', 'sim', 8764, 'U+223C', 'tilde operator = varies with = similar to', ], ['≅', '~', 'cong', 8773, 'U+2245', 'approximately equal to', ], ['≈', '', 'asymp', 8776, 'U+2248', 'almost equal to = asymptotic to', ], ['≠', '<>', 'ne', 8800, 'U+2260', 'not equal to', ], ['≡', '', 'equiv', 8801, 'U+2261', 'identical to', ], ['≤', '<=', 'le', 8804, 'U+2264', 'less-than or equal to', ], ['≥', '>=', 'ge', 8805, 'U+2265', 'greater-than or equal to', ], ['⊂', '', 'sub', 8834, 'U+2282', 'subset of', ], ['⊃', '', 'sup', 8835, 'U+2283', 'superset of', ], ['⊄', '', 'nsub', 8836, 'U+2284', 'not a subset of', ], ['⊆', '', 'sube', 8838, 'U+2286', 'subset of or equal to', ], ['⊇', '', 'supe', 8839, 'U+2287', 'superset of or equal to', ], ['⊕', '', 'oplus', 8853, 'U+2295', 'circled plus = direct sum', ], ['⊗', '', 'otimes', 8855, 'U+2297', 'circled times = vector product', ], # ------------------------------------------------------------------------------------------------ # 0 1 2 3 4 5 # char equiv entity entity codepoint description # ------------------------------------------------------------------------------------------------ ['⊥', '', 'perp', 8869, 'U+22A5', 'up tack = orthogonal to = perpendicular', ], ['⋅', chr(177), 'sdot', 8901, 'U+22C5', 'dot operator', ], ['⌈', '', 'lceil', 8968, 'U+2308', 'left ceiling = apl upstile', ], ['⌉', '', 'rceil', 8969, 'U+2309', 'right ceiling', ], ['⌊', '', 'lfloor', 8970, 'U+230A', 'left floor = apl downstile', ], ['⌋', '', 'rfloor', 8971, 'U+230B', 'right floor', ], ['〈', '<', 'lang', 9001, 'U+2329', 'left-pointing angle bracket = bra', ], ['◊', '', 'loz', 9674, 'U+25CA', 'lozenge', ], ['♠', '', 'spades', 9824, 'U+2660', 'black spade suit', ], ['♣', '', 'clubs', 9827, 'U+2663', 'black club suit = shamrock', ], ['♥', '', 'hearts', 9829, 'U+2665', 'black heart suit = valentine', ], ['♦', '', 'diams', 9830, 'U+2666', 'black diamond suit', ], # ------------------------------------------------------------------------------------------------ # 0 1 2 3 4 5 # char equiv entity entity codepoint description # ------------------------------------------------------------------------------------------------ );