panda.doc.html.HTMLEntities Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of panda-core Show documentation
Show all versions of panda-core Show documentation
Panda Core is the core module of Panda Framework, it contains commonly used utility classes similar to apache-commons.
package panda.doc.html;
import java.util.HashMap;
import java.util.Map;
import panda.lang.Arrays;
import panda.lang.Collections;
/**
* HTML entities
*/
@SuppressWarnings("unchecked")
public class HTMLEntities {
public static final String NBSP = " ";
public static final String APOS = "'";
public static final String QUOT = """;
public static final String AMP = "&";
public static final String LT = "<";
public static final String GT = ">";
/**
* Mapping to escape the basic XML and HTML character entities. Namely: {@code " & < >}
*/
public static final Map BASIC_ESCAPE = Arrays.toMap(
"\"", QUOT, // " - double-quote
"&", AMP, // & - ampersand
"<", LT, // < - less-than
">", GT // > - greater-than
);
/**
* Reverse of {@link #BASIC_ESCAPE} for unescaping purposes.
*/
public static final Map BASIC_UNESCAPE = invert(BASIC_ESCAPE);
/**
* Mapping to escape the apostrophe character to its XML character entity.
*/
public static final Map APOS_ESCAPE = Arrays.toMap("'", APOS);
/**
* Reverse of {@link #APOS_ESCAPE} for unescaping purposes.
*/
public static final Map APOS_UNESCAPE = invert(APOS_ESCAPE);
/**
* XML escape map
*/
public static final Map XML_ESCAPE = merge(BASIC_ESCAPE, APOS_ESCAPE);
/**
* Reverse of {@link #XML_ESCAPE} for unescaping purposes.
*/
public static final Map XML_UNESCAPE = invert(XML_ESCAPE);
/**
* Mapping to escape ISO-8859-1
* characters to their named HTML 3.x equivalents.
*/
public static final Map ISO8859_1_ESCAPE = Arrays.toMap(
"\u00A0", NBSP, // non-breaking space
"\u00A1", "¡", // inverted exclamation mark
"\u00A2", "¢", // cent sign
"\u00A3", "£", // pound sign
"\u00A4", "¤", // currency sign
"\u00A5", "¥", // yen sign = yuan sign
"\u00A6", "¦", // broken bar = broken vertical bar
"\u00A7", "§", // section sign
"\u00A8", "¨", // diaeresis = spacing diaeresis
"\u00A9", "©", // © - copyright sign
"\u00AA", "ª", // feminine ordinal indicator
"\u00AB", "«", // left-pointing double angle quotation mark = left pointing guillemet
"\u00AC", "¬", // not sign
"\u00AD", "", // soft hyphen = discretionary hyphen
"\u00AE", "®", // ® - registered trademark sign
"\u00AF", "¯", // macron = spacing macron = overline = APL overbar
"\u00B0", "°", // degree sign
"\u00B1", "±", // plus-minus sign = plus-or-minus sign
"\u00B2", "²", // superscript two = superscript digit two = squared
"\u00B3", "³", // superscript three = superscript digit three = cubed
"\u00B4", "´", // acute accent = spacing acute
"\u00B5", "µ", // micro sign
"\u00B6", "¶", // pilcrow sign = paragraph sign
"\u00B7", "·", // middle dot = Georgian comma = Greek middle dot
"\u00B8", "¸", // cedilla = spacing cedilla
"\u00B9", "¹", // superscript one = superscript digit one
"\u00BA", "º", // masculine ordinal indicator
"\u00BB", "»", // right-pointing double angle quotation mark = right pointing
// guillemet
"\u00BC", "¼", // vulgar fraction one quarter = fraction one quarter
"\u00BD", "½", // vulgar fraction one half = fraction one half
"\u00BE", "¾", // vulgar fraction three quarters = fraction three quarters
"\u00BF", "¿", // inverted question mark = turned question mark
"\u00C0", "À", // À - uppercase A, grave accent
"\u00C1", "Á", // Á - uppercase A, acute accent
"\u00C2", "Â", // Â - uppercase A, circumflex accent
"\u00C3", "Ã", // Ã - uppercase A, tilde
"\u00C4", "Ä", // Ä - uppercase A, umlaut
"\u00C5", "Å", // Å - uppercase A, ring
"\u00C6", "Æ", // Æ - uppercase AE
"\u00C7", "Ç", // Ç - uppercase C, cedilla
"\u00C8", "È", // È - uppercase E, grave accent
"\u00C9", "É", // É - uppercase E, acute accent
"\u00CA", "Ê", // Ê - uppercase E, circumflex accent
"\u00CB", "Ë", // Ë - uppercase E, umlaut
"\u00CC", "Ì", // Ì - uppercase I, grave accent
"\u00CD", "Í", // Í - uppercase I, acute accent
"\u00CE", "Î", // Î - uppercase I, circumflex accent
"\u00CF", "Ï", // Ï - uppercase I, umlaut
"\u00D0", "Ð", // Ð - uppercase Eth, Icelandic
"\u00D1", "Ñ", // Ñ - uppercase N, tilde
"\u00D2", "Ò", // Ò - uppercase O, grave accent
"\u00D3", "Ó", // Ó - uppercase O, acute accent
"\u00D4", "Ô", // Ô - uppercase O, circumflex accent
"\u00D5", "Õ", // Õ - uppercase O, tilde
"\u00D6", "Ö", // Ö - uppercase O, umlaut
"\u00D7", "×", // multiplication sign
"\u00D8", "Ø", // Ø - uppercase O, slash
"\u00D9", "Ù", // Ù - uppercase U, grave accent
"\u00DA", "Ú", // Ú - uppercase U, acute accent
"\u00DB", "Û", // Û - uppercase U, circumflex accent
"\u00DC", "Ü", // Ü - uppercase U, umlaut
"\u00DD", "Ý", // Ý - uppercase Y, acute accent
"\u00DE", "Þ", // Þ - uppercase THORN, Icelandic
"\u00DF", "ß", // ß - lowercase sharps, German
"\u00E0", "à", // à - lowercase a, grave accent
"\u00E1", "á", // á - lowercase a, acute accent
"\u00E2", "â", // â - lowercase a, circumflex accent
"\u00E3", "ã", // ã - lowercase a, tilde
"\u00E4", "ä", // ä - lowercase a, umlaut
"\u00E5", "å", // å - lowercase a, ring
"\u00E6", "æ", // æ - lowercase ae
"\u00E7", "ç", // ç - lowercase c, cedilla
"\u00E8", "è", // è - lowercase e, grave accent
"\u00E9", "é", // é - lowercase e, acute accent
"\u00EA", "ê", // ê - lowercase e, circumflex accent
"\u00EB", "ë", // ë - lowercase e, umlaut
"\u00EC", "ì", // ì - lowercase i, grave accent
"\u00ED", "í", // í - lowercase i, acute accent
"\u00EE", "î", // î - lowercase i, circumflex accent
"\u00EF", "ï", // ï - lowercase i, umlaut
"\u00F0", "ð", // ð - lowercase eth, Icelandic
"\u00F1", "ñ", // ñ - lowercase n, tilde
"\u00F2", "ò", // ò - lowercase o, grave accent
"\u00F3", "ó", // ó - lowercase o, acute accent
"\u00F4", "ô", // ô - lowercase o, circumflex accent
"\u00F5", "õ", // õ - lowercase o, tilde
"\u00F6", "ö", // ö - lowercase o, umlaut
"\u00F7", "÷", // division sign
"\u00F8", "ø", // ø - lowercase o, slash
"\u00F9", "ù", // ù - lowercase u, grave accent
"\u00FA", "ú", // ú - lowercase u, acute accent
"\u00FB", "û", // û - lowercase u, circumflex accent
"\u00FC", "ü", // ü - lowercase u, umlaut
"\u00FD", "ý", // ý - lowercase y, acute accent
"\u00FE", "þ", // þ - lowercase thorn, Icelandic
"\u00FF", "ÿ" // ÿ - lowercase y, umlaut
);
/**
* Reverse of {@link #ISO8859_1_ESCAPE} for unescaping purposes.
*/
public static final Map ISO8859_1_UNESCAPE = invert(ISO8859_1_ESCAPE);
/**
* Mapping to escape ISO-8859-1
* characters to their named HTML 3.x equivalents.
*/
public static final Map HTML3_ESCAPE = merge(XML_ESCAPE, ISO8859_1_ESCAPE);
/**
* Reverse of {@link #HTML3_ESCAPE} for unescaping purposes.
*/
public static final Map HTML3_UNESCAPE = invert(HTML3_ESCAPE);
/**
* Mapping to escape additional character entity references.
* Note that this must be used with {@link #ISO8859_1_ESCAPE} to get the full list of HTML 4.0
* character entities.
*/
public static final Map HTML40_EXTENDED_ESCAPE = Arrays.toMap(
//
"\u0192", "ƒ", // latin small f with hook = function= florin, U+0192 ISOtech
// -->
//
"\u0391", "Α", // greek capital letter alpha, U+0391 -->
"\u0392", "Β", // greek capital letter beta, U+0392 -->
"\u0393", "Γ", // greek capital letter gamma,U+0393 ISOgrk3 -->
"\u0394", "Δ", // greek capital letter delta,U+0394 ISOgrk3 -->
"\u0395", "Ε", // greek capital letter epsilon, U+0395 -->
"\u0396", "Ζ", // greek capital letter zeta, U+0396 -->
"\u0397", "Η", // greek capital letter eta, U+0397 -->
"\u0398", "Θ", // greek capital letter theta,U+0398 ISOgrk3 -->
"\u0399", "Ι", // greek capital letter iota, U+0399 -->
"\u039A", "Κ", // greek capital letter kappa, U+039A -->
"\u039B", "Λ", // greek capital letter lambda,U+039B ISOgrk3 -->
"\u039C", "Μ", // greek capital letter mu, U+039C -->
"\u039D", "Ν", // greek capital letter nu, U+039D -->
"\u039E", "Ξ", // greek capital letter xi, U+039E ISOgrk3 -->
"\u039F", "Ο", // greek capital letter omicron, U+039F -->
"\u03A0", "Π", // greek capital letter pi, U+03A0 ISOgrk3 -->
"\u03A1", "Ρ", // greek capital letter rho, U+03A1 -->
//
"\u03A3", "Σ", // greek capital letter sigma,U+03A3 ISOgrk3 -->
"\u03A4", "Τ", // greek capital letter tau, U+03A4 -->
"\u03A5", "Υ", // greek capital letter upsilon,U+03A5 ISOgrk3 -->
"\u03A6", "Φ", // greek capital letter phi,U+03A6 ISOgrk3 -->
"\u03A7", "Χ", // greek capital letter chi, U+03A7 -->
"\u03A8", "Ψ", // greek capital letter psi,U+03A8 ISOgrk3 -->
"\u03A9", "Ω", // greek capital letter omega,U+03A9 ISOgrk3 -->
"\u03B1", "α", // greek small letter alpha,U+03B1 ISOgrk3 -->
"\u03B2", "β", // greek small letter beta, U+03B2 ISOgrk3 -->
"\u03B3", "γ", // greek small letter gamma,U+03B3 ISOgrk3 -->
"\u03B4", "δ", // greek small letter delta,U+03B4 ISOgrk3 -->
"\u03B5", "ε", // greek small letter epsilon,U+03B5 ISOgrk3 -->
"\u03B6", "ζ", // greek small letter zeta, U+03B6 ISOgrk3 -->
"\u03B7", "η", // greek small letter eta, U+03B7 ISOgrk3 -->
"\u03B8", "θ", // greek small letter theta,U+03B8 ISOgrk3 -->
"\u03B9", "ι", // greek small letter iota, U+03B9 ISOgrk3 -->
"\u03BA", "κ", // greek small letter kappa,U+03BA ISOgrk3 -->
"\u03BB", "λ", // greek small letter lambda,U+03BB ISOgrk3 -->
"\u03BC", "μ", // greek small letter mu, U+03BC ISOgrk3 -->
"\u03BD", "ν", // greek small letter nu, U+03BD ISOgrk3 -->
"\u03BE", "ξ", // greek small letter xi, U+03BE ISOgrk3 -->
"\u03BF", "ο", // greek small letter omicron, U+03BF NEW -->
"\u03C0", "π", // greek small letter pi, U+03C0 ISOgrk3 -->
"\u03C1", "ρ", // greek small letter rho, U+03C1 ISOgrk3 -->
"\u03C2", "ς", // greek small letter final sigma,U+03C2 ISOgrk3 -->
"\u03C3", "σ", // greek small letter sigma,U+03C3 ISOgrk3 -->
"\u03C4", "τ", // greek small letter tau, U+03C4 ISOgrk3 -->
"\u03C5", "υ", // greek small letter upsilon,U+03C5 ISOgrk3 -->
"\u03C6", "φ", // greek small letter phi, U+03C6 ISOgrk3 -->
"\u03C7", "χ", // greek small letter chi, U+03C7 ISOgrk3 -->
"\u03C8", "ψ", // greek small letter psi, U+03C8 ISOgrk3 -->
"\u03C9", "ω", // greek small letter omega,U+03C9 ISOgrk3 -->
"\u03D1", "ϑ", // greek small letter theta symbol,U+03D1 NEW -->
"\u03D2", "ϒ", // greek upsilon with hook symbol,U+03D2 NEW -->
"\u03D6", "ϖ", // greek pi symbol, U+03D6 ISOgrk3 -->
//
"\u2022", "•", // bullet = black small circle,U+2022 ISOpub -->
//
"\u2026", "…", // horizontal ellipsis = three dot leader,U+2026 ISOpub -->
"\u2032", "′", // prime = minutes = feet, U+2032 ISOtech -->
"\u2033", "″", // double prime = seconds = inches,U+2033 ISOtech -->
"\u203E", "‾", // overline = spacing overscore,U+203E NEW -->
"\u2044", "⁄", // fraction slash, U+2044 NEW -->
//
"\u2118", "℘", // script capital P = power set= Weierstrass p, U+2118 ISOamso
// -->
"\u2111", "ℑ", // blackletter capital I = imaginary part,U+2111 ISOamso -->
"\u211C", "ℜ", // blackletter capital R = real part symbol,U+211C ISOamso -->
"\u2122", "™", // trade mark sign, U+2122 ISOnum -->
"\u2135", "ℵ", // alef symbol = first transfinite cardinal,U+2135 NEW -->
//
//
"\u2190", "←", // leftwards arrow, U+2190 ISOnum -->
"\u2191", "↑", // upwards arrow, U+2191 ISOnum-->
"\u2192", "→", // rightwards arrow, U+2192 ISOnum -->
"\u2193", "↓", // downwards arrow, U+2193 ISOnum -->
"\u2194", "↔", // left right arrow, U+2194 ISOamsa -->
"\u21B5", "↵", // downwards arrow with corner leftwards= carriage return,
// U+21B5 NEW -->
"\u21D0", "⇐", // leftwards double arrow, U+21D0 ISOtech -->
//
"\u21D1", "⇑", // upwards double arrow, U+21D1 ISOamsa -->
"\u21D2", "⇒", // rightwards double arrow,U+21D2 ISOtech -->
//
"\u21D3", "⇓", // downwards double arrow, U+21D3 ISOamsa -->
"\u21D4", "⇔", // left right double arrow,U+21D4 ISOamsa -->
//
"\u2200", "∀", // for all, U+2200 ISOtech -->
"\u2202", "∂", // partial differential, U+2202 ISOtech -->
"\u2203", "∃", // there exists, U+2203 ISOtech -->
"\u2205", "∅", // empty set = null set = diameter,U+2205 ISOamso -->
"\u2207", "∇", // nabla = backward difference,U+2207 ISOtech -->
"\u2208", "∈", // element of, U+2208 ISOtech -->
"\u2209", "∉", // not an element of, U+2209 ISOtech -->
"\u220B", "∋", // contains as member, U+220B ISOtech -->
//
"\u220F", "∏", // n-ary product = product sign,U+220F ISOamsb -->
//
"\u2211", "∑", // n-ary summation, U+2211 ISOamsb -->
//
"\u2212", "−", // minus sign, U+2212 ISOtech -->
"\u2217", "∗", // asterisk operator, U+2217 ISOtech -->
"\u221A", "√", // square root = radical sign,U+221A ISOtech -->
"\u221D", "∝", // proportional to, U+221D ISOtech -->
"\u221E", "∞", // infinity, U+221E ISOtech -->
"\u2220", "∠", // angle, U+2220 ISOamso -->
"\u2227", "∧", // logical and = wedge, U+2227 ISOtech -->
"\u2228", "∨", // logical or = vee, U+2228 ISOtech -->
"\u2229", "∩", // intersection = cap, U+2229 ISOtech -->
"\u222A", "∪", // union = cup, U+222A ISOtech -->
"\u222B", "∫", // integral, U+222B ISOtech -->
"\u2234", "∴", // therefore, U+2234 ISOtech -->
"\u223C", "∼", // tilde operator = varies with = similar to,U+223C ISOtech -->
//
"\u2245", "≅", // approximately equal to, U+2245 ISOtech -->
"\u2248", "≈", // almost equal to = asymptotic to,U+2248 ISOamsr -->
"\u2260", "≠", // not equal to, U+2260 ISOtech -->
"\u2261", "≡", // identical to, U+2261 ISOtech -->
"\u2264", "≤", // less-than or equal to, U+2264 ISOtech -->
"\u2265", "≥", // greater-than or equal to,U+2265 ISOtech -->
"\u2282", "⊂", // subset of, U+2282 ISOtech -->
"\u2283", "⊃", // superset of, U+2283 ISOtech -->
//
"\u2284", "⊅", // not a subset of, U+2284 ISOamsn -->
"\u2286", "⊆", // subset of or equal to, U+2286 ISOtech -->
"\u2287", "⊇", // superset of or equal to,U+2287 ISOtech -->
"\u2295", "⊕", // circled plus = direct sum,U+2295 ISOamsb -->
"\u2297", "⊗", // circled times = vector product,U+2297 ISOamsb -->
"\u22A5", "⊥", // up tack = orthogonal to = perpendicular,U+22A5 ISOtech -->
"\u22C5", "⋅", // dot operator, U+22C5 ISOamsb -->
//
//
"\u2308", "⌈", // left ceiling = apl upstile,U+2308 ISOamsc -->
"\u2309", "⌉", // right ceiling, U+2309 ISOamsc -->
"\u230A", "⌊", // left floor = apl downstile,U+230A ISOamsc -->
"\u230B", "⌋", // right floor, U+230B ISOamsc -->
"\u2329", "〈", // left-pointing angle bracket = bra,U+2329 ISOtech -->
//
"\u232A", "〉", // right-pointing angle bracket = ket,U+232A ISOtech -->
//
//
"\u25CA", "◊", // lozenge, U+25CA ISOpub -->
//
"\u2660", "♠", // black spade suit, U+2660 ISOpub -->
//
"\u2663", "♣", // black club suit = shamrock,U+2663 ISOpub -->
"\u2665", "♥", // black heart suit = valentine,U+2665 ISOpub -->
"\u2666", "♦", // black diamond suit, U+2666 ISOpub -->
//
"\u0152", "Œ", // -- latin capital ligature OE,U+0152 ISOlat2 -->
"\u0153", "œ", // -- latin small ligature oe, U+0153 ISOlat2 -->
//
"\u0160", "Š", // -- latin capital letter S with caron,U+0160 ISOlat2 -->
"\u0161", "š", // -- latin small letter s with caron,U+0161 ISOlat2 -->
"\u0178", "Ÿ", // -- latin capital letter Y with diaeresis,U+0178 ISOlat2 -->
//
"\u02C6", "ˆ", // -- modifier letter circumflex accent,U+02C6 ISOpub -->
"\u02DC", "˜", // small tilde, U+02DC ISOdia -->
//
"\u2002", " ", // en space, U+2002 ISOpub -->
"\u2003", " ", // em space, U+2003 ISOpub -->
"\u2009", " ", // thin space, U+2009 ISOpub -->
"\u200C", "", // zero width non-joiner,U+200C NEW RFC 2070 -->
"\u200D", "", // zero width joiner, U+200D NEW RFC 2070 -->
"\u200E", "", // left-to-right mark, U+200E NEW RFC 2070 -->
"\u200F", "", // right-to-left mark, U+200F NEW RFC 2070 -->
"\u2013", "–", // en dash, U+2013 ISOpub -->
"\u2014", "—", // em dash, U+2014 ISOpub -->
"\u2018", "‘", // left single quotation mark,U+2018 ISOnum -->
"\u2019", "’", // right single quotation mark,U+2019 ISOnum -->
"\u201A", "‚", // single low-9 quotation mark, U+201A NEW -->
"\u201C", "“", // left double quotation mark,U+201C ISOnum -->
"\u201D", "”", // right double quotation mark,U+201D ISOnum -->
"\u201E", "„", // double low-9 quotation mark, U+201E NEW -->
"\u2020", "†", // dagger, U+2020 ISOpub -->
"\u2021", "‡", // double dagger, U+2021 ISOpub -->
"\u2030", "‰", // per mille sign, U+2030 ISOtech -->
"\u2039", "‹", // single left-pointing angle quotation mark,U+2039 ISO
// proposed -->
//
"\u203A", "›", // single right-pointing angle quotation mark,U+203A ISO
// proposed -->
//
"\u20AC", "€" // -- euro sign, U+20AC NEW -->
);
/**
* Reverse of {@link #HTML40_EXTENDED_ESCAPE} for unescaping purposes.
*/
public static final Map HTML40_EXTENDED_UNESCAPE = invert(HTML40_EXTENDED_ESCAPE);
/**
* HTML4 escape map
*/
public static final Map HTML4_ESCAPE = merge(HTML3_ESCAPE, HTML40_EXTENDED_ESCAPE);
/**
* Reverse of {@link #HTML4_ESCAPE} for unescaping purposes.
*/
public static final Map HTML4_UNESCAPE = invert(HTML4_ESCAPE);
/**
* Used to invert an escape array into an unescape array
*
* @param src source map
* @return inverted map
*/
private static Map invert(final Map src) {
Map des = new HashMap();
Collections.invertAddAll(des, src);
return des;
}
/**
* merge map
*
* @param src source maps
* @return merged map
*/
private static Map merge(final Map ... src) {
Map des = new HashMap();
Collections.addAll(des, src);
return des;
}
}