All Downloads are FREE. Search and download functionalities are using the official Maven repository.

panda.doc.html.HTMLEntities Maven / Gradle / Ivy

Go to download

Panda Core is the core module of Panda Framework, it contains commonly used utility classes similar to apache-commons.

There is a newer version: 1.8.0
Show newest version
package panda.doc.html;

import java.util.HashMap;
import java.util.Map;

import panda.lang.Arrays;
import panda.lang.Collections;

/**
 * HTML entities
 */
@SuppressWarnings("unchecked")
public class HTMLEntities {
	public static final String NBSP = " ";
	public static final String APOS = "'";
	public static final String QUOT = """;
	public static final String AMP = "&";
	public static final String LT = "<";
	public static final String GT = ">";
	
	/**
	 * Mapping to escape the basic XML and HTML character entities. Namely: {@code " & < >}
	 */
	public static final Map BASIC_ESCAPE = Arrays.toMap(
			"\"", QUOT,	// " - double-quote
			"&", AMP,	// & - ampersand
			"<", LT,	// < - less-than
			">", GT		// > - greater-than
		);

	/**
	 * Reverse of {@link #BASIC_ESCAPE} for unescaping purposes.
	 */
	public static final Map BASIC_UNESCAPE = invert(BASIC_ESCAPE);

	/**
	 * Mapping to escape the apostrophe character to its XML character entity.
	 */
	public static final Map APOS_ESCAPE = Arrays.toMap("'", APOS);

	/**
	 * Reverse of {@link #APOS_ESCAPE} for unescaping purposes.
	 */
	public static final Map APOS_UNESCAPE = invert(APOS_ESCAPE);

	/**
	 * XML escape map
	 */
	public static final Map XML_ESCAPE = merge(BASIC_ESCAPE, APOS_ESCAPE);

	/**
	 * Reverse of {@link #XML_ESCAPE} for unescaping purposes.
	 */
	public static final Map XML_UNESCAPE = invert(XML_ESCAPE);

	/**
	 * Mapping to escape ISO-8859-1
	 * characters to their named HTML 3.x equivalents.
	 */
	public static final Map ISO8859_1_ESCAPE = Arrays.toMap(
			"\u00A0", NBSP, // non-breaking space
			"\u00A1", "¡", // inverted exclamation mark
			"\u00A2", "¢", // cent sign
			"\u00A3", "£", // pound sign
			"\u00A4", "¤", // currency sign
			"\u00A5", "¥", // yen sign = yuan sign
			"\u00A6", "¦", // broken bar = broken vertical bar
			"\u00A7", "§", // section sign
			"\u00A8", "¨", // diaeresis = spacing diaeresis
			"\u00A9", "©", // © - copyright sign
			"\u00AA", "ª", // feminine ordinal indicator
			"\u00AB", "«", // left-pointing double angle quotation mark = left pointing guillemet
			"\u00AC", "¬", // not sign
			"\u00AD", "­", // soft hyphen = discretionary hyphen
			"\u00AE", "®", // ® - registered trademark sign
			"\u00AF", "¯", // macron = spacing macron = overline = APL overbar
			"\u00B0", "°", // degree sign
			"\u00B1", "±", // plus-minus sign = plus-or-minus sign
			"\u00B2", "²", // superscript two = superscript digit two = squared
			"\u00B3", "³", // superscript three = superscript digit three = cubed
			"\u00B4", "´", // acute accent = spacing acute
			"\u00B5", "µ", // micro sign
			"\u00B6", "¶", // pilcrow sign = paragraph sign
			"\u00B7", "·", // middle dot = Georgian comma = Greek middle dot
			"\u00B8", "¸", // cedilla = spacing cedilla
			"\u00B9", "¹", // superscript one = superscript digit one
			"\u00BA", "º", // masculine ordinal indicator
			"\u00BB", "»", // right-pointing double angle quotation mark = right pointing
										// guillemet
			"\u00BC", "¼", // vulgar fraction one quarter = fraction one quarter
			"\u00BD", "½", // vulgar fraction one half = fraction one half
			"\u00BE", "¾", // vulgar fraction three quarters = fraction three quarters
			"\u00BF", "¿", // inverted question mark = turned question mark
			"\u00C0", "À", // À - uppercase A, grave accent
			"\u00C1", "Á", // Á - uppercase A, acute accent
			"\u00C2", "Â", // Â - uppercase A, circumflex accent
			"\u00C3", "Ã", // Ã - uppercase A, tilde
			"\u00C4", "Ä", // Ä - uppercase A, umlaut
			"\u00C5", "Å", // Å - uppercase A, ring
			"\u00C6", "Æ", // Æ - uppercase AE
			"\u00C7", "Ç", // Ç - uppercase C, cedilla
			"\u00C8", "È", // È - uppercase E, grave accent
			"\u00C9", "É", // É - uppercase E, acute accent
			"\u00CA", "Ê", // Ê - uppercase E, circumflex accent
			"\u00CB", "Ë", // Ë - uppercase E, umlaut
			"\u00CC", "Ì", // Ì - uppercase I, grave accent
			"\u00CD", "Í", // Í - uppercase I, acute accent
			"\u00CE", "Î", // Î - uppercase I, circumflex accent
			"\u00CF", "Ï", // Ï - uppercase I, umlaut
			"\u00D0", "Ð", // Ð - uppercase Eth, Icelandic
			"\u00D1", "Ñ", // Ñ - uppercase N, tilde
			"\u00D2", "Ò", // Ò - uppercase O, grave accent
			"\u00D3", "Ó", // Ó - uppercase O, acute accent
			"\u00D4", "Ô", // Ô - uppercase O, circumflex accent
			"\u00D5", "Õ", // Õ - uppercase O, tilde
			"\u00D6", "Ö", // Ö - uppercase O, umlaut
			"\u00D7", "×", // multiplication sign
			"\u00D8", "Ø", // Ø - uppercase O, slash
			"\u00D9", "Ù", // Ù - uppercase U, grave accent
			"\u00DA", "Ú", // Ú - uppercase U, acute accent
			"\u00DB", "Û", // Û - uppercase U, circumflex accent
			"\u00DC", "Ü", // Ü - uppercase U, umlaut
			"\u00DD", "Ý", // Ý - uppercase Y, acute accent
			"\u00DE", "Þ", // Þ - uppercase THORN, Icelandic
			"\u00DF", "ß", // ß - lowercase sharps, German
			"\u00E0", "à", // à - lowercase a, grave accent
			"\u00E1", "á", // á - lowercase a, acute accent
			"\u00E2", "â", // â - lowercase a, circumflex accent
			"\u00E3", "ã", // ã - lowercase a, tilde
			"\u00E4", "ä", // ä - lowercase a, umlaut
			"\u00E5", "å", // å - lowercase a, ring
			"\u00E6", "æ", // æ - lowercase ae
			"\u00E7", "ç", // ç - lowercase c, cedilla
			"\u00E8", "è", // è - lowercase e, grave accent
			"\u00E9", "é", // é - lowercase e, acute accent
			"\u00EA", "ê", // ê - lowercase e, circumflex accent
			"\u00EB", "ë", // ë - lowercase e, umlaut
			"\u00EC", "ì", // ì - lowercase i, grave accent
			"\u00ED", "í", // í - lowercase i, acute accent
			"\u00EE", "î", // î - lowercase i, circumflex accent
			"\u00EF", "ï", // ï - lowercase i, umlaut
			"\u00F0", "ð", // ð - lowercase eth, Icelandic
			"\u00F1", "ñ", // ñ - lowercase n, tilde
			"\u00F2", "ò", // ò - lowercase o, grave accent
			"\u00F3", "ó", // ó - lowercase o, acute accent
			"\u00F4", "ô", // ô - lowercase o, circumflex accent
			"\u00F5", "õ", // õ - lowercase o, tilde
			"\u00F6", "ö", // ö - lowercase o, umlaut
			"\u00F7", "÷", // division sign
			"\u00F8", "ø", // ø - lowercase o, slash
			"\u00F9", "ù", // ù - lowercase u, grave accent
			"\u00FA", "ú", // ú - lowercase u, acute accent
			"\u00FB", "û", // û - lowercase u, circumflex accent
			"\u00FC", "ü", // ü - lowercase u, umlaut
			"\u00FD", "ý", // ý - lowercase y, acute accent
			"\u00FE", "þ", // þ - lowercase thorn, Icelandic
			"\u00FF", "ÿ" // ÿ - lowercase y, umlaut
	);

	/**
	 * Reverse of {@link #ISO8859_1_ESCAPE} for unescaping purposes.
	 */
	public static final Map ISO8859_1_UNESCAPE = invert(ISO8859_1_ESCAPE);

	/**
	 * Mapping to escape ISO-8859-1
	 * characters to their named HTML 3.x equivalents.
	 */
	public static final Map HTML3_ESCAPE = merge(XML_ESCAPE, ISO8859_1_ESCAPE);

	/**
	 * Reverse of {@link #HTML3_ESCAPE} for unescaping purposes.
	 */
	public static final Map HTML3_UNESCAPE = invert(HTML3_ESCAPE);

	/**
	 * Mapping to escape additional character entity references.
	 * Note that this must be used with {@link #ISO8859_1_ESCAPE} to get the full list of HTML 4.0
	 * character entities.
	 */
	public static final Map HTML40_EXTENDED_ESCAPE = Arrays.toMap(
			// 
			"\u0192", "ƒ", // latin small f with hook = function= florin, U+0192 ISOtech
			// -->
			// 
			"\u0391", "Α", // greek capital letter alpha, U+0391 -->
			"\u0392", "Β", // greek capital letter beta, U+0392 -->
			"\u0393", "Γ", // greek capital letter gamma,U+0393 ISOgrk3 -->
			"\u0394", "Δ", // greek capital letter delta,U+0394 ISOgrk3 -->
			"\u0395", "Ε", // greek capital letter epsilon, U+0395 -->
			"\u0396", "Ζ", // greek capital letter zeta, U+0396 -->
			"\u0397", "Η", // greek capital letter eta, U+0397 -->
			"\u0398", "Θ", // greek capital letter theta,U+0398 ISOgrk3 -->
			"\u0399", "Ι", // greek capital letter iota, U+0399 -->
			"\u039A", "Κ", // greek capital letter kappa, U+039A -->
			"\u039B", "Λ", // greek capital letter lambda,U+039B ISOgrk3 -->
			"\u039C", "Μ", // greek capital letter mu, U+039C -->
			"\u039D", "Ν", // greek capital letter nu, U+039D -->
			"\u039E", "Ξ", // greek capital letter xi, U+039E ISOgrk3 -->
			"\u039F", "Ο", // greek capital letter omicron, U+039F -->
			"\u03A0", "Π", // greek capital letter pi, U+03A0 ISOgrk3 -->
			"\u03A1", "Ρ", // greek capital letter rho, U+03A1 -->
			// 
			"\u03A3", "Σ", // greek capital letter sigma,U+03A3 ISOgrk3 -->
			"\u03A4", "Τ", // greek capital letter tau, U+03A4 -->
			"\u03A5", "Υ", // greek capital letter upsilon,U+03A5 ISOgrk3 -->
			"\u03A6", "Φ", // greek capital letter phi,U+03A6 ISOgrk3 -->
			"\u03A7", "Χ", // greek capital letter chi, U+03A7 -->
			"\u03A8", "Ψ", // greek capital letter psi,U+03A8 ISOgrk3 -->
			"\u03A9", "Ω", // greek capital letter omega,U+03A9 ISOgrk3 -->
			"\u03B1", "α", // greek small letter alpha,U+03B1 ISOgrk3 -->
			"\u03B2", "β", // greek small letter beta, U+03B2 ISOgrk3 -->
			"\u03B3", "γ", // greek small letter gamma,U+03B3 ISOgrk3 -->
			"\u03B4", "δ", // greek small letter delta,U+03B4 ISOgrk3 -->
			"\u03B5", "ε", // greek small letter epsilon,U+03B5 ISOgrk3 -->
			"\u03B6", "ζ", // greek small letter zeta, U+03B6 ISOgrk3 -->
			"\u03B7", "η", // greek small letter eta, U+03B7 ISOgrk3 -->
			"\u03B8", "θ", // greek small letter theta,U+03B8 ISOgrk3 -->
			"\u03B9", "ι", // greek small letter iota, U+03B9 ISOgrk3 -->
			"\u03BA", "κ", // greek small letter kappa,U+03BA ISOgrk3 -->
			"\u03BB", "λ", // greek small letter lambda,U+03BB ISOgrk3 -->
			"\u03BC", "μ", // greek small letter mu, U+03BC ISOgrk3 -->
			"\u03BD", "ν", // greek small letter nu, U+03BD ISOgrk3 -->
			"\u03BE", "ξ", // greek small letter xi, U+03BE ISOgrk3 -->
			"\u03BF", "ο", // greek small letter omicron, U+03BF NEW -->
			"\u03C0", "π", // greek small letter pi, U+03C0 ISOgrk3 -->
			"\u03C1", "ρ", // greek small letter rho, U+03C1 ISOgrk3 -->
			"\u03C2", "ς", // greek small letter final sigma,U+03C2 ISOgrk3 -->
			"\u03C3", "σ", // greek small letter sigma,U+03C3 ISOgrk3 -->
			"\u03C4", "τ", // greek small letter tau, U+03C4 ISOgrk3 -->
			"\u03C5", "υ", // greek small letter upsilon,U+03C5 ISOgrk3 -->
			"\u03C6", "φ", // greek small letter phi, U+03C6 ISOgrk3 -->
			"\u03C7", "χ", // greek small letter chi, U+03C7 ISOgrk3 -->
			"\u03C8", "ψ", // greek small letter psi, U+03C8 ISOgrk3 -->
			"\u03C9", "ω", // greek small letter omega,U+03C9 ISOgrk3 -->
			"\u03D1", "ϑ", // greek small letter theta symbol,U+03D1 NEW -->
			"\u03D2", "ϒ", // greek upsilon with hook symbol,U+03D2 NEW -->
			"\u03D6", "ϖ", // greek pi symbol, U+03D6 ISOgrk3 -->
			// 
			"\u2022", "•", // bullet = black small circle,U+2022 ISOpub -->
			// 
			"\u2026", "…", // horizontal ellipsis = three dot leader,U+2026 ISOpub -->
			"\u2032", "′", // prime = minutes = feet, U+2032 ISOtech -->
			"\u2033", "″", // double prime = seconds = inches,U+2033 ISOtech -->
			"\u203E", "‾", // overline = spacing overscore,U+203E NEW -->
			"\u2044", "⁄", // fraction slash, U+2044 NEW -->
			// 
			"\u2118", "℘", // script capital P = power set= Weierstrass p, U+2118 ISOamso
						// -->
			"\u2111", "ℑ", // blackletter capital I = imaginary part,U+2111 ISOamso -->
			"\u211C", "ℜ", // blackletter capital R = real part symbol,U+211C ISOamso -->
			"\u2122", "™", // trade mark sign, U+2122 ISOnum -->
			"\u2135", "ℵ", // alef symbol = first transfinite cardinal,U+2135 NEW -->
			// 
			// 
			"\u2190", "←", // leftwards arrow, U+2190 ISOnum -->
			"\u2191", "↑", // upwards arrow, U+2191 ISOnum-->
			"\u2192", "→", // rightwards arrow, U+2192 ISOnum -->
			"\u2193", "↓", // downwards arrow, U+2193 ISOnum -->
			"\u2194", "↔", // left right arrow, U+2194 ISOamsa -->
			"\u21B5", "↵", // downwards arrow with corner leftwards= carriage return,
						// U+21B5 NEW -->
			"\u21D0", "⇐", // leftwards double arrow, U+21D0 ISOtech -->
			// 
			"\u21D1", "⇑", // upwards double arrow, U+21D1 ISOamsa -->
			"\u21D2", "⇒", // rightwards double arrow,U+21D2 ISOtech -->
			// 
			"\u21D3", "⇓", // downwards double arrow, U+21D3 ISOamsa -->
			"\u21D4", "⇔", // left right double arrow,U+21D4 ISOamsa -->
			// 
			"\u2200", "∀", // for all, U+2200 ISOtech -->
			"\u2202", "∂", // partial differential, U+2202 ISOtech -->
			"\u2203", "∃", // there exists, U+2203 ISOtech -->
			"\u2205", "∅", // empty set = null set = diameter,U+2205 ISOamso -->
			"\u2207", "∇", // nabla = backward difference,U+2207 ISOtech -->
			"\u2208", "∈", // element of, U+2208 ISOtech -->
			"\u2209", "∉", // not an element of, U+2209 ISOtech -->
			"\u220B", "∋", // contains as member, U+220B ISOtech -->
			// 
			"\u220F", "∏", // n-ary product = product sign,U+220F ISOamsb -->
			// 
			"\u2211", "∑", // n-ary summation, U+2211 ISOamsb -->
			// 
			"\u2212", "−", // minus sign, U+2212 ISOtech -->
			"\u2217", "∗", // asterisk operator, U+2217 ISOtech -->
			"\u221A", "√", // square root = radical sign,U+221A ISOtech -->
			"\u221D", "∝", // proportional to, U+221D ISOtech -->
			"\u221E", "∞", // infinity, U+221E ISOtech -->
			"\u2220", "∠", // angle, U+2220 ISOamso -->
			"\u2227", "∧", // logical and = wedge, U+2227 ISOtech -->
			"\u2228", "∨", // logical or = vee, U+2228 ISOtech -->
			"\u2229", "∩", // intersection = cap, U+2229 ISOtech -->
			"\u222A", "∪", // union = cup, U+222A ISOtech -->
			"\u222B", "∫", // integral, U+222B ISOtech -->
			"\u2234", "∴", // therefore, U+2234 ISOtech -->
			"\u223C", "∼", // tilde operator = varies with = similar to,U+223C ISOtech -->
			// 
			"\u2245", "≅", // approximately equal to, U+2245 ISOtech -->
			"\u2248", "≈", // almost equal to = asymptotic to,U+2248 ISOamsr -->
			"\u2260", "≠", // not equal to, U+2260 ISOtech -->
			"\u2261", "≡", // identical to, U+2261 ISOtech -->
			"\u2264", "≤", // less-than or equal to, U+2264 ISOtech -->
			"\u2265", "≥", // greater-than or equal to,U+2265 ISOtech -->
			"\u2282", "⊂", // subset of, U+2282 ISOtech -->
			"\u2283", "⊃", // superset of, U+2283 ISOtech -->
			//  
			"\u2284", "⊅", // not a subset of, U+2284 ISOamsn -->
			"\u2286", "⊆", // subset of or equal to, U+2286 ISOtech -->
			"\u2287", "⊇", // superset of or equal to,U+2287 ISOtech -->
			"\u2295", "⊕", // circled plus = direct sum,U+2295 ISOamsb -->
			"\u2297", "⊗", // circled times = vector product,U+2297 ISOamsb -->
			"\u22A5", "⊥", // up tack = orthogonal to = perpendicular,U+22A5 ISOtech -->
			"\u22C5", "⋅", // dot operator, U+22C5 ISOamsb -->
			// 
			// 
			"\u2308", "⌈", // left ceiling = apl upstile,U+2308 ISOamsc -->
			"\u2309", "⌉", // right ceiling, U+2309 ISOamsc -->
			"\u230A", "⌊", // left floor = apl downstile,U+230A ISOamsc -->
			"\u230B", "⌋", // right floor, U+230B ISOamsc -->
			"\u2329", "⟨", // left-pointing angle bracket = bra,U+2329 ISOtech -->
			// 
			"\u232A", "⟩", // right-pointing angle bracket = ket,U+232A ISOtech -->
			// 
			// 
			"\u25CA", "◊", // lozenge, U+25CA ISOpub -->
			// 
			"\u2660", "♠", // black spade suit, U+2660 ISOpub -->
			// 
			"\u2663", "♣", // black club suit = shamrock,U+2663 ISOpub -->
			"\u2665", "♥", // black heart suit = valentine,U+2665 ISOpub -->
			"\u2666", "♦", // black diamond suit, U+2666 ISOpub -->
			
			// 
			"\u0152", "Œ", // -- latin capital ligature OE,U+0152 ISOlat2 -->
			"\u0153", "œ", // -- latin small ligature oe, U+0153 ISOlat2 -->
			// 
			"\u0160", "Š", // -- latin capital letter S with caron,U+0160 ISOlat2 -->
			"\u0161", "š", // -- latin small letter s with caron,U+0161 ISOlat2 -->
			"\u0178", "Ÿ", // -- latin capital letter Y with diaeresis,U+0178 ISOlat2 -->
			// 
			"\u02C6", "ˆ", // -- modifier letter circumflex accent,U+02C6 ISOpub -->
			"\u02DC", "˜", // small tilde, U+02DC ISOdia -->
			// 
			"\u2002", " ", // en space, U+2002 ISOpub -->
			"\u2003", " ", // em space, U+2003 ISOpub -->
			"\u2009", " ", // thin space, U+2009 ISOpub -->
			"\u200C", "‌", // zero width non-joiner,U+200C NEW RFC 2070 -->
			"\u200D", "‍", // zero width joiner, U+200D NEW RFC 2070 -->
			"\u200E", "‎", // left-to-right mark, U+200E NEW RFC 2070 -->
			"\u200F", "‏", // right-to-left mark, U+200F NEW RFC 2070 -->
			"\u2013", "–", // en dash, U+2013 ISOpub -->
			"\u2014", "—", // em dash, U+2014 ISOpub -->
			"\u2018", "‘", // left single quotation mark,U+2018 ISOnum -->
			"\u2019", "’", // right single quotation mark,U+2019 ISOnum -->
			"\u201A", "‚", // single low-9 quotation mark, U+201A NEW -->
			"\u201C", "“", // left double quotation mark,U+201C ISOnum -->
			"\u201D", "”", // right double quotation mark,U+201D ISOnum -->
			"\u201E", "„", // double low-9 quotation mark, U+201E NEW -->
			"\u2020", "†", // dagger, U+2020 ISOpub -->
			"\u2021", "‡", // double dagger, U+2021 ISOpub -->
			"\u2030", "‰", // per mille sign, U+2030 ISOtech -->
			"\u2039", "‹", // single left-pointing angle quotation mark,U+2039 ISO
						// proposed -->
			// 
			"\u203A", "›", // single right-pointing angle quotation mark,U+203A ISO
						// proposed -->
			// 
			"\u20AC", "€" // -- euro sign, U+20AC NEW -->
		);

	/**
	 * Reverse of {@link #HTML40_EXTENDED_ESCAPE} for unescaping purposes.
	 */
	public static final Map HTML40_EXTENDED_UNESCAPE = invert(HTML40_EXTENDED_ESCAPE);

	/**
	 * HTML4 escape map
	 */
	public static final Map HTML4_ESCAPE = merge(HTML3_ESCAPE, HTML40_EXTENDED_ESCAPE);

	/**
	 * Reverse of {@link #HTML4_ESCAPE} for unescaping purposes.
	 */
	public static final Map HTML4_UNESCAPE = invert(HTML4_ESCAPE);

	/**
	 * Used to invert an escape array into an unescape array
	 * 
	 * @param src source map
	 * @return inverted map
	 */
	private static Map invert(final Map src) {
		Map des = new HashMap();
		Collections.invertAddAll(des, src);
		return des;
	}

	/**
	 * merge map
	 * 
	 * @param src source maps
	 * @return merged map
	 */
	private static Map merge(final Map ... src) {
		Map des = new HashMap();
		Collections.addAll(des, src);
		return des;
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy