com.greenpepper.shaded.org.jsoup.nodes.Entities Maven / Gradle / Ivy
package com.greenpepper.shaded.org.jsoup.nodes;
import com.greenpepper.shaded.org.jsoup.helper.StringUtil;
import com.greenpepper.shaded.org.jsoup.parser.Parser;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.CharsetEncoder;
import java.util.*;
/**
* HTML entities, and escape routines.
* Source: W3C HTML
* named character references.
*/
public class Entities {
public enum EscapeMode {
/** Restricted entities suitable for XHTML output: lt, gt, amp, and quot only. */
xhtml(xhtmlByVal),
/** Default HTML output entities. */
base(baseByVal),
/** Complete HTML entities. */
extended(fullByVal);
private Map map;
EscapeMode(Map map) {
this.map = map;
}
public Map getMap() {
return map;
}
}
private static final Map full;
private static final Map xhtmlByVal;
private static final Map base;
private static final Map baseByVal;
private static final Map fullByVal;
private Entities() {}
/**
* Check if the input is a known named entity
* @param name the possible entity name (e.g. "lt" or "amp")
* @return true if a known named entity
*/
public static boolean isNamedEntity(String name) {
return full.containsKey(name);
}
/**
* Check if the input is a known named entity in the base entity set.
* @param name the possible entity name (e.g. "lt" or "amp")
* @return true if a known named entity in the base set
* @see #isNamedEntity(String)
*/
public static boolean isBaseNamedEntity(String name) {
return base.containsKey(name);
}
/**
* Get the Character value of the named entity
* @param name named entity (e.g. "lt" or "amp")
* @return the Character value of the named entity (e.g. '{@literal <}' or '{@literal &}')
*/
public static Character getCharacterByName(String name) {
return full.get(name);
}
static String escape(String string, Document.OutputSettings out) {
StringBuilder accum = new StringBuilder(string.length() * 2);
escape(accum, string, out, false, false, false);
return accum.toString();
}
// this method is ugly, and does a lot. but other breakups cause rescanning and stringbuilder generations
static void escape(StringBuilder accum, String string, Document.OutputSettings out,
boolean inAttribute, boolean normaliseWhite, boolean stripLeadingWhite) {
boolean lastWasWhite = false;
boolean reachedNonWhite = false;
final EscapeMode escapeMode = out.escapeMode();
final CharsetEncoder encoder = out.encoder();
final CoreCharset coreCharset = CoreCharset.byName(encoder.charset().name());
final Map map = escapeMode.getMap();
final int length = string.length();
int codePoint;
for (int offset = 0; offset < length; offset += Character.charCount(codePoint)) {
codePoint = string.codePointAt(offset);
if (normaliseWhite) {
if (StringUtil.isWhitespace(codePoint)) {
if ((stripLeadingWhite && !reachedNonWhite) || lastWasWhite)
continue;
accum.append(' ');
lastWasWhite = true;
continue;
} else {
lastWasWhite = false;
reachedNonWhite = true;
}
}
// surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]):
if (codePoint < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
final char c = (char) codePoint;
// html specific and required escapes:
switch (c) {
case '&':
accum.append("&");
break;
case 0xA0:
if (escapeMode != EscapeMode.xhtml)
accum.append(" ");
else
accum.append(" ");
break;
case '<':
// escape when in character data or when in a xml attribue val; not needed in html attr val
if (!inAttribute || escapeMode == EscapeMode.xhtml)
accum.append("<");
else
accum.append(c);
break;
case '>':
if (!inAttribute)
accum.append(">");
else
accum.append(c);
break;
case '"':
if (inAttribute)
accum.append(""");
else
accum.append(c);
break;
default:
if (canEncode(coreCharset, c, encoder))
accum.append(c);
else if (map.containsKey(c))
accum.append('&').append(map.get(c)).append(';');
else
accum.append("").append(Integer.toHexString(codePoint)).append(';');
}
} else {
final String c = new String(Character.toChars(codePoint));
if (encoder.canEncode(c)) // uses fallback encoder for simplicity
accum.append(c);
else
accum.append("").append(Integer.toHexString(codePoint)).append(';');
}
}
}
static String unescape(String string) {
return unescape(string, false);
}
/**
* Unescape the input string.
* @param string to un-HTML-escape
* @param strict if "strict" (that is, requires trailing ';' char, otherwise that's optional)
* @return unescaped string
*/
static String unescape(String string, boolean strict) {
return Parser.unescapeEntities(string, strict);
}
/*
* Provides a fast-path for Encoder.canEncode, which drastically improves performance on Android post JellyBean.
* After KitKat, the implementation of canEncode degrades to the point of being useless. For non ASCII or UTF,
* performance may be bad. We can add more encoders for common character sets that are impacted by performance
* issues on Android if required.
*
* Benchmarks: *
* OLD toHtml() impl v New (fastpath) in millis
* Wiki: 1895, 16
* CNN: 6378, 55
* Alterslash: 3013, 28
* Jsoup: 167, 2
*/
private static boolean canEncode(final CoreCharset charset, final char c, final CharsetEncoder fallback) {
// todo add more charset tests if impacted by Android's bad perf in canEncode
switch (charset) {
case ascii:
return c < 0x80;
case utf:
return true; // real is:!(Character.isLowSurrogate(c) || Character.isHighSurrogate(c)); - but already check above
default:
return fallback.canEncode(c);
}
}
private enum CoreCharset {
ascii, utf, fallback;
private static CoreCharset byName(String name) {
if (name.equals("US-ASCII"))
return ascii;
if (name.startsWith("UTF-")) // covers UTF-8, UTF-16, et al
return utf;
return fallback;
}
}
// xhtml has restricted entities
private static final Object[][] xhtmlArray = {
{"quot", 0x00022},
{"amp", 0x00026},
{"lt", 0x0003C},
{"gt", 0x0003E}
};
static {
xhtmlByVal = new HashMap();
base = loadEntities("entities-base.properties"); // most common / default
baseByVal = toCharacterKey(base);
full = loadEntities("entities-full.properties"); // extended and overblown.
fullByVal = toCharacterKey(full);
for (Object[] entity : xhtmlArray) {
Character c = Character.valueOf((char) ((Integer) entity[1]).intValue());
xhtmlByVal.put(c, ((String) entity[0]));
}
}
private static Map loadEntities(String filename) {
Properties properties = new Properties();
Map entities = new HashMap();
try {
InputStream in = Entities.class.getResourceAsStream(filename);
properties.load(in);
in.close();
} catch (IOException e) {
throw new MissingResourceException("Error loading entities resource: " + e.getMessage(), "Entities", filename);
}
for (Map.Entry entry: properties.entrySet()) {
Character val = Character.valueOf((char) Integer.parseInt((String) entry.getValue(), 16));
String name = (String) entry.getKey();
entities.put(name, val);
}
return entities;
}
private static Map toCharacterKey(Map inMap) {
Map outMap = new HashMap();
for (Map.Entry entry: inMap.entrySet()) {
Character character = entry.getValue();
String name = entry.getKey();
if (outMap.containsKey(character)) {
// dupe, prefer the lower case version
if (name.toLowerCase().equals(name))
outMap.put(character, name);
} else {
outMap.put(character, name);
}
}
return outMap;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy