org.jsoup.nodes.Entities Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of html2pdf Show documentation
Show all versions of html2pdf Show documentation
pdfHTML is an iText add-on that lets you to parse (X)HTML snippets and the associated CSS and converts
them to PDF.
package org.jsoup.nodes;
import org.jsoup.SerializationException;
import org.jsoup.helper.StringUtil;
import org.jsoup.parser.Parser;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.CharsetEncoder;
import java.util.*;
/**
* HTML entities, and escape routines.
* Source: W3C HTML
* named character references.
*/
public class Entities {
public static class EscapeMode {
/** Restricted entities suitable for XHTML output: lt, gt, amp, and quot only. */
public static EscapeMode xhtml = new EscapeMode(xhtmlByVal, "xhtml");
/** Default HTML output entities. */
public static EscapeMode base = new EscapeMode(baseByVal, "base");
/** Complete HTML entities. */
public static EscapeMode extended = new EscapeMode(fullByVal, "extended");
private static Map nameValueMap = new HashMap();
public static EscapeMode valueOf(String name) {
return nameValueMap.get(name);
}
static {
nameValueMap.put(xhtml.name, xhtml);
nameValueMap.put(base.name, base);
nameValueMap.put(extended.name, extended);
}
private Map map;
private String name;
private EscapeMode(Map map, String name) {
this.map = map;
this.name = name;
}
public Map getMap() {
return map;
}
public String name() {
return name;
}
}
private static final Map full;
private static final Map xhtmlByVal;
private static final Map base;
private static final Map baseByVal;
private static final Map fullByVal;
private Entities() {}
/**
* Check if the input is a known named entity
* @param name the possible entity name (e.g. "lt" or "amp")
* @return true if a known named entity
*/
public static boolean isNamedEntity(String name) {
return full.containsKey(name);
}
/**
* Check if the input is a known named entity in the base entity set.
* @param name the possible entity name (e.g. "lt" or "amp")
* @return true if a known named entity in the base set
* @see #isNamedEntity(String)
*/
public static boolean isBaseNamedEntity(String name) {
return base.containsKey(name);
}
/**
* Get the Character value of the named entity
* @param name named entity (e.g. "lt" or "amp")
* @return the Character value of the named entity (e.g. '{@literal <}' or '{@literal &}')
*/
public static Character getCharacterByName(String name) {
return full.get(name);
}
static String escape(String string, Document.OutputSettings out) {
StringBuilder accum = new StringBuilder(string.length() * 2);
try {
escape(accum, string, out, false, false, false);
} catch (IOException e) {
throw new SerializationException(e); // doesn't happen
}
return accum.toString();
}
// this method is ugly, and does a lot. but other breakups cause rescanning and stringbuilder generations
static void escape(Appendable accum, String str, Document.OutputSettings outputSettings,
boolean inAttribute, boolean normaliseWhite, boolean stripLeadingWhite) throws IOException {
boolean lastWasWhite = false;
boolean reachedNonWhite = false;
final EscapeMode escapeMode = outputSettings.escapeMode();
final CharsetEncoder encoder = outputSettings.encoder();
final CoreCharset coreCharset = getCoreCharsetByName(outputSettings.charset().name());
final Map map = escapeMode.getMap();
final int length = str.length();
int codePoint;
for (int offset = 0; offset < length; offset += Character.charCount(codePoint)) {
codePoint = str.codePointAt(offset);
if (normaliseWhite) {
if (StringUtil.isWhitespace(codePoint)) {
if ((stripLeadingWhite && !reachedNonWhite) || lastWasWhite)
continue;
accum.append(' ');
lastWasWhite = true;
continue;
} else {
lastWasWhite = false;
reachedNonWhite = true;
}
}
// surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]):
if (codePoint < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
final char c = (char) codePoint;
// html specific and required escapes:
switch (c) {
case '&':
accum.append("&");
break;
case (char) 0xA0:
if (escapeMode != EscapeMode.xhtml)
accum.append(" ");
else
accum.append(" ");
break;
case '<':
// escape when in character data or when in a xml attribue val; not needed in html attr val
if (!inAttribute || escapeMode == EscapeMode.xhtml)
accum.append("<");
else
accum.append(c);
break;
case '>':
if (!inAttribute)
accum.append(">");
else
accum.append(c);
break;
case '"':
if (inAttribute)
accum.append(""");
else
accum.append(c);
break;
default:
if (canEncode(coreCharset, c, encoder))
accum.append(c);
else if (map.containsKey(c))
accum.append('&').append(map.get(c)).append(';');
else
accum.append("").append(Integer.toHexString(codePoint)).append(';');
}
} else {
final String c = new String(Character.toChars(codePoint));
if (encoder.canEncode(c)) // uses fallback encoder for simplicity
accum.append(c);
else
accum.append("").append(Integer.toHexString(codePoint)).append(';');
}
}
}
static String unescape(String string) {
return unescape(string, false);
}
/**
* Unescape the input string.
* @param string to un-HTML-escape
* @param strict if "strict" (that is, requires trailing ';' char, otherwise that's optional)
* @return unescaped string
*/
static String unescape(String string, boolean strict) {
return Parser.unescapeEntities(string, strict);
}
/*
* Provides a fast-path for Encoder.canEncode, which drastically improves performance on Android post JellyBean.
* After KitKat, the implementation of canEncode degrades to the point of being useless. For non ASCII or UTF,
* performance may be bad. We can add more encoders for common character sets that are impacted by performance
* issues on Android if required.
*
* Benchmarks: *
* OLD toHtml() impl v New (fastpath) in millis
* Wiki: 1895, 16
* CNN: 6378, 55
* Alterslash: 3013, 28
* Jsoup: 167, 2
*/
private static boolean canEncode(final CoreCharset charset, final char c, final CharsetEncoder fallback) {
// todo add more charset tests if impacted by Android's bad perf in canEncode
switch (charset) {
case ascii:
return c < 0x80;
case utf:
// real is:!(Character.isLowSurrogate(c) || Character.isHighSurrogate(c)); - but already check above
return true;
default:
return fallback.canEncode(c);
}
}
private enum CoreCharset {
ascii, utf, fallback;
}
private static CoreCharset getCoreCharsetByName(String name) {
if (name.equals("US-ASCII"))
return CoreCharset.ascii;
if (name.startsWith("UTF-")) // covers UTF-8, UTF-16, et al
return CoreCharset.utf;
return CoreCharset.fallback;
}
// xhtml has restricted entities
private static final Object[][] xhtmlArray = {
{"quot", 0x00022},
{"amp", 0x00026},
{"lt", 0x0003C},
{"gt", 0x0003E}
};
static {
xhtmlByVal = new HashMap();
base = loadEntities("entities-base.properties"); // most common / default
baseByVal = toCharacterKey(base);
full = loadEntities("entities-full.properties"); // extended and overblown.
fullByVal = toCharacterKey(full);
for (Object[] entity : xhtmlArray) {
char c = (char) ((Integer) entity[1]).intValue();
xhtmlByVal.put(c, ((String) entity[0]));
}
}
private static Map loadEntities(String filename) {
Properties properties = new Properties();
Map entities = new HashMap();
try {
InputStream in = Entities.class.getResourceAsStream(filename);
properties.load(in);
in.close();
} catch (IOException e) {
throw new MissingResourceException("Error loading entities resource: " + e.getMessage(), "Entities", filename);
}
for (Object name: properties.keySet()) {
Character val = (char) Integer.parseInt(properties.getProperty((String)name), 16);
entities.put((String) name, val);
}
return entities;
}
private static Map toCharacterKey(Map inMap) {
Map outMap = new HashMap();
for (Map.Entry entry: inMap.entrySet()) {
char character = (char) entry.getValue();
String name = entry.getKey();
if (outMap.containsKey(character)) {
// dupe, prefer the lower case version
if (name.toLowerCase().equals(name))
outMap.put(character, name);
} else {
outMap.put(character, name);
}
}
return outMap;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy