
xmlparser.utils.Constants Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of simplexml Show documentation
Show all versions of simplexml Show documentation
A clean and simple XML parser, serializer, and deserializer.
The newest version!
package xmlparser.utils;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.HexFormat;
import java.util.LinkedHashMap;
import java.util.Map;
import static java.nio.charset.StandardCharsets.UTF_8;
import static java.util.Map.entry;
import static java.util.Map.ofEntries;
public enum Constants {;
public static final char
CHAR_FORWARD_SLASH = '/',
CHAR_SPACE = ' ',
CHAR_EQUALS = '=',
CHAR_LESS_THAN = '<',
CHAR_GREATER_THAN = '>',
CHAR_QUESTION_MARK = '?',
CHAR_SINGLE_QUOTE = '\'',
CHAR_DOUBLE_QUOTE = '"',
CHAR_AMPERSAND = '&';
public static final char
XML_TAG_START = CHAR_LESS_THAN,
XML_TAG_END = CHAR_GREATER_THAN,
XML_SELF_CLOSING = CHAR_FORWARD_SLASH,
XML_PROLOG = CHAR_QUESTION_MARK;
public static final String
XML_START_COMMENT = "!--",
XML_END_COMMENT = "--";
public static final String
EMPTY = "",
SPACE = " ",
INDENT = " ",
LESS_THAN = "<",
GREATER_THAN = ">",
AMPERSAND = "&",
EQUALS = "=",
HASH = "#",
SEMICOLON = ";",
DOUBLE_QUOTE = "\"",
FORWARD_SLASH = "/",
CARRIAGE_RETURN = "\r",
LINE_FEED = "\n";
public static final String
ENCODED_LESS_THAN = "<",
ENCODED_GREATER_THAN = ">",
ENCODED_SINGLE_QUOTE = "'",
ENCODED_DOUBLE_QUOTE = """,
ENCODED_AMPERSAND = "&",
ENCODED_UTF8 = "";
public static final String
PREDICATE_START_SYMBOL = "[",
PREDICATE_END_SYMBOL = "]",
PREDICATE_EQUAL_SYMBOL = "=",
SEGMENT_EXPRESSION = "text()",
EXPRESSION_PATH_SEPARATOR = "/";
public static final String
ERROR_EQUALITY_WITHOUT_TWO_COMPONENTS = "Equality predicate must have exactly two members",
ERROR_EQUALITY_WITH_EMPTY_PARTS = "Equality predicate must have two non-empty members",
ERROR_PREDICATE_WRONG_START = "Predicate does not start with [",
ERROR_PREDICATE_WRONG_END = "Predicate does not end with ]",
ERROR_PREDICATE_WRONG_NAME = "Element name contains ]";
// This list contains named entities from HTML 2.0, HTML 3.2, HTML 4.0, HTML 5.0, XML 1.0, MathML 2.0,
// and MathML 3.0.
//
// https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
// https://www.freeformatter.com/html-entities.html
//
// Warning: The entities ̑, ⃛, ⃛, and ⃜ had a leading space in MathML 2.0.
// In MathML 3.0 and HTML 5.0 that leading space was removed. This map uses the newer version without the
// leading space.
//
// Warning: It appears that HTML 3.2 allows named entities that do not end with a semicolon. Our parsing
// does not support this.
//
// Warning: Our parsing uses Java chars to detect characters that need to be encoded. However, a Java char
// contains only 16 bits, while some entities map onto characters that don't fit in 16 bits. It is therefore
// not possible to encode those characters with the current implementation. HTML entities that fit this
// definition are:
//
// ∾̳, 𝔞, 𝔄, 𝔸, 𝕒, 𝒶, 𝒜, 𝔟, 𝔅, =⃥, ≡⃥, 𝔹,
// 𝕓, 𝒷, ∩︀, 𝔠, 𝕔, 𝒸, 𝒞, ∪︀, 𝔇, 𝔡, 𝔻, 𝕕,
// ⃜, ̑, 𝒹, 𝒟, 𝔢, 𝔈, 𝕖, 𝔼, 𝔣, 𝔉, fj,
// 𝕗, 𝔽, 𝒻, ⋛︀, 𝔊, 𝔤, 𝕘, 𝔾, 𝒢, ≩︀, ≩︀,
// 𝔥, 𝕙, 𝒽, 𝔦, 𝕀, 𝕚, 𝒾, 𝔧, 𝔍, 𝕁, 𝕛, 𝒿,
// 𝒥, 𝔎, 𝔨, 𝕂, 𝕜, 𝓀, 𝒦, ⪭︀, ⋚︀, 𝔩, 𝔏, 𝕃,
// 𝕝, 𝓁, ≨︀, ≨︀, 𝔪, 𝔐, 𝕞, 𝕄, 𝓂, ∠⃒, ⩰̸,
// ≋̸, ≎̸, ≏̸, ⩭̸, ≐̸, ≂̸, 𝔫, 𝔑, ≧̸, ≧̸,
// ⩾̸, ⩾̸, ⋙̸, ≫⃒, ≫̸, ≦̸, ≦̸, ⩽̸, ⩽̸, ⋘̸, ≪⃒,
// ≪̸, 𝕟, ≧̸, ≫̸, ⩾̸,
// ≎̸, ≏̸, ⋵̸, ⋹̸, ⧏̸, ≪̸,
// ⩽̸, ⪢̸, ⪡̸, ⪯̸,
// ⧐̸, ⊏̸, ⊐̸, ⊂⃒, ⪰̸,
// ≿̸, ⊃⃒, ⫽⃥, ∂̸, ⪯̸, ⪯̸, ⤳̸, ↝̸,
// ⪰̸, 𝒩, 𝓃, ⫅̸, ⊂⃒, ⫅̸, ⪰̸, ⫆̸, ⊃⃒,
// ⫆̸, ≍⃒, ≥⃒, >⃒, ≤⃒, <⃒, ⊴⃒, ⊵⃒, ∼⃒, 𝔒,
// 𝔬, 𝕠, 𝕆, 𝒪, 𝔭, 𝔓, 𝕡, 𝒫, 𝓅, 𝔮, 𝔔, 𝕢,
// 𝓆, 𝒬, ∽̱, 𝔯, 𝕣, 𝓇, 𝔰, 𝔖, ⪬︀, 𝕊, 𝕤, ⊓︀,
// ⊔︀, 𝓈, 𝒮, ⃛, 𝔱, 𝔗,   , 𝕋, 𝕥, ⃛,
// 𝓉, 𝒯, 𝔲, 𝔘, 𝕦, 𝕌, 𝒰, 𝓊, ⊊︀, ⫋︀,
// ⊋︀, ⫌︀, 𝔙, 𝔳, ⊂⃒, ⊃⃒, 𝕧, 𝕍, 𝒱, 𝓋,
// ⫋︀, ⊋︀, ⫌︀, 𝔴, 𝔚, 𝕨, 𝕎, 𝒲, 𝓌, 𝔵, 𝔛, 𝕩,
// 𝕏, 𝒳, 𝓍, 𝔶, 𝔜, 𝕐, 𝕪, 𝓎, 𝒴, 𝔷, 𝕫, 𝓏, 𝒵
//
public static final Map NAMED_ENTITIES = loadNamedEntitiesMap();
public static final Map REVERSE_ENTITIES = loadReverseEntities(NAMED_ENTITIES);
private static Map loadNamedEntitiesMap() {
final var hexFormat = HexFormat.of();
final var map = new LinkedHashMap();
final var in = Constants.class.getResourceAsStream("/named-entities-map");
if (in == null) throw new IllegalStateException("Missing named entities map");
try (final var reader = new BufferedReader(new InputStreamReader(in))) {
int i = 0;
String line; while ((line = reader.readLine()) != null) {
i++;
if (line.isEmpty()) continue;
final int offset = line.indexOf(';');
final var key = line.substring(0, offset + 1);
final var value = new String(hexFormat.parseHex(line.substring(offset+1)), UTF_8);
if (key.isBlank()) System.out.println("line " + i);
map.put(key, value);
}
} catch (final IOException e) {
throw new IllegalStateException("Failure during named entities map parsing", e);
}
return map;
}
private static Map loadReverseEntities(final Map entities) {
final var map = new HashMap();
for (final var entry : entities.entrySet()) {
final var entityName = entry.getKey();
final var entityChar = entry.getValue();
// It may be possible to use String.length(), but I don't want to risk it. Our parser uses
// `toCharArray()` to split a String so that's what I'm sticking with
if (entityChar.toCharArray().length > 1) continue;
// Grab the smallest entityName that is available, no need to make big data
map.compute(entityChar.charAt(0), (c, storedName)
-> storedName == null ? entityName
: entityName.length() >= storedName.length() ? storedName
: entityName);
}
return map;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy