xmlparser.utils.Constants Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of simplexml Show documentation
A clean and simple XML parser, serializer, and deserializer.
The newest version!
package xmlparser.utils;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.HexFormat;
import java.util.LinkedHashMap;
import java.util.Map;

import static java.nio.charset.StandardCharsets.UTF_8;
import static java.util.Map.entry;
import static java.util.Map.ofEntries;

public enum Constants {;

    public static final char
        CHAR_FORWARD_SLASH = '/',
        CHAR_SPACE = ' ',
        CHAR_EQUALS = '=',
        CHAR_LESS_THAN = '<',
        CHAR_GREATER_THAN = '>',
        CHAR_QUESTION_MARK = '?',
        CHAR_SINGLE_QUOTE = '\'',
        CHAR_DOUBLE_QUOTE = '"',
        CHAR_AMPERSAND = '&';

    public static final char
        XML_TAG_START = CHAR_LESS_THAN,
        XML_TAG_END = CHAR_GREATER_THAN,
        XML_SELF_CLOSING = CHAR_FORWARD_SLASH,
        XML_PROLOG = CHAR_QUESTION_MARK;

    public static final String
        XML_START_COMMENT = "!--",
        XML_END_COMMENT = "--";

    public static final String
        EMPTY = "",
        SPACE = " ",
        INDENT = "  ",
        LESS_THAN = "<",
        GREATER_THAN = ">",
        AMPERSAND = "&",
        EQUALS = "=",
        HASH = "#",
        SEMICOLON = ";",
        DOUBLE_QUOTE = "\"",
        FORWARD_SLASH = "/",
        CARRIAGE_RETURN = "\r",
        LINE_FEED = "\n";

    public static final String
        ENCODED_LESS_THAN = "<",
        ENCODED_GREATER_THAN = ">",
        ENCODED_SINGLE_QUOTE = "'",
        ENCODED_DOUBLE_QUOTE = """,
        ENCODED_AMPERSAND = "&",
        ENCODED_UTF8 = "&#";

    public static final String
        PREDICATE_START_SYMBOL = "[",
        PREDICATE_END_SYMBOL = "]",
        PREDICATE_EQUAL_SYMBOL = "=",
        SEGMENT_EXPRESSION = "text()",
        EXPRESSION_PATH_SEPARATOR = "/";

    public static final String
        ERROR_EQUALITY_WITHOUT_TWO_COMPONENTS = "Equality predicate must have exactly two members",
        ERROR_EQUALITY_WITH_EMPTY_PARTS = "Equality predicate must have two non-empty members",
        ERROR_PREDICATE_WRONG_START = "Predicate does not start with [",
        ERROR_PREDICATE_WRONG_END = "Predicate does not end with ]",
        ERROR_PREDICATE_WRONG_NAME = "Element name contains ]";

    // This list contains named entities from HTML 2.0, HTML 3.2, HTML 4.0, HTML 5.0, XML 1.0, MathML 2.0,
    // and MathML 3.0.
    //
    // https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
    // https://www.freeformatter.com/html-entities.html
    //
    // Warning: The entities ̑, ⃛, ⃛, and ⃜ had a leading space in MathML 2.0.
    // In MathML 3.0 and HTML 5.0 that leading space was removed. This map uses the newer version without the
    // leading space.
    //
    // Warning: It appears that HTML 3.2 allows named entities that do not end with a semicolon. Our parsing
    // does not support this.
    //
    // Warning: Our parsing uses Java chars to detect characters that need to be encoded. However, a Java char
    // contains only 16 bits, while some entities map onto characters that don't fit in 16 bits. It is therefore
    // not possible to encode those characters with the current implementation. HTML entities that fit this
    // definition are:
    //
    //    ∾̳, 𝔞, 𝔄, 𝔸, 𝕒, 𝒶, 𝒜, 𝔟, 𝔅, =⃥, ≡⃥, 𝔹,
    //    𝕓, 𝒷, ∩︀, 𝔠, 𝕔, 𝒸, 𝒞, ∪︀, 𝔇, 𝔡, 𝔻, 𝕕,
    //    ⃜, ̑, 𝒹, 𝒟, 𝔢, 𝔈, 𝕖, 𝔼, 𝔣, 𝔉, fj,
    //    𝕗, 𝔽, 𝒻, ⋛︀, 𝔊, 𝔤, 𝕘, 𝔾, 𝒢, ≩︀, ≩︀,
    //    𝔥, 𝕙, 𝒽, 𝔦, 𝕀, 𝕚, 𝒾, 𝔧, 𝔍, 𝕁, 𝕛, 𝒿,
    //    𝒥, 𝔎, 𝔨, 𝕂, 𝕜, 𝓀, 𝒦, ⪭︀, ⋚︀, 𝔩, 𝔏, 𝕃,
    //    𝕝, 𝓁, ≨︀, ≨︀, 𝔪, 𝔐, 𝕞, 𝕄, 𝓂, ∠⃒, ⩰̸,
    //    ≋̸, ≎̸, ≏̸, ⩭̸, ≐̸, ≂̸, 𝔫, 𝔑, ≧̸, ≧̸,
    //    ⩾̸, ⩾̸, ⋙̸, ≫⃒, ≫̸, ≦̸, ≦̸, ⩽̸, ⩽̸, ⋘̸, ≪⃒,
    //    ≪̸, 𝕟, ≧̸, ≫̸, ⩾̸,
    //    ≎̸, ≏̸, ⋵̸, ⋹̸, ⧏̸, ≪̸,
    //    ⩽̸, ⪢̸, ⪡̸, ⪯̸,
    //    ⧐̸, ⊏̸, ⊐̸, ⊂⃒, ⪰̸,
    //    ≿̸, ⊃⃒, ⫽⃥, ∂̸, ⪯̸, ⪯̸, ⤳̸, ↝̸,
    //    ⪰̸, 𝒩, 𝓃, ⫅̸, ⊂⃒, ⫅̸, ⪰̸, ⫆̸, ⊃⃒,
    //    ⫆̸, ≍⃒, ≥⃒, >⃒, ≤⃒, <⃒, ⊴⃒, ⊵⃒, ∼⃒, 𝔒,
    //    𝔬, 𝕠, 𝕆, 𝒪, 𝔭, 𝔓, 𝕡, 𝒫, 𝓅, 𝔮, 𝔔, 𝕢,
    //    𝓆, 𝒬, ∽̱, 𝔯, 𝕣, 𝓇, 𝔰, 𝔖, ⪬︀, 𝕊, 𝕤, ⊓︀,
    //    ⊔︀, 𝓈, 𝒮, ⃛, 𝔱, 𝔗,   , 𝕋, 𝕥, ⃛,
    //    𝓉, 𝒯, 𝔲, 𝔘, 𝕦, 𝕌, 𝒰, 𝓊, ⊊︀, ⫋︀,
    //    ⊋︀, ⫌︀, 𝔙, 𝔳, ⊂⃒, ⊃⃒, 𝕧, 𝕍, 𝒱, 𝓋,
    //    ⫋︀, ⊋︀, ⫌︀, 𝔴, 𝔚, 𝕨, 𝕎, 𝒲, 𝓌, 𝔵, 𝔛, 𝕩,
    //    𝕏, 𝒳, 𝓍, 𝔶, 𝔜, 𝕐, 𝕪, 𝓎, 𝒴, 𝔷, 𝕫, 𝓏, 𝒵
    //
    public static final Map NAMED_ENTITIES = loadNamedEntitiesMap();
    public static final Map REVERSE_ENTITIES = loadReverseEntities(NAMED_ENTITIES);

    private static Map loadNamedEntitiesMap() {
        final var hexFormat = HexFormat.of();
        final var map = new LinkedHashMap();

        final var in = Constants.class.getResourceAsStream("/named-entities-map");
        if (in == null) throw new IllegalStateException("Missing named entities map");

        try (final var reader = new BufferedReader(new InputStreamReader(in))) {
            int i = 0;
            String line; while ((line = reader.readLine()) != null) {
                i++;
                if (line.isEmpty()) continue;

                final int offset = line.indexOf(';');
                final var key = line.substring(0, offset + 1);
                final var value = new String(hexFormat.parseHex(line.substring(offset+1)), UTF_8);
                if (key.isBlank()) System.out.println("line " + i);
                map.put(key, value);
            }
        } catch (final IOException e) {
            throw new IllegalStateException("Failure during named entities map parsing", e);
        }

        return map;
    }

    private static Map loadReverseEntities(final Map entities) {
        final var map = new HashMap();

        for (final var entry : entities.entrySet()) {
            final var entityName = entry.getKey();
            final var entityChar = entry.getValue();

            // It may be possible to use String.length(), but I don't want to risk it. Our parser uses
            // `toCharArray()` to split a String so that's what I'm sticking with
            if (entityChar.toCharArray().length > 1) continue;

            // Grab the smallest entityName that is available, no need to make big data
            map.compute(entityChar.charAt(0), (c, storedName)
                    -> storedName == null ? entityName
                    : entityName.length() >= storedName.length() ? storedName
                    : entityName);
        }

        return map;
    }

}