net.pwall.xml.XML Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of xmlutil Show documentation
XML Utilities
There is a newer version: 2.2
/*
 * @(#) XML.java
 */

package net.pwall.xml;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.Objects;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

import org.w3c.dom.Comment;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

import net.pwall.util.CharMapper;
import net.pwall.util.CharMapperEntry;
import net.pwall.util.CharUnmapper;
import net.pwall.util.Strings;
import net.pwall.util.Strings.SpaceTest;

/**
 * Static methods for working with XML.
 *
 * The descriptions of several methods in this class mention the definition of white space in
 * the XML specification.  This is a reference to the definition in Section 2.3 of
 * Extensible Markup Language (XML) 1.0 (Fifth
 * Edition).
 *
 * @author  Peter Wall
 *
 */
public class XML {

    public static final String NAMESPACES_FEATURE = "http://xml.org/sax/features/namespaces";
    public static final String VALIDATION_FEATURE = "http://xml.org/sax/features/validation";
    public static final String RESOLVE_DTD_URIS_FEATURE =
            "http://xml.org/sax/features/resolve-dtd-uris";
    public static final String EXTERNAL_GENERAL_ENTITIES_FEATURE =
            "http://xml.org/sax/features/external-general-entities";
    public static final String EXTERNAL_PARAMETER_ENTITIES_FEATURE =
            "http://xml.org/sax/features/external-parameter-entities";
    public static final String LEXICAL_HANDLER_PROPERTY =
            "http://xml.org/sax/properties/lexical-handler";

    private static DocumentBuilderFactory docBuilderFactory = null;
    private static DocumentBuilderFactory docBuilderFactoryNS = null;

    private static final SpaceTest spaceTest = new SpaceTest() {
        @Override
        public boolean isSpace(int ch) {
            return isWhiteSpace(ch);
        }
    };

    private static final CharMapperEntry[] predefinedEntityMappings = new CharMapperEntry[] {
        new CharMapperEntry('&', "&"),
        new CharMapperEntry('\'', "'"),
        new CharMapperEntry('>', ">"),
        new CharMapperEntry('<', "<"),
        new CharMapperEntry('"', """)
    };

    private static final CharMapper defaultCharMapper = new CharMapper() {
        @Override
        public String map(int codePoint) {
            if (codePoint == '<')
                return "<";
            if (codePoint == '>')
                return ">";
            if (codePoint == '&')
                return "&";
            if (codePoint == '"')
                return """;
            if (codePoint < ' ' && !isWhiteSpace(codePoint) || codePoint >= 0x7F) {
                StringBuilder sb = new StringBuilder(10);
                sb.append("&#");
                sb.append(codePoint);
                sb.append(';');
                return sb.toString();
            }
            return null;
        }
    };

    private static final CharMapper allCharMapper = new CharMapper() {
        @Override
        public String map(int codePoint) {
            if (codePoint == '<')
                return "<";
            if (codePoint == '>')
                return ">";
            if (codePoint == '&')
                return "&";
            if (codePoint == '"')
                return """;
            if (codePoint == '\'')
                return "'";
            if (codePoint < ' ' && !isWhiteSpace(codePoint) || codePoint >= 0x7F) {
                StringBuilder sb = new StringBuilder(10);
                sb.append("&#");
                sb.append(codePoint);
                sb.append(';');
                return sb.toString();
            }
            return null;
        }
    };

    private static final CharMapper dataCharMapper = new CharMapper() {
        @Override
        public String map(int codePoint) {
            if (codePoint == '<')
                return "<";
            if (codePoint == '>')
                return ">";
            if (codePoint == '&')
                return "&";
            if (codePoint < ' ' && !isWhiteSpace(codePoint) || codePoint >= 0x7F) {
                StringBuilder sb = new StringBuilder(10);
                sb.append("&#");
                sb.append(codePoint);
                sb.append(';');
                return sb.toString();
            }
            return null;
        }
    };

    private static final CharUnmapper unmapper = new CharUnmapper() {
        @Override
        public boolean isEscape(CharSequence s, int offset) {
            return s.charAt(offset) == '&';
        }
        @Override
        public int unmap(StringBuilder sb, CharSequence s, int offset) {
            int start = offset + 1;
            if (start < s.length() && s.charAt(start) == '#') {
                int i = ++start;
                do {
                    if (i >= s.length())
                        throw new IllegalArgumentException("Unclosed character reference");
                } while (s.charAt(i++) != ';');
                int codePoint;
                try {
                    if (s.charAt(start) == 'x')
                        codePoint = Strings.convertHexToInt(s, start + 1, i - 1);
                    else
                        codePoint = Strings.convertToInt(s, start, i - 1);
                }
                catch (NumberFormatException nfe) {
                    throw new IllegalArgumentException("Illegal digit in character reference");
                }
                if (Character.isSupplementaryCodePoint(codePoint)) {
                    sb.append(Character.highSurrogate(codePoint));
                    sb.append(Character.lowSurrogate(codePoint));
                }
                else if (Character.isBmpCodePoint(codePoint) &&
                        !Character.isSurrogate((char)codePoint))
                    sb.append((char)codePoint);
                else
                    throw new IllegalArgumentException("Illegal character reference");
                return i - offset;
            }
            else {
                for (CharMapperEntry entry : predefinedEntityMappings) {
                    String mapping = entry.getString();
                    if (entriesEqual(s, offset, mapping)) {
                        sb.append((char)entry.getCodePoint()); // guaranteed to be in BMP
                        return mapping.length();
                    }
                }
                throw new IllegalArgumentException("Illegal entity reference");
            }
        }
    };

    private static boolean entriesEqual(CharSequence source, int start, CharSequence target) {
        int n = target.length();
        if (start + n > source.length())
            return false;
        for (int i = 0, j = start; i < n; i++, j++) {
            if (target.charAt(i) != source.charAt(j))
                return false;
        }
        return true;
    }

    /**
     * Private constructor - class is not to be instantiated.
     */
    private XML() {
    }

    /**
     * Parse an XML document from an {@link InputStream}.  This is a convenience method which
     * automates the use of {@link DocumentBuilderFactory} and {@link DocumentBuilder}.
     *
     * @param   is      the {@link InputStream}
     * @return          the result DOM
     * @throws  ParserConfigurationException if the {@link DocumentBuilderFactory} can not
     *                                       create the {@link DocumentBuilder}
     * @throws  SAXException if any parse errors occur
     * @throws  IOException if any I/O errors occur
     */
    public static Document parse(InputStream is)
            throws ParserConfigurationException, SAXException, IOException {
        return getDocumentBuilder().parse(is);
    }

    /**
     * Parse an XML document from an {@link InputStream} with a specified systemId.  This is a
     * convenience method which automates the use of {@link DocumentBuilderFactory} and
     * {@link DocumentBuilder}.
     *
     * @param   is          the {@link InputStream}
     * @param   systemId    the systemId
     * @return              the result DOM
     * @throws  ParserConfigurationException if the {@link DocumentBuilderFactory} can not
     *                                       create the {@link DocumentBuilder}
     * @throws  SAXException if any parse errors occur
     * @throws  IOException if any I/O errors occur
     */
    public static Document parse(InputStream is, String systemId)
            throws ParserConfigurationException, SAXException, IOException {
        return getDocumentBuilder().parse(is, systemId);
    }

    /**
     * Parse an XML document from a URI.  This is a convenience method which automates the use
     * of {@link DocumentBuilderFactory} and {@link DocumentBuilder}.
     *
     * @param   uri     the URI
     * @return          the result DOM
     * @throws  ParserConfigurationException if the {@link DocumentBuilderFactory} can not
     *                                       create the {@link DocumentBuilder}
     * @throws  SAXException if any parse errors occur
     * @throws  IOException if any I/O errors occur
     */
    public static Document parse(String uri)
            throws ParserConfigurationException, SAXException, IOException {
        return getDocumentBuilder().parse(uri);
    }

    /**
     * Parse an XML document from a {@link File}.  This is a convenience method which automates
     * the use of {@link DocumentBuilderFactory} and {@link DocumentBuilder}.
     *
     * @param   f       the {@link File}
     * @return          the result DOM
     * @throws  ParserConfigurationException if the {@link DocumentBuilderFactory} can not
     *                                       create the {@link DocumentBuilder}
     * @throws  SAXException if any parse errors occur
     * @throws  IOException if any I/O errors occur
     */
    public static Document parse(File f)
            throws ParserConfigurationException, SAXException, IOException {
        return getDocumentBuilder().parse(f);
    }

    /**
     * Parse an XML document from an {@link InputSource}.  This is a convenience method which
     * automates the use of {@link DocumentBuilderFactory} and {@link DocumentBuilder}.
     *
     * @param   is      the {@link InputSource}
     * @return          the result DOM
     * @throws  ParserConfigurationException if the {@link DocumentBuilderFactory} can not
     *                                       create the {@link DocumentBuilder}
     * @throws  SAXException if any parse errors occur
     * @throws  IOException if any I/O errors occur
     */
    public static Document parse(InputSource is)
            throws ParserConfigurationException, SAXException, IOException {
        return getDocumentBuilder().parse(is);
    }

    /**
     * Get a {@link DocumentBuilder}.  This is a convenience method which automates the process
     * of acquiring a {@link DocumentBuilder} from a {@link DocumentBuilderFactory}.
     *
     * @return  a {@link DocumentBuilder}
     * @throws  ParserConfigurationException if the {@link DocumentBuilderFactory} can not
     *                                       create the {@link DocumentBuilder}
     */
    public static DocumentBuilder getDocumentBuilder() throws ParserConfigurationException {
        return getDocumentBuilderFactory().newDocumentBuilder();
    }

    /**
     * Get a namespace-aware {@link DocumentBuilder}.  This is a convenience method which
     * automates the process of acquiring a {@link DocumentBuilder} from a namespace-aware
     * {@link DocumentBuilderFactory}.
     *
     * @return  a namespace-aware {@link DocumentBuilder}
     * @throws  ParserConfigurationException if the {@link DocumentBuilderFactory} can not
     *                                       create the {@link DocumentBuilder}
     */
    public static DocumentBuilder getDocumentBuilderNS() throws ParserConfigurationException {
        return getDocumentBuilderFactoryNS().newDocumentBuilder();
    }

    /**
     * Get a {@link DocumentBuilderFactory}.  This is a convenience method which returns a
     * shared instance.
     *
     * @return  a {@link DocumentBuilderFactory}
     */
    public static synchronized DocumentBuilderFactory getDocumentBuilderFactory() {
        if (docBuilderFactory == null)
            docBuilderFactory = DocumentBuilderFactory.newInstance();
        return docBuilderFactory;
    }

    /**
     * Get a namespace-aware {@link DocumentBuilderFactory}.  This is a convenience method which
     * returns a shared instance.
     *
     * @return  a namespace-aware {@link DocumentBuilderFactory}
     */
    public static synchronized DocumentBuilderFactory getDocumentBuilderFactoryNS() {
        if (docBuilderFactoryNS == null) {
            docBuilderFactoryNS = DocumentBuilderFactory.newInstance();
            docBuilderFactoryNS.setNamespaceAware(true);
        }
        return docBuilderFactoryNS;
    }

    /**
     * Create a new {@link Document}.
     *
     * @return  the {@link Document}
     * @throws  RuntimeException on parser configuration errors
     */
    public static Document newDocument() {
        try {
            return getDocumentBuilder().newDocument();
        }
        catch (ParserConfigurationException pce) {
            throw new RuntimeException("Parser configuration error", pce);
        }
    }

    public static String escapeUTF16(String s) {
        return Strings.escapeUTF16(s, defaultCharMapper);
    }

    /**
     * Escape a string for use in XML.  Specifically, this method converts:
     * 
     * < (less than)
     * &lt;
     * > (greater than)
     * &gt;
     * & (ampersand)
     * &amp;
     * " (double quote)
     * &quot;
     * Characters less than 0x20 (except for 0x09, 0x0A, 0x0D) or greater than 0x7E
     * &#nnn; (where nnn is the code position in decimal)
     * 
     *
     * @param   s       the string to be escaped
     * @return          the escaped string
     * @throws          NullPointerException if the input string is null
     */
    public static String escape(String s) {
        return Strings.escape(s, defaultCharMapper);
    }

    public static CharSequence escape(CharSequence cs) {
        return Strings.escape(cs, defaultCharMapper);
    }

    public static String escapeAll(String s) {
        return Strings.escape(s, allCharMapper);
    }

    public static CharSequence escapeAll(CharSequence cs) {
        return Strings.escape(cs, allCharMapper);
    }

    public static String escapeData(String s) {
        return Strings.escape(s, dataCharMapper);
    }

    public static CharSequence escapeData(CharSequence cs) {
        return Strings.escape(cs, dataCharMapper);
    }

    public static void appendEscaped(Appendable a, CharSequence cs) throws IOException {
        Strings.appendEscaped(a, cs, defaultCharMapper);
    }

    public static void appendEscapedAll(Appendable a, CharSequence cs) throws IOException {
        Strings.appendEscaped(a, cs, allCharMapper);
    }

    public static void appendEscapedData(Appendable a, CharSequence cs) throws IOException {
        Strings.appendEscaped(a, cs, dataCharMapper);
    }

    /**
     * Unescape a string escaped with XML character or entity references.
     *
     * @param   s       the string to be unescaped
     * @return          the unescaped string
     */
    public static String unescape(String s) {
        return Strings.unescape(s, unmapper);
    }

    /**
     * Trim leading and trailing white space characters from a {@link String}, using the
     * definition of white space in the XML specification.
     *
     * @param   s       the {@link String} to be trimmed
     * @return          the trimmed {@link String}
     * @throws          NullPointerException if the input string is {@code null}
     */
    public static String trim(String s) {
        return Strings.trim(s, spaceTest);
    }

    /**
     * Trim leading and trailing white space characters from a {@link CharSequence}, using the
     * definition of white space in the XML specification.
     *
     * @param   cs      the {@link CharSequence} to be trimmed
     * @return          the trimmed {@link CharSequence}
     * @throws          NullPointerException if the input {@link CharSequence} is {@code null}
     */
    public static CharSequence trim(CharSequence cs) {
        return Strings.trim(cs, spaceTest);
    }

    /**
     * Tests whether a {@link CharSequence} (a {@link String}, {@link StringBuilder} etc.) is
     * comprised entirely of white space characters, using the definition of white space in the
     * XML specification.
     *
     * @param   cs      the {@link CharSequence}
     * @return          {@code true} if the contents are all space characters
     * @throws          NullPointerException if the input {@link CharSequence} is {@code null}
     */
    public static boolean isAllWhiteSpace(CharSequence cs) {
        for (int i = 0, n = cs.length(); i < n; i++)
            if (!isWhiteSpace(cs.charAt(i)))
                return false;
        return true;
    }

    /**
     * Tests whether a Unicode code point is a white space character, using the definition of
     * white space in the XML specification.
     *
     * @param   cp      the code point to be tested
     * @return          {@code true} if the code point is a white space character
     */
    public static boolean isWhiteSpace(int cp) {
        // the first comparison gives an immediate 'false' for characters > space
        // the second gives 'true' for space itself
        // the most common case will be decided with 1 comparison; the second most common, 2
        return cp <= ' ' && (cp == ' ' || cp == '\n' || cp == '\t' || cp == '\r');
    }

    /**
     * Split a string into white space delimited tokens, using the definition of white space in
     * the XML specification.
     *
     * @param   s       the string to be split
     * @return          an array of tokens
     * @throws          NullPointerException if the input string is {@code null}
     */
    public static String[] split(String s) {
        return split(s, 0, s.length());
    }

    /**
     * Split a portion of a string into white space delimited tokens, using the definition of
     * white space in the XML specification.
     *
     * @param   s       the string to be split
     * @param   start   the start index of the portion to be examined
     * @param   end     the end index (exclusive) of the portion to be examined
     * @return          an array of tokens
     * @throws          NullPointerException if the input string is {@code null}
     * @throws          StringIndexOutOfBoundsException if {@code start} or {@code end} is
     *                  invalid
     */
    public static String[] split(String s, int start, int end) {
        return Strings.split(s, start, end, spaceTest);
    }

    public static ElementIterator elementIterator(Node parent) {
        return new ElementIterator(parent);
    }

    /**
     * Test whether an {@link Element} matches a specified tag name.
     *
     * @param   elem    the {@link Element}
     * @param   tagName the tag name
     * @return          {@code true} if the names match
     */
    public static boolean match(Element elem, String tagName) {
        return Objects.equals(elem.getTagName(), tagName);
    }

    /**
     * Test whether an {@link Element} matches a specified name and namespace URI.
     *
     * @param   elem            the {@link Element}
     * @param   localName       the local portion of the tag name
     * @param   namespaceURI    the namespace URI
     * @return                  {@code true} if the names match
     */
    public static boolean matchNS(Element elem, String localName, String namespaceURI) {
        return Objects.equals(elem.getLocalName(), localName) &&
                Objects.equals(elem.getNamespaceURI(), namespaceURI);
    }

    /**
     * Get the root element of a document, and check that the element name is correct.
     *
     * @param   document    the document
     * @param   name        the expected root element name
     * @param   nsuri       the namespace URI
     * @return              the root element
     * @throws  XMLException if the root element name is incorrect
     */
    public static Element getDocumentElement(Document document, String name, String nsuri) {
        Element root = document.getDocumentElement();
        if (root == null)
            throw new XMLException("Missing document element");
        if (!matchNS(root, name, nsuri))
            throw new XMLException("Incorrect document element: <" + root.getTagName() + '>');
        return root;
    }

    /**
     * Get the root element of a document, and check that the element name is correct.
     *
     * @param   document    the document
     * @param   name        the expected root element name
     * @return              the root element
     * @throws  XMLException if the root element name is incorrect
     */
    public static Element getDocumentElement(Document document, String name) {
        Element root = document.getDocumentElement();
        if (root == null)
            throw new XMLException("Missing document element");
        if (!match(root, name))
            throw new XMLException("Incorrect document element: <" + root.getTagName() + '>');
        return root;
    }

    /**
     * Create an "unrecognized element" exception.
     *
     * @param   elem    the element which was not recognized
     * @return          the exception
     */
    public static XMLException unrecognizedElement(Element elem) {
        StringBuilder sb = new StringBuilder("Unrecognized element <");
        sb.append(elem.getTagName());
        sb.append('>');
        Node parent = elem.getParentNode();
        if (parent != null && parent instanceof Element) {
            sb.append(" in <");
            sb.append(((Element)parent).getTagName());
            sb.append('>');
        }
        return new XMLException(sb.toString());
    }

    /**
     * Check that a node has no significant content.  This method checks that each child node of
     * the given node is either a comment node or a whitespace-only text node.
     *
     * @param   node    the node
     * @throws  XMLException if the node has child nodes that are not comment
     *                       nodes or empty text nodes
     */
    public static void checkChildNodes(Node node) {
        NodeList children = node.getChildNodes();
        for (int i = 0, n = children.getLength(); i < n; ++i)
            checkNode(children.item(i));
    }

    /**
     * Check that a node is a comment node or a whitespace-only text node.  When iterating
     * through a list of nodes to find the element nodes, it is useful to be able to confirm
     * that the non-element nodes are valid.  This method checks that those nodes are just
     * comment nodes or text nodes composed entirely of whitespace.
     *
     * @param   node    the node
     * @throws  XMLException if the node is not a comment or an empty text node
     */
    public static void checkNode(Node node) {
        if (node instanceof Comment)
            return;
        if (node instanceof Text)
            checkText((Text)node);
        else {
            String message = "Incorrect node type";
            Node parent = node.getParentNode();
            if (parent != null && parent instanceof Element)
                message = message + " in <" + ((Element)parent).getTagName() + '>';
            throw new XMLException(message);
        }
    }

    /**
     * Check that a Text node contains only whitespace.
     *
     * @param   text    the Text node
     * @throws  XMLException if the node is not empty
     */
    public static void checkText(Text text) {
        String data = text.getData();
        for (int i = 0, n = data.length(); i < n; ++i) {
            char ch = data.charAt(i);
            if (!isWhiteSpace(ch)) {
                String message = "No text allowed";
                Node parent = text.getParentNode();
                if (parent != null && parent instanceof Element)
                    message = message + " in <" + ((Element)parent).getTagName() + '>';
                throw new XMLException(message);
            }
        }
    }

}