net.pwall.xml.XML Maven / Gradle / Ivy
/*
* @(#) XML.java
*/
package net.pwall.xml;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.Objects;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.w3c.dom.Comment;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import net.pwall.util.CharMapper;
import net.pwall.util.CharMapperEntry;
import net.pwall.util.CharUnmapper;
import net.pwall.util.Strings;
import net.pwall.util.Strings.SpaceTest;
/**
* Static methods for working with XML.
*
* The descriptions of several methods in this class mention the definition of white space in
* the XML specification. This is a reference to the definition in Section 2.3 of
* Extensible Markup Language (XML) 1.0 (Fifth
* Edition).
*
* @author Peter Wall
*
*/
public class XML {
public static final String NAMESPACES_FEATURE = "http://xml.org/sax/features/namespaces";
public static final String VALIDATION_FEATURE = "http://xml.org/sax/features/validation";
public static final String RESOLVE_DTD_URIS_FEATURE =
"http://xml.org/sax/features/resolve-dtd-uris";
public static final String EXTERNAL_GENERAL_ENTITIES_FEATURE =
"http://xml.org/sax/features/external-general-entities";
public static final String EXTERNAL_PARAMETER_ENTITIES_FEATURE =
"http://xml.org/sax/features/external-parameter-entities";
public static final String LEXICAL_HANDLER_PROPERTY =
"http://xml.org/sax/properties/lexical-handler";
private static DocumentBuilderFactory docBuilderFactory = null;
private static DocumentBuilderFactory docBuilderFactoryNS = null;
private static final SpaceTest spaceTest = new SpaceTest() {
@Override
public boolean isSpace(int ch) {
return isWhiteSpace(ch);
}
};
private static final CharMapperEntry[] predefinedEntityMappings = new CharMapperEntry[] {
new CharMapperEntry('&', "&"),
new CharMapperEntry('\'', "'"),
new CharMapperEntry('>', ">"),
new CharMapperEntry('<', "<"),
new CharMapperEntry('"', """)
};
private static final CharMapper defaultCharMapper = new CharMapper() {
@Override
public String map(int codePoint) {
if (codePoint == '<')
return "<";
if (codePoint == '>')
return ">";
if (codePoint == '&')
return "&";
if (codePoint == '"')
return """;
if (codePoint < ' ' && !isWhiteSpace(codePoint) || codePoint >= 0x7F) {
StringBuilder sb = new StringBuilder(10);
sb.append("");
sb.append(codePoint);
sb.append(';');
return sb.toString();
}
return null;
}
};
private static final CharMapper allCharMapper = new CharMapper() {
@Override
public String map(int codePoint) {
if (codePoint == '<')
return "<";
if (codePoint == '>')
return ">";
if (codePoint == '&')
return "&";
if (codePoint == '"')
return """;
if (codePoint == '\'')
return "'";
if (codePoint < ' ' && !isWhiteSpace(codePoint) || codePoint >= 0x7F) {
StringBuilder sb = new StringBuilder(10);
sb.append("");
sb.append(codePoint);
sb.append(';');
return sb.toString();
}
return null;
}
};
private static final CharMapper dataCharMapper = new CharMapper() {
@Override
public String map(int codePoint) {
if (codePoint == '<')
return "<";
if (codePoint == '>')
return ">";
if (codePoint == '&')
return "&";
if (codePoint < ' ' && !isWhiteSpace(codePoint) || codePoint >= 0x7F) {
StringBuilder sb = new StringBuilder(10);
sb.append("");
sb.append(codePoint);
sb.append(';');
return sb.toString();
}
return null;
}
};
private static final CharUnmapper unmapper = new CharUnmapper() {
@Override
public boolean isEscape(CharSequence s, int offset) {
return s.charAt(offset) == '&';
}
@Override
public int unmap(StringBuilder sb, CharSequence s, int offset) {
int start = offset + 1;
if (start < s.length() && s.charAt(start) == '#') {
int i = ++start;
do {
if (i >= s.length())
throw new IllegalArgumentException("Unclosed character reference");
} while (s.charAt(i++) != ';');
int codePoint;
try {
if (s.charAt(start) == 'x')
codePoint = Strings.convertHexToInt(s, start + 1, i - 1);
else
codePoint = Strings.convertToInt(s, start, i - 1);
}
catch (NumberFormatException nfe) {
throw new IllegalArgumentException("Illegal digit in character reference");
}
if (Character.isSupplementaryCodePoint(codePoint)) {
sb.append(Character.highSurrogate(codePoint));
sb.append(Character.lowSurrogate(codePoint));
}
else if (Character.isBmpCodePoint(codePoint) &&
!Character.isSurrogate((char)codePoint))
sb.append((char)codePoint);
else
throw new IllegalArgumentException("Illegal character reference");
return i - offset;
}
else {
for (CharMapperEntry entry : predefinedEntityMappings) {
String mapping = entry.getString();
if (entriesEqual(s, offset, mapping)) {
sb.append((char)entry.getCodePoint()); // guaranteed to be in BMP
return mapping.length();
}
}
throw new IllegalArgumentException("Illegal entity reference");
}
}
};
private static boolean entriesEqual(CharSequence source, int start, CharSequence target) {
int n = target.length();
if (start + n > source.length())
return false;
for (int i = 0, j = start; i < n; i++, j++) {
if (target.charAt(i) != source.charAt(j))
return false;
}
return true;
}
/**
* Private constructor - class is not to be instantiated.
*/
private XML() {
}
/**
* Parse an XML document from an {@link InputStream}. This is a convenience method which
* automates the use of {@link DocumentBuilderFactory} and {@link DocumentBuilder}.
*
* @param is the {@link InputStream}
* @return the result DOM
* @throws ParserConfigurationException if the {@link DocumentBuilderFactory} can not
* create the {@link DocumentBuilder}
* @throws SAXException if any parse errors occur
* @throws IOException if any I/O errors occur
*/
public static Document parse(InputStream is)
throws ParserConfigurationException, SAXException, IOException {
return getDocumentBuilder().parse(is);
}
/**
* Parse an XML document from an {@link InputStream} with a specified systemId. This is a
* convenience method which automates the use of {@link DocumentBuilderFactory} and
* {@link DocumentBuilder}.
*
* @param is the {@link InputStream}
* @param systemId the systemId
* @return the result DOM
* @throws ParserConfigurationException if the {@link DocumentBuilderFactory} can not
* create the {@link DocumentBuilder}
* @throws SAXException if any parse errors occur
* @throws IOException if any I/O errors occur
*/
public static Document parse(InputStream is, String systemId)
throws ParserConfigurationException, SAXException, IOException {
return getDocumentBuilder().parse(is, systemId);
}
/**
* Parse an XML document from a URI. This is a convenience method which automates the use
* of {@link DocumentBuilderFactory} and {@link DocumentBuilder}.
*
* @param uri the URI
* @return the result DOM
* @throws ParserConfigurationException if the {@link DocumentBuilderFactory} can not
* create the {@link DocumentBuilder}
* @throws SAXException if any parse errors occur
* @throws IOException if any I/O errors occur
*/
public static Document parse(String uri)
throws ParserConfigurationException, SAXException, IOException {
return getDocumentBuilder().parse(uri);
}
/**
* Parse an XML document from a {@link File}. This is a convenience method which automates
* the use of {@link DocumentBuilderFactory} and {@link DocumentBuilder}.
*
* @param f the {@link File}
* @return the result DOM
* @throws ParserConfigurationException if the {@link DocumentBuilderFactory} can not
* create the {@link DocumentBuilder}
* @throws SAXException if any parse errors occur
* @throws IOException if any I/O errors occur
*/
public static Document parse(File f)
throws ParserConfigurationException, SAXException, IOException {
return getDocumentBuilder().parse(f);
}
/**
* Parse an XML document from an {@link InputSource}. This is a convenience method which
* automates the use of {@link DocumentBuilderFactory} and {@link DocumentBuilder}.
*
* @param is the {@link InputSource}
* @return the result DOM
* @throws ParserConfigurationException if the {@link DocumentBuilderFactory} can not
* create the {@link DocumentBuilder}
* @throws SAXException if any parse errors occur
* @throws IOException if any I/O errors occur
*/
public static Document parse(InputSource is)
throws ParserConfigurationException, SAXException, IOException {
return getDocumentBuilder().parse(is);
}
/**
* Get a {@link DocumentBuilder}. This is a convenience method which automates the process
* of acquiring a {@link DocumentBuilder} from a {@link DocumentBuilderFactory}.
*
* @return a {@link DocumentBuilder}
* @throws ParserConfigurationException if the {@link DocumentBuilderFactory} can not
* create the {@link DocumentBuilder}
*/
public static DocumentBuilder getDocumentBuilder() throws ParserConfigurationException {
return getDocumentBuilderFactory().newDocumentBuilder();
}
/**
* Get a namespace-aware {@link DocumentBuilder}. This is a convenience method which
* automates the process of acquiring a {@link DocumentBuilder} from a namespace-aware
* {@link DocumentBuilderFactory}.
*
* @return a namespace-aware {@link DocumentBuilder}
* @throws ParserConfigurationException if the {@link DocumentBuilderFactory} can not
* create the {@link DocumentBuilder}
*/
public static DocumentBuilder getDocumentBuilderNS() throws ParserConfigurationException {
return getDocumentBuilderFactoryNS().newDocumentBuilder();
}
/**
* Get a {@link DocumentBuilderFactory}. This is a convenience method which returns a
* shared instance.
*
* @return a {@link DocumentBuilderFactory}
*/
public static synchronized DocumentBuilderFactory getDocumentBuilderFactory() {
if (docBuilderFactory == null)
docBuilderFactory = DocumentBuilderFactory.newInstance();
return docBuilderFactory;
}
/**
* Get a namespace-aware {@link DocumentBuilderFactory}. This is a convenience method which
* returns a shared instance.
*
* @return a namespace-aware {@link DocumentBuilderFactory}
*/
public static synchronized DocumentBuilderFactory getDocumentBuilderFactoryNS() {
if (docBuilderFactoryNS == null) {
docBuilderFactoryNS = DocumentBuilderFactory.newInstance();
docBuilderFactoryNS.setNamespaceAware(true);
}
return docBuilderFactoryNS;
}
/**
* Create a new {@link Document}.
*
* @return the {@link Document}
* @throws RuntimeException on parser configuration errors
*/
public static Document newDocument() {
try {
return getDocumentBuilder().newDocument();
}
catch (ParserConfigurationException pce) {
throw new RuntimeException("Parser configuration error", pce);
}
}
public static String escapeUTF16(String s) {
return Strings.escapeUTF16(s, defaultCharMapper);
}
/**
* Escape a string for use in XML. Specifically, this method converts:
*
* - < (less than)
* - <
* - > (greater than)
* - >
* - & (ampersand)
* - &
* - " (double quote)
* - "
* - Characters less than 0x20 (except for 0x09, 0x0A, 0x0D) or greater than 0x7E
* - &#nnn; (where nnn is the code position in decimal)
*
*
* @param s the string to be escaped
* @return the escaped string
* @throws NullPointerException if the input string is null
*/
public static String escape(String s) {
return Strings.escape(s, defaultCharMapper);
}
public static CharSequence escape(CharSequence cs) {
return Strings.escape(cs, defaultCharMapper);
}
public static String escapeAll(String s) {
return Strings.escape(s, allCharMapper);
}
public static CharSequence escapeAll(CharSequence cs) {
return Strings.escape(cs, allCharMapper);
}
public static String escapeData(String s) {
return Strings.escape(s, dataCharMapper);
}
public static CharSequence escapeData(CharSequence cs) {
return Strings.escape(cs, dataCharMapper);
}
public static void appendEscaped(Appendable a, CharSequence cs) throws IOException {
Strings.appendEscaped(a, cs, defaultCharMapper);
}
public static void appendEscapedAll(Appendable a, CharSequence cs) throws IOException {
Strings.appendEscaped(a, cs, allCharMapper);
}
public static void appendEscapedData(Appendable a, CharSequence cs) throws IOException {
Strings.appendEscaped(a, cs, dataCharMapper);
}
/**
* Unescape a string escaped with XML character or entity references.
*
* @param s the string to be unescaped
* @return the unescaped string
*/
public static String unescape(String s) {
return Strings.unescape(s, unmapper);
}
/**
* Trim leading and trailing white space characters from a {@link String}, using the
* definition of white space in the XML specification.
*
* @param s the {@link String} to be trimmed
* @return the trimmed {@link String}
* @throws NullPointerException if the input string is {@code null}
*/
public static String trim(String s) {
return Strings.trim(s, spaceTest);
}
/**
* Trim leading and trailing white space characters from a {@link CharSequence}, using the
* definition of white space in the XML specification.
*
* @param cs the {@link CharSequence} to be trimmed
* @return the trimmed {@link CharSequence}
* @throws NullPointerException if the input {@link CharSequence} is {@code null}
*/
public static CharSequence trim(CharSequence cs) {
return Strings.trim(cs, spaceTest);
}
/**
* Tests whether a {@link CharSequence} (a {@link String}, {@link StringBuilder} etc.) is
* comprised entirely of white space characters, using the definition of white space in the
* XML specification.
*
* @param cs the {@link CharSequence}
* @return {@code true} if the contents are all space characters
* @throws NullPointerException if the input {@link CharSequence} is {@code null}
*/
public static boolean isAllWhiteSpace(CharSequence cs) {
for (int i = 0, n = cs.length(); i < n; i++)
if (!isWhiteSpace(cs.charAt(i)))
return false;
return true;
}
/**
* Tests whether a Unicode code point is a white space character, using the definition of
* white space in the XML specification.
*
* @param cp the code point to be tested
* @return {@code true} if the code point is a white space character
*/
public static boolean isWhiteSpace(int cp) {
// the first comparison gives an immediate 'false' for characters > space
// the second gives 'true' for space itself
// the most common case will be decided with 1 comparison; the second most common, 2
return cp <= ' ' && (cp == ' ' || cp == '\n' || cp == '\t' || cp == '\r');
}
/**
* Split a string into white space delimited tokens, using the definition of white space in
* the XML specification.
*
* @param s the string to be split
* @return an array of tokens
* @throws NullPointerException if the input string is {@code null}
*/
public static String[] split(String s) {
return split(s, 0, s.length());
}
/**
* Split a portion of a string into white space delimited tokens, using the definition of
* white space in the XML specification.
*
* @param s the string to be split
* @param start the start index of the portion to be examined
* @param end the end index (exclusive) of the portion to be examined
* @return an array of tokens
* @throws NullPointerException if the input string is {@code null}
* @throws StringIndexOutOfBoundsException if {@code start} or {@code end} is
* invalid
*/
public static String[] split(String s, int start, int end) {
return Strings.split(s, start, end, spaceTest);
}
public static ElementIterator elementIterator(Node parent) {
return new ElementIterator(parent);
}
/**
* Test whether an {@link Element} matches a specified tag name.
*
* @param elem the {@link Element}
* @param tagName the tag name
* @return {@code true} if the names match
*/
public static boolean match(Element elem, String tagName) {
return Objects.equals(elem.getTagName(), tagName);
}
/**
* Test whether an {@link Element} matches a specified name and namespace URI.
*
* @param elem the {@link Element}
* @param localName the local portion of the tag name
* @param namespaceURI the namespace URI
* @return {@code true} if the names match
*/
public static boolean matchNS(Element elem, String localName, String namespaceURI) {
return Objects.equals(elem.getLocalName(), localName) &&
Objects.equals(elem.getNamespaceURI(), namespaceURI);
}
/**
* Get the root element of a document, and check that the element name is correct.
*
* @param document the document
* @param name the expected root element name
* @param nsuri the namespace URI
* @return the root element
* @throws XMLException if the root element name is incorrect
*/
public static Element getDocumentElement(Document document, String name, String nsuri) {
Element root = document.getDocumentElement();
if (root == null)
throw new XMLException("Missing document element");
if (!matchNS(root, name, nsuri))
throw new XMLException("Incorrect document element: <" + root.getTagName() + '>');
return root;
}
/**
* Get the root element of a document, and check that the element name is correct.
*
* @param document the document
* @param name the expected root element name
* @return the root element
* @throws XMLException if the root element name is incorrect
*/
public static Element getDocumentElement(Document document, String name) {
Element root = document.getDocumentElement();
if (root == null)
throw new XMLException("Missing document element");
if (!match(root, name))
throw new XMLException("Incorrect document element: <" + root.getTagName() + '>');
return root;
}
/**
* Create an "unrecognized element" exception.
*
* @param elem the element which was not recognized
* @return the exception
*/
public static XMLException unrecognizedElement(Element elem) {
StringBuilder sb = new StringBuilder("Unrecognized element <");
sb.append(elem.getTagName());
sb.append('>');
Node parent = elem.getParentNode();
if (parent != null && parent instanceof Element) {
sb.append(" in <");
sb.append(((Element)parent).getTagName());
sb.append('>');
}
return new XMLException(sb.toString());
}
/**
* Check that a node has no significant content. This method checks that each child node of
* the given node is either a comment node or a whitespace-only text node.
*
* @param node the node
* @throws XMLException if the node has child nodes that are not comment
* nodes or empty text nodes
*/
public static void checkChildNodes(Node node) {
NodeList children = node.getChildNodes();
for (int i = 0, n = children.getLength(); i < n; ++i)
checkNode(children.item(i));
}
/**
* Check that a node is a comment node or a whitespace-only text node. When iterating
* through a list of nodes to find the element nodes, it is useful to be able to confirm
* that the non-element nodes are valid. This method checks that those nodes are just
* comment nodes or text nodes composed entirely of whitespace.
*
* @param node the node
* @throws XMLException if the node is not a comment or an empty text node
*/
public static void checkNode(Node node) {
if (node instanceof Comment)
return;
if (node instanceof Text)
checkText((Text)node);
else {
String message = "Incorrect node type";
Node parent = node.getParentNode();
if (parent != null && parent instanceof Element)
message = message + " in <" + ((Element)parent).getTagName() + '>';
throw new XMLException(message);
}
}
/**
* Check that a Text node contains only whitespace.
*
* @param text the Text node
* @throws XMLException if the node is not empty
*/
public static void checkText(Text text) {
String data = text.getData();
for (int i = 0, n = data.length(); i < n; ++i) {
char ch = data.charAt(i);
if (!isWhiteSpace(ch)) {
String message = "No text allowed";
Node parent = text.getParentNode();
if (parent != null && parent instanceof Element)
message = message + " in <" + ((Element)parent).getTagName() + '>';
throw new XMLException(message);
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy