org.daisy.dotify.common.xml.XMLTools Maven / Gradle / Ivy
package org.daisy.dotify.common.xml;
import org.xml.sax.Attributes;
import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.ext.LexicalHandler;
import org.xml.sax.helpers.DefaultHandler;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URI;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.charset.UnsupportedCharsetException;
import java.text.MessageFormat;
import java.util.Arrays;
import java.util.Map;
import java.util.Optional;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.transform.Result;
import javax.xml.transform.Source;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.sax.SAXSource;
/**
* Provides some xml tools.
*
* @author Joel Håkansson
*/
public class XMLTools {
private static final Logger logger = Logger.getLogger(XMLTools.class.getCanonicalName());
static final Pattern XML_DECL = Pattern.compile(
"\\A\uFEFF?\\s*<\\?xml[^>]*?encoding\\s*=\\s*[\"'](?[^'\"]*)[\"'].*\\?>"
);
private static final Optional UTF_32_BE = forName("UTF-32BE");
private static final Optional UTF_32_LE = forName("UTF-32LE");
private static final Optional IBM_500 = forName("IBM500");
// With bom
private static final byte[] USC_4_BE = new byte[]{0x00, 0x00, (byte) 0xFE, (byte) 0xFF};
private static final byte[] USC_4_LE = new byte[]{(byte) 0xFF, (byte) 0xFE, 0x00, 0x00};
private static final byte[] USC_4_2143 = new byte[]{0x00, 0x00, (byte) 0xFF, (byte) 0xFE};
private static final byte[] USC_4_3412 = new byte[]{(byte) 0xFE, (byte) 0xFF, 0x00, 0x00};
// NO bom
private static final byte[] UTF_16_BE = new byte[]{0x00, (byte) 0x3C, 0x00, (byte) 0x3F};
private static final byte[] UTF_16_LE = new byte[]{(byte) 0x3C, 0x00, (byte) 0x3F, 0x00};
private static final byte[] UTF_8 = new byte[]{(byte) 0x3C, (byte) 0x3F, 0x78, 0x6D};
private static final byte[] EBCDIC = new byte[]{(byte) 0x4C, (byte) 0x6F, (byte) 0xA7, (byte) 0x94};
private XMLTools() {
}
static Optional getDeclaredEncoding(
byte[] data,
Charset preliminaryEncoding
) throws XmlEncodingDetectionException {
try (Reader r = new InputStreamReader(new ByteArrayInputStream(data), preliminaryEncoding)) {
StringBuilder sb = new StringBuilder();
int c = r.read();
if (c == '\uFEFF') {
c = -2;
}
// Append BOM or any whitespace characters
while (c == -2 || Character.isWhitespace((int) c)) {
sb.append((char) c);
c = r.read();
}
// Read the next 5 characters to determine if an XML declaration is present
for (int i = 0; i < 5 && c != -1; i++) {
sb.append((char) c);
c = r.read();
}
boolean closing = false;
if (sb.length() >= 5 && ".
if (c == '?') {
closing = true;
} else if (c == '>' && closing) {
sb.append((char) c);
break;
} else {
closing = false;
}
}
return getDeclaredEncoding(sb.toString());
} else {
return Optional.empty();
}
} catch (IOException e) {
throw new XmlEncodingDetectionException("Failed to read.", e);
}
}
/**
* Gets the declared encoding from the given string. If the string
* doesn't start with an XML declaration, an empty optional is returned.
*
* @param text the xml
* @return returns a string with the declared encoding
*/
public static Optional getDeclaredEncoding(String text) {
Matcher m = XML_DECL.matcher(text);
String enc;
if (m.find() && (enc = m.group("ENCODING")) != null) {
return Optional.of(enc);
}
return Optional.empty();
}
/**
* Detects XML encoding based on this algorithm:
* https://www.w3.org/TR/xml/#sec-guessing.
* In accordance with this specification, it is assumed that the XML declaration
* is not preceded by whitespace (if present).
* Note that some encodings mentioned in the specification are not supported
* because they are not supported by the JVM.
*
* @param data the input bytes
* @return returns the name of the detected charset
* @throws IllegalArgumentException if the length of the data is less than 4 bytes
* @throws XmlEncodingMismatchException if the declared encoding doesn't match the detected encoding and
* the detected encoding is an exact match
* @throws XmlEncodingDetectionException if detection fails
*/
public static String detectXmlEncoding(byte[] data) throws XmlEncodingDetectionException {
if (data.length < 4) {
throw new IllegalArgumentException();
}
PreliminaryCharset preliminary = guessCharset(data);
if (preliminary == null) {
throw new XmlEncodingDetectionException("Could not detect encoding.");
}
Optional specifiedEncoding = getDeclaredEncoding(data, preliminary.getCharset());
if (specifiedEncoding.isPresent()) {
String returnEncoding = specifiedEncoding.get();
if (preliminary.isExactMatch()) {
if (!preliminary.getCharset().name().toUpperCase().startsWith(returnEncoding.toUpperCase())) {
String msg = MessageFormat.format(
"The specified encoding ({0}) doesn''t match detected encoding ({1}).",
returnEncoding,
preliminary.getCharset().name()
);
throw new XmlEncodingMismatchException(msg, preliminary.getCharset().name(), returnEncoding);
}
return preliminary.getCharset().name();
} else {
return returnEncoding;
}
} else if (preliminary.isExactMatch()) {
return preliminary.getCharset().name();
} else {
throw new XmlEncodingDetectionException("Could not detect encoding.");
}
}
/**
* Tries to detect a Unicode encoding from the supplied data based
* on the presence of a BOM. If the file doesn't start with a BOM,
* an empty optional is returned.
*
* @param data the data to detect encoding on
* @return returns the encoding detected from the BOM
* @throws UnsupportedCharsetException if the charset could be detected but not created
*/
public static Optional detectBomEncoding(byte[] data) {
return Optional.ofNullable(guessCharsetFromBom(data.length > 4 ? Arrays.copyOf(data, 4) : data));
}
/**
* Finds group of encodings that can be used to decode the declaration (if any).
*
* @param data the data
* @return returns a preliminary charset, based on the first bytes of the file
* @throws IllegalArgumentException if the length of the data is less than 4 bytes
*/
private static PreliminaryCharset guessCharset(byte[] data) {
// Based on https://www.w3.org/TR/xml/#sec-guessing
if (data.length < 4) {
throw new IllegalArgumentException();
}
byte[] signature = Arrays.copyOf(data, 4);
int i;
// With BOM
Charset charsetFromBom;
try {
charsetFromBom = guessCharsetFromBom(signature);
} catch (UnsupportedCharsetException e) {
return null;
}
if (charsetFromBom != null) {
return new PreliminaryCharset.Builder(charsetFromBom).bom(true).exactMatch(true).build();
} else if ((i = detectUcs4WithoutBom(signature)) > -1) { // No BOM
if (i == 1) {
//BE
return UTF_32_BE.map(
v -> new PreliminaryCharset.Builder(v).bom(false).exactMatch(false).build()
).orElse(null);
} else if (i == 3) {
//LE
return UTF_32_LE.map(
v -> new PreliminaryCharset.Builder(v).bom(false).exactMatch(false).build()
).orElse(null);
} else {
// not supported
return null;
}
} else if (Arrays.equals(signature, UTF_16_BE)) {
// UTF-16, big endian
return new PreliminaryCharset.Builder(StandardCharsets.UTF_16BE).bom(false).exactMatch(false).build();
} else if (Arrays.equals(signature, UTF_16_LE)) {
// UTF-16, little endian
return new PreliminaryCharset.Builder(StandardCharsets.UTF_16LE).bom(false).exactMatch(false).build();
} else if (Arrays.equals(signature, UTF_8)) {
// UTF-8 no BOM
return new PreliminaryCharset.Builder(StandardCharsets.UTF_8).bom(false).exactMatch(false).build();
} else if (Arrays.equals(signature, EBCDIC)) {
return IBM_500.map(
v -> new PreliminaryCharset.Builder(v).bom(false).exactMatch(false).build()
).orElse(null);
}
// UTF-8 without encoding declaration or corrupt
return new PreliminaryCharset.Builder(StandardCharsets.UTF_8).bom(false).exactMatch(true).build();
}
/**
* Guess the charset from a BOM.
*
* @param signature a byte signature, 0-4 bytes long
* @return returns the charset if detected or null if the charset could not be detected.
* @throws UnsupportedCharsetException if the charset could be detected but not created
*/
private static Charset guessCharsetFromBom(byte[] signature) throws UnsupportedCharsetException {
if (signature.length < 2) {
// No Unicode encoding has a byte order mark < 2 bytes
return null;
} else if (Arrays.equals(signature, USC_4_BE)) {
return UTF_32_BE.orElseThrow(() -> new UnsupportedCharsetException("UTF-32BE"));
} else if (Arrays.equals(signature, USC_4_LE)) {
// Note that this test must come before UTF-16 below
return UTF_32_LE.orElseThrow(() -> new UnsupportedCharsetException("UTF-32LE"));
} else if (Arrays.equals(signature, USC_4_2143)) {
// Not supported by the JVM
throw new UnsupportedCharsetException("USC-4-2143");
} else if (Arrays.equals(signature, USC_4_3412)) {
// Note that this test must come before UTF-16 below
// Not supported by the JVM
throw new UnsupportedCharsetException("USC-4-3412");
} else if (signature[0] == (byte) 0xFE && signature[1] == (byte) 0xFF) {
// UTF-16, big endian
return StandardCharsets.UTF_16BE;
} else if (signature[0] == (byte) 0xFF && signature[1] == (byte) 0xFE) {
// UTF-16, little endian
return StandardCharsets.UTF_16LE;
} else if (
signature.length > 2 &&
signature[0] == (byte) 0xEF &&
signature[1] == (byte) 0xBB &&
signature[2] == (byte) 0xBF
) {
// UTF-8 with BOM
return StandardCharsets.UTF_8;
} else {
return null;
}
}
/**
* Detects if the supplied data is XML encoded with UCS4 without BOM.
* Returns the index of the non-zero byte, or -1 if the data isn't
* a match for UCS4 encoded XML.
*
* @param data the input data
* @return returns the non-zero byte
*/
private static int detectUcs4WithoutBom(byte[] data) {
if (data.length != 4) {
throw new IllegalArgumentException("Expected 4 bytes");
}
int seen = -1;
int i;
for (i = 0; i < data.length; i++) {
if (data[i] == 0x3C) {
if (seen == -1) {
seen = i;
} else {
return -1;
}
} else if (data[i] != 0x00) {
return -1;
}
}
return i;
}
private static Optional forName(String charset) {
try {
return Optional.of(Charset.forName(charset));
} catch (Exception e) {
return Optional.empty();
}
}
/**
* Transforms the xml with the specified parameters. By default, this method will set up a caching
* entity resolver, which will reduce the amount of fetching of dtd's from the Internet.
*
* This method will attempt to create Source and Result objects from the supplied source, result and xslt
* objects. This process supports several types of objects from which Sources and Results are typically created,
* such as files, strings and URLs.
*
* This method will create its own instance of a transformer factory.
*
* @param source the source xml
* @param result the result xml
* @param xslt the xslt
* @param params xslt parameters
* @throws XMLToolsException if the transformation is unsuccessful
*/
public static void transform(
Object source,
Object result,
Object xslt,
Map params
) throws XMLToolsException {
transform(
TransformerTools.toSource(source),
TransformerTools.toResult(result),
TransformerTools.toSource(xslt),
params
);
}
/**
* Transforms the xml with the specified parameters. By default, this method will set up a caching entity
* resolver, which will reduce the amount of fetching of dtd's from the Internet.
*
* This method will attempt to create Source and Result objects from the supplied source, result and xslt
* objects. This process supports several types of objects from which Sources and Results are typically created,
* such as files, strings and URLs.
*
* @param source the source xml
* @param result the result xml
* @param xslt the xslt
* @param params xslt parameters
* @param factory the transformer factory
* @throws XMLToolsException if the transformation is unsuccessful
*/
public static void transform(
Object source,
Object result,
Object xslt,
Map params,
TransformerFactory factory
) throws XMLToolsException {
transform(
TransformerTools.toSource(source),
TransformerTools.toResult(result),
TransformerTools.toSource(xslt),
params,
factory
);
}
/**
* Transforms the xml with the specified parameters. By default, this method will set up a caching entity
* resolver, which will reduce the amount of fetching of dtd's from the Internet.
*
* This method will create its own instance of a transformer factory.
*
* @param source the source xml
* @param result the result xml
* @param xslt the xslt
* @param params xslt parameters
* @throws XMLToolsException if the transformation is unsuccessful
*/
public static void transform(
Source source,
Result result,
Source xslt,
Map params
) throws XMLToolsException {
transform(source, result, xslt, params, TransformerFactory.newInstance());
}
/**
* Transforms the xml with the specified parameters. By default, this method will set up a caching entity
* resolver, which will reduce the amount of fetching of dtd's from the Internet.
*
* @param source the source xml
* @param result the result xml
* @param xslt the xslt
* @param params xslt parameters
* @param factory the transformer factory
* @throws XMLToolsException if the transformation is unsuccessful
*/
public static void transform(
Source source,
Result result,
Source xslt,
Map params,
TransformerFactory factory
) throws XMLToolsException {
transform(
source,
result,
xslt,
TransformerEnvironment.builder().transformerFactory(factory).parameters(params).build()
);
}
/**
* Transforms the xml with the specified parameters. By default, this method will set up a caching entity
* resolver, which will reduce the amount of fetching of dtd's from the Internet.
*
* @param source the source xml
* @param result the result xml
* @param xslt the xslt
* @param env the transformer environment
* @param the type of exception thrown
* @throws T if the transformation is unsuccessful
*/
public static void transform(
Object source,
Object result,
Object xslt,
TransformerEnvironment env
) throws T {
transform(
env.asSource(source),
env.asResult(result),
env.asSource(xslt),
env
);
}
/**
* Transforms the xml with the specified parameters. By default, this method will set up a caching entity
* resolver, which will reduce the amount of fetching of dtd's from the Internet.
*
* @param source the source xml
* @param result the result xml
* @param xslt the xslt
* @param env the transformer environment
* @param the type of exception thrown
* @throws T if the transformation is unsuccessful
*/
public static void transform(
Source source,
Result result,
Source xslt,
TransformerEnvironment env
) throws T {
Transformer transformer = env.newTransformer(xslt);
for (String name : env.getParameters().keySet()) {
transformer.setParameter(name, env.getParameters().get(name));
}
SAXParserFactory parserFactory = SAXParserFactory.newInstance();
try {
transformer.setURIResolver(new CachingURIResolver(parserFactory));
} catch (XMLToolsException e) {
env.toThrowable(e);
}
//Create a SAXSource, hook up an entityresolver
if (source.getSystemId() != null && source.getSystemId().length() > 0) {
try {
if (source instanceof SAXSource) {
transformer.transform(setEntityResolver((SAXSource) source), result);
} else {
SAXParser parser = parserFactory.newSAXParser();
parser.getXMLReader().setFeature("http://xml.org/sax/features/validation", false);
try (InputStream is = new URLCache().openStream(new URI(source.getSystemId()).toURL())) {
InputSource isource = new InputSource(is);
isource.setSystemId(source.getSystemId());
SAXSource saxSource = new SAXSource(parser.getXMLReader(), isource);
saxSource.setSystemId(source.getSystemId());
transformer.transform(setEntityResolver(saxSource), result);
}
}
} catch (TransformerException e) {
throw env.toThrowable(e);
} catch (Exception e) {
logger.throwing(XMLTools.class.getCanonicalName(), "transform", e);
}
} else {
throw env.toThrowable(new XMLToolsException(
"No system id on source, see https://github.com/brailleapps/dotify.common/issues/4."
));
}
}
private static SAXSource setEntityResolver(SAXSource source) {
if (source.getXMLReader().getEntityResolver() == null) {
source.getXMLReader().setEntityResolver(new EntityResolverCache());
}
return source;
}
/**
* Returns true if the specified file is well formed XML.
*
* @param f the file
* @return returns true if the file is well formed XML, false otherwise
* @throws XMLToolsException if a parser cannot be configured or if parsing fails
*/
public static final boolean isWellformedXML(File f) throws XMLToolsException {
return parseXML(f) != null;
}
/**
* Returns true if the contents at the specified URI is well formed XML.
*
* @param uri the URI
* @return returns true if the contents at the specified URI is well formed XML, false otherwise
* @throws XMLToolsException if a parser cannot be configured or if parsing fails
*/
public static final boolean isWellformedXML(URI uri) throws XMLToolsException {
return parseXML(uri) != null;
}
/**
* Returns true if the specified source is well formed XML.
*
* @param source the source
* @return returns true if the source is well formed XML, false otherwise
* @throws XMLToolsException if a parser cannot be configured or if parsing fails
*/
public static final boolean isWellformedXML(InputSource source) throws XMLToolsException {
return parseXML(source) != null;
}
/**
* Asserts that the specified file is well formed and returns some root node information.
*
* @param f the file
* @return returns the root node, or null if file is not well formed
* @throws XMLToolsException if a parser cannot be configured or if parsing fails
*/
public static final XMLInfo parseXML(File f) throws XMLToolsException {
return parseXML(f, false);
}
/**
* Asserts that the contents at the specified URI is well formed and returns some root node information.
*
* @param uri the URI
* @return returns the root node, or null if the contents at the specified URI is not well formed
* @throws XMLToolsException if a parser cannot be configured or if parsing fails
*/
public static final XMLInfo parseXML(URI uri) throws XMLToolsException {
return parseXML(uri, false);
}
/**
* Asserts that the source is well formed and returns some root node information.
*
* @param source the source
* @return returns the root node, or null if the source is not well formed
* @throws XMLToolsException if a parser cannot be configured or if parsing fails
*/
public static final XMLInfo parseXML(InputSource source) throws XMLToolsException {
return parseXML(source, false);
}
/**
* Returns some root node information and optionally asserts that the specified
* file is well formed.
*
* @param f the file
* @param peek true if the parsing should stop after reading the root element. If true,
* the file may or may not be well formed beyond the first start tag.
* @return returns the root node, or null if file is not well formed
* @throws XMLToolsException if a parser cannot be configured or if parsing fails
*/
public static final XMLInfo parseXML(File f, boolean peek) throws XMLToolsException {
return parseXML(f.toURI(), peek);
}
/**
* Returns some root node information and optionally asserts that the contents at the
* specified URI is well formed.
*
* @param uri the URI
* @param peek true if the parsing should stop after reading the root element. If true,
* the contents at the specified URI may or may not be well formed beyond the first start tag.
* @return returns the root node, or null if file is not well formed
* @throws XMLToolsException if a parser cannot be configured or if parsing fails
*/
public static final XMLInfo parseXML(URI uri, boolean peek) throws XMLToolsException {
try (InputStream is = uri.toURL().openStream()) {
InputSource source = new InputSource(is);
source.setSystemId(uri.toASCIIString());
return parseXML(source, peek);
} catch (IOException e) {
throw new XMLToolsException(e);
}
}
/**
* Returns some root node information and optionally asserts that the contents at the
* specified source is well formed.
*
* @param source the source
* @param peek true if the parsing should stop after reading the root element. If true,
* the source may or may not be well formed beyond the first start tag.
* @return returns the root node, or null if file is not well formed
* @throws XMLToolsException if a parser cannot be configured or if parsing fails
*/
public static final XMLInfo parseXML(InputSource source, boolean peek) throws XMLToolsException {
SAXParserFactory factory = SAXParserFactory.newInstance();
factory.setNamespaceAware(true);
SAXParser saxParser = null;
try {
saxParser = factory.newSAXParser();
} catch (ParserConfigurationException e) {
throw new XMLToolsException("Failed to set up XML parser.", e);
} catch (SAXException e) {
throw new XMLToolsException("Failed to set up XML parser.", e);
}
XMLHandler dh = new XMLHandler(peek);
try {
XMLReader reader = saxParser.getXMLReader();
if (dh != null) {
reader.setContentHandler(dh);
reader.setEntityResolver(dh);
//since we sometimes have loadDTD turned off,
//we use lexical handler to get the pub and sys id of prolog
reader.setProperty("http://xml.org/sax/properties/lexical-handler", dh);
reader.setErrorHandler(dh);
reader.setDTDHandler(dh);
}
saxParser.getXMLReader().parse(source);
} catch (StopParsing e) {
//thrown if peek is true
} catch (SAXException e) {
return null;
} catch (IOException e) {
throw new XMLToolsException(e);
}
return dh.root;
}
private static class XMLHandler extends DefaultHandler implements LexicalHandler {
private final EntityResolver resolver;
private final boolean peek;
private final XMLInfo.Builder builder;
private XMLInfo root = null;
XMLHandler(boolean peek) {
this.resolver = new EntityResolverCache();
this.peek = peek;
this.builder = new XMLInfo.Builder();
}
@Override
public void startElement(String uri, String localName, String qName,
Attributes attributes) throws SAXException {
if (this.root == null) {
this.root = builder.uri(uri).localName(localName).qName(qName).attributes(attributes).build();
if (peek) {
throw new StopParsing();
}
}
}
@Override
public InputSource resolveEntity(String publicId, String systemId) throws SAXException, IOException {
if (root == null) {
//set prolog entity in builder
builder.publicId(publicId);
builder.systemId(systemId);
}
return resolver.resolveEntity(publicId, systemId);
}
@Override
public void startDTD(String name, String publicId, String systemId) throws SAXException {
builder.publicId(publicId);
builder.systemId(systemId);
}
@Override
public void endDTD() throws SAXException {
// no-op
}
@Override
public void startEntity(String name) throws SAXException {
// no-op
}
@Override
public void endEntity(String name) throws SAXException {
// no-op
}
@Override
public void startCDATA() throws SAXException {
// no-op
}
@Override
public void endCDATA() throws SAXException {
// no-op
}
@Override
public void comment(char[] ch, int start, int length)
throws SAXException {
// no-op
}
}
private static class StopParsing extends SAXException {
private static final long serialVersionUID = -4335028194855324300L;
}
}