org.daisy.dotify.common.xml.XMLTools Maven / Gradle / Ivy
package org.daisy.dotify.common.xml;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.charset.UnsupportedCharsetException;
import java.text.MessageFormat;
import java.util.Arrays;
import java.util.Map;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.transform.Result;
import javax.xml.transform.Source;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.sax.SAXSource;
import org.xml.sax.Attributes;
import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.ext.LexicalHandler;
import org.xml.sax.helpers.DefaultHandler;
* Provides some xml tools.
* @author Joel Håkansson
public class XMLTools {
static final Pattern XML_DECL = Pattern.compile("\\A\uFEFF?\\s*<\\?xml[^>]*?encoding\\s*=\\s*[\"'](?[^'\"]*)[\"'].*\\?>");
private static final Optional UTF_32_BE = forName("UTF-32BE");
private static final Optional UTF_32_LE = forName("UTF-32LE");
private static final Optional IBM_500 = forName("IBM500");
// With bom
private static final byte[] USC_4_BE = new byte[]{0x00, 0x00, (byte)0xFE, (byte)0xFF};
private static final byte[] USC_4_LE = new byte[]{(byte)0xFF, (byte)0xFE, 0x00, 0x00};
private static final byte[] USC_4_2143 = new byte[]{0x00, 0x00, (byte)0xFF, (byte)0xFE};
private static final byte[] USC_4_3412 = new byte[]{(byte)0xFE, (byte)0xFF, 0x00, 0x00};
// NO bom
private static final byte[] UTF_16_BE = new byte[] {0x00, (byte)0x3C, 0x00, (byte)0x3F};
private static final byte[] UTF_16_LE = new byte[] {(byte)0x3C, 0x00, (byte)0x3F, 0x00};
private static final byte[] UTF_8 = new byte[] {(byte)0x3C, (byte)0x3F, 0x78, 0x6D};
private static final byte[] EBCDIC = new byte[] {(byte)0x4C, (byte)0x6F, (byte)0xA7, (byte)0x94};
private XMLTools() {}
static Optional getDeclaredEncoding(byte[] data, Charset preliminaryEncoding) throws XmlEncodingDetectionException {
try (Reader r = new InputStreamReader(new ByteArrayInputStream(data), preliminaryEncoding)) {
StringBuilder sb = new StringBuilder();
int c =;
if (c=='\uFEFF') {
c = -2;
// Append BOM or any whitespace characters
while (c==-2 || Character.isWhitespace((int)c)) {
c =;
// Read the next 5 characters to determine if an XML declaration is present
for (int i=0; i<5 && c!=-1; i++) {
c =;
boolean closing = false;
if (sb.length()>=5 && ".
if (c=='?') {
closing = true;
} else if (c=='>' && closing) {
} else {
closing = false;
return getDeclaredEncoding(sb.toString());
} else {
return Optional.empty();
} catch (IOException e) {
throw new XmlEncodingDetectionException("Failed to read.", e);
* Gets the declared encoding from the given string. If the string
* doesn't start with an XML declaration, an empty optional is returned.
* @param text the xml
* @return returns a string with the declared encoding
public static Optional getDeclaredEncoding(String text) {
Matcher m = XML_DECL.matcher(text);
String enc;
if (m.find() && ("ENCODING"))!=null) {
return Optional.of(enc);
return Optional.empty();
* Detects XML encoding based on this algorithm:
* In accordance with this specification, it is assumed that the XML declaration
* is not preceded by whitespace (if present).
* Note that some encodings mentioned in the specification are not supported
* because they are not supported by the JVM.
* @param data the input bytes
* @return returns the name of the detected charset
* @throws IllegalArgumentException if the length of the data is less than 4 bytes
* @throws XmlEncodingMismatchException if the declared encoding doesn't match the detected encoding and
* the detected encoding is an exact match
* @throws XmlEncodingDetectionException if detection fails
public static String detectXmlEncoding(byte[] data) throws XmlEncodingDetectionException {
if (data.length<4) {
throw new IllegalArgumentException();
PreliminaryCharset preliminary = guessCharset(data);
if (preliminary==null) {
throw new XmlEncodingDetectionException("Could not detect encoding.");
Optional specifiedEncoding = getDeclaredEncoding(data, preliminary.getCharset());
if (specifiedEncoding.isPresent()) {
String returnEncoding = specifiedEncoding.get();
if (preliminary.isExactMatch()) {
if (!preliminary.getCharset().name().toUpperCase().startsWith(returnEncoding.toUpperCase())) {
String msg = MessageFormat.format("The specified encoding ({0}) doesn''t match detected encoding ({1}).", returnEncoding, preliminary.getCharset().name());
throw new XmlEncodingMismatchException(msg, preliminary.getCharset().name(), returnEncoding);
return preliminary.getCharset().name();
} else {
return returnEncoding;
} else if (preliminary.isExactMatch()) {
return preliminary.getCharset().name();
} else {
throw new XmlEncodingDetectionException("Could not detect encoding.");
* Tries to detect a Unicode encoding from the supplied data based
* on the presence of a BOM. If the file doesn't start with a BOM,
* an empty optional is returned.
* @param data the data to detect encoding on
* @return returns the encoding detected from the BOM
* @throws UnsupportedCharsetException if the charset could be detected but not created
public static Optional detectBomEncoding(byte[] data) {
return Optional.ofNullable(guessCharsetFromBom(data.length>4?Arrays.copyOf(data, 4):data));
* Finds group of encodings that can be used to decode the declaration (if any).
* @param data the data
* @return returns a preliminary charset, based on the first bytes of the file
* @throws IllegalArgumentException if the length of the data is less than 4 bytes
private static PreliminaryCharset guessCharset(byte[] data) {
// Based on
if (data.length<4) {
throw new IllegalArgumentException();
byte[] signature = Arrays.copyOf(data, 4);
int i;
// With BOM
Charset charsetFromBom;
try {
charsetFromBom = guessCharsetFromBom(signature);
} catch (UnsupportedCharsetException e) {
return null;
if (charsetFromBom!=null) {
return new PreliminaryCharset.Builder(charsetFromBom).bom(true).exactMatch(true).build();
// No BOM
else if ((i = detectUcs4WithoutBom(signature))>-1) {
if (i==1) {
return>new PreliminaryCharset.Builder(v).bom(false).exactMatch(false).build()).orElse(null);
} else if (i==3) {
return>new PreliminaryCharset.Builder(v).bom(false).exactMatch(false).build()).orElse(null);
} else {
// not supported
return null;
} else if (Arrays.equals(signature, UTF_16_BE)) {
// UTF-16, big endian
return new PreliminaryCharset.Builder(StandardCharsets.UTF_16BE).bom(false).exactMatch(false).build();
} else if (Arrays.equals(signature, UTF_16_LE)) {
// UTF-16, little endian
return new PreliminaryCharset.Builder(StandardCharsets.UTF_16LE).bom(false).exactMatch(false).build();
} else if (Arrays.equals(signature, UTF_8)) {
// UTF-8 no BOM
return new PreliminaryCharset.Builder(StandardCharsets.UTF_8).bom(false).exactMatch(false).build();
} else if (Arrays.equals(signature, EBCDIC)) {
return>new PreliminaryCharset.Builder(v).bom(false).exactMatch(false).build()).orElse(null);
// UTF-8 without encoding declaration or corrupt
return new PreliminaryCharset.Builder(StandardCharsets.UTF_8).bom(false).exactMatch(true).build();
* Guess the charset from a BOM.
* @param signature a byte signature, 0-4 bytes long
* @return returns the charset if detected or null if the charset could not be detected.
* @throws UnsupportedCharsetException if the charset could be detected but not created
private static Charset guessCharsetFromBom(byte[] signature) throws UnsupportedCharsetException {
if (signature.length<2) {
// No Unicode encoding has a byte order mark < 2 bytes
return null;
} else if (Arrays.equals(signature, USC_4_BE)) {
return UTF_32_BE.orElseThrow(()->new UnsupportedCharsetException("UTF-32BE"));
} else if (Arrays.equals(signature, USC_4_LE)) {
// Note that this test must come before UTF-16 below
return UTF_32_LE.orElseThrow(()->new UnsupportedCharsetException("UTF-32LE"));
} else if (Arrays.equals(signature, USC_4_2143)) {
// Not supported by the JVM
throw new UnsupportedCharsetException("USC-4-2143");
} else if (Arrays.equals(signature, USC_4_3412)) {
// Note that this test must come before UTF-16 below
// Not supported by the JVM
throw new UnsupportedCharsetException("USC-4-3412");
} else if (signature[0]==(byte)0xFE && signature[1]==(byte)0xFF) {
// UTF-16, big endian
return StandardCharsets.UTF_16BE;
} else if (signature[0]==(byte)0xFF && signature[1]==(byte)0xFE) {
// UTF-16, little endian
return StandardCharsets.UTF_16LE;
} else if (signature.length>2 && signature[0]==(byte)0xEF && signature[1]==(byte)0xBB && signature[2]==(byte)0xBF) {
// UTF-8 with BOM
return StandardCharsets.UTF_8;
} else {
return null;
* Detects if the supplied data is XML encoded with UCS4 without BOM.
* Returns the index of the non-zero byte, or -1 if the data isn't
* a match for UCS4 encoded XML.
* @param data the input data
* @return returns the non-zero byte
private static int detectUcs4WithoutBom(byte[] data) {
if (data.length!=4) {
throw new IllegalArgumentException("Expected 4 bytes");
int seen = -1;
int i;
for (i=0; i forName(String charset) {
try {
return Optional.of(Charset.forName(charset));
} catch (Exception e) {
return Optional.empty();
* Transforms the xml with the specified parameters. By default, this method will set up a caching entity resolver, which
* will reduce the amount of fetching of dtd's from the Internet.
* This method will attempt to create Source and Result objects from the supplied source, result and xslt objects.
* This process supports several types of objects from which Sources and Results are typically created, such as files,
* strings and URLs.
* This method will create its own instance of a transformer factory.
* @param source the source xml
* @param result the result xml
* @param xslt the xslt
* @param params xslt parameters
* @throws XMLToolsException if the transformation is unsuccessful
public static void transform(Object source, Object result, Object xslt, Map params) throws XMLToolsException {
transform(TransformerTools.toSource(source), TransformerTools.toResult(result), TransformerTools.toSource(xslt), params);
* Transforms the xml with the specified parameters. By default, this method will set up a caching entity resolver, which
* will reduce the amount of fetching of dtd's from the Internet.
* This method will attempt to create Source and Result objects from the supplied source, result and xslt objects.
* This process supports several types of objects from which Sources and Results are typically created, such as files,
* strings and URLs.
* @param source the source xml
* @param result the result xml
* @param xslt the xslt
* @param params xslt parameters
* @param factory the transformer factory
* @throws XMLToolsException if the transformation is unsuccessful
public static void transform(Object source, Object result, Object xslt, Map params, TransformerFactory factory) throws XMLToolsException {
transform(TransformerTools.toSource(source), TransformerTools.toResult(result), TransformerTools.toSource(xslt), params, factory);
* Transforms the xml with the specified parameters. By default, this method will set up a caching entity resolver, which
* will reduce the amount of fetching of dtd's from the Internet.
* This method will create its own instance of a transformer factory.
* @param source the source xml
* @param result the result xml
* @param xslt the xslt
* @param params xslt parameters
* @throws XMLToolsException if the transformation is unsuccessful
public static void transform(Source source, Result result, Source xslt, Map params) throws XMLToolsException {
transform(source, result, xslt, params, TransformerFactory.newInstance());
* Transforms the xml with the specified parameters. By default, this method will set up a caching entity resolver, which
* will reduce the amount of fetching of dtd's from the Internet.
* @param source the source xml
* @param result the result xml
* @param xslt the xslt
* @param params xslt parameters
* @param factory the transformer factory
* @throws XMLToolsException if the transformation is unsuccessful
public static void transform(Source source, Result result, Source xslt, Map params, TransformerFactory factory) throws XMLToolsException {
transform(source, result, xslt, TransformerEnvironment.builder().transformerFactory(factory).parameters(params).build());
* Transforms the xml with the specified parameters. By default, this method will set up a caching entity resolver, which
* will reduce the amount of fetching of dtd's from the Internet.
* @param source the source xml
* @param result the result xml
* @param xslt the xslt
* @param env the transformer environment
* @param the type of exception thrown
* @throws T if the transformation is unsuccessful
public static void transform(Object source, Object result, Object xslt, TransformerEnvironment env) throws T {
transform(env.asSource(source), env.asResult(result), env.asSource(xslt), env);
* Transforms the xml with the specified parameters. By default, this method will set up a caching entity resolver, which
* will reduce the amount of fetching of dtd's from the Internet.
* @param source the source xml
* @param result the result xml
* @param xslt the xslt
* @param env the transformer environment
* @param the type of exception thrown
* @throws T if the transformation is unsuccessful
public static void transform(Source source, Result result, Source xslt, TransformerEnvironment env) throws T {
Transformer transformer = env.newTransformer(xslt);
for (String name : env.getParameters().keySet()) {
transformer.setParameter(name, env.getParameters().get(name));
SAXParserFactory parserFactory = SAXParserFactory.newInstance();
try {
transformer.setURIResolver(new CachingURIResolver(parserFactory));
} catch (XMLToolsException e) {
//Create a SAXSource, hook up an entityresolver
if(source.getSystemId()!=null && source.getSystemId().length()>0) {
try {
if (source instanceof SAXSource) {
transformer.transform(setEntityResolver((SAXSource) source), result);
} else {
SAXParser parser = parserFactory.newSAXParser();
parser.getXMLReader().setFeature("", false);
try (InputStream is = new URLCache().openStream(new URI(source.getSystemId()).toURL())) {
InputSource isource = new InputSource(is);
SAXSource saxSource = new SAXSource(parser.getXMLReader(), isource);
transformer.transform(setEntityResolver(saxSource), result);
} catch (TransformerException e) {
throw env.toThrowable(e);
} catch (Exception e) {
//TODO: really catch everything?
} else {
throw env.toThrowable(new XMLToolsException("No system id on source, see"));
private static SAXSource setEntityResolver(SAXSource source) {
if(source.getXMLReader().getEntityResolver()==null) {
source.getXMLReader().setEntityResolver(new EntityResolverCache());
return source;
* Returns true if the specified file is well formed XML.
* @param f the file
* @return returns true if the file is well formed XML, false otherwise
* @throws XMLToolsException if a parser cannot be configured or if parsing fails
public static final boolean isWellformedXML(File f) throws XMLToolsException {
return parseXML(f)!=null;
* Returns true if the contents at the specified URI is well formed XML.
* @param uri the URI
* @return returns true if the contents at the specified URI is well formed XML, false otherwise
* @throws XMLToolsException if a parser cannot be configured or if parsing fails
public static final boolean isWellformedXML(URI uri) throws XMLToolsException {
return parseXML(uri)!=null;
* Returns true if the specified source is well formed XML.
* @param source the source
* @return returns true if the source is well formed XML, false otherwise
* @throws XMLToolsException if a parser cannot be configured or if parsing fails
public static final boolean isWellformedXML(InputSource source) throws XMLToolsException {
return parseXML(source)!=null;
* Asserts that the specified file is well formed and returns some root node information.
* @param f the file
* @return returns the root node, or null if file is not well formed
* @throws XMLToolsException if a parser cannot be configured or if parsing fails
public static final XMLInfo parseXML(File f) throws XMLToolsException {
return parseXML(f, false);
* Asserts that the contents at the specified URI is well formed and returns some root node information.
* @param uri the URI
* @return returns the root node, or null if the contents at the specified URI is not well formed
* @throws XMLToolsException if a parser cannot be configured or if parsing fails
public static final XMLInfo parseXML(URI uri) throws XMLToolsException {
return parseXML(uri, false);
* Asserts that the source is well formed and returns some root node information.
* @param source the source
* @return returns the root node, or null if the source is not well formed
* @throws XMLToolsException if a parser cannot be configured or if parsing fails
public static final XMLInfo parseXML(InputSource source) throws XMLToolsException {
return parseXML(source, false);
* Returns some root node information and optionally asserts that the specified
* file is well formed.
* @param f the file
* @param peek true if the parsing should stop after reading the root element. If true,
* the file may or may not be well formed beyond the first start tag.
* @return returns the root node, or null if file is not well formed
* @throws XMLToolsException if a parser cannot be configured or if parsing fails
public static final XMLInfo parseXML(File f, boolean peek) throws XMLToolsException {
return parseXML(f.toURI(), peek);
* Returns some root node information and optionally asserts that the contents at the
* specified URI is well formed.
* @param uri the URI
* @param peek true if the parsing should stop after reading the root element. If true,
* the contents at the specified URI may or may not be well formed beyond the first start tag.
* @return returns the root node, or null if file is not well formed
* @throws XMLToolsException if a parser cannot be configured or if parsing fails
public static final XMLInfo parseXML(URI uri, boolean peek) throws XMLToolsException {
try (InputStream is = uri.toURL().openStream()) {
InputSource source = new InputSource(is);
return parseXML(source, peek);
} catch (IOException e) {
throw new XMLToolsException(e);
* Returns some root node information and optionally asserts that the contents at the
* specified source is well formed.
* @param source the source
* @param peek true if the parsing should stop after reading the root element. If true,
* the source may or may not be well formed beyond the first start tag.
* @return returns the root node, or null if file is not well formed
* @throws XMLToolsException if a parser cannot be configured or if parsing fails
public static final XMLInfo parseXML(InputSource source, boolean peek) throws XMLToolsException {
SAXParserFactory factory = SAXParserFactory.newInstance();
SAXParser saxParser = null;
try {
saxParser = factory.newSAXParser();
} catch (ParserConfigurationException e) {
throw new XMLToolsException("Failed to set up XML parser.", e);
} catch (SAXException e) {
throw new XMLToolsException("Failed to set up XML parser.", e);
XMLHandler dh = new XMLHandler(peek);
try {
XMLReader reader = saxParser.getXMLReader();
if (dh != null) {
//since we sometimes have loadDTD turned off,
//we use lexical handler to get the pub and sys id of prolog
reader.setProperty("", dh);
} catch (StopParsing e) {
//thrown if peek is true
} catch (SAXException e) {
return null;
} catch (IOException e) {
throw new XMLToolsException(e);
return dh.root;
private static class XMLHandler extends DefaultHandler implements LexicalHandler {
private final EntityResolver resolver;
private final boolean peek;
private final XMLInfo.Builder builder;
private XMLInfo root = null;
XMLHandler(boolean peek) {
this.resolver = new EntityResolverCache();
this.peek = peek;
this.builder = new XMLInfo.Builder();
public void startElement(String uri, String localName, String qName,
Attributes attributes) throws SAXException {
if (this.root == null) {
this.root = builder.uri(uri).localName(localName).qName(qName).attributes(attributes).build();
if (peek) {
throw new StopParsing();
public InputSource resolveEntity(String publicId, String systemId) throws SAXException, IOException {
if (root == null) {
//set prolog entity in builder
return resolver.resolveEntity(publicId, systemId);
public void startDTD(String name, String publicId, String systemId) throws SAXException {
public void endDTD() throws SAXException {
// no-op
public void startEntity(String name) throws SAXException {
// no-op
public void endEntity(String name) throws SAXException {
// no-op
public void startCDATA() throws SAXException {
// no-op
public void endCDATA() throws SAXException {
// no-op
public void comment(char[] ch, int start, int length)
throws SAXException {
// no-op
private static class StopParsing extends SAXException {
private static final long serialVersionUID = -4335028194855324300L;