All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.daisy.dotify.common.xml.XMLTools Maven / Gradle / Ivy

The newest version!
package org.daisy.dotify.common.xml;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URI;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.charset.UnsupportedCharsetException;
import java.text.MessageFormat;
import java.util.Arrays;
import java.util.Map;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.transform.Result;
import javax.xml.transform.Source;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.sax.SAXSource;

import org.xml.sax.Attributes;
import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.ext.LexicalHandler;
import org.xml.sax.helpers.DefaultHandler;

/**
 * Provides some xml tools.
 * 
 * @author Joel Håkansson
 */
public class XMLTools {

	static final Pattern XML_DECL = Pattern.compile("\\A\uFEFF?\\s*<\\?xml[^>]*?encoding\\s*=\\s*[\"'](?[^'\"]*)[\"'].*\\?>");
	private static final Optional UTF_32_BE = forName("UTF-32BE");
	private static final Optional UTF_32_LE = forName("UTF-32LE");
	private static final Optional IBM_500 = forName("IBM500");

	// With bom
	private static final byte[] USC_4_BE = new byte[]{0x00, 0x00, (byte)0xFE, (byte)0xFF};
	private static final byte[] USC_4_LE = new byte[]{(byte)0xFF, (byte)0xFE, 0x00, 0x00};
	private static final byte[] USC_4_2143 = new byte[]{0x00, 0x00, (byte)0xFF, (byte)0xFE};
	private static final byte[] USC_4_3412 = new byte[]{(byte)0xFE, (byte)0xFF, 0x00, 0x00};

	// NO bom
	private static final byte[] UTF_16_BE = new byte[] {0x00, (byte)0x3C, 0x00, (byte)0x3F};
	private static final byte[] UTF_16_LE = new byte[] {(byte)0x3C, 0x00, (byte)0x3F, 0x00};
	private static final byte[] UTF_8 = new byte[] {(byte)0x3C, (byte)0x3F, 0x78, 0x6D};
	private static final byte[] EBCDIC = new byte[] {(byte)0x4C, (byte)0x6F, (byte)0xA7, (byte)0x94};

	private XMLTools() {}
	
	static Optional getDeclaredEncoding(byte[] data, Charset preliminaryEncoding) throws XmlEncodingDetectionException {
		try (Reader r = new InputStreamReader(new ByteArrayInputStream(data), preliminaryEncoding)) {
			StringBuilder sb = new StringBuilder();
			int c = r.read();
			if (c=='\uFEFF') {
				c = -2;
			}
			// Append BOM or any whitespace characters
			while (c==-2 || Character.isWhitespace((int)c)) {
				sb.append((char)c);
				c = r.read();
			}
			// Read the next 5 characters to determine if an XML declaration is present
			for (int i=0; i<5 && c!=-1; i++) {
				sb.append((char)c);
				c = r.read();
			}
			boolean closing = false;
			if (sb.length()>=5 && ".
					if (c=='?') {
						closing = true;
					} else if (c=='>' && closing) {
						sb.append((char)c);
						break;
					} else {
						closing = false;
					}
				}
				return getDeclaredEncoding(sb.toString());
			} else {
				return Optional.empty();
			}
		} catch (IOException e) {
			throw new XmlEncodingDetectionException("Failed to read.", e);
		}
	}

	/**
	 * Gets the declared encoding from the given string. If the string
	 * doesn't start with an XML declaration, an empty optional is returned.
	 * @param text the xml
	 * @return returns a string with the declared encoding
	 */
	public static Optional getDeclaredEncoding(String text) {
		Matcher m = XML_DECL.matcher(text);
		String enc;
		if (m.find() && (enc=m.group("ENCODING"))!=null) {
			return Optional.of(enc);
		}
		return Optional.empty();
	}
	
	/**
	 * Detects XML encoding based on this algorithm: https://www.w3.org/TR/xml/#sec-guessing.
	 * In accordance with this specification, it is assumed that the XML declaration 
	 * is not preceded by whitespace (if present).
	 * Note that some encodings mentioned in the specification are not supported 
	 * because they are not supported by the JVM.
	 * 
	 * @param data the input bytes
	 * @return returns the name of the detected charset
	 * @throws IllegalArgumentException if the length of the data is less than 4 bytes
	 * @throws XmlEncodingMismatchException if the declared encoding doesn't match the detected encoding and
	 * 			the detected encoding is an exact match
	 * @throws XmlEncodingDetectionException if detection fails
	 */
	public static String detectXmlEncoding(byte[] data) throws XmlEncodingDetectionException {
		if (data.length<4) {
			throw new IllegalArgumentException();
		}
		PreliminaryCharset preliminary = guessCharset(data);
		if (preliminary==null) {
			throw new XmlEncodingDetectionException("Could not detect encoding.");
		}
		Optional specifiedEncoding = getDeclaredEncoding(data, preliminary.getCharset());
		if (specifiedEncoding.isPresent()) {
			String returnEncoding = specifiedEncoding.get();
			if (preliminary.isExactMatch()) {
				if (!preliminary.getCharset().name().toUpperCase().startsWith(returnEncoding.toUpperCase())) {
					String msg = MessageFormat.format("The specified encoding ({0}) doesn''t match detected encoding ({1}).", returnEncoding, preliminary.getCharset().name());
					throw new XmlEncodingMismatchException(msg, preliminary.getCharset().name(), returnEncoding);
				}
				return preliminary.getCharset().name();
			} else {
				return returnEncoding;
			}
		} else if (preliminary.isExactMatch()) {
			return preliminary.getCharset().name();
		} else {
			throw new XmlEncodingDetectionException("Could not detect encoding.");
		}
	}
	
	/**
	 * Tries to detect a Unicode encoding from the supplied data based
	 * on the presence of a BOM. If the file doesn't start with a BOM, 
	 * an empty optional is returned.
	 * @param data the data to detect encoding on
	 * @return returns the encoding detected from the BOM
	 * @throws UnsupportedCharsetException if the charset could be detected but not created
	 */
	public static Optional detectBomEncoding(byte[] data) {
		return Optional.ofNullable(guessCharsetFromBom(data.length>4?Arrays.copyOf(data, 4):data));
	}

	/**
	 * Finds group of encodings that can be used to decode the declaration (if any).
	 * @param data the data
	 * @return returns a preliminary charset, based on the first bytes of the file
	 * @throws IllegalArgumentException if the length of the data is less than 4 bytes
	 */
	private static PreliminaryCharset guessCharset(byte[] data) {
		// Based on https://www.w3.org/TR/xml/#sec-guessing
		if (data.length<4) {
			throw new IllegalArgumentException();
		}
		byte[] signature = Arrays.copyOf(data, 4);
		int i;
		// With BOM
		Charset charsetFromBom;
		try {
			charsetFromBom = guessCharsetFromBom(signature);
		} catch (UnsupportedCharsetException e) {
			return null;
		}
		if (charsetFromBom!=null) {
			return new PreliminaryCharset.Builder(charsetFromBom).bom(true).exactMatch(true).build();
		}
		// No BOM
		else if ((i = detectUcs4WithoutBom(signature))>-1) {
			if (i==1) {
				//BE
				return UTF_32_BE.map(v->new PreliminaryCharset.Builder(v).bom(false).exactMatch(false).build()).orElse(null);
			} else if (i==3) {
				//LE
				return UTF_32_LE.map(v->new PreliminaryCharset.Builder(v).bom(false).exactMatch(false).build()).orElse(null);
			} else {
				// not supported
				return null;
			}
		} else if (Arrays.equals(signature, UTF_16_BE)) {
			// UTF-16, big endian
			return new PreliminaryCharset.Builder(StandardCharsets.UTF_16BE).bom(false).exactMatch(false).build();
		} else if (Arrays.equals(signature, UTF_16_LE)) {
			// UTF-16, little endian
			return new PreliminaryCharset.Builder(StandardCharsets.UTF_16LE).bom(false).exactMatch(false).build();
		} else if (Arrays.equals(signature, UTF_8)) {
			// UTF-8 no BOM
			return new PreliminaryCharset.Builder(StandardCharsets.UTF_8).bom(false).exactMatch(false).build();
		} else if (Arrays.equals(signature, EBCDIC)) {
			return IBM_500.map(v->new PreliminaryCharset.Builder(v).bom(false).exactMatch(false).build()).orElse(null);
		}
		// UTF-8 without encoding declaration or corrupt
		return new PreliminaryCharset.Builder(StandardCharsets.UTF_8).bom(false).exactMatch(true).build();
	}
	
	/**
	 * Guess the charset from a BOM.
	 * @param signature a byte signature, 0-4 bytes long
	 * @return returns the charset if detected or null if the charset could not be detected.
	 * @throws UnsupportedCharsetException if the charset could be detected but not created
	 */
	private static Charset guessCharsetFromBom(byte[] signature) throws UnsupportedCharsetException {
		if (signature.length<2) {
			// No Unicode encoding has a byte order mark < 2 bytes
			return null;
		} else if (Arrays.equals(signature, USC_4_BE)) {
			return UTF_32_BE.orElseThrow(()->new UnsupportedCharsetException("UTF-32BE"));
		} else if (Arrays.equals(signature, USC_4_LE)) {
			// Note that this test must come before UTF-16 below
			return UTF_32_LE.orElseThrow(()->new UnsupportedCharsetException("UTF-32LE"));
		} else if (Arrays.equals(signature, USC_4_2143)) {
			// Not supported by the JVM
			throw new UnsupportedCharsetException("USC-4-2143");
		} else if (Arrays.equals(signature, USC_4_3412)) {
			// Note that this test must come before UTF-16 below
			// Not supported by the JVM
			throw new UnsupportedCharsetException("USC-4-3412");
		} else if (signature[0]==(byte)0xFE && signature[1]==(byte)0xFF) {
			// UTF-16, big endian
			return StandardCharsets.UTF_16BE;
		} else if (signature[0]==(byte)0xFF && signature[1]==(byte)0xFE) {
			// UTF-16, little endian
			return StandardCharsets.UTF_16LE;
		} else if (signature.length>2 && signature[0]==(byte)0xEF && signature[1]==(byte)0xBB && signature[2]==(byte)0xBF) {
			// UTF-8 with BOM
			return StandardCharsets.UTF_8;
		} else {
			return null;
		}
	}
	
	/**
	 * Detects if the supplied data is XML encoded with UCS4 without BOM.
	 * Returns the index of the non-zero byte, or -1 if the data isn't
	 * a match for UCS4 encoded XML.
	 * 
	 * @param data the input data
	 * @return returns the non-zero byte
	 */
	private static int detectUcs4WithoutBom(byte[] data) {
		if (data.length!=4) {
			throw new IllegalArgumentException("Expected 4 bytes");
		}
		int seen = -1;
		int i;
		for (i=0; i forName(String charset) {
		try {
			return Optional.of(Charset.forName(charset));
		} catch (Exception e) {
			return Optional.empty();
		}
	}

	/**
	 * 

Transforms the xml with the specified parameters. By default, this method will set up a caching entity resolver, which * will reduce the amount of fetching of dtd's from the Internet.

* *

This method will attempt to create Source and Result objects from the supplied source, result and xslt objects. * This process supports several types of objects from which Sources and Results are typically created, such as files, * strings and URLs.

* *

This method will create its own instance of a transformer factory.

* * @param source the source xml * @param result the result xml * @param xslt the xslt * @param params xslt parameters * @throws XMLToolsException if the transformation is unsuccessful */ public static void transform(Object source, Object result, Object xslt, Map params) throws XMLToolsException { transform(TransformerTools.toSource(source), TransformerTools.toResult(result), TransformerTools.toSource(xslt), params); } /** *

Transforms the xml with the specified parameters. By default, this method will set up a caching entity resolver, which * will reduce the amount of fetching of dtd's from the Internet.

* *

This method will attempt to create Source and Result objects from the supplied source, result and xslt objects. * This process supports several types of objects from which Sources and Results are typically created, such as files, * strings and URLs.

* * @param source the source xml * @param result the result xml * @param xslt the xslt * @param params xslt parameters * @param factory the transformer factory * @throws XMLToolsException if the transformation is unsuccessful */ public static void transform(Object source, Object result, Object xslt, Map params, TransformerFactory factory) throws XMLToolsException { transform(TransformerTools.toSource(source), TransformerTools.toResult(result), TransformerTools.toSource(xslt), params, factory); } /** * Transforms the xml with the specified parameters. By default, this method will set up a caching entity resolver, which * will reduce the amount of fetching of dtd's from the Internet. * *

This method will create its own instance of a transformer factory.

* @param source the source xml * @param result the result xml * @param xslt the xslt * @param params xslt parameters * @throws XMLToolsException if the transformation is unsuccessful */ public static void transform(Source source, Result result, Source xslt, Map params) throws XMLToolsException { transform(source, result, xslt, params, TransformerFactory.newInstance()); } /** *

Transforms the xml with the specified parameters. By default, this method will set up a caching entity resolver, which * will reduce the amount of fetching of dtd's from the Internet.

* @param source the source xml * @param result the result xml * @param xslt the xslt * @param params xslt parameters * @param factory the transformer factory * @throws XMLToolsException if the transformation is unsuccessful */ public static void transform(Source source, Result result, Source xslt, Map params, TransformerFactory factory) throws XMLToolsException { transform(source, result, xslt, TransformerEnvironment.builder().transformerFactory(factory).parameters(params).build()); } /** *

Transforms the xml with the specified parameters. By default, this method will set up a caching entity resolver, which * will reduce the amount of fetching of dtd's from the Internet.

* @param source the source xml * @param result the result xml * @param xslt the xslt * @param env the transformer environment * @param the type of exception thrown * @throws T if the transformation is unsuccessful */ public static void transform(Object source, Object result, Object xslt, TransformerEnvironment env) throws T { transform(env.asSource(source), env.asResult(result), env.asSource(xslt), env); } /** *

Transforms the xml with the specified parameters. By default, this method will set up a caching entity resolver, which * will reduce the amount of fetching of dtd's from the Internet.

* @param source the source xml * @param result the result xml * @param xslt the xslt * @param env the transformer environment * @param the type of exception thrown * @throws T if the transformation is unsuccessful */ public static void transform(Source source, Result result, Source xslt, TransformerEnvironment env) throws T { Transformer transformer = env.newTransformer(xslt); for (String name : env.getParameters().keySet()) { transformer.setParameter(name, env.getParameters().get(name)); } SAXParserFactory parserFactory = SAXParserFactory.newInstance(); try { transformer.setURIResolver(new CachingURIResolver(parserFactory)); } catch (XMLToolsException e) { env.toThrowable(e); } //Create a SAXSource, hook up an entityresolver if(source.getSystemId()!=null && source.getSystemId().length()>0) { try { if (source instanceof SAXSource) { transformer.transform(setEntityResolver((SAXSource) source), result); } else { SAXParser parser = parserFactory.newSAXParser(); parser.getXMLReader().setFeature("http://xml.org/sax/features/validation", false); try (InputStream is = new URLCache().openStream(new URI(source.getSystemId()).toURL())) { InputSource isource = new InputSource(is); isource.setSystemId(source.getSystemId()); SAXSource saxSource = new SAXSource(parser.getXMLReader(), isource); saxSource.setSystemId(source.getSystemId()); transformer.transform(setEntityResolver(saxSource), result); } } } catch (TransformerException e) { throw env.toThrowable(e); } catch (Exception e) { //TODO: really catch everything? e.printStackTrace(); } } else { throw env.toThrowable(new XMLToolsException("No system id on source, see https://github.com/brailleapps/dotify.common/issues/4.")); } } private static SAXSource setEntityResolver(SAXSource source) { if(source.getXMLReader().getEntityResolver()==null) { source.getXMLReader().setEntityResolver(new EntityResolverCache()); } return source; } /** * Returns true if the specified file is well formed XML. * @param f the file * @return returns true if the file is well formed XML, false otherwise * @throws XMLToolsException if a parser cannot be configured or if parsing fails */ public static final boolean isWellformedXML(File f) throws XMLToolsException { return parseXML(f)!=null; } /** * Returns true if the contents at the specified URI is well formed XML. * @param uri the URI * @return returns true if the contents at the specified URI is well formed XML, false otherwise * @throws XMLToolsException if a parser cannot be configured or if parsing fails */ public static final boolean isWellformedXML(URI uri) throws XMLToolsException { return parseXML(uri)!=null; } /** * Returns true if the specified source is well formed XML. * @param source the source * @return returns true if the source is well formed XML, false otherwise * @throws XMLToolsException if a parser cannot be configured or if parsing fails */ public static final boolean isWellformedXML(InputSource source) throws XMLToolsException { return parseXML(source)!=null; } /** * Asserts that the specified file is well formed and returns some root node information. * @param f the file * @return returns the root node, or null if file is not well formed * @throws XMLToolsException if a parser cannot be configured or if parsing fails */ public static final XMLInfo parseXML(File f) throws XMLToolsException { return parseXML(f, false); } /** * Asserts that the contents at the specified URI is well formed and returns some root node information. * @param uri the URI * @return returns the root node, or null if the contents at the specified URI is not well formed * @throws XMLToolsException if a parser cannot be configured or if parsing fails */ public static final XMLInfo parseXML(URI uri) throws XMLToolsException { return parseXML(uri, false); } /** * Asserts that the source is well formed and returns some root node information. * @param source the source * @return returns the root node, or null if the source is not well formed * @throws XMLToolsException if a parser cannot be configured or if parsing fails */ public static final XMLInfo parseXML(InputSource source) throws XMLToolsException { return parseXML(source, false); } /** * Returns some root node information and optionally asserts that the specified * file is well formed. * @param f the file * @param peek true if the parsing should stop after reading the root element. If true, * the file may or may not be well formed beyond the first start tag. * @return returns the root node, or null if file is not well formed * @throws XMLToolsException if a parser cannot be configured or if parsing fails */ public static final XMLInfo parseXML(File f, boolean peek) throws XMLToolsException { return parseXML(f.toURI(), peek); } /** * Returns some root node information and optionally asserts that the contents at the * specified URI is well formed. * @param uri the URI * @param peek true if the parsing should stop after reading the root element. If true, * the contents at the specified URI may or may not be well formed beyond the first start tag. * @return returns the root node, or null if file is not well formed * @throws XMLToolsException if a parser cannot be configured or if parsing fails */ public static final XMLInfo parseXML(URI uri, boolean peek) throws XMLToolsException { try (InputStream is = uri.toURL().openStream()) { InputSource source = new InputSource(is); source.setSystemId(uri.toASCIIString()); return parseXML(source, peek); } catch (IOException e) { throw new XMLToolsException(e); } } /** * Returns some root node information and optionally asserts that the contents at the * specified source is well formed. * @param source the source * @param peek true if the parsing should stop after reading the root element. If true, * the source may or may not be well formed beyond the first start tag. * @return returns the root node, or null if file is not well formed * @throws XMLToolsException if a parser cannot be configured or if parsing fails */ public static final XMLInfo parseXML(InputSource source, boolean peek) throws XMLToolsException { SAXParserFactory factory = SAXParserFactory.newInstance(); factory.setNamespaceAware(true); SAXParser saxParser = null; try { saxParser = factory.newSAXParser(); } catch (ParserConfigurationException e) { throw new XMLToolsException("Failed to set up XML parser.", e); } catch (SAXException e) { throw new XMLToolsException("Failed to set up XML parser.", e); } XMLHandler dh = new XMLHandler(peek); try { XMLReader reader = saxParser.getXMLReader(); if (dh != null) { reader.setContentHandler(dh); reader.setEntityResolver(dh); //since we sometimes have loadDTD turned off, //we use lexical handler to get the pub and sys id of prolog reader.setProperty("http://xml.org/sax/properties/lexical-handler", dh); reader.setErrorHandler(dh); reader.setDTDHandler(dh); } saxParser.getXMLReader().parse(source); } catch (StopParsing e) { //thrown if peek is true } catch (SAXException e) { return null; } catch (IOException e) { throw new XMLToolsException(e); } return dh.root; } private static class XMLHandler extends DefaultHandler implements LexicalHandler { private final EntityResolver resolver; private final boolean peek; private final XMLInfo.Builder builder; private XMLInfo root = null; XMLHandler(boolean peek) { this.resolver = new EntityResolverCache(); this.peek = peek; this.builder = new XMLInfo.Builder(); } @Override public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { if (this.root == null) { this.root = builder.uri(uri).localName(localName).qName(qName).attributes(attributes).build(); if (peek) { throw new StopParsing(); } } } @Override public InputSource resolveEntity(String publicId, String systemId) throws SAXException, IOException { if (root == null) { //set prolog entity in builder builder.publicId(publicId); builder.systemId(systemId); } return resolver.resolveEntity(publicId, systemId); } @Override public void startDTD(String name, String publicId, String systemId) throws SAXException { builder.publicId(publicId); builder.systemId(systemId); } @Override public void endDTD() throws SAXException { // no-op } @Override public void startEntity(String name) throws SAXException { // no-op } @Override public void endEntity(String name) throws SAXException { // no-op } @Override public void startCDATA() throws SAXException { // no-op } @Override public void endCDATA() throws SAXException { // no-op } @Override public void comment(char[] ch, int start, int length) throws SAXException { // no-op } } private static class StopParsing extends SAXException { private static final long serialVersionUID = -4335028194855324300L; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy