All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.conqat.lib.commons.xml.XMLUtils Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (c) CQSE GmbH
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.conqat.lib.commons.xml;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintStream;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

import javax.xml.XMLConstants;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.conqat.lib.commons.assertion.CCSMAssert;
import org.conqat.lib.commons.filesystem.FileSystemUtils;
import org.conqat.lib.commons.string.StringUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.ErrorHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;

/**
 * Collection of utility methods for XML.
 * 

* The parsers created in this class are configured to avoid Xml Entity Expansion (XEE) attacks * using {@link XMLConstants#ACCESS_EXTERNAL_DTD} to disable external doc type processing, * {@link XMLConstants#ACCESS_EXTERNAL_SCHEMA} to disable external schemas and * {@link XMLConstants#ACCESS_EXTERNAL_STYLESHEET} for to disable external stylesheets. See * https://rules.sonarsource.com/java/RSPEC-2755 for examples and * https://jira.cqse.eu/browse/TS-23501 for an audit report explaining two attack methods. Without * these means data from the server, e.g. file content, can be exposed to arbitrary servers just by * deserializing XML. In addition parsers are configured to disable DTD loading and grammer * completely using {@link #FEATURE_LOAD_DTD_GRAMMAR} {@link #FEATURE_LOAD_EXTERNAL_DTD}. */ public class XMLUtils { private static final Logger LOGGER = LogManager.getLogger(); /** Identifier for schema source. */ private static final String ATTRIBUTE_SCHEMA_SOURCE = "http://java.sun.com/xml/jaxp/properties/schemaSource"; private static final String FEATURE_LOAD_DTD_GRAMMAR = "http://apache.org/xml/features/nonvalidating/load-dtd-grammar"; private static final String FEATURE_LOAD_EXTERNAL_DTD = "http://apache.org/xml/features/nonvalidating/load-external-dtd"; /** Schema URL */ private static final String SCHEMA_URL = "http://www.w3.org/2001/XMLSchema"; /** Identifier for schema language. */ private static final String ATTRIBUTE_SCHEMA_LANGUAGE = "http://java.sun.com/xml/jaxp/properties/schemaLanguage"; /** Creates a new {@link XMLWriter} that writes to the given output file. */ public static , AttributesEnum extends Enum> XMLWriter createUtf8Writer( File outputFile, Class attributesClass) throws FileNotFoundException, UnsupportedEncodingException { return new XMLWriter<>(new PrintStream(outputFile, FileSystemUtils.UTF8_ENCODING), new XMLResolver(attributesClass)); } /** * Parse a file without validation. * * @param file * the file to parse. * @return the DOM document. * @throws SAXException * if a parsing exception occurs, i.e. if the file is not well-formed. * @throws IOException * if an IO exception occurs. */ public static Document parse(File file) throws SAXException, IOException { return createSchemaUnawareParser().parse(file); } /** * Parse an input source without validation. * * @param input * the input source to parse * @return the DOM document. * @throws SAXException * if a parsing exception occurs, i.e. if the file is not well-formed. * @throws IOException * if an IO exception occurs. */ public static Document parse(InputSource input) throws SAXException, IOException { return createSchemaUnawareParser().parse(input); } /** * Parse an input source using SAX without validation. * * @throws SAXException * if a parsing exception occurs, i.e. if the file is not well-formed. * @throws IOException * if an IO exception occurs. */ public static void parseSAX(File file, DefaultHandler handler) throws SAXException, IOException { createSchemaUnawareSAXParser().parse(file, handler); } /** * Creates an {@link XMLEventReader} for the given input source using StAX without validation. In * contrast to SAX StAX is pull-based rather than push-based. * * @throws XMLStreamException * if a parsing exception occurs, i.e. if the file is not well-formed. */ public static XMLEventReader parseStAX(String source, javax.xml.stream.XMLResolver resolver) throws XMLStreamException { XMLInputFactory inputFactory = XMLInputFactory.newFactory(); inputFactory.setProperty(XMLInputFactory.SUPPORT_DTD, false); inputFactory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false); inputFactory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, false); inputFactory.setXMLResolver(resolver); return inputFactory.createXMLEventReader(StringUtils.toInputStream(source)); } /** * Parse an input source using SAX without validation. * * @throws SAXException * if a parsing exception occurs, i.e. if the file is not well-formed. * @throws IOException * if an IO exception occurs. */ public static void parseSAX(InputSource input, DefaultHandler handler) throws SAXException, IOException { createSchemaUnawareSAXParser().parse(input, handler); } /** * Parse a string that contains XML without validation. * * @throws SAXException * if a parsing exception occurs, i.e. if the file is not well-formed. * @throws IOException * if an IO exception occurs. */ public static void parseSAX(String content, DefaultHandler handler) throws SAXException, IOException { parseSAX(new InputSource(new StringReader(content)), handler); } /** * Parse and validate file using schema. This implements a custom error handler to avoid different * behaviour between the JAXP implementations shipping with Java 1.5 and Java 1.6. * * @param file * the file to parse. * @param schemaURL * URL point to schema, may not be null * @return the DOM document. * @throws SAXException * if a parsing exception occurs, i.e. if the file is not well-formed or not valid * @throws IOException * if an IO exception occurs. */ public static Document parse(File file, URL schemaURL) throws SAXException, IOException { try (FileInputStream stream = new FileInputStream(file)) { return parse(new InputSource(stream), schemaURL); } } /** * Parse and validate file using schema. This implements a custom error handler to avoid different * behaviour between the JAXP implementations shipping with Java 1.5 and Java 1.6. * * @param input * the input to parse. * @param schemaURL * URL point to schema, may not be null * @return the DOM document. * @throws SAXException * if a parsing exception occurs, i.e. if the file is not well-formed or not valid * @throws IOException * if an IO exception occurs. */ public static Document parse(InputSource input, URL schemaURL) throws SAXException, IOException { CCSMAssert.isTrue(schemaURL != null, "Schema URL may not be null!"); DocumentBuilder parser = createSchemaAwareParser(schemaURL); XMLErrorHandler errorHandler = new XMLErrorHandler(); parser.setErrorHandler(errorHandler); Document document = parser.parse(input); if (errorHandler.exception != null) { throw errorHandler.exception; } return document; } /** * Parse and validate file using SAX and schema. * * @param file * the file to parse. * @param schemaURL * URL point to schema, may not be null * @throws SAXException * if a parsing exception occurs, i.e. if the file is not well-formed or not valid * @throws IOException * if an IO exception occurs. */ public static void parseSAX(File file, URL schemaURL, DefaultHandler handler) throws SAXException, IOException { try (FileInputStream stream = new FileInputStream(file)) { parseSAX(new InputSource(stream), schemaURL, handler); } } /** * Parse and validate file using SAX and schema. * * @param input * the input to parse. * @param schemaURL * URL point to schema, may not be null * @throws SAXException * if a parsing exception occurs, i.e. if the file is not well-formed or not valid * @throws IOException * if an IO exception occurs. */ public static void parseSAX(InputSource input, URL schemaURL, DefaultHandler handler) throws SAXException, IOException { CCSMAssert.isTrue(schemaURL != null, "Schema URL may not be null!"); createSchemaAwareSAXParser(schemaURL).parse(input, handler); } /** Creates a schema-unaware XML parser */ private static DocumentBuilder createSchemaUnawareParser() { try { return createNamespaceAwareDocumentBuilderFactory().newDocumentBuilder(); } catch (ParserConfigurationException e) { throw new IllegalStateException("No document builder found, probably Java is misconfigured!", e); } } /** Creates a schema-unaware SAX parser */ private static SAXParser createSchemaUnawareSAXParser() throws SAXException { try { return createSAXParser(false); } catch (ParserConfigurationException e) { throw new IllegalStateException("No SAX parser found, probably Java is misconfigured!", e); } } /** Creates a schema-aware XML parser */ private static DocumentBuilder createSchemaAwareParser(URL schemaURL) { try { DocumentBuilderFactory dbf = createNamespaceAwareDocumentBuilderFactory(); dbf.setValidating(true); dbf.setAttribute(ATTRIBUTE_SCHEMA_LANGUAGE, SCHEMA_URL); dbf.setAttribute(ATTRIBUTE_SCHEMA_SOURCE, schemaURL.toString()); return dbf.newDocumentBuilder(); } catch (ParserConfigurationException e) { throw new IllegalStateException("No document builder found, probably Java is misconfigured!", e); } } /** Creates a schema-aware SAX parser */ private static SAXParser createSchemaAwareSAXParser(URL schemaURL) throws SAXException { try { SAXParser parser = createSAXParser(true); parser.setProperty(ATTRIBUTE_SCHEMA_LANGUAGE, SCHEMA_URL); parser.setProperty(ATTRIBUTE_SCHEMA_SOURCE, schemaURL.toString()); return parser; } catch (ParserConfigurationException e) { throw new IllegalStateException("No SAX parser found, probably Java is misconfigured!", e); } } /** Creates a namespace-aware {@link DocumentBuilderFactory} */ private static DocumentBuilderFactory createNamespaceAwareDocumentBuilderFactory() throws ParserConfigurationException { DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); dbf.setAttribute(XMLConstants.ACCESS_EXTERNAL_DTD, ""); dbf.setAttribute(XMLConstants.ACCESS_EXTERNAL_SCHEMA, ""); dbf.setFeature(FEATURE_LOAD_DTD_GRAMMAR, false); dbf.setFeature(FEATURE_LOAD_EXTERNAL_DTD, false); dbf.setNamespaceAware(true); return dbf; } private static SAXParser createSAXParser(boolean validating) throws ParserConfigurationException, SAXException { SAXParserFactory spf = SAXParserFactory.newInstance(); spf.setNamespaceAware(true); spf.setValidating(validating); spf.setFeature(FEATURE_LOAD_DTD_GRAMMAR, false); spf.setFeature(FEATURE_LOAD_EXTERNAL_DTD, false); SAXParser parser = spf.newSAXParser(); parser.setProperty(XMLConstants.ACCESS_EXTERNAL_DTD, ""); parser.setProperty(XMLConstants.ACCESS_EXTERNAL_SCHEMA, ""); return parser; } /** * Same as {@link #parse(File, URL)} but with schema file. * * @throws IllegalArgumentException * if the schema file could not be converted to an URL */ public static Document parse(File file, File schema) throws SAXException, IOException { try { return parse(file, schema.toURI().toURL()); } catch (MalformedURLException e) { throw new IllegalArgumentException("Schema file could not be converted to URL: ", e); } } /** * Returns a string representation of the given XML document, which is "pretty printed", i.e. the * tags are indented. */ public static String prettyPrint(Document doc) throws TransformerException { URL url = XMLUtils.class.getResource("pretty.xsl"); StreamSource xslSource = new StreamSource(url.toExternalForm()); Transformer transformer = createTransformerFactory().newTransformer(xslSource); return StringUtils.normalizeLineSeparatorsPlatformSpecific(transformDocumentToString(doc, transformer)); } /** * Transforms the document to an xml string (flat, with no line breaks). Use * {@link #prettyPrint(Document)} for readable xml output. */ public static String print(Document document) throws TransformerException { Transformer transformer = createTransformerFactory().newTransformer(); return transformDocumentToString(document, transformer); } private static TransformerFactory createTransformerFactory() { TransformerFactory transformerFactory = TransformerFactory.newInstance(); transformerFactory.setAttribute(XMLConstants.ACCESS_EXTERNAL_DTD, ""); transformerFactory.setAttribute(XMLConstants.ACCESS_EXTERNAL_STYLESHEET, ""); return transformerFactory; } /** Transform an XML document to a string using the given transformer */ private static String transformDocumentToString(Document document, Transformer transformer) throws TransformerException { DOMSource source = new DOMSource(document); StringWriter stringWriter = new StringWriter(); StreamResult resultStream = new StreamResult(stringWriter); transformer.transform(source, resultStream); return stringWriter.toString(); } /** * Returns all children of the given element which are element named as specified. */ public static List getNamedChildren(Element element, String elementNames) { List result = new ArrayList<>(); NodeList children = element.getChildNodes(); for (int i = 0; i < children.getLength(); ++i) { Node node = children.item(i); if (node.getNodeType() == Node.ELEMENT_NODE && node.getNodeName().equals(elementNames)) { result.add((Element) node); } } return result; } /** * Returns the first child of the given element which is an element named as specified. Returns null * if none are found. */ public static Element getNamedChild(Element element, String name) { List children = XMLUtils.getNamedChildren(element, name); if (!children.isEmpty()) { return children.get(0); } return null; } /** * Get the text content of the given element's first child that is an element named as specified. If * none is found, the empty string is returned. */ public static String getNamedChildContent(Element parent, String name) { Element element = XMLUtils.getNamedChild(parent, name); if (element == null) { return StringUtils.EMPTY_STRING; } return element.getTextContent(); } /** * Extracts all ElementNodes from a NodeList and returns the result as a list. * * @param nodeList * the NodeList to be searched for ElementNodes. * @return an array containing all ElementNodes stored in the given node list or null if the input * has been null. */ public static List elementNodes(NodeList nodeList) { if (nodeList == null) { return null; } List result = new ArrayList<>(); int len = nodeList.getLength(); for (int i = 0; i < len; ++i) { Node node = nodeList.item(i); if (node.getNodeType() == Node.ELEMENT_NODE) { result.add((Element) node); } } return result; } /** * Appends a child element with the given tag name to the given element and returns the new element. */ public static Element appendChild(Element element, String tagName) { Element newElement = CCSMAssert.checkedCast(element.getOwnerDocument().createElement(tagName), Element.class); element.appendChild(newElement); return newElement; } /** * Appends a child element with the given tag name and the given text content to the given element * and returns the new element. */ public static Element appendChild(Element parent, String tagName, String textContent) { Element newElement = appendChild(parent, tagName); newElement.setTextContent(textContent); return newElement; } /** * Get all leaf elements of an XML tree rooted at an element * * @param root * The root element * @return List of all leaf elements */ public static List leafElementNodes(Element root) { List leafElementNodes = new ArrayList<>(); leafElementNodes(root, leafElementNodes); return leafElementNodes; } /** * Add all leaf element nodes of an XML tree rooted at an element to a list */ private static void leafElementNodes(Element root, List leafElementNodes) { List children = XMLUtils.elementNodes(root.getChildNodes()); if (children.isEmpty()) { leafElementNodes.add(root); } else { for (Element child : children) { leafElementNodes(child, leafElementNodes); } } } /** Converts the given {@link String} to a SAX {@link InputSource}. */ public static InputSource toInputSource(String string) { return new InputSource(new StringReader(string)); } /** * Simple error handler for handling validation errors. This handler stores the first problem raised * during parsing. */ private static class XMLErrorHandler implements ErrorHandler { /** * The stored exception. Value unequal null signals a validation problem. */ private SAXParseException exception; /** {@inheritDoc} */ @Override public void error(SAXParseException exception) { if (this.exception == null) { this.exception = exception; } } /** {@inheritDoc} */ @Override public void fatalError(SAXParseException exception) { error(exception); } /** {@inheritDoc} */ @Override public void warning(SAXParseException exception) { LOGGER.info(getClass().getSimpleName() + " received a warning: " + exception.getMessage()); } } /** * Fixes chars which are not allowed in XML content. The following replacements are allowed: *

    *
  • All '&' which are not part of an XML escape char sequence are replaced by '&'. *
  • All low ASCII control chars are removed, besides TAB, LF, CR *
  • Escaped ASCII control chars are removed (e.g. � or �) with variable zero padding in * hex and decimal format. *
*/ public static String fixIllegalXmlChars(String content) { String replacedContent = content.replaceAll("(?i)&(?!(lt|gt|amp|apos|quot|#x[0-9a-f]+|#\\d+);)", "&"); replacedContent = replacedContent.replaceAll("([\\x00-\\x08\\x0b\\x0c\\x0e-\\x1f\\x7f])", StringUtils.EMPTY_STRING); replacedContent = replacedContent.replaceAll("(?i)�*([0-8bcef]|1[0-9a-f]|7f);", StringUtils.EMPTY_STRING); replacedContent = replacedContent.replaceAll("(?i)�*([0-8]|1[124-9]|2[0-9]|3[01]|127);", StringUtils.EMPTY_STRING); return replacedContent; } /** * Takes DTD file contents and adds them to the XML. If there already is a DTD or DTD reference it * is overwritten. Uses a SAX parser to parse the XML with the internal DTD. Returns whether the * parser ran into an exception. * * @param xml * the XML document that is validated against the DTD * @param dtd * the DTD as file contents: */ public static boolean validateAgainstDTD(String xml, String dtd) throws ParserConfigurationException, IOException, TransformerException, SAXException { String xmlWithDtd = addInternalDtd(xml, dtd); SAXParser parser = createSAXParser(true); XMLReader reader = parser.getXMLReader(); XMLErrorHandler errorHandler = new XMLErrorHandler(); reader.setErrorHandler(errorHandler); reader.parse(toInputSource(xmlWithDtd)); return errorHandler.exception == null; } /** * Adds DTD as internal DTD to the XML. An already existing internal DTD is replaced. * * @param xml * the XML document the DTD is supposed to be added to * @param dtd * the DTD as file contents: * @return the XML with internal DTD */ private static String addInternalDtd(String xml, String dtd) throws IOException, SAXException, TransformerException { Document doc = parse(toInputSource(xml)); TransformerFactory transformerFactory = TransformerFactory.newInstance(); transformerFactory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true); transformerFactory.setAttribute(XMLConstants.ACCESS_EXTERNAL_DTD, ""); transformerFactory.setAttribute(XMLConstants.ACCESS_EXTERNAL_STYLESHEET, ""); Transformer transformer = transformerFactory.newTransformer(); transformer.setOutputProperty(OutputKeys.DOCTYPE_SYSTEM, ""); String xmlWithDoctypeTag = transformDocumentToString(doc, transformer); return xmlWithDoctypeTag.replaceFirst("", dtd); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy