org.apache.oodt.commons.util.XML Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of oodt-commons Show documentation
Apache OODT Common Utilities Project
There is a newer version: 1.9.1
// Licensed to the Apache Software Foundation (ASF) under one or more contributor
// license agreements.  See the NOTICE.txt file distributed with this work for
// additional information regarding copyright ownership.  The ASF licenses this
// file to you under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License.  You may obtain a copy of
// the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
// License for the specific language governing permissions and limitations under
// the License.
//
// Portions of this code come from the Apache XML Project's Xerces 1.0.3 XML Parser
// (specifically, the functions escape and getEntityRef).  Apache license applies:
//
// The Apache Software License, Version 1.1
//
// Copyright (c) 1999 The Apache Software Foundation.  All rights reserved.
//
// Redistribution and use in source and binary forms, with or without modification, are
// permitted provided that the following conditions are met:
//
// 1. Redistributions of source code must retain the above copyright notice, this list of
//     conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright notice, this list
//    of conditions and the following disclaimer in the documentation and/or other materials
//    provided with the distribution.
//
// 3. The end-user documentation included with the redistribution, if any, must include
//    the following acknowledgment:
//
//    "This product includes software developed by the Apache Software Foundation (http://www.apache.org/)."
//
//    Alternately, this acknowledgment may appear in the software itself,
//    if and wherever such third-party acknowledgments normally appear.
//
// 4. The names "Xerces" and "Apache Software Foundation" must not be used to endorse or
//    promote products derived from this software without prior written permission. For
//    written permission, please contact [email protected].
//
// 5. Products derived from this software may not be called "Apache", nor may "Apache"
//    appear in their name, without prior written permission of the Apache Software
//    Foundation.
//
// THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING,
// BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
// PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
// ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// $Id: XML.java,v 1.2 2005-05-01 22:49:55 cmattmann Exp $

package org.apache.oodt.commons.util;

import org.w3c.dom.DOMException;
import org.w3c.dom.DOMImplementation;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.EntityResolver;
import org.xml.sax.ErrorHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.logging.Logger;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

/** XML services.
 *
 * This class provides several XML convenience services and encapsulates the underlying
 * XML implementation, allowing it to vary without impacting developers.
 *
 * @author Kelly
 */
public class XML {
  private static Logger LOG = Logger.getLogger(XML.class.getName());
	private static DocumentBuilder getStandardDocumentBuilder() {
		try {
			DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
			factory.setCoalescing(false);
			factory.setExpandEntityReferences(false);
			factory.setIgnoringComments(false);
			factory.setIgnoringElementContentWhitespace(true);
			factory.setNamespaceAware(true);
			factory.setValidating(true);
			return factory.newDocumentBuilder();
		} catch (ParserConfigurationException ex) {
			throw new IllegalStateException("Unexpected ParserConfigurationException: " + ex.getMessage());
		}
	}

	/** Get the DOM implementation.
	 *
	 * @return The DOM implementation.
	 */
	public static DOMImplementation getDOMImplementation() {
		return getStandardDocumentBuilder().getDOMImplementation();
	}

	/** Create a DOM document.
	 *
	 * @return A new DOM document.
	 */
	public static Document createDocument() {
		return getStandardDocumentBuilder().newDocument();
	}

	/** Create a DOM parser.
	 *
	 * This method creates a new DOM parser that has validation turned on and
	 * ignorable whitespace not included, and has a default error handler that prints
	 * error messages and warnings to the standard error stream.
	 *
	 * @return A new DOM parser.
	 */
	public static DOMParser createDOMParser() {
		DocumentBuilder builder = getStandardDocumentBuilder();
		builder.setEntityResolver(ENTERPRISE_ENTITY_RESOLVER);
		builder.setErrorHandler(new ErrorHandler() {
			public void error(SAXParseException ex) {
				System.err.println("Parse error: " + ex.getMessage());
				ex.printStackTrace();
			}
			public void warning(SAXParseException ex) {
				System.err.println("Parse warning: " + ex.getMessage());
			}
			public void fatalError(SAXParseException ex) {
				System.err.println("Fatal parse error: " + ex.getMessage());
				ex.printStackTrace();
			}
		});
		return new DOMParser(builder);
	}

	/** Create a SAX parser.
	 *
	 * This method creates a new, default SAX parser.  It's set up with a default
	 * error handler that just prints messages to the standard error stream.
	 *
	 * @return A new SAX parser.
	 */
	public static SAXParser createSAXParser() {
		try {
			SAXParserFactory factory = SAXParserFactory.newInstance();
			factory.setNamespaceAware(false);
			factory.setValidating(false);
			javax.xml.parsers.SAXParser saxParser = factory.newSAXParser();
			saxParser.getXMLReader().setEntityResolver(ENTERPRISE_ENTITY_RESOLVER);
			saxParser.getXMLReader().setErrorHandler(new ErrorHandler() {
				public void error(SAXParseException ex) {
					System.err.println("Parse error: " + ex.getMessage());
				}
				public void warning(SAXParseException ex) {
					System.err.println("Parse warning: " + ex.getMessage());
				}
				public void fatalError(SAXParseException ex) {
					System.err.println("Fatal parse error: " + ex.getMessage());
				}
			});
		  return new SAXParser(saxParser);
		} catch (ParserConfigurationException ex) {
			throw new IllegalStateException("Unexpected ParserConfigurationException: " + ex.getMessage());
		} catch (SAXException ex) {
			throw new IllegalStateException("Unexpected SAXException: " + ex.getMessage());
		}
	}

	/** Serialize an XML DOM document into a String.
	 *
	 * This method takes a DOM document and pretty-prints (or pretty-serializes, in
	 * XML parlance) it into a string.
	 *
	 * @param doc The document.
	 * @param omitXMLDeclaration True if we should omit the XML declaration, false to keep the XML declaration.
	 * @return The pretty-serialized, stringified, document.
	 */
	public static String serialize(Document doc, boolean omitXMLDeclaration) {
		StringWriter writer = new StringWriter();
	  serialize(doc, writer, omitXMLDeclaration);
	  return writer.getBuffer().toString();
	}

	/** Serialize an XML DOM document into a String.
	 *
	 * This method takes a DOM document and pretty-prints (or pretty-serializes, in
	 * XML parlance) it into a string.
	 *
	 * @param doc The document.
	 * @return The pretty-serialized, stringified, document.
	 */
	public static String serialize(Document doc) {
		return serialize(doc, /*omitXMLDeclaration*/false);
	}

	/** Serialize an XML DOM document into a writer.
	 *
	 * This method takes a DOM document and pretty-prints (or pretty-serializes, in
	 * XML parlance) it into a writer.
	 *
	 * @param doc The document.
	 * @param writer Where to write it.
	 * @param omitXMLDeclaration True if we should omit the XML declaration, false to keep the XML declaration.
	 * @throws IOException If an I/O error occurs.
	 */
	public static void serialize(Document doc, Writer writer, boolean omitXMLDeclaration) {
		try {
			TransformerFactory factory = TransformerFactory.newInstance();
			Transformer transformer = factory.newTransformer();
			
			//update 05/01/2005
			//author: Chris Mattmann
			//set properties to pretty print
			//and also to include the DOCTYPE when serializing the XML
			
			//only set this if it's not null else we get a nice NullPointerException
			if(doc.getDoctype().getPublicId() != null){
				transformer.setOutputProperty(OutputKeys.DOCTYPE_PUBLIC, doc.getDoctype().getPublicId());			    
			}

			//only set this if it's not null else we get a nice NullPointerException
			if(doc.getDoctype().getSystemId() != null){
				transformer.setOutputProperty(OutputKeys.DOCTYPE_SYSTEM, doc.getDoctype().getSystemId());			    
			}

			transformer.setOutputProperty(OutputKeys.INDENT,"yes");
			
			DOMSource source = new DOMSource(doc);
			StreamResult result = new StreamResult(writer);
			transformer.transform(source, result);
		} catch (TransformerConfigurationException ex) {
			throw new IllegalStateException("Unexpected TransformerConfigurationException: " + ex.getMessage());
		} catch (TransformerException ex) {
			throw new IllegalStateException("Unexpected TransformerException: " + ex.getMessage());
		}
	}

	/** Serialize an XML DOM document into a writer.
	 *
	 * This method takes a DOM document and pretty-prints (or pretty-serializes, in
	 * XML parlance) it into a writer.
	 *
	 * @param doc The document.
	 * @param writer Where to write it.
	 * @throws IOException If an I/O error occurs.
	 */
	public static void serialize(Document doc, Writer writer) {
		serialize(doc, writer, /*omitXMLDeclaration*/false);
	}

	/** Parse the given XML document into a DOM tree.
	 *
	 * @param inputSource The XML document to parse.
	 * @return A DOM tree for the given XML document.
	 * @throws SAXException If a parse error occurs.
	 * @throws IOException If an I/O error occurs.
	 */
	public static Document parse(InputSource inputSource) throws SAXException, IOException {
		DOMParser parser = XML.createDOMParser();
		parser.parse(inputSource);
		return parser.getDocument();
	}

	/** Parse the given XML document into a DOM tree.
	 *
	 * @param string The XML document to parse.
	 * @return A DOM tree for the given XML document.
	 * @throws SAXException If a parse error occurs.
	 */
	public static Document parse(String string) throws SAXException {
		Document doc;
		try {
			DOMParser parser = XML.createDOMParser();
			StringReader reader = new StringReader(string);
			InputSource inputSource = new InputSource(reader);
			doc = parse(inputSource);
			reader.close();
		} catch (IOException cantHappen) {
			cantHappen.printStackTrace();
			throw new RuntimeException("I/O exception " + cantHappen.getClass().getName()
				+ " can NOT have happened, yet it did!  Message: " + cantHappen.getMessage());
		}
		return doc;
	}

	/** Parse the given XML document into a DOM tree.
	 *
	 * @param reader The XML document to parse.
	 * @return A DOM tree for the given XML document.
	 * @throws SAXException If a parse error occurs.
	 * @throws IOException If an I/O error occurs.
	 */
	public static Document parse(Reader reader) throws SAXException, IOException {
		return parse(new InputSource(reader));
	}

	/** Parse the given XML document into a DOM tree.
	 *
	 * @param inputStream The XML document to parse.
	 * @return A DOM tree for the given XML document.
	 * @throws SAXException If a parse error occurs.
	 * @throws IOException If an I/O error occurs.
	 */
	public static Document parse(InputStream inputStream) throws SAXException, IOException {
		return parse(new InputStreamReader(inputStream));
	}

	/** Add a repeating child element with text from the given collection to the given node.
	 *
	 * For example, if values is a {@link java.util.List} with strings items
	 * "a", "b", and "c", and name is "value", then the XML document will
	 * have
	 * <value>a</value>
	 * <value>b</value>
	 * <value>c</value>
	 * appended to node.
	 *
	 * @param node Node to which to add child elements.
	 * @param name Name to give each child element.
	 * @param values Collection of values to assign to each child element, in iterator order.
	 * @throws DOMException If a DOM error occurs.
	 */
	public static void add(Node node, String name, Collection values) throws DOMException {
	  for (Object value : values) {
		add(node, name, value);
	  }
	}

	/** Add a child element with the given text to the given element.
	 *
	 * This method modifies your DOM tree so that
	 * <node>
	 *   ...
	 * </node>
	 * becomes
	 * <node>
	 *   ...
	 *   <name>text>/name>
	 * </node>
	 *
	 * Adding a null name does nothing.  Adding null text
	 * won't add the element.
	 *
	 * @param node Node to which to add a child element.
	 * @param name Name of the child element to add to node.
	 * @param text What text the text-node child the child element named name should have.  If null,
	 * nothing happens.
	 * @throws DOMException If a DOM error occurs.
	 */
	public static void addNonNull(Node node, String name, String text) throws DOMException {
		if (text == null) {
		  return;
		}
		add(node, name, text);
	}

	/** Add a child element with the given text to the given element.
	 *
	 * This method modifies your DOM tree so that
	 * <node>
	 *   ...
	 * </node>
	 * becomes
	 * <node>
	 *   ...
	 *   <name>text>/name>
	 * </node>
	 *
	 * Adding a null name does nothing.  Adding null text
	 * results in an empty name tag.
	 *
	 * @param node Node to which to add a child element.
	 * @param name Name of the child element to add to node.
	 * @param text What text the text-node child the child element named name should have.
	 * @throws DOMException If a DOM error occurs.
	 */
	public static void add(Node node, String name, String text) throws DOMException {
		if (name == null) {
		  return;
		}
		if (node == null) {
		  throw new IllegalArgumentException("Can't add to a null node");
		}
		Document doc = node.getOwnerDocument();
		Element element = doc.createElement(name);
		if (text != null) {
		  element.appendChild(doc.createTextNode(text));
		}
		node.appendChild(element);
	}

	/** Add a child element with the string representation of the given
	 * object to the given node.
	 *
	 * This method modifies your DOM tree so that
	 * <node>
	 *   ...
	 * </node>
	 * becomes
	 * <node>
	 *   ...
	 *   <name>string-rep>/name>
	 * </node>
	 *
	 * Adding a null name does nothing.  Adding null object
	 * results in an empty name tag.
	 *
	 * @param node Node to which to add a child element.
	 * @param name Name of the child element to add to node.
	 * @param object The string representation of the object to have as the text-node
	 * child the child element named name.
	 * @throws DOMException If a DOM error occurs.
	 */
	public static void add(Node node, String name, Object object) throws DOMException {
		add(node, name, object == null? null : object.toString());
	}

	/** Get unwrapped text from the given DOM node
	 *
	 * This method unwraps any wrapped text.  For example, if the document contains
	 * <node>Hello, world.  This is
	 *    my first document.
         * </node>
	 * then the node's unwrapped text is
	 * Hello, world.  This is my first document.
	 * while the {@link #text} method would return the wrapped value
	 * Hello, world.  This is
	 *    my first document.
	 *
	 * In other words, it collects the text nodes under the given node and replaces
	 * strings of newlines and spaces with a single space.  Unwrapping a null node
	 * returns a null string.
	 *
	 * @param node The node.
	 * @return The text in its children, unwrapped.
	 */
	public static String unwrappedText(Node node) {
		if (node == null) {
		  return null;
		}
		StringBuffer buffer = new StringBuffer();
		StringBuilder wrapped = new StringBuilder(text1(node, buffer));
		boolean newline = false;
		for (int i = 0; i < wrapped.length(); ++i) {
			if (!newline) {
				if (wrapped.charAt(i) == '\n') {
					newline = true;
					wrapped.setCharAt(i, ' ');
				}
			} else {
				if (Character.isWhitespace(wrapped.charAt(i))) {
					wrapped.deleteCharAt(i);
					--i;
				} else {
				  newline = false;
				}
			}
		}
		return wrapped.toString().trim();
	}

	/** Get the text from the given DOM node.
	 *
	 * Getting text from a null node gives you a null string.
	 *
	 * @param node The node.
	 * @return The text in its children.
	 */
	public static String text(Node node) {
		// [ return text(node) ]
		StringBuffer buffer = new StringBuffer();
		return text1(node, buffer);
	}

	/** Dump the structure of the DOM tree rooted at the given node to the given writer.
	 *
	 * This outputs the tree structure including the type of each node, its name, and
	 * its value.  Note that for many nodes, the name isn't useful (the name of text
	 * nodes, for example, is #text), and for many nodes, the value is
	 * null.
	 *
	 * @param writer The writer to which write the tree structure.
	 * @param node The tree to output.
	 */
	public static void dump(PrintWriter writer, Node node) {
		dump(writer, node, 0);
	}

	/** Remove all comments from the given document node's subtree.
	 *
	 * @param node Node from which to search for comments to nuke.
	 */
	public static void removeComments(Node node) {
		List commentNodes = new ArrayList();
		findCommentNodes(commentNodes, node);
	  for (Object commentNode1 : commentNodes) {
		Node commentNode = (Node) commentNode1;
		commentNode.getParentNode().removeChild(commentNode);
	  }
	}

	/** The resolver for entities for the JPL enterprise. */
	public static final EntityResolver ENTERPRISE_ENTITY_RESOLVER = new EnterpriseEntityResolver();

	/** An empty XML DOM document.  This is handy for some basic operations, and for
	 * fetching the DOM implementation.
	 */
	public static final Document EMPTY_DOCUMENT = org.apache.oodt.commons.util.XML.createDocument();

	/** Identifies the last printable character in the Unicode range that is supported
	 * by the encoding used with this serializer.  For 8-bit encodings this will be either
	 * 0x7E or 0xFF.  For 16-bit encodings this will be 0xFFFF. Characters that are not
	 * printable will be escaped using character references.
	 *
	 * 
Taken from Xerces 1.0.3.  Apache license applies; see source code for
	 * license.
	 *
	 */
	private static int LAST_PRINTABLE = 0x7E;

	/** Escapes a string so it may be printed as text content or attribute value. Non
	 * printable characters are escaped using character references.  Where the format
	 * specifies a deault entity reference, that reference is used
	 * (e.g. &lt;).
	 *
	 * Taken from Xerces 1.0.3.  Apache license applies; see source code for
	 * license.
	 *
	 * @param source The string to escape
	 * @return The escaped string
	 */
	public static String escape(String source) {
		StringBuffer    result;
		int             i;
		char            ch;
		String          charRef;
        
		result = new StringBuffer(source.length());
		for (i = 0; i < source.length(); ++i) {
			ch = source.charAt(i);
			// If the character is not printable, print as character
			// reference.  Non printables are below ASCII space but not tab or
			// line terminator, ASCII delete, or above a certain Unicode
			// threshold.
			if ((ch < ' ' && ch != '\t' && ch != '\n' && ch != '\r') || ch > LAST_PRINTABLE || ch == 0xF7) {
			  result.append("&#").append(Integer.toString(ch)).append(';');
			} else {
				// If there is a suitable entity reference for this
				// character, print it. The list of available entity
				// references is almost but not identical between XML and
				// HTML.
				charRef = getEntityRef(ch);
				if (charRef == null) {
				  result.append(ch);
				} else {
				  result.append('&').append(charRef).append(';');
				}
			}
		}
		return result.toString();
	}

	/** Find all comment nodes under the given node and add them to the given list.
	 *
	 * @param list List to add to.
	 * @param node Node to search.
	 */
	private static void findCommentNodes(List list, Node node) {
		if (node.getNodeType() == Node.COMMENT_NODE) {
		  list.add(node);
		} else {
			NodeList children = node.getChildNodes();
			for (int i = 0; i < children.getLength(); ++i) {
				findCommentNodes(list, children.item(i));
			}
		}
	}

	/** Get the entity reference for the given character.
	 *
	 * Taken from Xerces 1.0.3; see source code for license.
	 */
	private static String getEntityRef(char ch) {
		// Encode special XML characters into the equivalent character references.
		// These five are defined by default for all XML documents.
		switch (ch) {
			case '<':  return "lt";
			case '>':  return "gt";
			case '"':  return "quot";
			case '\'': return "apos";
			case '&':  return "amp";
		}
		return null;
	}

	/** Get the text from the child node using the given buffer.
	 *
	 * @param node The node.
	 * @param buffer The buffer to use.
	 * @return The text.
	 */
	private static String text1(Node node, StringBuffer buffer) {
		for (Node ch = node.getFirstChild(); ch != null; ch = ch.getNextSibling()) {
			if (ch.getNodeType() == Node.ELEMENT_NODE || ch.getNodeType() == Node.ENTITY_REFERENCE_NODE) {
			  buffer.append(text(ch));
			} else if (ch.getNodeType() == Node.TEXT_NODE) {
			  buffer.append(ch.getNodeValue());
			}
		}
		return buffer.toString();
	}

	/** Dump the structure of the DOM tree rooted at the given node to the given writer,
	 * indenting the contents.
	 *
	 * @param indentAmt The number of spaces to indent the output of this node;
	 * children are indented two more than this amount.
	 * @param writer The writer to which write the tree structure.
	 * @param node The tree to output.
	 */
	private static void dump(PrintWriter writer, Node node, int indentAmt) {
		for (int i = 0; i < indentAmt; ++i) {
		  writer.print(' ');
		}
		writer.println(typeOf(node) + "(" + node.getNodeName() + ", " + node.getNodeValue() + ")");
		NodeList children = node.getChildNodes();
		for (int i = 0; i < children.getLength(); ++i) {
		  dump(writer, children.item(i), indentAmt + 2);
		}
	}

	/** Return a human-readable representation of the type of the given node.
	 *
	 * For example, an attribute node returns Attribute, while an element
	 * node returns Element.
	 *
	 * @param node The node.
	 * @return The name of the node's type.
	 */
	private static String typeOf(Node node) {
		switch (node.getNodeType()) {
			case Node.ATTRIBUTE_NODE:              return "Attribute";
			case Node.CDATA_SECTION_NODE:          return "CDATA-Section";
			case Node.COMMENT_NODE:                return "Comment";
			case Node.DOCUMENT_FRAGMENT_NODE:      return "Document-Fragment";
			case Node.DOCUMENT_NODE:               return "Document";
			case Node.DOCUMENT_TYPE_NODE:          return "Document-Type";
			case Node.ELEMENT_NODE:                return "Element";
			case Node.ENTITY_NODE:                 return "Entity";
			case Node.ENTITY_REFERENCE_NODE:       return "Entity-Ref";
			case Node.NOTATION_NODE:               return "Notation";
			case Node.PROCESSING_INSTRUCTION_NODE: return "Proc-Instr";
			case Node.TEXT_NODE:                   return "Text";
			default:                               return "Unknown!";
		}
	}
}