All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.randomnoun.common.XmlUtil Maven / Gradle / Ivy

There is a newer version: 1.0.28
Show newest version
package com.randomnoun.common;

/* (c) 2013 randomnoun. All Rights Reserved. This work is licensed under a
 * BSD Simplified License. (http://www.randomnoun.com/bsd-simplified.html)
 */

import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.ccil.cowan.tagsoup.*;
import org.ccil.cowan.tagsoup.Parser;

import org.w3c.dom.*;
import org.w3c.dom.Element;
import org.xml.sax.*;

import org.apache.log4j.Logger;

/** XML utility functions
 *
 * @author knoxg
 * @blog http://www.randomnoun.com/wp/2013/01/25/exciting-things-with-xml/
 * @version $Id: XmlUtil.java,v 1.5 2013-09-24 02:37:09 knoxg Exp $
 */
public class XmlUtil {
	
    /** A revision marker to be used in exception stack traces. */
    public static final String _revision = "$Id: XmlUtil.java,v 1.5 2013-09-24 02:37:09 knoxg Exp $";


	/** Clean some HTML text through the tagsoup filter. The returned string is guaranteed to be 
	 * well-formed XML (and can therefore be used by other tools that expect valid XML). 
	 * 
	 * @param inputXml input XML document
	 * @param isHtml if true, uses the HTML schema, omits the XML declaration, and uses the html method
	 * 
	 * @throws SAXException if the tagsoup library could not parse the input string
	 * @throws IllegalStateException if an error occurred reading from a string (should never occur)
	 */ 
	public static String getCleanXml(String inputXml, boolean isHtml) throws SAXException {
		return getCleanXml(new ByteArrayInputStream(inputXml.getBytes()), isHtml);
	}
	
	/** Clean a HTML inputStream through the tagsoup filter. The returned string is guaranteed to be 
	 * well-formed XML (and can therefore be used by other tools that expect valid XML). 
	 * 
	 * @param is input XML stream
	 * @param isHtml if true, uses the HTML schema, omits the XML declaration, and uses the html method
	 * 
	 * @throws SAXException if the tagsoup library could not parse the input string
	 * @throws IllegalStateException if an error occurred reading from a string (should never occur)
	 */ 
	public static String getCleanXml(InputStream inputStream, boolean isHtml) throws SAXException {
		try {
			ByteArrayOutputStream baos = new ByteArrayOutputStream();
			InputSource is = new InputSource();
			is.setByteStream(inputStream); // could use raw inputstream here later

			XMLReader xmlReader = new Parser();
			Writer w = new OutputStreamWriter(baos);
			XMLWriter tagsoupXMLWriter = new XMLWriter(w);
			tagsoupXMLWriter.setOutputProperty(XMLWriter.OMIT_XML_DECLARATION, "yes");
			if (isHtml) {
				HTMLSchema theSchema = new HTMLSchema();
				xmlReader.setProperty(Parser.schemaProperty, theSchema);
	
				tagsoupXMLWriter.setOutputProperty(XMLWriter.METHOD, "html");
				tagsoupXMLWriter.setPrefix(theSchema.getURI(), "");
			}
			
			xmlReader.setContentHandler(tagsoupXMLWriter);
			xmlReader.parse(is);
			return baos.toString();
		} catch (IOException ioe) {
			throw (IllegalStateException) new IllegalStateException("IO Exception reading from string").initCause(ioe);		
		}
	}


	/**
	 * Iterates through the child nodes of the specified element, and returns the contents
	 * of all Text and CDATA elements among those nodes, concatenated into a string.
	 *
	 * 

Elements are recursed into. * * @param element the element that contains, as child nodes, the text to be returned. * @return the contents of all the CDATA children of the specified element. */ public static String getText(Element element) { if (element == null) { throw new NullPointerException("null element"); } StringBuffer buf = new StringBuffer(); NodeList children = element.getChildNodes(); for (int i = 0; i < children.getLength(); ++i) { org.w3c.dom.Node child = children.item(i); short nodeType = child.getNodeType(); if (nodeType == org.w3c.dom.Node.CDATA_SECTION_NODE) { buf.append(((org.w3c.dom.Text) child).getData()); } else if (nodeType == org.w3c.dom.Node.TEXT_NODE) { buf.append(((org.w3c.dom.Text) child).getData()); } else if (nodeType == org.w3c.dom.Node.ELEMENT_NODE) { buf.append(getText((Element) child)); } } return buf.toString(); } /** * Iterates through the child nodes of the specified element, and returns the contents * of all Text and CDATA elements among those nodes, concatenated into a string. * Any elements with tagNames that are included in the tagNames parameter of this * method are also included. * *

Attributes of these tags are also included in the result, but may be reordered. * *

Self-closing elements (e.g. <br/>) * are expanded into opening and closing elements (e.g. <br></br>) * *

Elements are recursed into. * * @param element the element that contains, as child nodes, the text to be returned. * @return the contents of all the CDATA children of the specified element. */ public static String getTextPreserveElements(Element element, String[] tagNames) { if (element == null) { throw new NullPointerException("null element"); } Set tagNamesSet = new HashSet(Arrays.asList(tagNames)); StringBuffer buf = new StringBuffer(); NodeList children = element.getChildNodes(); for (int i = 0; i < children.getLength(); ++i) { org.w3c.dom.Node child = children.item(i); short nodeType = child.getNodeType(); if (nodeType == org.w3c.dom.Node.CDATA_SECTION_NODE) { buf.append(((org.w3c.dom.Text) child).getData()); } else if (nodeType == org.w3c.dom.Node.TEXT_NODE) { buf.append(((org.w3c.dom.Text) child).getData()); } else if (nodeType == org.w3c.dom.Node.ELEMENT_NODE) { String tagName = ((Element) child).getTagName(); boolean includeEl = tagNamesSet.contains(tagName); if (includeEl) { buf.append('<'); buf.append(tagName); NamedNodeMap nnm = ((Element) child).getAttributes(); for (int j = 0; j < nnm.getLength(); j++) { Attr attr = (Attr) nnm.item(j); buf.append(" " + attr.getName()); if (attr.getValue()!=null) { buf.append("=\"" + attr.getValue() + "\""); } } buf.append('>'); } buf.append(getTextPreserveElements((Element) child, tagNames)); if (includeEl) { buf.append(""); } } } return buf.toString(); } /** * Iterates through the child nodes of the specified element, and returns the contents * of all Text and CDATA elements among those nodes, concatenated into a string. * *

Elements are not recursed into. * * @param element the element that contains, as child nodes, the text to be returned. * @return the contents of all the CDATA children of the specified element. */ public static String getTextNonRecursive(Element element) { if (element == null) { throw new NullPointerException("null element"); } StringBuffer buf = new StringBuffer(); NodeList children = element.getChildNodes(); for (int i = 0; i < children.getLength(); ++i) { org.w3c.dom.Node child = children.item(i); short nodeType = child.getNodeType(); if (nodeType == org.w3c.dom.Node.CDATA_SECTION_NODE) { buf.append(((org.w3c.dom.Text) child).getData()); } else if (nodeType == org.w3c.dom.Node.TEXT_NODE) { buf.append(((org.w3c.dom.Text) child).getData()); } else if (nodeType == org.w3c.dom.Node.ELEMENT_NODE) { // ignore child elements } } return buf.toString(); } /** Return a DOM document object from an XML string * * @param text the string representation of the XML to parse */ public static Document toDocument(String text) throws SAXException { return toDocument(new ByteArrayInputStream(text.getBytes())); } /** Return a DOM document object from an InputStream * * @param is the InputStream containing the XML to parse */ public static Document toDocument(InputStream is) throws SAXException { try { DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder(); Document doc = docBuilder.parse(is); doc.getDocumentElement().normalize(); // Collapses adjacent text nodes into one node. return doc; } catch (ParserConfigurationException pce) { // this can never happen throw (IllegalStateException) new IllegalStateException("Error creating DOM parser").initCause(pce); } catch (IOException ioe) { // this can also never happen throw (IllegalStateException) new IllegalStateException("Error retrieving information").initCause(ioe); } } /** Converts a document node subtree back into an XML string * * @param node a DOM node * @param omitXmlDeclaration if true, omits the XML declaration from the returned result * * @return the XML for this node * * @throws TransformerException if the transformation to XML failed * @throws IllegalStateException if the transformer could not be initialised */ public static String getXmlString(Node node, boolean omitXmlDeclaration) throws TransformerException { try { ByteArrayOutputStream baos = new ByteArrayOutputStream(); TransformerFactory transformerFactory = TransformerFactory.newInstance(); Transformer transformer = transformerFactory.newTransformer(); DOMSource source = new DOMSource(node); StreamResult result = new StreamResult(baos); transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, omitXmlDeclaration ? "yes": "no"); transformer.transform(source, result); return baos.toString(); } catch (TransformerConfigurationException tce) { throw (IllegalStateException) new IllegalStateException("Could not initialise transfoermer").initCause(tce); } } /** Remove leading/trailing whitespace from all text nodes in this nodeList. * Will iterate through subnodes recursively. * * @param nodeList */ public static void compact(Node node) { if (node.getNodeType()==Node.TEXT_NODE) { org.w3c.dom.Text el = (org.w3c.dom.Text) node; if (el.getNodeValue()!=null) { el.setNodeValue(el.getNodeValue().trim()); } } else if (node.getNodeType()==Node.ELEMENT_NODE) { NodeList childNodes = node.getChildNodes(); if (childNodes != null && childNodes.getLength() > 0) { int len = childNodes.getLength(); for (int i=0; iThis class will probably not work when tables are embedded within other tables */ public static class SimpleTableContentHandler implements ContentHandler { /** Logger instance for this class */ public static final Logger logger = Logger.getLogger(SimpleTableContentHandler.class); /** Current table */ List> thisTable = null; /** Current row in table */ List thisRow = null; /** Current cell in row */ String thisCell = ""; /** The state of this parser */ private enum State { /** start of doc, expecting 'table' */ START, /** in table element, expecting 'tr' */ IN_TABLE, /** in tr element, expecting 'td' (or other ignored elements) */ IN_TR, /** in td element, capturing to closing tag */ IN_TD } State state = State.START; // unused interface methods public void setDocumentLocator(Locator locator) { } public void startDocument() throws SAXException { } public void endDocument() throws SAXException { } public void startPrefixMapping(String prefix, String uri) throws SAXException { } public void endPrefixMapping(String prefix) throws SAXException { } public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { } public void processingInstruction(String target, String data) throws SAXException { } public void skippedEntity(String name) throws SAXException { } public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { switch (state) { case START: if (qName.equals("table")) { thisTable = new ArrayList>(); state = State.IN_TABLE; } else { logger.warn("Warning: top-level element '" + qName + "' found (expected 'table')"); } break; case IN_TABLE: if (qName.equals("tr")) { thisRow = new ArrayList(); thisTable.add(thisRow); state = State.IN_TR; } break; case IN_TR: if (qName.equals("td")) { thisCell = ""; state = State.IN_TD; } break; case IN_TD: break; default: throw new IllegalStateException("Illegal state " + state + " in SimpleTableContentHandler"); } } public void characters(char[] ch, int start, int length) throws SAXException { if (state==State.IN_TD) { thisCell += new String(ch, start, length); } } public void endElement(String uri, String localName, String qName) throws SAXException { if (state == State.IN_TD && qName.equals("td")) { thisRow.add(thisCell); state = State.IN_TR; } else if (state == State.IN_TR && qName.equals("tr")) { state = State.IN_TABLE; } } public List> getTable() { return thisTable; } } /** An abstract stack-based XML parser. Similar to the apache digester, but without * the dozen or so dependent JARs. * *

Only element text is captured *

Element attributes are not parsed by this class. *

Mixed text/element nodes are not parsed by this class. * */ public abstract static class AbstractStackContentHandler implements ContentHandler { /** Logger instance for this class */ public static final Logger logger = Logger.getLogger(AbstractStackContentHandler.class); /** Location in stack */ private String stack = ""; private String text = null; // text captured so far // unused interface methods public void setDocumentLocator(Locator locator) { } public void startDocument() throws SAXException { } public void endDocument() throws SAXException { } public void startPrefixMapping(String prefix, String uri) throws SAXException { } public void endPrefixMapping(String prefix) throws SAXException { } public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { } public void processingInstruction(String target, String data) throws SAXException { } public void skippedEntity(String name) throws SAXException { } public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { stack = stack.equals("") ? qName : stack + "/" + qName; text = ""; element(stack); } public void characters(char[] ch, int start, int length) throws SAXException { text += new String(ch, start, length); } public void endElement(String uri, String localName, String qName) throws SAXException { elementText(stack, text); text = ""; // probably not necessary stack = stack.contains("/") ? stack.substring(0, stack.lastIndexOf("/")) : ""; } public abstract void element(String path) throws SAXException; public abstract void elementText(String path, String content) throws SAXException; } /** Convert a NodeList into something that Java1.5 can treat as Iterable, * so that it can be used in for (Node node : nodeList) { ... } style * constructs. * *

(org.w3c.dom.traversal.NodeListIterator doesn't currently implement Iterable) * */ public static class NodeListIterator implements Iterable { private final NodeList nodeList; public NodeListIterator(NodeList nodeList) { this.nodeList = nodeList; } public Iterator iterator() { return new Iterator() { private int index = 0; public boolean hasNext() { return index < nodeList.getLength(); } public org.w3c.dom.Node next() { return nodeList.item(index++); } public void remove() { throw new UnsupportedOperationException("remove() not allowed in NodeList"); } }; } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy