
com.randomnoun.common.XmlUtil Maven / Gradle / Ivy
package com.randomnoun.common;
/* (c) 2013 randomnoun. All Rights Reserved. This work is licensed under a
* BSD Simplified License. (http://www.randomnoun.com/bsd-simplified.html)
*/
import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.ccil.cowan.tagsoup.*;
import org.ccil.cowan.tagsoup.Parser;
import org.w3c.dom.*;
import org.w3c.dom.Element;
import org.xml.sax.*;
import org.apache.log4j.Logger;
/** XML utility functions
*
* @author knoxg
* @blog http://www.randomnoun.com/wp/2013/01/25/exciting-things-with-xml/
* @version $Id: XmlUtil.java,v 1.5 2013-09-24 02:37:09 knoxg Exp $
*/
public class XmlUtil {
/** A revision marker to be used in exception stack traces. */
public static final String _revision = "$Id: XmlUtil.java,v 1.5 2013-09-24 02:37:09 knoxg Exp $";
/** Clean some HTML text through the tagsoup filter. The returned string is guaranteed to be
* well-formed XML (and can therefore be used by other tools that expect valid XML).
*
* @param inputXml input XML document
* @param isHtml if true, uses the HTML schema, omits the XML declaration, and uses the html method
*
* @throws SAXException if the tagsoup library could not parse the input string
* @throws IllegalStateException if an error occurred reading from a string (should never occur)
*/
public static String getCleanXml(String inputXml, boolean isHtml) throws SAXException {
return getCleanXml(new ByteArrayInputStream(inputXml.getBytes()), isHtml);
}
/** Clean a HTML inputStream through the tagsoup filter. The returned string is guaranteed to be
* well-formed XML (and can therefore be used by other tools that expect valid XML).
*
* @param is input XML stream
* @param isHtml if true, uses the HTML schema, omits the XML declaration, and uses the html method
*
* @throws SAXException if the tagsoup library could not parse the input string
* @throws IllegalStateException if an error occurred reading from a string (should never occur)
*/
public static String getCleanXml(InputStream inputStream, boolean isHtml) throws SAXException {
try {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
InputSource is = new InputSource();
is.setByteStream(inputStream); // could use raw inputstream here later
XMLReader xmlReader = new Parser();
Writer w = new OutputStreamWriter(baos);
XMLWriter tagsoupXMLWriter = new XMLWriter(w);
tagsoupXMLWriter.setOutputProperty(XMLWriter.OMIT_XML_DECLARATION, "yes");
if (isHtml) {
HTMLSchema theSchema = new HTMLSchema();
xmlReader.setProperty(Parser.schemaProperty, theSchema);
tagsoupXMLWriter.setOutputProperty(XMLWriter.METHOD, "html");
tagsoupXMLWriter.setPrefix(theSchema.getURI(), "");
}
xmlReader.setContentHandler(tagsoupXMLWriter);
xmlReader.parse(is);
return baos.toString();
} catch (IOException ioe) {
throw (IllegalStateException) new IllegalStateException("IO Exception reading from string").initCause(ioe);
}
}
/**
* Iterates through the child nodes of the specified element, and returns the contents
* of all Text and CDATA elements among those nodes, concatenated into a string.
*
* Elements are recursed into.
*
* @param element the element that contains, as child nodes, the text to be returned.
* @return the contents of all the CDATA children of the specified element.
*/
public static String getText(Element element)
{
if (element == null) { throw new NullPointerException("null element"); }
StringBuffer buf = new StringBuffer();
NodeList children = element.getChildNodes();
for (int i = 0; i < children.getLength(); ++i) {
org.w3c.dom.Node child = children.item(i);
short nodeType = child.getNodeType();
if (nodeType == org.w3c.dom.Node.CDATA_SECTION_NODE) {
buf.append(((org.w3c.dom.Text) child).getData());
} else if (nodeType == org.w3c.dom.Node.TEXT_NODE) {
buf.append(((org.w3c.dom.Text) child).getData());
} else if (nodeType == org.w3c.dom.Node.ELEMENT_NODE) {
buf.append(getText((Element) child));
}
}
return buf.toString();
}
/**
* Iterates through the child nodes of the specified element, and returns the contents
* of all Text and CDATA elements among those nodes, concatenated into a string.
* Any elements with tagNames that are included in the tagNames parameter of this
* method are also included.
*
*
Attributes of these tags are also included in the result, but may be reordered.
*
*
Self-closing elements (e.g. <br/>
)
* are expanded into opening and closing elements (e.g. <br></br>
)
*
*
Elements are recursed into.
*
* @param element the element that contains, as child nodes, the text to be returned.
* @return the contents of all the CDATA children of the specified element.
*/
public static String getTextPreserveElements(Element element, String[] tagNames) {
if (element == null) { throw new NullPointerException("null element"); }
Set tagNamesSet = new HashSet(Arrays.asList(tagNames));
StringBuffer buf = new StringBuffer();
NodeList children = element.getChildNodes();
for (int i = 0; i < children.getLength(); ++i) {
org.w3c.dom.Node child = children.item(i);
short nodeType = child.getNodeType();
if (nodeType == org.w3c.dom.Node.CDATA_SECTION_NODE) {
buf.append(((org.w3c.dom.Text) child).getData());
} else if (nodeType == org.w3c.dom.Node.TEXT_NODE) {
buf.append(((org.w3c.dom.Text) child).getData());
} else if (nodeType == org.w3c.dom.Node.ELEMENT_NODE) {
String tagName = ((Element) child).getTagName();
boolean includeEl = tagNamesSet.contains(tagName);
if (includeEl) {
buf.append('<');
buf.append(tagName);
NamedNodeMap nnm = ((Element) child).getAttributes();
for (int j = 0; j < nnm.getLength(); j++) {
Attr attr = (Attr) nnm.item(j);
buf.append(" " + attr.getName());
if (attr.getValue()!=null) {
buf.append("=\"" + attr.getValue() + "\"");
}
}
buf.append('>');
}
buf.append(getTextPreserveElements((Element) child, tagNames));
if (includeEl) {
buf.append("" + tagName + ">");
}
}
}
return buf.toString();
}
/**
* Iterates through the child nodes of the specified element, and returns the contents
* of all Text and CDATA elements among those nodes, concatenated into a string.
*
* Elements are not recursed into.
*
* @param element the element that contains, as child nodes, the text to be returned.
* @return the contents of all the CDATA children of the specified element.
*/
public static String getTextNonRecursive(Element element)
{
if (element == null) { throw new NullPointerException("null element"); }
StringBuffer buf = new StringBuffer();
NodeList children = element.getChildNodes();
for (int i = 0; i < children.getLength(); ++i) {
org.w3c.dom.Node child = children.item(i);
short nodeType = child.getNodeType();
if (nodeType == org.w3c.dom.Node.CDATA_SECTION_NODE) {
buf.append(((org.w3c.dom.Text) child).getData());
} else if (nodeType == org.w3c.dom.Node.TEXT_NODE) {
buf.append(((org.w3c.dom.Text) child).getData());
} else if (nodeType == org.w3c.dom.Node.ELEMENT_NODE) {
// ignore child elements
}
}
return buf.toString();
}
/** Return a DOM document object from an XML string
*
* @param text the string representation of the XML to parse
*/
public static Document toDocument(String text) throws SAXException {
return toDocument(new ByteArrayInputStream(text.getBytes()));
}
/** Return a DOM document object from an InputStream
*
* @param is the InputStream containing the XML to parse
*/
public static Document toDocument(InputStream is) throws SAXException {
try {
DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder();
Document doc = docBuilder.parse(is);
doc.getDocumentElement().normalize(); // Collapses adjacent text nodes into one node.
return doc;
} catch (ParserConfigurationException pce) {
// this can never happen
throw (IllegalStateException) new IllegalStateException("Error creating DOM parser").initCause(pce);
} catch (IOException ioe) {
// this can also never happen
throw (IllegalStateException) new IllegalStateException("Error retrieving information").initCause(ioe);
}
}
/** Converts a document node subtree back into an XML string
*
* @param node a DOM node
* @param omitXmlDeclaration if true, omits the XML declaration from the returned result
*
* @return the XML for this node
*
* @throws TransformerException if the transformation to XML failed
* @throws IllegalStateException if the transformer could not be initialised
*/
public static String getXmlString(Node node, boolean omitXmlDeclaration)
throws TransformerException
{
try {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
TransformerFactory transformerFactory = TransformerFactory.newInstance();
Transformer transformer = transformerFactory.newTransformer();
DOMSource source = new DOMSource(node);
StreamResult result = new StreamResult(baos);
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, omitXmlDeclaration ? "yes": "no");
transformer.transform(source, result);
return baos.toString();
} catch (TransformerConfigurationException tce) {
throw (IllegalStateException) new IllegalStateException("Could not initialise transfoermer").initCause(tce);
}
}
/** Remove leading/trailing whitespace from all text nodes in this nodeList.
* Will iterate through subnodes recursively.
*
* @param nodeList
*/
public static void compact(Node node) {
if (node.getNodeType()==Node.TEXT_NODE) {
org.w3c.dom.Text el = (org.w3c.dom.Text) node;
if (el.getNodeValue()!=null) {
el.setNodeValue(el.getNodeValue().trim());
}
} else if (node.getNodeType()==Node.ELEMENT_NODE) {
NodeList childNodes = node.getChildNodes();
if (childNodes != null && childNodes.getLength() > 0) {
int len = childNodes.getLength();
for (int i=0; iThis class will probably not work when tables are embedded within other tables
*/
public static class SimpleTableContentHandler
implements ContentHandler
{
/** Logger instance for this class */
public static final Logger logger = Logger.getLogger(SimpleTableContentHandler.class);
/** Current table */
List> thisTable = null;
/** Current row in table */
List thisRow = null;
/** Current cell in row */
String thisCell = "";
/** The state of this parser */
private enum State {
/** start of doc, expecting 'table' */
START,
/** in table element, expecting 'tr' */
IN_TABLE,
/** in tr element, expecting 'td' (or other ignored elements) */
IN_TR,
/** in td element, capturing to closing tag */
IN_TD
}
State state = State.START;
// unused interface methods
public void setDocumentLocator(Locator locator) { }
public void startDocument() throws SAXException { }
public void endDocument() throws SAXException { }
public void startPrefixMapping(String prefix, String uri) throws SAXException { }
public void endPrefixMapping(String prefix) throws SAXException { }
public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { }
public void processingInstruction(String target, String data) throws SAXException { }
public void skippedEntity(String name) throws SAXException { }
public void startElement(String uri, String localName, String qName, Attributes atts)
throws SAXException
{
switch (state) {
case START:
if (qName.equals("table")) {
thisTable = new ArrayList>();
state = State.IN_TABLE;
} else {
logger.warn("Warning: top-level element '" + qName + "' found (expected 'table')");
}
break;
case IN_TABLE:
if (qName.equals("tr")) {
thisRow = new ArrayList();
thisTable.add(thisRow);
state = State.IN_TR;
}
break;
case IN_TR:
if (qName.equals("td")) {
thisCell = "";
state = State.IN_TD;
}
break;
case IN_TD:
break;
default:
throw new IllegalStateException("Illegal state " + state + " in SimpleTableContentHandler");
}
}
public void characters(char[] ch, int start, int length)
throws SAXException {
if (state==State.IN_TD) {
thisCell += new String(ch, start, length);
}
}
public void endElement(String uri, String localName, String qName)
throws SAXException
{
if (state == State.IN_TD && qName.equals("td")) {
thisRow.add(thisCell);
state = State.IN_TR;
} else if (state == State.IN_TR && qName.equals("tr")) {
state = State.IN_TABLE;
}
}
public List> getTable() {
return thisTable;
}
}
/** An abstract stack-based XML parser. Similar to the apache digester, but without
* the dozen or so dependent JARs.
*
* Only element text is captured
*
Element attributes are not parsed by this class.
*
Mixed text/element nodes are not parsed by this class.
*
*/
public abstract static class AbstractStackContentHandler implements ContentHandler
{
/** Logger instance for this class */
public static final Logger logger = Logger.getLogger(AbstractStackContentHandler.class);
/** Location in stack */
private String stack = "";
private String text = null; // text captured so far
// unused interface methods
public void setDocumentLocator(Locator locator) { }
public void startDocument() throws SAXException { }
public void endDocument() throws SAXException { }
public void startPrefixMapping(String prefix, String uri) throws SAXException { }
public void endPrefixMapping(String prefix) throws SAXException { }
public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { }
public void processingInstruction(String target, String data) throws SAXException { }
public void skippedEntity(String name) throws SAXException { }
public void startElement(String uri, String localName, String qName, Attributes atts)
throws SAXException
{
stack = stack.equals("") ? qName : stack + "/" + qName;
text = "";
element(stack);
}
public void characters(char[] ch, int start, int length) throws SAXException {
text += new String(ch, start, length);
}
public void endElement(String uri, String localName, String qName)
throws SAXException
{
elementText(stack, text);
text = ""; // probably not necessary
stack = stack.contains("/") ? stack.substring(0, stack.lastIndexOf("/")) : "";
}
public abstract void element(String path) throws SAXException;
public abstract void elementText(String path, String content) throws SAXException;
}
/** Convert a NodeList into something that Java1.5 can treat as Iterable,
* so that it can be used in for (Node node : nodeList) { ... } style
* constructs.
*
*
(org.w3c.dom.traversal.NodeListIterator doesn't currently implement Iterable)
*
*/
public static class NodeListIterator implements Iterable {
private final NodeList nodeList;
public NodeListIterator(NodeList nodeList) {
this.nodeList = nodeList;
}
public Iterator iterator() {
return new Iterator() {
private int index = 0;
public boolean hasNext() {
return index < nodeList.getLength();
}
public org.w3c.dom.Node next() {
return nodeList.item(index++);
}
public void remove() {
throw new UnsupportedOperationException("remove() not allowed in NodeList");
}
};
}
}
}