org.jsoup.helper.W3CDom Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jsoup Show documentation
Show all versions of jsoup Show documentation
jsoup is a Java library for working with real-world HTML. It provides a very convenient API for extracting and manipulating data, using the best of DOM, CSS, and jquery-like methods. jsoup implements the WHATWG HTML5 specification, and parses HTML to the same DOM as modern browsers do.
package org.jsoup.helper;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Attributes;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;
import org.w3c.dom.Comment;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Text;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.StringWriter;
import java.util.HashMap;
/**
* Helper class to transform a {@link org.jsoup.nodes.Document} to a {@link org.w3c.dom.Document org.w3c.dom.Document},
* for integration with toolsets that use the W3C DOM.
*/
public class W3CDom {
protected DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
/**
* Convert a jsoup Document to a W3C Document.
* @param in jsoup doc
* @return w3c doc
*/
public Document fromJsoup(org.jsoup.nodes.Document in) {
Validate.notNull(in);
DocumentBuilder builder;
try {
//set the factory to be namespace-aware
factory.setNamespaceAware(true);
builder = factory.newDocumentBuilder();
Document out = builder.newDocument();
convert(in, out);
return out;
} catch (ParserConfigurationException e) {
throw new IllegalStateException(e);
}
}
/**
* Converts a jsoup document into the provided W3C Document. If required, you can set options on the output document
* before converting.
* @param in jsoup doc
* @param out w3c doc
* @see org.jsoup.helper.W3CDom#fromJsoup(org.jsoup.nodes.Document)
*/
public void convert(org.jsoup.nodes.Document in, Document out) {
if (!StringUtil.isBlank(in.location()))
out.setDocumentURI(in.location());
org.jsoup.nodes.Element rootEl = in.child(0); // skip the #root node
NodeTraversor traversor = new NodeTraversor(new W3CBuilder(out));
traversor.traverse(rootEl);
}
/**
* Implements the conversion by walking the input.
*/
protected static class W3CBuilder implements NodeVisitor {
private static final String xmlnsKey = "xmlns";
private static final String xmlnsPrefix = "xmlns:";
private final Document doc;
private final HashMap namespaces = new HashMap(); // prefix => urn
private Element dest;
public W3CBuilder(Document doc) {
this.doc = doc;
}
public void head(org.jsoup.nodes.Node source, int depth) {
if (source instanceof org.jsoup.nodes.Element) {
org.jsoup.nodes.Element sourceEl = (org.jsoup.nodes.Element) source;
String prefix = updateNamespaces(sourceEl);
String namespace = namespaces.get(prefix);
Element el = doc.createElementNS(namespace, sourceEl.tagName());
copyAttributes(sourceEl, el);
if (dest == null) { // sets up the root
doc.appendChild(el);
} else {
dest.appendChild(el);
}
dest = el; // descend
} else if (source instanceof org.jsoup.nodes.TextNode) {
org.jsoup.nodes.TextNode sourceText = (org.jsoup.nodes.TextNode) source;
Text text = doc.createTextNode(sourceText.getWholeText());
dest.appendChild(text);
} else if (source instanceof org.jsoup.nodes.Comment) {
org.jsoup.nodes.Comment sourceComment = (org.jsoup.nodes.Comment) source;
Comment comment = doc.createComment(sourceComment.getData());
dest.appendChild(comment);
} else if (source instanceof org.jsoup.nodes.DataNode) {
org.jsoup.nodes.DataNode sourceData = (org.jsoup.nodes.DataNode) source;
Text node = doc.createTextNode(sourceData.getWholeData());
dest.appendChild(node);
} else {
// unhandled
}
}
public void tail(org.jsoup.nodes.Node source, int depth) {
if (source instanceof org.jsoup.nodes.Element && dest.getParentNode() instanceof Element) {
dest = (Element) dest.getParentNode(); // undescend. cromulent.
}
}
private void copyAttributes(org.jsoup.nodes.Node source, Element el) {
for (Attribute attribute : source.attributes()) {
// valid xml attribute names are: ^[a-zA-Z_:][-a-zA-Z0-9_:.]
String key = attribute.getKey().replaceAll("[^-a-zA-Z0-9_:.]", "");
if (key.matches("[a-zA-Z_:]{1}[-a-zA-Z0-9_:.]*"))
el.setAttribute(key, attribute.getValue());
}
}
/**
* Finds any namespaces defined in this element. Returns any tag prefix.
*/
private String updateNamespaces(org.jsoup.nodes.Element el) {
// scan the element for namespace declarations
// like: xmlns="blah" or xmlns:prefix="blah"
Attributes attributes = el.attributes();
for (Attribute attr : attributes) {
String key = attr.getKey();
String prefix;
if (key.equals(xmlnsKey)) {
prefix = "";
} else if (key.startsWith(xmlnsPrefix)) {
prefix = key.substring(xmlnsPrefix.length());
} else {
continue;
}
namespaces.put(prefix, attr.getValue());
}
// get the element prefix if any
int pos = el.tagName().indexOf(":");
return pos > 0 ? el.tagName().substring(0, pos) : "";
}
}
/**
* Serialize a W3C document to a String.
* @param doc Document
* @return Document as string
*/
public String asString(Document doc) {
try {
DOMSource domSource = new DOMSource(doc);
StringWriter writer = new StringWriter();
StreamResult result = new StreamResult(writer);
TransformerFactory tf = TransformerFactory.newInstance();
Transformer transformer = tf.newTransformer();
transformer.transform(domSource, result);
return writer.toString();
} catch (TransformerException e) {
throw new IllegalStateException(e);
}
}
}