com.googlecode.gwt.test.internal.utils.GwtHtmlParser Maven / Gradle / Ivy
package com.googlecode.gwt.test.internal.utils;
import com.google.gwt.dom.client.Node;
import com.google.gwt.dom.client.NodeList;
import com.googlecode.gwt.test.exceptions.GwtTestPatchException;
import com.googlecode.gwt.test.internal.AfterTestCallback;
import com.googlecode.gwt.test.internal.AfterTestCallbackManager;
import com.googlecode.html.filters.DefaultFilter;
import org.apache.xerces.xni.*;
import org.apache.xerces.xni.parser.XMLDocumentFilter;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.XMLReaderFactory;
import java.io.StringReader;
import java.util.Collections;
/**
* HTML parser used by gwt-test-utils. It relies on htmlparser. For internal use
* only.
*
* @author Gael Lazzari
*/
public class GwtHtmlParser implements AfterTestCallback {
/**
* Filter which keep " " and " " strings instead of converting them in a ' ' character.
*/
private static class NbspRemover extends DefaultFilter {
private static final String NBSP_ENTITY_NAME = "nbsp";
boolean inNbspEntityRef;
XMLString nbspXMLString;
private NbspRemover() {
nbspXMLString = new XMLString();
char[] c = {'&', 'n', 'b', 's', 'p', ';'};
nbspXMLString.setValues(c, 0, 6);
}
@Override
public void characters(XMLString text, Augmentations augs) throws XNIException {
if (!inNbspEntityRef) {
super.characters(text, augs);
}
}
@Override
public void endGeneralEntity(String name, Augmentations augs) throws XNIException {
inNbspEntityRef = false;
}
@Override
public void startDocument(XMLLocator locator, String encoding, Augmentations augs)
throws XNIException {
super.startDocument(locator, encoding, augs);
inNbspEntityRef = false;
}
@Override
public void startGeneralEntity(String name, XMLResourceIdentifier id, String encoding,
Augmentations augs) throws XNIException {
if (NBSP_ENTITY_NAME.equals(name)) {
inNbspEntityRef = true;
super.characters(nbspXMLString, augs);
} else {
super.startGeneralEntity(name, id, encoding, augs);
}
}
}
private static GwtHtmlParser INSTANCE = new GwtHtmlParser();
public static NodeList parse(String html) {
return INSTANCE.parseInternal(html);
}
private XMLReader reader;
private GwtHtmlParser() {
AfterTestCallbackManager.get().registerCallback(this);
}
public void afterTest() throws Throwable {
reader = null;
}
private XMLReader getXMLReader() throws SAXException {
if (reader == null) {
reader = XMLReaderFactory.createXMLReader("com.googlecode.html.parsers.SAXParser");
// FIXME : this feature does not work with the NekoHTML version included in gwt-dev.jar
// (1.9.13) that's why we had to copy neko 1.9.15 sources in gwt-test-utils
reader.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment",
true);
reader.setFeature("http://cyberneko.org/html/features/scanner/notify-builtin-refs", true);
reader.setProperty("http://cyberneko.org/html/properties/default-encoding", "UTF-8");
XMLDocumentFilter[] filters = {new NbspRemover()};
reader.setProperty("http://cyberneko.org/html/properties/filters", filters);
}
return reader;
}
private NodeList parseInternal(String html) {
if (html == null || html.trim().length() == 0) {
return JsoUtils.newNodeList(Collections.emptyList());
}
try {
XMLReader xmlReader = getXMLReader();
GwtHtmlContentHandler contentHandler = new GwtHtmlContentHandler();
xmlReader.setContentHandler(contentHandler);
xmlReader.parse(new InputSource(new StringReader(html)));
return contentHandler.getParsedNodes();
} catch (Exception e) {
throw new GwtTestPatchException("Error while parsing HTML '" + html + "'", e);
}
}
}