com.gargoylesoftware.htmlunit.html.HTMLParser Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of htmlunit Show documentation
Show all versions of htmlunit Show documentation
A headless browser intended for use in testing web-based applications.
/*
* Copyright (c) 2002-2015 Gargoyle Software Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.gargoylesoftware.htmlunit.html;
import static com.gargoylesoftware.htmlunit.BrowserVersionFeatures.DOCTYPE_IS_COMMENT;
import static com.gargoylesoftware.htmlunit.BrowserVersionFeatures.HTMLCONDITIONAL_COMMENTS;
import static com.gargoylesoftware.htmlunit.BrowserVersionFeatures.HTMLIFRAME_IGNORE_SELFCLOSING;
import static com.gargoylesoftware.htmlunit.BrowserVersionFeatures.HTMLPARSER_REMOVE_EMPTY_CONTENT;
import static com.gargoylesoftware.htmlunit.BrowserVersionFeatures.HTML_ATTRIBUTE_LOWER_CASE;
import static com.gargoylesoftware.htmlunit.BrowserVersionFeatures.HTML_CDATA_AS_COMMENT;
import static com.gargoylesoftware.htmlunit.BrowserVersionFeatures.IGNORE_CONTENTS_OF_INNER_HEAD;
import static com.gargoylesoftware.htmlunit.BrowserVersionFeatures.JS_DEFINE_GETTER;
import static com.gargoylesoftware.htmlunit.BrowserVersionFeatures.META_X_UA_COMPATIBLE;
import static com.gargoylesoftware.htmlunit.BrowserVersionFeatures.PAGE_WAIT_LOAD_BEFORE_BODY;
import static com.gargoylesoftware.htmlunit.BrowserVersionFeatures.SVG;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.lang.reflect.InvocationTargetException;
import java.net.URL;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Deque;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.xerces.parsers.AbstractSAXParser;
import org.apache.xerces.util.DefaultErrorHandler;
import org.apache.xerces.util.XMLStringBuffer;
import org.apache.xerces.xni.Augmentations;
import org.apache.xerces.xni.QName;
import org.apache.xerces.xni.XMLAttributes;
import org.apache.xerces.xni.XNIException;
import org.apache.xerces.xni.parser.XMLInputSource;
import org.apache.xerces.xni.parser.XMLParseException;
import org.apache.xerces.xni.parser.XMLParserConfiguration;
import org.cyberneko.html.HTMLConfiguration;
import org.cyberneko.html.HTMLEventInfo;
import org.cyberneko.html.HTMLScanner;
import org.cyberneko.html.HTMLTagBalancer;
import org.cyberneko.html.HTMLTagBalancingListener;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.ext.LexicalHandler;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.ObjectInstantiationException;
import com.gargoylesoftware.htmlunit.Page;
import com.gargoylesoftware.htmlunit.SgmlPage;
import com.gargoylesoftware.htmlunit.WebAssert;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.WebResponse;
import com.gargoylesoftware.htmlunit.WebWindow;
import com.gargoylesoftware.htmlunit.javascript.host.html.HTMLBodyElement;
import com.gargoylesoftware.htmlunit.javascript.host.html.HTMLDocument;
import com.gargoylesoftware.htmlunit.javascript.host.html.HTMLElement;
import com.gargoylesoftware.htmlunit.svg.SvgElementFactory;
import net.sourceforge.htmlunit.corejs.javascript.Scriptable;
import net.sourceforge.htmlunit.corejs.javascript.ScriptableObject;
/**
* SAX parser implementation that uses the NekoHTML {@link org.cyberneko.html.HTMLConfiguration}
* to parse HTML into a HtmlUnit-specific DOM (HU-DOM) tree.
*
* @version $Revision: 10913 $
* @author Christian Sell
* @author David K. Taylor
* @author Chris Erskine
* @author Ahmed Ashour
* @author Marc Guillemot
* @author Ethan Glasser-Camp
* @author Sudhan Moghe
* @author Ronald Brill
* @author Frank Danek
* @author Carsten Steul
*/
public final class HTMLParser {
/** XHTML namespace. */
public static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml";
/** SVG namespace. */
public static final String SVG_NAMESPACE = "http://www.w3.org/2000/svg";
/**
* The SVG factory.
*/
public static final ElementFactory SVG_FACTORY = new SvgElementFactory();
private static final Map ELEMENT_FACTORIES = new HashMap<>();
static {
ELEMENT_FACTORIES.put(HtmlInput.TAG_NAME, InputElementFactory.instance);
final DefaultElementFactory defaultElementFactory = new DefaultElementFactory();
for (final String tagName : DefaultElementFactory.SUPPORTED_TAGS_) {
ELEMENT_FACTORIES.put(tagName, defaultElementFactory);
}
}
/**
* You should never need to create one of these!
*/
private HTMLParser() {
// Empty.
}
/**
* Parses the HTML content from the given string into an object tree representation.
*
* @param parent the parent for the new nodes
* @param source the (X)HTML to be parsed
* @throws SAXException if a SAX error occurs
* @throws IOException if an IO error occurs
*/
public static void parseFragment(final DomNode parent, final String source) throws SAXException, IOException {
parseFragment(parent, parent, source);
}
/**
* Parses the HTML content from the given string into an object tree representation.
*
* @param parent where the new parsed nodes will be added to
* @param context the context to build the fragment context stack
* @param source the (X)HTML to be parsed
* @throws SAXException if a SAX error occurs
* @throws IOException if an IO error occurs
*/
public static void parseFragment(final DomNode parent, final DomNode context, final String source)
throws SAXException, IOException {
final HtmlPage page = (HtmlPage) parent.getPage();
final URL url = page.getUrl();
final HtmlUnitDOMBuilder domBuilder = new HtmlUnitDOMBuilder(parent, url, source);
domBuilder.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true);
// build fragment context stack
DomNode node = context;
final List ancestors = new ArrayList<>();
while (node != null && node.getNodeType() != Node.DOCUMENT_NODE) {
ancestors.add(0, new QName(null, node.getNodeName(), null, null));
node = node.getParentNode();
}
if (ancestors.isEmpty() || !"html".equals(ancestors.get(0).localpart)) {
ancestors.add(0, new QName(null, "html", null, null));
}
if (ancestors.size() == 1 || !"body".equals(ancestors.get(1).localpart)) {
ancestors.add(1, new QName(null, "body", null, null));
}
domBuilder.setFeature(HTMLScanner.ALLOW_SELFCLOSING_TAGS, true);
domBuilder.setProperty(HTMLTagBalancer.FRAGMENT_CONTEXT_STACK, ancestors.toArray(new QName[] {}));
final XMLInputSource in = new XMLInputSource(null, url.toString(), null, new StringReader(source), null);
page.registerParsingStart();
page.registerSnippetParsingStart();
try {
domBuilder.parse(in);
}
finally {
page.registerParsingEnd();
page.registerSnippetParsingEnd();
}
}
/**
* Parses the HTML content from the specified WebResponse into an object tree representation.
*
* @param webResponse the response data
* @param webWindow the web window into which the page is to be loaded
* @return the page object which is the root of the DOM tree
* @throws IOException if there is an IO error
*/
public static HtmlPage parseHtml(final WebResponse webResponse, final WebWindow webWindow) throws IOException {
final HtmlPage page = new HtmlPage(webResponse.getWebRequest().getUrl(), webResponse, webWindow);
parse(webResponse, webWindow, page, false);
return page;
}
/**
* Parses the XHTML content from the specified WebResponse into an object tree representation.
*
* @param webResponse the response data
* @param webWindow the web window into which the page is to be loaded
* @return the page object which is the root of the DOM tree
* @throws IOException if there is an IO error
*/
public static XHtmlPage parseXHtml(final WebResponse webResponse, final WebWindow webWindow) throws IOException {
final XHtmlPage page = new XHtmlPage(webResponse.getWebRequest().getUrl(), webResponse, webWindow);
parse(webResponse, webWindow, page, true);
return page;
}
private static void parse(final WebResponse webResponse, final WebWindow webWindow, final HtmlPage page,
final boolean xhtml)
throws IOException {
webWindow.setEnclosedPage(page);
final URL url = webResponse.getWebRequest().getUrl();
final HtmlUnitDOMBuilder domBuilder = new HtmlUnitDOMBuilder(page, url, null);
String charset = webResponse.getContentCharsetOrNull();
try {
// handle charset
if (charset != null) {
domBuilder.setFeature(HTMLScanner.IGNORE_SPECIFIED_CHARSET, true);
}
else {
final String specifiedCharset = webResponse.getWebRequest().getCharset();
if (specifiedCharset != null) {
charset = specifiedCharset;
}
}
// xml content is different
if (xhtml) {
domBuilder.setFeature(HTMLScanner.ALLOW_SELFCLOSING_TAGS, true);
}
}
catch (final Exception e) {
throw new ObjectInstantiationException("Error setting HTML parser feature", e);
}
final InputStream content = webResponse.getContentAsStream();
final XMLInputSource in = new XMLInputSource(null, url.toString(), null, content, charset);
page.registerParsingStart();
try {
domBuilder.parse(in);
}
catch (final XNIException e) {
// extract enclosed exception
final Throwable origin = extractNestedException(e);
throw new RuntimeException("Failed parsing content from " + url, origin);
}
finally {
IOUtils.closeQuietly(content);
page.registerParsingEnd();
}
addBodyToPageIfNecessary(page, true, domBuilder.body_ != null);
}
/**
* Adds a body element to the current page, if necessary. Strictly speaking, this should
* probably be done by NekoHTML. See the bug linked below. If and when that bug is fixed,
* we may be able to get rid of this code.
*
* http://sourceforge.net/p/nekohtml/bugs/15/
* @param page
* @param originalCall
* @param checkInsideFrameOnly true if the original page had body that was removed by JavaScript
*/
private static void addBodyToPageIfNecessary(
final HtmlPage page, final boolean originalCall, final boolean checkInsideFrameOnly) {
// IE waits for the whole page to load before initializing bodies for frames.
final boolean waitToLoad = page.hasFeature(PAGE_WAIT_LOAD_BEFORE_BODY);
if (page.getEnclosingWindow() instanceof FrameWindow && originalCall && waitToLoad) {
return;
}
// Find out if the document already has a body element (or frameset).
final Element doc = page.getDocumentElement();
boolean hasBody = false;
for (Node child = doc.getFirstChild(); child != null; child = child.getNextSibling()) {
if (child instanceof HtmlBody || child instanceof HtmlFrameSet) {
hasBody = true;
break;
}
}
// If the document does not have a body, add it.
if (!hasBody && !checkInsideFrameOnly) {
final HtmlBody body = new HtmlBody("body", page, null, false);
doc.appendChild(body);
}
// If this is IE, we need to initialize the bodies of any frames, as well.
// This will already have been done when emulating FF (see above).
if (waitToLoad) {
for (final FrameWindow frame : page.getFrames()) {
final Page containedPage = frame.getEnclosedPage();
if (containedPage != null && containedPage.isHtmlPage()) {
addBodyToPageIfNecessary((HtmlPage) containedPage, false, false);
}
}
}
}
/**
* Extract nested exception within an XNIException (Nekohtml uses reflection and generated
* exceptions are wrapped many times within XNIException and InvocationTargetException)
*
* @param e the original XNIException
* @return the cause exception
*/
static Throwable extractNestedException(final Throwable e) {
Throwable originalException = e;
Throwable cause = ((XNIException) e).getException();
while (cause != null) {
originalException = cause;
if (cause instanceof XNIException) {
cause = ((XNIException) cause).getException();
}
else if (cause instanceof InvocationTargetException) {
cause = cause.getCause();
}
else {
cause = null;
}
}
return originalException;
}
/**
* @param tagName an HTML element tag name
* @return a factory for creating HtmlElements representing the given tag
*/
public static ElementFactory getFactory(final String tagName) {
final ElementFactory result = ELEMENT_FACTORIES.get(tagName);
if (result != null) {
return result;
}
return UnknownElementFactory.instance;
}
/**
* Returns the pre-registered element factory corresponding to the specified tag, or an UnknownElementFactory.
* @param page the page
* @param namespaceURI the namespace URI
* @param qualifiedName the qualified name
* @return the pre-registered element factory corresponding to the specified tag, or an UnknownElementFactory
*/
static ElementFactory getElementFactory(final SgmlPage page, final String namespaceURI,
final String qualifiedName) {
if (SVG_NAMESPACE.equals(namespaceURI) && page.hasFeature(SVG)) {
return SVG_FACTORY;
}
if (namespaceURI == null || namespaceURI.isEmpty()
|| !qualifiedName.contains(":") || namespaceURI.equals(XHTML_NAMESPACE)) {
String tagName = qualifiedName;
final int index = tagName.indexOf(':');
if (index != -1) {
tagName = tagName.substring(index + 1);
}
else {
tagName = tagName.toLowerCase(Locale.ENGLISH);
}
final ElementFactory factory = ELEMENT_FACTORIES.get(tagName);
if (factory != null) {
return factory;
}
}
return UnknownElementFactory.instance;
}
/**
* The parser and DOM builder. This class subclasses Xerces's AbstractSAXParser and implements
* the ContentHandler interface. Thus all parser APIs are kept private. The ContentHandler methods
* consume SAX events to build the page DOM
*/
static final class HtmlUnitDOMBuilder extends AbstractSAXParser
implements ContentHandler, LexicalHandler, HTMLTagBalancingListener {
private enum HeadParsed { YES, SYNTHESIZED, NO };
private final HtmlPage page_;
private Locator locator_;
private final Deque stack_ = new ArrayDeque();
private DomNode currentNode_;
private StringBuilder characters_;
private HeadParsed headParsed_ = HeadParsed.NO;
private boolean parsingInnerHead_ = false;
private HtmlElement head_;
private HtmlElement body_;
private boolean lastTagWasSynthesized_;
private HtmlForm formWaitingForLostChildren_;
private static final String FEATURE_AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
private static final String FEATURE_PARSE_NOSCRIPT
= "http://cyberneko.org/html/features/parse-noscript-content";
/**
* Parses and then inserts the specified HTML content into the HTML content currently being parsed.
* @param html the HTML content to push
*/
public void pushInputString(final String html) {
page_.registerParsingStart();
page_.registerInlineSnippetParsingStart();
try {
final WebResponse webResponse = page_.getWebResponse();
final String charset = webResponse.getContentCharset();
final String url = webResponse.getWebRequest().getUrl().toString();
final XMLInputSource in = new XMLInputSource(null, url, null, new StringReader(html), charset);
((HTMLConfiguration) fConfiguration).evaluateInputSource(in);
}
finally {
page_.registerParsingEnd();
page_.registerInlineSnippetParsingEnd();
}
}
/**
* Creates a new builder for parsing the specified response contents.
* @param node the location at which to insert the new content
* @param url the page's URL
*/
private HtmlUnitDOMBuilder(final DomNode node, final URL url, final String htmlContent) {
super(createConfiguration(node.getPage().getWebClient()));
page_ = (HtmlPage) node.getPage();
currentNode_ = node;
for (final Node ancestor : currentNode_.getAncestors(true)) {
stack_.push((DomNode) ancestor);
}
final WebClient webClient = page_.getWebClient();
final HTMLParserListener listener = webClient.getHTMLParserListener();
final boolean reportErrors;
if (listener != null) {
reportErrors = true;
fConfiguration.setErrorHandler(new HTMLErrorHandler(listener, url, htmlContent));
}
else {
reportErrors = false;
}
try {
setFeature(FEATURE_AUGMENTATIONS, true);
setProperty("http://cyberneko.org/html/properties/names/elems", "default");
if (!webClient.getBrowserVersion().hasFeature(HTML_ATTRIBUTE_LOWER_CASE)) {
setProperty("http://cyberneko.org/html/properties/names/attrs", "no-change");
}
setFeature("http://cyberneko.org/html/features/report-errors", reportErrors);
setFeature(FEATURE_PARSE_NOSCRIPT, !webClient.getOptions().isJavaScriptEnabled());
setFeature(HTMLScanner.ALLOW_SELFCLOSING_IFRAME,
!webClient.getBrowserVersion().hasFeature(HTMLIFRAME_IGNORE_SELFCLOSING));
setContentHandler(this);
setLexicalHandler(this); //comments and CDATA
}
catch (final SAXException e) {
throw new ObjectInstantiationException("unable to create HTML parser", e);
}
}
/**
* Create the configuration depending on the simulated browser
* @param webClient the current WebClient
* @return the configuration
*/
private static XMLParserConfiguration createConfiguration(final WebClient webClient) {
final BrowserVersion browserVersion = webClient.getBrowserVersion();
// for IE we need a special scanner that will be able to understand conditional comments
if (browserVersion.hasFeature(HTMLCONDITIONAL_COMMENTS)) {
return new HTMLConfiguration() {
@Override
protected HTMLScanner createDocumentScanner() {
return new HTMLScannerForIE(browserVersion);
}
};
}
return new HTMLConfiguration();
}
/**
* @return the document locator
*/
public Locator getLocator() {
return locator_;
}
/** {@inheritDoc ContentHandler#setDocumentLocator} */
@Override
public void setDocumentLocator(final Locator locator) {
locator_ = locator;
}
/** {@inheritDoc ContentHandler#startDocument()} */
@Override
public void startDocument() throws SAXException {
}
/** {@inheritDoc} */
@Override
public void startElement(final QName element, final XMLAttributes attributes, final Augmentations augs)
throws XNIException {
// augs might change so we store only the interesting part
lastTagWasSynthesized_ = isSynthesized(augs);
super.startElement(element, attributes, augs);
}
/** {@inheritDoc ContentHandler#startElement(String,String,String,Attributes)} */
@Override
public void startElement(
String namespaceURI, final String localName,
final String qName, final Attributes atts)
throws SAXException {
handleCharacters();
final String tagLower = localName.toLowerCase(Locale.ENGLISH);
if (page_.isParsingHtmlSnippet() && ("html".equals(tagLower) || "body".equals(tagLower))) {
return;
}
if (parsingInnerHead_ && page_.hasFeature(IGNORE_CONTENTS_OF_INNER_HEAD)) {
return;
}
if (namespaceURI != null) {
namespaceURI = namespaceURI.trim();
}
if ("head".equals(tagLower)) {
if (headParsed_ == HeadParsed.YES || page_.isParsingHtmlSnippet()) {
parsingInnerHead_ = true;
return;
}
headParsed_ = lastTagWasSynthesized_ ? HeadParsed.SYNTHESIZED : HeadParsed.YES;
}
// add a head if none was there
else if (headParsed_ == HeadParsed.NO && ("body".equals(tagLower) || "frameset".equals(tagLower))) {
final ElementFactory factory = getElementFactory(page_, namespaceURI, "head");
final DomElement newElement = factory.createElement(page_, "head", null);
currentNode_.appendChild(newElement);
headParsed_ = HeadParsed.SYNTHESIZED;
}
// If we're adding a body element, keep track of any temporary synthetic ones
// that we may have had to create earlier (for document.write(), for example).
HtmlBody oldBody = null;
if ("body".equals(qName) && page_.getBody() instanceof HtmlBody) {
oldBody = (HtmlBody) page_.getBody();
}
// Need to reset this at each starting form tag because it could be set from a synthesized
// end tag.
if ("form".equals(tagLower)) {
formWaitingForLostChildren_ = null;
}
// Add the new node.
if (!(page_ instanceof XHtmlPage) && XHTML_NAMESPACE.equals(namespaceURI)) {
namespaceURI = null;
}
final ElementFactory factory = getElementFactory(page_, namespaceURI, qName);
final DomElement newElement = factory.createElementNS(page_, namespaceURI, qName, atts, true);
newElement.setStartLocation(locator_.getLineNumber(), locator_.getColumnNumber());
// parse can't replace everything as it does not buffer elements while parsing
addNodeToRightParent(currentNode_, newElement);
// If we had an old synthetic body and we just added a real body element, quietly
// remove the old body and move its children to the real body element we just added.
if (oldBody != null) {
oldBody.quietlyRemoveAndMoveChildrenTo(newElement);
}
if ("body".equals(tagLower)) {
body_ = (HtmlElement) newElement;
}
else if ("head".equals(tagLower)) {
head_ = (HtmlElement) newElement;
}
else if ("html".equals(tagLower)) {
if (!page_.hasFeature(JS_DEFINE_GETTER) && page_.isQuirksMode()) {
// this is not really correct; a following meta tag may disable the quirks
// mode; but at the moment i have no idea for a better place for this
removePrototypeProperties((Scriptable) page_.getEnclosingWindow().getScriptObject(), "Array",
"every", "filter", "forEach", "indexOf", "lastIndexOf", "map", "reduce",
"reduceRight", "some");
}
}
else if ("meta".equals(tagLower)) {
// i like the IE
if (page_.hasFeature(META_X_UA_COMPATIBLE)) {
final HtmlMeta meta = (HtmlMeta) newElement;
if ("X-UA-Compatible".equals(meta.getHttpEquivAttribute())) {
final String content = meta.getContentAttribute();
if (content.startsWith("IE=")) {
final String mode = content.substring(3).trim();
final int version = (int) page_.getWebClient().getBrowserVersion().
getBrowserVersionNumeric();
if ("edge".equals(mode)) {
((HTMLDocument) page_.getScriptObject()).forceDocumentMode(version);
}
else {
try {
int value = Integer.parseInt(mode);
if (value > version) {
value = version;
}
((HTMLDocument) page_.getScriptObject()).forceDocumentMode(value);
}
catch (final Exception e) {
// ignore
}
}
}
}
}
}
currentNode_ = newElement;
stack_.push(currentNode_);
}
/**
* Removes prototype properties.
* @param scope the scope
* @param className the class for which properties should be removed
* @param properties the properties to remove
*/
private void removePrototypeProperties(final Scriptable scope, final String className,
final String... properties) {
final ScriptableObject prototype = (ScriptableObject) ScriptableObject.getClassPrototype(scope, className);
for (final String property : properties) {
prototype.delete(property);
}
}
/**
* Adds the new node to the right parent that is not necessary the currentNode in case of
* malformed HTML code. The method tries to emulate the behaviour of Firefox.
*/
private void addNodeToRightParent(final DomNode currentNode, final DomElement newElement) {
final String currentNodeName = currentNode.getNodeName();
final String newNodeName = newElement.getNodeName();
DomNode parent = currentNode;
// If the new node is a table element and the current node isn't one search the stack for the
// correct parent.
if ("tr".equals(newNodeName) && !isTableChild(currentNodeName)) {
parent = findElementOnStack("tbody", "thead", "tfoot");
}
else if (isTableChild(newNodeName) && !"table".equals(currentNodeName)) {
parent = findElementOnStack("table");
}
else if (isTableCell(newNodeName) && !"tr".equals(currentNodeName)) {
parent = findElementOnStack("tr");
}
// If the parent changed and the old parent was a form it is now waiting for lost children.
if (parent != currentNode && "form".equals(currentNodeName)) {
formWaitingForLostChildren_ = (HtmlForm) currentNode;
}
final String parentNodeName = parent.getNodeName();
if (("table".equals(parentNodeName) && !isTableChild(newNodeName))
|| (isTableChild(parentNodeName) && !"caption".equals(parentNodeName)
&& !"colgroup".equals(parentNodeName) && !"tr".equals(newNodeName))
|| ("colgroup".equals(parentNodeName) && !"col".equals(newNodeName))
|| ("tr".equals(parentNodeName) && !isTableCell(newNodeName))) {
// If its a form or submittable just add it even though the resulting DOM is incorrect.
// Otherwise insert the element before the table.
if ("form".equals(newNodeName)) {
formWaitingForLostChildren_ = (HtmlForm) newElement;
parent.appendChild(newElement);
}
else if (newElement instanceof SubmittableElement) {
if (formWaitingForLostChildren_ != null) {
formWaitingForLostChildren_.addLostChild((HtmlElement) newElement);
}
parent.appendChild(newElement);
}
else {
parent = findElementOnStack("table");
parent.insertBefore(newElement);
}
}
else if (head_ != null && "title".equals(newNodeName) && !parsingInnerHead_) {
head_.appendChild(newElement);
}
else if (formWaitingForLostChildren_ != null && "form".equals(parentNodeName)) {
// Do not append any children to invalid form. Submittable are inserted after the form,
// everything else before the table.
if (newElement instanceof SubmittableElement) {
formWaitingForLostChildren_.addLostChild((HtmlElement) newElement);
parent.getParentNode().appendChild(newElement);
}
else {
parent = findElementOnStack("table");
parent.insertBefore(newElement);
}
}
else if (formWaitingForLostChildren_ != null && newElement instanceof SubmittableElement) {
formWaitingForLostChildren_.addLostChild((HtmlElement) newElement);
parent.appendChild(newElement);
}
else {
parent.appendChild(newElement);
}
}
private DomNode findElementOnStack(final String... searchedElementNames) {
DomNode searchedNode = null;
for (final DomNode node : stack_) {
if (ArrayUtils.contains(searchedElementNames, node.getNodeName())) {
searchedNode = node;
break;
}
}
if (searchedNode == null) {
searchedNode = stack_.peek(); // this is surely wrong but at least it won't throw a NPE
}
return searchedNode;
}
private boolean isTableChild(final String nodeName) {
return "thead".equals(nodeName) || "tbody".equals(nodeName)
|| "tfoot".equals(nodeName) || "caption".equals(nodeName)
|| "colgroup".equals(nodeName);
}
private boolean isTableCell(final String nodeName) {
return "td".equals(nodeName) || "th".equals(nodeName);
}
/** {@inheritDoc} */
@Override
public void endElement(final QName element, final Augmentations augs)
throws XNIException {
// augs might change so we store only the interesting part
lastTagWasSynthesized_ = isSynthesized(augs);
super.endElement(element, augs);
}
/** {@inheritDoc ContentHandler@endElement(String,String,String)} */
@Override
public void endElement(final String namespaceURI, final String localName, final String qName)
throws SAXException {
handleCharacters();
final String tagLower = localName.toLowerCase(Locale.ENGLISH);
if (page_.isParsingHtmlSnippet() && ("html".equals(tagLower) || "body".equals(tagLower))) {
return;
}
if (parsingInnerHead_) {
if ("head".equals(tagLower)) {
parsingInnerHead_ = false;
}
if ("head".equals(tagLower) || page_.hasFeature(IGNORE_CONTENTS_OF_INNER_HEAD)) {
return;
}
}
// Need to reset this at each closing form tag because a valid form could start afterwards.
if ("form".equals(tagLower)) {
formWaitingForLostChildren_ = null;
}
final DomNode previousNode = stack_.pop(); //remove currentElement from stack
previousNode.setEndLocation(locator_.getLineNumber(), locator_.getColumnNumber());
// special handling for form lost children (malformed HTML code where is synthesized)
if (previousNode instanceof HtmlForm && lastTagWasSynthesized_) {
formWaitingForLostChildren_ = (HtmlForm) previousNode;
}
if (!stack_.isEmpty()) {
currentNode_ = stack_.peek();
}
final boolean postponed = page_.isParsingInlineHtmlSnippet();
previousNode.onAllChildrenAddedToPage(postponed);
}
/** {@inheritDoc} */
@Override
public void characters(final char[] ch, final int start, final int length) throws SAXException {
if ((characters_ == null || characters_.length() == 0)
&& page_.hasFeature(HTMLPARSER_REMOVE_EMPTY_CONTENT)
&& StringUtils.isBlank(new String(ch, start, length))) {
DomNode node = currentNode_.getLastChild();
if (currentNode_ instanceof HTMLElement.ProxyDomNode) {
final HTMLElement.ProxyDomNode proxyNode = (HTMLElement.ProxyDomNode) currentNode_;
node = proxyNode.getDomNode();
if (!proxyNode.isAppend()) {
node = node.getPreviousSibling();
if (node == null) {
node = proxyNode.getDomNode().getParentNode();
}
}
}
if (removeEmptyCharacters(node)) {
return;
}
}
if (characters_ == null) {
characters_ = new StringBuilder();
}
characters_.append(ch, start, length);
}
private boolean removeEmptyCharacters(final DomNode node) {
if (node != null) {
if (node instanceof HtmlInput) {
return false;
}
if (node.getFirstChild() != null
&& (node instanceof HtmlAnchor || node instanceof HtmlSpan
|| node instanceof HtmlFont
|| node instanceof HtmlStrong || node instanceof HtmlBold
|| node instanceof HtmlItalic || node instanceof HtmlUnderlined
|| node instanceof HtmlEmphasis
|| node instanceof HtmlAbbreviated || node instanceof HtmlAcronym
|| node instanceof HtmlBaseFont || node instanceof HtmlBidirectionalOverride
|| node instanceof HtmlBig || node instanceof HtmlBlink
|| node instanceof HtmlCitation || node instanceof HtmlCode
|| node instanceof HtmlDeletedText || node instanceof HtmlDefinition
|| node instanceof HtmlInsertedText || node instanceof HtmlKeyboard
|| node instanceof HtmlLabel || node instanceof HtmlMap
|| node instanceof HtmlNoBreak || node instanceof HtmlInlineQuotation
|| node instanceof HtmlS || node instanceof HtmlSample
|| node instanceof HtmlSmall || node instanceof HtmlStrike
|| node instanceof HtmlSubscript || node instanceof HtmlSuperscript
|| node instanceof HtmlTeletype || node instanceof HtmlVariable
)) {
return false;
}
}
else {
if (currentNode_ instanceof HtmlFont) {
return false;
}
}
return true;
}
/** {@inheritDoc} */
@Override
public void ignorableWhitespace(final char[] ch, final int start, final int length) throws SAXException {
if (characters_ == null) {
characters_ = new StringBuilder();
}
characters_.append(ch, start, length);
}
/**
* Picks up the character data accumulated so far and add it to the current element as a text node.
*/
private void handleCharacters() {
if (characters_ != null && characters_.length() != 0) {
if (currentNode_ instanceof HtmlHtml) {
// In HTML, the node only has two possible children:
// the and the ; any text is ignored.
characters_.setLength(0);
}
else {
// Use the normal behavior: append a text node for the accumulated text.
final String textValue = characters_.toString();
final DomText text = new DomText(page_, textValue);
characters_.setLength(0);
// malformed HTML: some text => text comes before the table
if (currentNode_ instanceof HtmlTableRow && StringUtils.isNotBlank(textValue)) {
final HtmlTableRow row = (HtmlTableRow) currentNode_;
final HtmlTable enclosingTable = row.getEnclosingTable();
if (enclosingTable != null) { // may be null when called from Range.createContextualFragment
enclosingTable.insertBefore(text);
}
}
else {
currentNode_.appendChild(text);
}
}
}
}
/** {@inheritDoc} */
@Override
public void endDocument() throws SAXException {
handleCharacters();
final DomNode currentPage = page_;
currentPage.setEndLocation(locator_.getLineNumber(), locator_.getColumnNumber());
}
/** {@inheritDoc} */
@Override
public void startPrefixMapping(final String prefix, final String uri) throws SAXException {
}
/** {@inheritDoc} */
@Override
public void endPrefixMapping(final String prefix) throws SAXException {
}
/** {@inheritDoc} */
@Override
public void processingInstruction(final String target, final String data) throws SAXException {
}
/** {@inheritDoc} */
@Override
public void skippedEntity(final String name) throws SAXException {
}
// LexicalHandler methods
/** {@inheritDoc} */
@Override
public void comment(final char[] ch, final int start, final int length) {
handleCharacters();
final String data = new String(ch, start, length);
if (!data.startsWith("[CDATA")
|| page_.hasFeature(HTML_CDATA_AS_COMMENT)) {
final DomComment comment = new DomComment(page_, data);
currentNode_.appendChild(comment);
}
}
/** {@inheritDoc} */
@Override
public void endCDATA() {
}
/** {@inheritDoc} */
@Override
public void endDTD() {
}
/** {@inheritDoc} */
@Override
public void endEntity(final String name) {
}
/** {@inheritDoc} */
@Override
public void startCDATA() {
}
/** {@inheritDoc} */
@Override
public void startDTD(final String name, final String publicId, final String systemId) {
final DomDocumentType type = new DomDocumentType(page_, name, publicId, systemId);
page_.setDocumentType(type);
final Node child;
if (page_.hasFeature(DOCTYPE_IS_COMMENT)) {
child = new DomComment(page_, "DOCTYPE " + name + " PUBLIC \""
+ publicId + "\" \"" + systemId + '"');
}
else {
child = type;
}
page_.appendChild(child);
}
/** {@inheritDoc} */
@Override
public void startEntity(final String name) {
}
/**
* {@inheritDoc}
*/
@Override
public void ignoredEndElement(final QName element, final Augmentations augs) {
// if real is reached, don't accept fields anymore as lost children
if ("form".equals(element.localpart)) {
formWaitingForLostChildren_ = null;
}
if (parsingInnerHead_ && "head".equalsIgnoreCase(element.localpart)) {
parsingInnerHead_ = false;
}
}
/**
* {@inheritDoc}
*/
@Override
public void ignoredStartElement(final QName elem, final XMLAttributes attrs, final Augmentations augs) {
// when multiple body elements are encountered, the attributes of the discarded
// elements are used when not previously defined
if (body_ != null && "body".equalsIgnoreCase(elem.localpart) && attrs != null) {
// add the attributes that don't already exist
final int length = attrs.getLength();
for (int i = 0; i < length; ++i) {
final String attrName = attrs.getLocalName(i).toLowerCase(Locale.ENGLISH);
if (body_.getAttributes().getNamedItem(attrName) == null) {
body_.setAttribute(attrName, attrs.getValue(i));
if (attrName.startsWith("on") && body_.getScriptObject() != null) {
final HTMLBodyElement jsBody = (HTMLBodyElement) body_.getScriptObject();
jsBody.createEventHandlerFromAttribute(attrName, attrs.getValue(i));
}
}
}
}
if (headParsed_ == HeadParsed.YES && "head".equalsIgnoreCase(elem.localpart)) {
parsingInnerHead_ = true;
}
}
/**
* {@inheritDoc}
*/
@Override
public void parse(final XMLInputSource inputSource) throws XNIException, IOException {
final HtmlUnitDOMBuilder oldBuilder = page_.getBuilder();
page_.setBuilder(this);
try {
super.parse(inputSource);
}
finally {
page_.setBuilder(oldBuilder);
}
}
private boolean isSynthesized(final Augmentations augs) {
final HTMLEventInfo info = (augs == null) ? null
: (HTMLEventInfo) augs.getItem(FEATURE_AUGMENTATIONS);
return info != null ? info.isSynthesized() : false;
}
}
}
/**
* Utility to transmit parsing errors to a {@link HTMLParserListener}.
*/
class HTMLErrorHandler extends DefaultErrorHandler {
private final HTMLParserListener listener_;
private final URL url_;
private String html_;
HTMLErrorHandler(final HTMLParserListener listener, final URL url, final String htmlContent) {
WebAssert.notNull("listener", listener);
WebAssert.notNull("url", url);
listener_ = listener;
url_ = url;
html_ = htmlContent;
}
/** @see DefaultErrorHandler#error(String,String,XMLParseException) */
@Override
public void error(final String domain, final String key,
final XMLParseException exception) throws XNIException {
listener_.error(exception.getMessage(),
url_,
html_,
exception.getLineNumber(),
exception.getColumnNumber(),
key);
}
/** @see DefaultErrorHandler#warning(String,String,XMLParseException) */
@Override
public void warning(final String domain, final String key,
final XMLParseException exception) throws XNIException {
listener_.warning(exception.getMessage(),
url_,
html_,
exception.getLineNumber(),
exception.getColumnNumber(),
key);
}
}
class HTMLScannerForIE extends org.cyberneko.html.HTMLScanner {
HTMLScannerForIE(final BrowserVersion browserVersion) {
fContentScanner = new ContentScannerForIE(browserVersion);
}
class ContentScannerForIE extends HTMLScanner.ContentScanner {
private final BrowserVersion browserVersion_;
ContentScannerForIE(final BrowserVersion browserVersion) {
browserVersion_ = browserVersion;
}
@Override
protected void scanComment() throws IOException {
final String s = nextContent(30); // [if ...
if (s.startsWith("[if ") && s.contains("]>")) {
final String condition = StringUtils.substringBefore(s.substring(4), "]>");
try {
if (IEConditionalCommentExpressionEvaluator.evaluate(condition, browserVersion_)) {
// skip until ">"
for (int i = 0; i < condition.length() + 6; ++i) {
read();
}
if (s.contains("]>")) {
skip("", false);
}
else if (s.contains("]>-->")) {
skip("-->", false);
}
}
else {
final StringBuilder builder = new StringBuilder();
while (!builder.toString().endsWith("-->")) {
builder.append((char) read());
}
}
return;
}
catch (final Exception e) { // incorrect expression => handle it as plain text
// TODO: report it!
final XMLStringBuffer buffer = new XMLStringBuffer("");
fDocumentHandler.characters(buffer, locationAugs());
return;
}
}
// this is a normal comment, not a conditional comment for IE
super.scanComment();
}
@Override
public String nextContent(final int len) throws IOException {
return super.nextContent(len);
}
@Override
public boolean scanMarkupContent(final XMLStringBuffer buffer, final char cend) throws IOException {
return super.scanMarkupContent(buffer, cend);
}
}
@Override
protected boolean skipMarkup(final boolean balance) throws IOException {
final ContentScannerForIE contentScanner = (ContentScannerForIE) fContentScanner;
final String s = contentScanner.nextContent(30);
if (s.startsWith("[if ") && s.contains("]>")) {
final String condition = StringUtils.substringBefore(s.substring(4), "]>");
try {
if (IEConditionalCommentExpressionEvaluator.evaluate(condition, contentScanner.browserVersion_)) {
// skip until ">"
for (int i = 0; i < condition.length() + 6; ++i) {
read();
}
return true;
}
final XMLStringBuffer buffer = new XMLStringBuffer();
int ch;
while ((ch = read()) != -1) {
buffer.append((char) ch);
if (buffer.toString().endsWith("")) {
final XMLStringBuffer trimmedBuffer
= new XMLStringBuffer(buffer.ch, 0, buffer.length - 3);
fDocumentHandler.comment(trimmedBuffer, locationAugs());
return true;
}
}
}
catch (final Exception e) { // incorrect expression => handle it as plain text
// TODO: report it!
final XMLStringBuffer buffer = new XMLStringBuffer("");
fDocumentHandler.characters(buffer, locationAugs());
return true;
}
}
return super.skipMarkup(balance);
}
}