org.htmlunit.html.parser.neko.HtmlUnitNekoDOMBuilder Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of xlt Show documentation
Show all versions of xlt Show documentation
XLT (Xceptance LoadTest) is an extensive load and performance test tool developed and maintained by Xceptance.
/*
* Copyright (c) 2002-2024 Gargoyle Software Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.htmlunit.html.parser.neko;
import static org.htmlunit.BrowserVersionFeatures.HTML_ATTRIBUTE_LOWER_CASE;
import static org.htmlunit.BrowserVersionFeatures.HTML_COMMAND_TAG;
import static org.htmlunit.BrowserVersionFeatures.HTML_ISINDEX_TAG;
import static org.htmlunit.BrowserVersionFeatures.HTML_MAIN_TAG;
import static org.htmlunit.BrowserVersionFeatures.META_X_UA_COMPATIBLE;
import java.io.IOException;
import java.io.StringReader;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayDeque;
import java.util.Deque;
import java.util.Locale;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.tuple.Triple;
import org.htmlunit.BrowserVersion;
import org.htmlunit.ObjectInstantiationException;
import org.htmlunit.WebClient;
import org.htmlunit.WebResponse;
import org.htmlunit.cyberneko.HTMLConfiguration;
import org.htmlunit.cyberneko.HTMLElements;
import org.htmlunit.cyberneko.HTMLEventInfo;
import org.htmlunit.cyberneko.HTMLScanner;
import org.htmlunit.cyberneko.HTMLTagBalancingListener;
import org.htmlunit.cyberneko.util.FastHashMap;
import org.htmlunit.cyberneko.xerces.parsers.AbstractSAXParser;
import org.htmlunit.cyberneko.xerces.xni.Augmentations;
import org.htmlunit.cyberneko.xerces.xni.QName;
import org.htmlunit.cyberneko.xerces.xni.XMLAttributes;
import org.htmlunit.cyberneko.xerces.xni.XMLString;
import org.htmlunit.cyberneko.xerces.xni.XNIException;
import org.htmlunit.cyberneko.xerces.xni.parser.XMLInputSource;
import org.htmlunit.cyberneko.xerces.xni.parser.XMLParserConfiguration;
import org.htmlunit.html.DomComment;
import org.htmlunit.html.DomDocumentType;
import org.htmlunit.html.DomElement;
import org.htmlunit.html.DomNode;
import org.htmlunit.html.DomText;
import org.htmlunit.html.ElementFactory;
import org.htmlunit.html.Html;
import org.htmlunit.html.HtmlBody;
import org.htmlunit.html.HtmlElement;
import org.htmlunit.html.HtmlForm;
import org.htmlunit.html.HtmlHiddenInput;
import org.htmlunit.html.HtmlHtml;
import org.htmlunit.html.HtmlImage;
import org.htmlunit.html.HtmlMeta;
import org.htmlunit.html.HtmlPage;
import org.htmlunit.html.HtmlSvg;
import org.htmlunit.html.HtmlTable;
import org.htmlunit.html.HtmlTableRow;
import org.htmlunit.html.HtmlTemplate;
import org.htmlunit.html.ScriptElement;
import org.htmlunit.html.SubmittableElement;
import org.htmlunit.html.XHtmlPage;
import org.htmlunit.html.parser.HTMLParser;
import org.htmlunit.html.parser.HTMLParserDOMBuilder;
import org.htmlunit.html.parser.HTMLParserListener;
import org.htmlunit.javascript.host.html.HTMLBodyElement;
import org.htmlunit.javascript.host.html.HTMLDocument;
import org.htmlunit.util.StringUtils;
import org.w3c.dom.Node;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.ext.LexicalHandler;
/**
* INTERNAL API - SUBJECT TO CHANGE AT ANY TIME - USE AT YOUR OWN RISK.
*
* The parser and DOM builder. This class subclasses Xerces's AbstractSAXParser and implements
* the ContentHandler interface. Thus all parser APIs are kept private. The ContentHandler methods
* consume SAX events to build the page DOM
*
* @author Christian Sell
* @author David K. Taylor
* @author Chris Erskine
* @author Ahmed Ashour
* @author Marc Guillemot
* @author Ethan Glasser-Camp
* @author Sudhan Moghe
* @author Ronald Brill
* @author Frank Danek
* @author Carsten Steul
* @author Ronny Shapiro
* @author Atsushi Nakagawa
*/
final class HtmlUnitNekoDOMBuilder extends AbstractSAXParser
implements ContentHandler, LexicalHandler, HTMLTagBalancingListener, HTMLParserDOMBuilder {
// cache Neko Elements for performance and memory efficiency
private static final FastHashMap, HTMLElements>
HTMLELEMENTS_CACHE = new FastHashMap<>();
static {
// continue short code enumeration
final short isIndexShortCode = HTMLElements.UNKNOWN + 1;
final short commandShortCode = isIndexShortCode + 1;
final short mainShortCode = commandShortCode + 1;
// isIndex is special - we have to add it here because all browsers moving this to
// the body (even if it is not supported)
final HTMLElements.Element isIndex = new HTMLElements.Element(isIndexShortCode, "ISINDEX",
HTMLElements.Element.CONTAINER, HTMLElements.BODY, null);
final HTMLElements.Element isIndexSupported = new HTMLElements.Element(isIndexShortCode, "ISINDEX",
HTMLElements.Element.BLOCK, HTMLElements.BODY, new short[] {isIndexShortCode});
final HTMLElements.Element command = new HTMLElements.Element(commandShortCode, "COMMAND",
HTMLElements.Element.EMPTY, new short[] {HTMLElements.BODY, HTMLElements.HEAD}, null);
final HTMLElements.Element main = new HTMLElements.Element(mainShortCode, "MAIN",
HTMLElements.Element.INLINE, HTMLElements.BODY, null);
Triple key;
HTMLElements value;
// !COMMAND_TAG !ISINDEX_TAG !MAIN_TAG
key = Triple.of(Boolean.FALSE, Boolean.FALSE, Boolean.FALSE);
value = new HTMLElements();
value.setElement(isIndex);
HTMLELEMENTS_CACHE.put(key, value);
// !COMMAND_TAG !ISINDEX_TAG MAIN_TAG
key = Triple.of(Boolean.FALSE, Boolean.FALSE, Boolean.TRUE);
value = new HTMLElements();
value.setElement(main);
value.setElement(isIndex);
HTMLELEMENTS_CACHE.put(key, value);
// !COMMAND_TAG ISINDEX_TAG !MAIN_TAG
key = Triple.of(Boolean.FALSE, Boolean.TRUE, Boolean.FALSE);
value = new HTMLElements();
value.setElement(isIndexSupported);
HTMLELEMENTS_CACHE.put(key, value);
// !COMMAND_TAG ISINDEX_TAG MAIN_TAG
key = Triple.of(Boolean.FALSE, Boolean.TRUE, Boolean.TRUE);
value = new HTMLElements();
value.setElement(isIndexSupported);
value.setElement(main);
HTMLELEMENTS_CACHE.put(key, value);
// COMMAND_TAG !ISINDEX_TAG !MAIN_TAG
key = Triple.of(Boolean.TRUE, Boolean.FALSE, Boolean.FALSE);
value = new HTMLElements();
value.setElement(command);
value.setElement(isIndex);
HTMLELEMENTS_CACHE.put(key, value);
// COMMAND_TAG !ISINDEX_TAG MAIN_TAG
key = Triple.of(Boolean.TRUE, Boolean.FALSE, Boolean.TRUE);
value = new HTMLElements();
value.setElement(command);
value.setElement(isIndex);
value.setElement(main);
HTMLELEMENTS_CACHE.put(key, value);
// COMMAND_TAG ISINDEX_TAG !MAIN_TAG
key = Triple.of(Boolean.TRUE, Boolean.TRUE, Boolean.FALSE);
value = new HTMLElements();
value.setElement(command);
value.setElement(isIndexSupported);
HTMLELEMENTS_CACHE.put(key, value);
// COMMAND_TAG ISINDEX_TAG MAIN_TAG
key = Triple.of(Boolean.TRUE, Boolean.TRUE, Boolean.TRUE);
value = new HTMLElements();
value.setElement(command);
value.setElement(isIndexSupported);
value.setElement(main);
HTMLELEMENTS_CACHE.put(key, value);
}
private enum HeadParsed { YES, SYNTHESIZED, NO }
private final HTMLParser htmlParser_;
private final HtmlPage page_;
private Locator locator_;
private final Deque stack_ = new ArrayDeque<>();
/** Did the snippet tried to overwrite the start node? */
private boolean snippetStartNodeOverwritten_;
private final int initialSize_;
private DomNode currentNode_;
private final boolean createdByJavascript_;
private final XMLString characters_ = new XMLString();
private HtmlUnitNekoDOMBuilder.HeadParsed headParsed_ = HeadParsed.NO;
private HtmlElement body_;
private boolean lastTagWasSynthesized_;
private HtmlForm consumingForm_;
private boolean formEndingIsAdjusting_;
private boolean insideSvg_;
private static final String FEATURE_AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
private static final String FEATURE_PARSE_NOSCRIPT
= "http://cyberneko.org/html/features/parse-noscript-content";
/**
* Parses and then inserts the specified HTML content into the HTML content currently being parsed.
* @param html the HTML content to push
*/
@Override
public void pushInputString(final String html) {
page_.registerParsingStart();
page_.registerInlineSnippetParsingStart();
try {
final WebResponse webResponse = page_.getWebResponse();
final Charset charset = webResponse.getContentCharset();
final String url = webResponse.getWebRequest().getUrl().toString();
final XMLInputSource in = new XMLInputSource(null, url, null, new StringReader(html), charset.name());
((HTMLConfiguration) parserConfiguration_).evaluateInputSource(in);
}
finally {
page_.registerParsingEnd();
page_.registerInlineSnippetParsingEnd();
}
}
/**
* Creates a new builder for parsing the specified response contents.
* @param node the location at which to insert the new content
* @param url the page's URL
* @param createdByJavascript if true the (script) tag was created by javascript
*/
HtmlUnitNekoDOMBuilder(final HTMLParser htmlParser,
final DomNode node, final URL url, final String htmlContent, final boolean createdByJavascript) {
super(createConfiguration(node.getPage().getWebClient().getBrowserVersion()));
htmlParser_ = htmlParser;
page_ = (HtmlPage) node.getPage();
currentNode_ = node;
for (final Node ancestor : currentNode_.getAncestors()) {
stack_.push((DomNode) ancestor);
}
createdByJavascript_ = createdByJavascript;
final WebClient webClient = page_.getWebClient();
final HTMLParserListener listener = webClient.getHTMLParserListener();
final boolean reportErrors = listener != null;
if (reportErrors) {
parserConfiguration_.setErrorHandler(new HtmlUnitNekoHTMLErrorHandler(listener, url, htmlContent));
}
try {
setFeature(FEATURE_AUGMENTATIONS, true);
if (!webClient.getBrowserVersion().hasFeature(HTML_ATTRIBUTE_LOWER_CASE)) {
setProperty("http://cyberneko.org/html/properties/names/attrs", "no-change");
}
setFeature("http://cyberneko.org/html/features/report-errors", reportErrors);
setFeature(FEATURE_PARSE_NOSCRIPT, !webClient.isJavaScriptEnabled());
setFeature(HTMLScanner.ALLOW_SELFCLOSING_IFRAME, false);
setContentHandler(this);
setLexicalHandler(this); //comments and CDATA
}
catch (final SAXException e) {
throw new ObjectInstantiationException("unable to create HTML parser", e);
}
initialSize_ = stack_.size();
}
/**
* Create the configuration depending on the simulated browser
* @return the configuration
*/
private static XMLParserConfiguration createConfiguration(final BrowserVersion browserVersion) {
final HTMLElements elements = HTMLELEMENTS_CACHE.get(
Triple.of(browserVersion.hasFeature(HTML_COMMAND_TAG),
browserVersion.hasFeature(HTML_ISINDEX_TAG),
browserVersion.hasFeature(HTML_MAIN_TAG)));
return new HTMLConfiguration(elements);
}
/**
* {@inheritDoc}
*/
@Override
public void setDocumentLocator(final Locator locator) {
locator_ = locator;
}
/**
* {@inheritDoc}
*/
@Override
public void startDocument() throws SAXException {
// nothing to do
}
/** {@inheritDoc} */
@Override
public void startElement(final QName element, final XMLAttributes attributes, final Augmentations augs)
throws XNIException {
// augs might change so we store only the interesting part
lastTagWasSynthesized_ = isSynthesized(augs);
super.startElement(element, attributes, augs);
}
/**
* {@inheritDoc}
*/
@Override
public void startElement(String namespaceURI, final String localName, final String qName, final Attributes atts)
throws SAXException {
if (snippetStartNodeOverwritten_) {
snippetStartNodeOverwritten_ = false;
return;
}
handleCharacters();
final String tagLower = org.htmlunit.util.StringUtils.toRootLowerCase(localName);
if (page_.isParsingHtmlSnippet() && ("html".equals(tagLower) || "body".equals(tagLower))) {
// we have to push the current node on the stack to make sure
// the endElement call is able to remove a node from the stack
stack_.push(currentNode_);
return;
}
if ("head".equals(tagLower)) {
if (headParsed_ == HeadParsed.YES || page_.isParsingHtmlSnippet()) {
// we have to push the current node on the stack to make sure
// the endElement call is able to remove a node from the stack
stack_.push(currentNode_);
return;
}
headParsed_ = lastTagWasSynthesized_ ? HeadParsed.SYNTHESIZED : HeadParsed.YES;
}
if (namespaceURI != null) {
namespaceURI = namespaceURI.trim();
}
// If we're adding a body element, keep track of any temporary synthetic ones
// that we may have had to create earlier (for document.write(), for example).
HtmlBody oldBody = null;
if ("body".equals(qName) && page_.getBody() instanceof HtmlBody) {
oldBody = (HtmlBody) page_.getBody();
}
// Add the new node.
if (!(page_ instanceof XHtmlPage) && Html.XHTML_NAMESPACE.equals(namespaceURI)) {
namespaceURI = null;
}
final ElementFactory factory =
htmlParser_.getElementFactory(page_, namespaceURI, qName, insideSvg_, false);
if (factory == HtmlUnitNekoHtmlParser.SVG_FACTORY) {
namespaceURI = Html.SVG_NAMESPACE;
}
final DomElement newElement = factory.createElementNS(page_, namespaceURI, qName, atts, true);
newElement.setStartLocation(locator_.getLineNumber(), locator_.getColumnNumber());
// parse can't replace everything as it does not buffer elements while parsing
addNodeToRightParent(currentNode_, newElement);
if (newElement instanceof HtmlSvg) {
insideSvg_ = true;
}
// Forms own elements simply by enclosing source-wise rather than DOM parent-child relationship
// Forms without a will keep consuming forever
if (newElement instanceof HtmlForm) {
consumingForm_ = (HtmlForm) newElement;
formEndingIsAdjusting_ = false;
}
else if (consumingForm_ != null) {
// If the current form enclosed a suitable element
if (newElement instanceof SubmittableElement) {
// Let these be owned by the form
if (((HtmlElement) newElement).getEnclosingForm() != consumingForm_) {
((HtmlElement) newElement).setOwningForm(consumingForm_);
}
}
}
// If we had an old synthetic body and we just added a real body element, quietly
// remove the old body and move its children to the real body element we just added.
if (oldBody != null) {
oldBody.quietlyRemoveAndMoveChildrenTo(newElement);
}
if (!insideSvg_ && "body".equals(tagLower)) {
body_ = (HtmlElement) newElement;
}
else if (newElement instanceof HtmlMeta && page_.hasFeature(META_X_UA_COMPATIBLE)) {
final HtmlMeta meta = (HtmlMeta) newElement;
if ("X-UA-Compatible".equals(meta.getHttpEquivAttribute())) {
final String content = meta.getContentAttribute();
if (content.startsWith("IE=")) {
final String mode = content.substring(3).trim();
final int version = page_.getWebClient().getBrowserVersion().getBrowserVersionNumeric();
try {
int value = Integer.parseInt(mode);
if (value > version) {
value = version;
}
((HTMLDocument) page_.getScriptableObject()).forceDocumentMode(value);
}
catch (final Exception e) {
// ignore
}
}
}
}
else if (createdByJavascript_ && newElement instanceof ScriptElement) {
final ScriptElement script = (ScriptElement) newElement;
script.markAsCreatedByDomParser();
}
currentNode_ = newElement;
stack_.push(currentNode_);
}
/**
* Adds the new node to the right parent that is not necessary the currentNode in case of
* malformed HTML code. The method tries to emulate the behavior of Firefox.
*/
private void addNodeToRightParent(final DomNode currentNode, final DomElement newElement) {
final String currentNodeName = currentNode.getNodeName();
final String newNodeName = newElement.getNodeName();
// First ensure table elements are housed correctly
if (isTableChild(newNodeName)) {
final DomNode parent =
"table".equals(currentNodeName) ? currentNode : findElementOnStack("table");
appendChild(parent, newElement);
return;
}
if ("tr".equals(newNodeName)) {
final DomNode parent =
isTableChild(currentNodeName) ? currentNode : findElementOnStack("tbody", "thead", "tfoot");
appendChild(parent, newElement);
return;
}
if (isTableCell(newNodeName)) {
final DomNode parent =
"tr".equals(currentNodeName) ? currentNode : findElementOnStack("tr");
appendChild(parent, newElement);
return;
}
// Next ensure non-table elements don't appear in tables
if ("table".equals(currentNodeName) || isTableChild(currentNodeName) || "tr".equals(currentNodeName)) {
if ("template".equals(newNodeName)) {
currentNode.appendChild(newElement);
}
// Scripts, forms, and styles are exempt
else if (!"colgroup".equals(currentNodeName)
&& ("script".equals(newNodeName)
|| "form".equals(newNodeName)
|| "style".equals(newNodeName))) {
currentNode.appendChild(newElement);
}
// These are good
else if ("col".equals(newNodeName) && "colgroup".equals(currentNodeName)) {
currentNode.appendChild(newElement);
}
else if ("caption".equals(currentNodeName)) {
currentNode.appendChild(newElement);
}
else if (newElement instanceof HtmlHiddenInput) {
currentNode.appendChild(newElement);
}
else {
// Move before the table
final DomNode parent = findElementOnStack("table");
parent.insertBefore(newElement);
}
return;
}
if (formEndingIsAdjusting_ && "form".equals(currentNodeName)) {
// We cater to HTMLTagBalancer's shortcomings by moving this node out of the was on the same DOM tree depth as the was found in the source on a different
// DOM tree depth (either above or below) to the just goes missing for these (really? just tables?)
}
else {
/*
* This was ignored by HTMLTagBalancer as it generates its own
* at the end of the depth with the starting
*
* but this isn't suitable for us because shouldn't be ignored but
* rather moved directly behind the tree it's in to instead become:
* |
* |
*/
// We cater for this by moving out nodes such as the in the above
// diagram out of the form
formEndingIsAdjusting_ = true;
}
}
}
/**
* {@inheritDoc}
*/
@Override
public void ignoredStartElement(final QName elem, final XMLAttributes attrs, final Augmentations augs) {
// when multiple html/body elements are encountered, the attributes of the discarded
// elements are used when not previously defined
if (attrs != null && body_ != null) {
String lp = elem.getLocalpart();
if (lp != null && lp.length() == 4) {
lp = lp.toLowerCase(Locale.ROOT);
if ("body".equals(lp)) {
copyAttributes(body_, attrs);
}
else if ("html".equals(lp)) {
copyAttributes((DomElement) body_.getParentNode(), attrs);
}
}
}
}
private static void copyAttributes(final DomElement to, final XMLAttributes attrs) {
final int length = attrs.getLength();
for (int i = 0; i < length; i++) {
final String attrName = StringUtils.toRootLowerCase(attrs.getLocalName(i));
if (to.getAttributes().getNamedItem(attrName) == null) {
to.setAttribute(attrName, attrs.getValue(i));
if (attrName.startsWith("on") && to.getPage().getWebClient().isJavaScriptEngineEnabled()
&& to.getScriptableObject() instanceof HTMLBodyElement) {
final HTMLBodyElement jsBody = to.getScriptableObject();
jsBody.createEventHandlerFromAttribute(attrName, attrs.getValue(i));
}
}
}
}
/**
* {@inheritDoc}
*/
@Override
public void parse(final XMLInputSource inputSource) throws XNIException, IOException {
final HTMLParserDOMBuilder oldBuilder = page_.getDOMBuilder();
page_.setDOMBuilder(this);
try {
super.parse(inputSource);
}
finally {
page_.setDOMBuilder(oldBuilder);
}
}
HtmlElement getBody() {
return body_;
}
private static boolean isSynthesized(final Augmentations augs) {
return augs instanceof HTMLEventInfo && ((HTMLEventInfo) augs).isSynthesized();
}
private static void appendChild(final DomNode parent, final DomNode child) {
if (parent instanceof HtmlTemplate) {
((HtmlTemplate) parent).getContent().appendChild(child);
return;
}
parent.appendChild(child);
}
}