com.itextpdf.styledxmlparser.jsoup.parser.HtmlTreeBuilder Maven / Gradle / Ivy
Show all versions of styled-xml-parser Show documentation
/*
This file is part of the iText (R) project.
Copyright (c) 1998-2024 Apryse Group NV
Authors: Apryse Software.
This program is offered under a commercial and under the AGPL license.
For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below.
AGPL licensing:
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see .
*/
package com.itextpdf.styledxmlparser.jsoup.parser;
import com.itextpdf.styledxmlparser.jsoup.helper.Validate;
import com.itextpdf.styledxmlparser.jsoup.internal.StringUtil;
import com.itextpdf.styledxmlparser.jsoup.nodes.CDataNode;
import com.itextpdf.styledxmlparser.jsoup.nodes.Comment;
import com.itextpdf.styledxmlparser.jsoup.nodes.DataNode;
import com.itextpdf.styledxmlparser.jsoup.nodes.Document;
import com.itextpdf.styledxmlparser.jsoup.nodes.Element;
import com.itextpdf.styledxmlparser.jsoup.nodes.FormElement;
import com.itextpdf.styledxmlparser.jsoup.nodes.Node;
import com.itextpdf.styledxmlparser.jsoup.nodes.TextNode;
import com.itextpdf.styledxmlparser.jsoup.select.Elements;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
/**
* HTML Tree Builder; creates a DOM from Tokens.
*/
public class HtmlTreeBuilder extends TreeBuilder {
// tag searches. must be sorted, used in StringUtil.inSorted. HtmlTreeBuilderTest validates they're sorted.
static final String[] TagsSearchInScope = new String[]{"applet", "caption", "html", "marquee", "object", "table", "td", "th"};
static final String[] TagSearchList = new String[]{"ol", "ul"};
static final String[] TagSearchButton = new String[]{"button"};
static final String[] TagSearchTableScope = new String[]{"html", "table"};
static final String[] TagSearchSelectScope = new String[]{"optgroup", "option"};
static final String[] TagSearchEndTags = new String[]{"dd", "dt", "li", "optgroup", "option", "p", "rp", "rt"};
static final String[] TagSearchSpecial = new String[]{"address", "applet", "area", "article", "aside", "base", "basefont", "bgsound",
"blockquote", "body", "br", "button", "caption", "center", "col", "colgroup", "command", "dd",
"details", "dir", "div", "dl", "dt", "embed", "fieldset", "figcaption", "figure", "footer", "form",
"frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html",
"iframe", "img", "input", "isindex", "li", "link", "listing", "marquee", "menu", "meta", "nav",
"noembed", "noframes", "noscript", "object", "ol", "p", "param", "plaintext", "pre", "script",
"section", "select", "style", "summary", "table", "tbody", "td", "textarea", "tfoot", "th", "thead",
"title", "tr", "ul", "wbr", "xmp"};
public static final int MaxScopeSearchDepth = 100; // prevents the parser bogging down in exceptionally broken pages
private HtmlTreeBuilderState state; // the current state
private HtmlTreeBuilderState originalState; // original / marked state
private boolean baseUriSetFromDoc;
private Element headElement; // the current head element
private FormElement formElement; // the current form element
private Element contextElement; // fragment parse context -- could be null even if fragment parsing
private ArrayList formattingElements; // active (open) formatting elements
private List pendingTableCharacters; // chars in table to be shifted out
private Token.EndTag emptyEnd; // reused empty end tag
private boolean framesetOk; // if ok to go into frameset
private boolean fosterInserts; // if next inserts should be fostered
private boolean fragmentParsing; // if parsing a fragment of html
ParseSettings defaultSettings() {
return ParseSettings.htmlDefault;
}
@Override
TreeBuilder newInstance() {
return new HtmlTreeBuilder();
}
@Override
protected void initialiseParse(Reader input, String baseUri, Parser parser) {
super.initialiseParse(input, baseUri, parser);
// this is a bit mucky.
state = HtmlTreeBuilderState.Initial;
originalState = null;
baseUriSetFromDoc = false;
headElement = null;
formElement = null;
contextElement = null;
formattingElements = new ArrayList<>();
pendingTableCharacters = new ArrayList<>();
emptyEnd = new Token.EndTag();
framesetOk = true;
fosterInserts = false;
fragmentParsing = false;
}
List parseFragment(String inputFragment, Element context, String baseUri, Parser parser) {
// context may be null
state = HtmlTreeBuilderState.Initial;
initialiseParse(new StringReader(inputFragment), baseUri, parser);
contextElement = context;
fragmentParsing = true;
Element root = null;
if (context != null) {
if (context.ownerDocument() != null) // quirks setup:
doc.quirksMode(context.ownerDocument().quirksMode());
// initialise the tokeniser state:
String contextTag = context.normalName();
if (StringUtil.in(contextTag, "title", "textarea"))
tokeniser.transition(TokeniserState.Rcdata);
else if (StringUtil.in(contextTag, "iframe", "noembed", "noframes", "style", "xmp"))
tokeniser.transition(TokeniserState.Rawtext);
else if (contextTag.equals("script"))
tokeniser.transition(TokeniserState.ScriptData);
else if (contextTag.equals(("noscript")))
tokeniser.transition(TokeniserState.Data); // if scripting enabled, rawtext
else if (contextTag.equals("plaintext"))
tokeniser.transition(TokeniserState.Data);
else
tokeniser.transition(TokeniserState.Data); // default
root = new Element(Tag.valueOf(contextTag, settings), baseUri);
doc.appendChild(root);
stack.add(root);
resetInsertionMode();
// setup form element to nearest form on context (up ancestor chain). ensures form controls are associated
// with form correctly
Elements contextChain = context.parents();
contextChain.add(0, context);
for (Element parent: contextChain) {
if (parent instanceof FormElement) {
formElement = (FormElement) parent;
break;
}
}
}
runParser();
if (context != null) {
// depending on context and the input html, content may have been added outside of the root el
// e.g. context=p, input=div, the div will have been pushed out.
List nodes = root.siblingNodes();
if (!nodes.isEmpty())
root.insertChildren(-1, nodes);
return root.childNodes();
}
else
return doc.childNodes();
}
@Override
protected boolean process(Token token) {
currentToken = token;
return this.state.process(token, this);
}
boolean process(Token token, HtmlTreeBuilderState state) {
currentToken = token;
return state.process(token, this);
}
void transition(HtmlTreeBuilderState state) {
this.state = state;
}
HtmlTreeBuilderState state() {
return state;
}
void markInsertionMode() {
originalState = state;
}
HtmlTreeBuilderState originalState() {
return originalState;
}
void framesetOk(boolean framesetOk) {
this.framesetOk = framesetOk;
}
boolean framesetOk() {
return framesetOk;
}
Document getDocument() {
return doc;
}
String getBaseUri() {
return baseUri;
}
void maybeSetBaseUri(Element base) {
if (baseUriSetFromDoc) // only listen to the first in parse
return;
String href = base.absUrl("href");
if (href.length() != 0) { // ignore etc
baseUri = href;
baseUriSetFromDoc = true;
doc.setBaseUri(href); // set on the doc so doc.createElement(Tag) will get updated base, and to update all descendants
}
}
boolean isFragmentParsing() {
return fragmentParsing;
}
void error(HtmlTreeBuilderState state) {
if (parser.getErrors().canAddError())
parser.getErrors().add(new ParseError(reader.pos(), "Unexpected token [{0}] when in state [{1}]", currentToken.tokenType(), state));
}
Element insert(final Token.StartTag startTag) {
// cleanup duplicate attributes:
if (startTag.hasAttributes() && !startTag.attributes.isEmpty()) {
int dupes = startTag.attributes.deduplicate(settings);
if (dupes > 0) {
error("Duplicate attribute");
}
}
// handle empty unknown tags
// when the spec expects an empty tag, will directly hit insertEmpty, so won't generate this fake end tag.
if (startTag.isSelfClosing()) {
Element el = insertEmpty(startTag);
stack.add(el);
tokeniser.transition(TokeniserState.Data); // handles , otherwise needs breakout steps from script data
tokeniser.emit(((Token.Tag) emptyEnd.reset()).name(el.tagName())); // ensure we get out of whatever state we are in. emitted for yielded processing
return el;
}
Element el = new Element(Tag.valueOf(startTag.name(), settings), null, settings.normalizeAttributes(startTag.attributes));
insert(el);
return el;
}
Element insertStartTag(String startTagName) {
Element el = new Element(Tag.valueOf(startTagName, settings), null);
insert(el);
return el;
}
void insert(Element el) {
insertNode(el);
stack.add(el);
}
Element insertEmpty(Token.StartTag startTag) {
Tag tag = Tag.valueOf(startTag.name(), settings);
Element el = new Element(tag, null, settings.normalizeAttributes(startTag.attributes));
insertNode(el);
if (startTag.isSelfClosing()) {
if (tag.isKnownTag()) {
if (!tag.isEmpty())
tokeniser.error("Tag cannot be self closing; not a void tag");
}
else // unknown tag, remember this is self closing for output
tag.setSelfClosing();
}
return el;
}
FormElement insertForm(Token.StartTag startTag, boolean onStack) {
Tag tag = Tag.valueOf(startTag.name(), settings);
FormElement el = new FormElement(tag, null, settings.normalizeAttributes(startTag.attributes));
setFormElement(el);
insertNode(el);
if (onStack)
stack.add(el);
return el;
}
void insert(Token.Comment commentToken) {
Comment comment = new Comment(commentToken.getData());
insertNode(comment);
}
void insert(Token.Character characterToken) {
final Node node;
Element el = currentElement();
if (el == null)
el = doc; // allows for whitespace to be inserted into the doc root object (not on the stack)
final String tagName = el.normalName();
final String data = characterToken.getData();
if (characterToken.isCData())
node = new CDataNode(data);
else if (isContentForTagData(tagName))
node = new DataNode(data);
else
node = new TextNode(data);
el.appendChild(node); // doesn't use insertNode, because we don't foster these; and will always have a stack.
}
private void insertNode(Node node) {
// if the stack hasn't been set up yet, elements (doctype, comments) go into the doc
if (stack.isEmpty())
doc.appendChild(node);
else if (isFosterInserts())
insertInFosterParent(node);
else
currentElement().appendChild(node);
// connect form controls to their form element
if (node instanceof Element && ((Element) node).tag().isFormListed()) {
if (formElement != null)
formElement.addElement((Element) node);
}
}
Element pop() {
int size = stack.size();
return stack.remove(size-1);
}
void push(Element element) {
stack.add(element);
}
ArrayList getStack() {
return stack;
}
boolean onStack(Element el) {
return isElementInQueue(stack, el);
}
private static final int maxQueueDepth = 256; // an arbitrary tension point between real HTML and crafted pain
private boolean isElementInQueue(ArrayList queue, Element element) {
final int bottom = queue.size() - 1;
final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0;
for (int pos = bottom; pos >= upper; pos--) {
Element next = queue.get(pos);
if (next == element) {
return true;
}
}
return false;
}
Element getFromStack(String elName) {
for (int pos = stack.size() -1; pos >= 0; pos--) {
Element next = stack.get(pos);
if (next.normalName().equals(elName)) {
return next;
}
}
return null;
}
boolean removeFromStack(Element el) {
for (int pos = stack.size() -1; pos >= 0; pos--) {
Element next = stack.get(pos);
if (next == el) {
stack.remove(pos);
return true;
}
}
return false;
}
Element popStackToClose(String elName) {
for (int pos = stack.size() -1; pos >= 0; pos--) {
Element el = stack.get(pos);
stack.remove(pos);
if (el.normalName().equals(elName))
return el;
}
return null;
}
// elnames is sorted, comes from Constants
void popStackToClose(String... elNames) {
for (int pos = stack.size() -1; pos >= 0; pos--) {
Element next = stack.get(pos);
stack.remove(pos);
if (StringUtil.inSorted(next.normalName(), elNames))
break;
}
}
void popStackToBefore(String elName) {
for (int pos = stack.size() -1; pos >= 0; pos--) {
Element next = stack.get(pos);
if (next.normalName().equals(elName)) {
break;
} else {
stack.remove(pos);
}
}
}
void clearStackToTableContext() {
clearStackToContext("table");
}
void clearStackToTableBodyContext() {
clearStackToContext("tbody", "tfoot", "thead", "template");
}
void clearStackToTableRowContext() {
clearStackToContext("tr", "template");
}
private void clearStackToContext(String... nodeNames) {
for (int pos = stack.size() -1; pos >= 0; pos--) {
Element next = stack.get(pos);
if (StringUtil.in(next.normalName(), nodeNames) || next.normalName().equals("html"))
break;
else
stack.remove(pos);
}
}
Element aboveOnStack(Element el) {
assert onStack(el);
for (int pos = stack.size() -1; pos >= 0; pos--) {
Element next = stack.get(pos);
if (next == el) {
return stack.get(pos-1);
}
}
return null;
}
void insertOnStackAfter(Element after, Element in) {
int i = stack.lastIndexOf(after);
Validate.isTrue(i != -1);
stack.add(i+1, in);
}
void replaceOnStack(Element out, Element in) {
replaceInQueue(stack, out, in);
}
private void replaceInQueue(ArrayList queue, Element out, Element in) {
int i = queue.lastIndexOf(out);
Validate.isTrue(i != -1);
queue.set(i, in);
}
void resetInsertionMode() {
boolean last = false;
for (int pos = stack.size() -1; pos >= 0; pos--) {
Element node = stack.get(pos);
if (pos == 0) {
last = true;
node = contextElement;
}
String name = node != null ? node.normalName() : "";
if ("select".equals(name)) {
transition(HtmlTreeBuilderState.InSelect);
break; // frag
} else if (("td".equals(name) || "th".equals(name) && !last)) {
transition(HtmlTreeBuilderState.InCell);
break;
} else if ("tr".equals(name)) {
transition(HtmlTreeBuilderState.InRow);
break;
} else if ("tbody".equals(name) || "thead".equals(name) || "tfoot".equals(name)) {
transition(HtmlTreeBuilderState.InTableBody);
break;
} else if ("caption".equals(name)) {
transition(HtmlTreeBuilderState.InCaption);
break;
} else if ("colgroup".equals(name)) {
transition(HtmlTreeBuilderState.InColumnGroup);
break; // frag
} else if ("table".equals(name)) {
transition(HtmlTreeBuilderState.InTable);
break;
} else if ("head".equals(name)) {
transition(HtmlTreeBuilderState.InBody);
break; // frag
} else if ("body".equals(name)) {
transition(HtmlTreeBuilderState.InBody);
break;
} else if ("frameset".equals(name)) {
transition(HtmlTreeBuilderState.InFrameset);
break; // frag
} else if ("html".equals(name)) {
transition(HtmlTreeBuilderState.BeforeHead);
break; // frag
} else if (last) {
transition(HtmlTreeBuilderState.InBody);
break; // frag
}
}
}
private String[] specificScopeTarget = {null};
private boolean inSpecificScope(String targetName, String[] baseTypes, String[] extraTypes) {
specificScopeTarget[0] = targetName;
return inSpecificScope(specificScopeTarget, baseTypes, extraTypes);
}
private boolean inSpecificScope(String[] targetNames, String[] baseTypes, String[] extraTypes) {
// https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-the-specific-scope
final int bottom = stack.size() -1;
final int top = bottom > MaxScopeSearchDepth ? bottom - MaxScopeSearchDepth : 0;
// don't walk too far up the tree
for (int pos = bottom; pos >= top; pos--) {
final String elName = stack.get(pos).normalName();
if (StringUtil.inSorted(elName, targetNames))
return true;
if (StringUtil.inSorted(elName, baseTypes))
return false;
if (extraTypes != null && StringUtil.inSorted(elName, extraTypes))
return false;
}
//Validate.fail("Should not be reachable"); // would end up false because hitting 'html' at root (basetypes)
return false;
}
boolean inScope(String[] targetNames) {
return inSpecificScope(targetNames, TagsSearchInScope, null);
}
boolean inScope(String targetName) {
return inScope(targetName, null);
}
boolean inScope(String targetName, String[] extras) {
return inSpecificScope(targetName, TagsSearchInScope, extras);
}
boolean inListItemScope(String targetName) {
return inScope(targetName, TagSearchList);
}
boolean inButtonScope(String targetName) {
return inScope(targetName, TagSearchButton);
}
boolean inTableScope(String targetName) {
return inSpecificScope(targetName, TagSearchTableScope, null);
}
boolean inSelectScope(String targetName) {
for (int pos = stack.size() -1; pos >= 0; pos--) {
Element el = stack.get(pos);
String elName = el.normalName();
if (elName.equals(targetName))
return true;
if (!StringUtil.inSorted(elName, TagSearchSelectScope)) // all elements except
return false;
}
Validate.fail("Should not be reachable");
return false;
}
void setHeadElement(Element headElement) {
this.headElement = headElement;
}
Element getHeadElement() {
return headElement;
}
boolean isFosterInserts() {
return fosterInserts;
}
void setFosterInserts(boolean fosterInserts) {
this.fosterInserts = fosterInserts;
}
FormElement getFormElement() {
return formElement;
}
void setFormElement(FormElement formElement) {
this.formElement = formElement;
}
void newPendingTableCharacters() {
pendingTableCharacters = new ArrayList<>();
}
List getPendingTableCharacters() {
return pendingTableCharacters;
}
/**
* 11.2.5.2 Closing elements that have implied end tags
*
* When the steps below require the UA to generate implied end tags, then, while the current node is a dd element, a
* dt element, an li element, an option element, an optgroup element, a p element, an rp element, or an rt element,
* the UA must pop the current node off the stack of open elements.
*
* @param excludeTag If a step requires the UA to generate implied end tags but lists an element to exclude from the
* process, then the UA must perform the above steps as if that element was not in the above list.
*/
void generateImpliedEndTags(String excludeTag) {
while ((excludeTag != null && !currentElement().normalName().equals(excludeTag)) &&
StringUtil.inSorted(currentElement().normalName(), TagSearchEndTags))
pop();
}
void generateImpliedEndTags() {
generateImpliedEndTags(null);
}
boolean isSpecial(Element el) {
String name = el.normalName();
return StringUtil.inSorted(name, TagSearchSpecial);
}
Element lastFormattingElement() {
return formattingElements.size() > 0 ? formattingElements.get(formattingElements.size()-1) : null;
}
int positionOfElement(Element el){
for (int i = 0; i < formattingElements.size(); i++){
if (el == formattingElements.get(i))
return i;
}
return -1;
}
Element removeLastFormattingElement() {
int size = formattingElements.size();
if (size > 0)
return formattingElements.remove(size-1);
else
return null;
}
// active formatting elements
void pushActiveFormattingElements(Element in) {
this.checkActiveFormattingElements(in);
formattingElements.add(in);
}
void pushWithBookmark(Element in,int bookmark){
this.checkActiveFormattingElements(in);
formattingElements.add(bookmark, in);
}
void checkActiveFormattingElements(Element in){
int numSeen = 0;
for (int pos = formattingElements.size() -1; pos >= 0; pos--) {
Element el = formattingElements.get(pos);
if (el == null) // marker
break;
if (isSameFormattingElement(in, el))
numSeen++;
if (numSeen == 3) {
formattingElements.remove(pos);
break;
}
}
}
private boolean isSameFormattingElement(Element a, Element b) {
// same if: same namespace, tag, and attributes. Element.equals only checks tag, might in future check children
return a.normalName().equals(b.normalName()) &&
// a.namespace().equals(b.namespace()) &&
a.attributes().equals(b.attributes());
}
void reconstructFormattingElements() {
Element last = lastFormattingElement();
if (last == null || onStack(last))
return;
Element entry = last;
int size = formattingElements.size();
int pos = size - 1;
boolean skip = false;
while (true) {
if (pos == 0) { // step 4. if none before, skip to 8
skip = true;
break;
}
entry = formattingElements.get(--pos); // step 5. one earlier than entry
if (entry == null || onStack(entry)) // step 6 - neither marker nor on stack
break; // jump to 8, else continue back to 4
}
while(true) {
if (!skip) // step 7: on later than entry
entry = formattingElements.get(++pos);
Validate.notNull(entry); // should not occur, as we break at last element
// 8. create new element from element, 9 insert into current node, onto stack
skip = false; // can only skip increment from 4.
Element newEl = insertStartTag(entry.normalName());
newEl.attributes().addAll(entry.attributes());
// 10. replace entry with new entry
formattingElements.set(pos, newEl);
// 11
if (pos == size-1) // if not last entry in list, jump to 7
break;
}
}
void clearFormattingElementsToLastMarker() {
while (!formattingElements.isEmpty()) {
Element el = removeLastFormattingElement();
if (el == null)
break;
}
}
void removeFromActiveFormattingElements(Element el) {
for (int pos = formattingElements.size() -1; pos >= 0; pos--) {
Element next = formattingElements.get(pos);
if (next == el) {
formattingElements.remove(pos);
break;
}
}
}
boolean isInActiveFormattingElements(Element el) {
return isElementInQueue(formattingElements, el);
}
Element getActiveFormattingElement(String nodeName) {
for (int pos = formattingElements.size() -1; pos >= 0; pos--) {
Element next = formattingElements.get(pos);
if (next == null) // scope marker
break;
else if (next.normalName().equals(nodeName))
return next;
}
return null;
}
void replaceActiveFormattingElement(Element out, Element in) {
replaceInQueue(formattingElements, out, in);
}
void insertMarkerToFormattingElements() {
formattingElements.add(null);
}
void insertInFosterParent(Node in) {
Element fosterParent;
Element lastTable = getFromStack("table");
boolean isLastTableParent = false;
if (lastTable != null) {
if (lastTable.parent() != null) {
fosterParent = (Element) lastTable.parent();
isLastTableParent = true;
} else
fosterParent = aboveOnStack(lastTable);
} else { // no table == frag
fosterParent = stack.get(0);
}
if (isLastTableParent) {
Validate.notNull(lastTable); // last table cannot be null by this point.
lastTable.before(in);
}
else
fosterParent.appendChild(in);
}
@Override
public String toString() {
return "TreeBuilder{" +
"currentToken=" + currentToken +
", state=" + state +
", currentElement=" + currentElement() +
'}';
}
protected boolean isContentForTagData(final String normalName) {
return (normalName.equals("script") || normalName.equals("style"));
}
}