org.jsoup.parser.HtmlTreeBuilder Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of jsoup Show documentation
jsoup HTML parser
The newest version!
package org.jsoup.parser;

import org.jsoup.helper.Validate;
import org.jsoup.internal.Normalizer;
import org.jsoup.internal.StringUtil;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.CDataNode;
import org.jsoup.nodes.Comment;
import org.jsoup.nodes.DataNode;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.FormElement;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jspecify.annotations.Nullable;

import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;

import static org.jsoup.internal.StringUtil.inSorted;
import static org.jsoup.parser.HtmlTreeBuilderState.Constants.InTableFoster;
import static org.jsoup.parser.HtmlTreeBuilderState.ForeignContent;
import static org.jsoup.parser.Parser.NamespaceHtml;

/**
 * HTML Tree Builder; creates a DOM from Tokens.
 */
public class HtmlTreeBuilder extends TreeBuilder {
    // tag searches. must be sorted, used in inSorted. HtmlTreeBuilderTest validates they're sorted.
    static final String[] TagsSearchInScope = new String[]{"applet", "caption", "html", "marquee", "object", "table", "td", "th"};
    static final String[] TagSearchList = new String[]{"ol", "ul"};
    static final String[] TagSearchButton = new String[]{"button"};
    static final String[] TagSearchTableScope = new String[]{"html", "table"};
    static final String[] TagSearchSelectScope = new String[]{"optgroup", "option"};
    static final String[] TagSearchEndTags = new String[]{"dd", "dt", "li", "optgroup", "option", "p", "rb", "rp", "rt", "rtc"};
    static final String[] TagThoroughSearchEndTags = new String[]{"caption", "colgroup", "dd", "dt", "li", "optgroup", "option", "p", "rb", "rp", "rt", "rtc", "tbody", "td", "tfoot", "th", "thead", "tr"};
    static final String[] TagSearchSpecial = new String[]{"address", "applet", "area", "article", "aside", "base", "basefont", "bgsound",
        "blockquote", "body", "br", "button", "caption", "center", "col", "colgroup", "command", "dd",
        "details", "dir", "div", "dl", "dt", "embed", "fieldset", "figcaption", "figure", "footer", "form",
        "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html",
        "iframe", "img", "input", "isindex", "li", "link", "listing", "marquee", "menu", "meta", "nav",
        "noembed", "noframes", "noscript", "object", "ol", "p", "param", "plaintext", "pre", "script",
        "section", "select", "style", "summary", "table", "tbody", "td", "textarea", "tfoot", "th", "thead",
        "title", "tr", "ul", "wbr", "xmp"};
    static final String[] TagMathMlTextIntegration = new String[]{"mi", "mn", "mo", "ms", "mtext"};
    static final String[] TagSvgHtmlIntegration = new String[]{"desc", "foreignObject", "title"};

    public static final int MaxScopeSearchDepth = 100; // prevents the parser bogging down in exceptionally broken pages

    private HtmlTreeBuilderState state; // the current state
    private HtmlTreeBuilderState originalState; // original / marked state

    private boolean baseUriSetFromDoc;
    private @Nullable Element headElement; // the current head element
    private @Nullable FormElement formElement; // the current form element
    private @Nullable Element contextElement; // fragment parse root; name only copy of context. could be null even if fragment parsing
    private ArrayList formattingElements; // active (open) formatting elements
    private ArrayList tmplInsertMode; // stack of Template Insertion modes
    private List pendingTableCharacters; // chars in table to be shifted out
    private Token.EndTag emptyEnd; // reused empty end tag

    private boolean framesetOk; // if ok to go into frameset
    private boolean fosterInserts; // if next inserts should be fostered
    private boolean fragmentParsing; // if parsing a fragment of html

    @Override ParseSettings defaultSettings() {
        return ParseSettings.htmlDefault;
    }

    @Override
    HtmlTreeBuilder newInstance() {
        return new HtmlTreeBuilder();
    }

    @Override
    protected void initialiseParse(Reader input, String baseUri, Parser parser) {
        super.initialiseParse(input, baseUri, parser);

        // this is a bit mucky. todo - probably just create new parser objects to ensure all reset.
        state = HtmlTreeBuilderState.Initial;
        originalState = null;
        baseUriSetFromDoc = false;
        headElement = null;
        formElement = null;
        contextElement = null;
        formattingElements = new ArrayList<>();
        tmplInsertMode = new ArrayList<>();
        pendingTableCharacters = new ArrayList<>();
        emptyEnd = new Token.EndTag(this);
        framesetOk = true;
        fosterInserts = false;
        fragmentParsing = false;
    }

    @Override void initialiseParseFragment(@Nullable Element context) {
        // context may be null
        state = HtmlTreeBuilderState.Initial;
        fragmentParsing = true;

        if (context != null) {
            final String contextName = context.normalName();
            contextElement = new Element(tagFor(contextName, settings), baseUri);
            if (context.ownerDocument() != null) // quirks setup:
                doc.quirksMode(context.ownerDocument().quirksMode());

            // initialise the tokeniser state:
            switch (contextName) {
                case "title":
                case "textarea":
                    tokeniser.transition(TokeniserState.Rcdata);
                    break;
                case "iframe":
                case "noembed":
                case "noframes":
                case "style":
                case "xmp":
                    tokeniser.transition(TokeniserState.Rawtext);
                    break;
                case "script":
                    tokeniser.transition(TokeniserState.ScriptData);
                    break;
                case "plaintext":
                    tokeniser.transition(TokeniserState.PLAINTEXT);
                    break;
                case "template":
                    tokeniser.transition(TokeniserState.Data);
                    pushTemplateMode(HtmlTreeBuilderState.InTemplate);
                    break;
                default:
                    tokeniser.transition(TokeniserState.Data);
            }
            doc.appendChild(contextElement);
            push(contextElement);
            resetInsertionMode();

            // setup form element to nearest form on context (up ancestor chain). ensures form controls are associated
            // with form correctly
            Element formSearch = context;
            while (formSearch != null) {
                if (formSearch instanceof FormElement) {
                    formElement = (FormElement) formSearch;
                    break;
                }
                formSearch = formSearch.parent();
            }
        }
    }

    @Override List completeParseFragment() {
        if (contextElement != null) {
            // depending on context and the input html, content may have been added outside of the root el
            // e.g. context=p, input=div, the div will have been pushed out.
            List nodes = contextElement.siblingNodes();
            if (!nodes.isEmpty())
                contextElement.insertChildren(-1, nodes);
            return contextElement.childNodes();
        }
        else
            return doc.childNodes();
    }

    @Override
    protected boolean process(Token token) {
        HtmlTreeBuilderState dispatch = useCurrentOrForeignInsert(token) ? this.state : ForeignContent;
        return dispatch.process(token, this);
    }

    boolean useCurrentOrForeignInsert(Token token) {
        // https://html.spec.whatwg.org/multipage/parsing.html#tree-construction
        // If the stack of open elements is empty
        if (stack.isEmpty())
            return true;
        final Element el = currentElement();
        final String ns = el.tag().namespace();

        // If the adjusted current node is an element in the HTML namespace
        if (NamespaceHtml.equals(ns))
            return true;

        // If the adjusted current node is a MathML text integration point and the token is a start tag whose tag name is neither "mglyph" nor "malignmark"
        // If the adjusted current node is a MathML text integration point and the token is a character token
        if (isMathmlTextIntegration(el)) {
            if (token.isStartTag()
                    && !"mglyph".equals(token.asStartTag().normalName)
                    && !"malignmark".equals(token.asStartTag().normalName))
                    return true;
            if (token.isCharacter())
                    return true;
        }
        // If the adjusted current node is a MathML annotation-xml element and the token is a start tag whose tag name is "svg"
        if (Parser.NamespaceMathml.equals(ns)
            && el.nameIs("annotation-xml")
            && token.isStartTag()
            && "svg".equals(token.asStartTag().normalName))
            return true;

        // If the adjusted current node is an HTML integration point and the token is a start tag
        // If the adjusted current node is an HTML integration point and the token is a character token
        if (isHtmlIntegration(el)
            && (token.isStartTag() || token.isCharacter()))
            return true;

        // If the token is an end-of-file token
        return token.isEOF();
    }

    static boolean isMathmlTextIntegration(Element el) {
        /*
        A node is a MathML text integration point if it is one of the following elements:
        A MathML mi element
        A MathML mo element
        A MathML mn element
        A MathML ms element
        A MathML mtext element
         */
        return (Parser.NamespaceMathml.equals(el.tag().namespace())
            && StringUtil.inSorted(el.normalName(), TagMathMlTextIntegration));
    }

    static boolean isHtmlIntegration(Element el) {
        /*
        A node is an HTML integration point if it is one of the following elements:
        A MathML annotation-xml element whose start tag token had an attribute with the name "encoding" whose value was an ASCII case-insensitive match for the string "text/html"
        A MathML annotation-xml element whose start tag token had an attribute with the name "encoding" whose value was an ASCII case-insensitive match for the string "application/xhtml+xml"
        An SVG foreignObject element
        An SVG desc element
        An SVG title element
         */
        if (Parser.NamespaceMathml.equals(el.tag().namespace())
            && el.nameIs("annotation-xml")) {
            String encoding = Normalizer.normalize(el.attr("encoding"));
            if (encoding.equals("text/html") || encoding.equals("application/xhtml+xml"))
                return true;
        }
        if (Parser.NamespaceSvg.equals(el.tag().namespace())
            && StringUtil.in(el.tagName(), TagSvgHtmlIntegration)) // note using .tagName for case-sensitive hit here of foreignObject
            return true;

        return false;
    }

    boolean process(Token token, HtmlTreeBuilderState state) {
        return state.process(token, this);
    }

    void transition(HtmlTreeBuilderState state) {
        this.state = state;
    }

    HtmlTreeBuilderState state() {
        return state;
    }

    void markInsertionMode() {
        originalState = state;
    }

    HtmlTreeBuilderState originalState() {
        return originalState;
    }

    void framesetOk(boolean framesetOk) {
        this.framesetOk = framesetOk;
    }

    boolean framesetOk() {
        return framesetOk;
    }

    Document getDocument() {
        return doc;
    }

    String getBaseUri() {
        return baseUri;
    }

    void maybeSetBaseUri(Element base) {
        if (baseUriSetFromDoc) // only listen to the first  in parse
            return;

        String href = base.absUrl("href");
        if (href.length() != 0) { // ignore  etc
            baseUri = href;
            baseUriSetFromDoc = true;
            doc.setBaseUri(href); // set on the doc so doc.createElement(Tag) will get updated base, and to update all descendants
        }
    }

    boolean isFragmentParsing() {
        return fragmentParsing;
    }

    void error(HtmlTreeBuilderState state) {
        if (parser.getErrors().canAddError())
            parser.getErrors().add(new ParseError(reader, "Unexpected %s token [%s] when in state [%s]",
                currentToken.tokenType(), currentToken, state));
    }

    Element createElementFor(Token.StartTag startTag, String namespace, boolean forcePreserveCase) {
        // dedupe and normalize the attributes:
        Attributes attributes = startTag.attributes;
        if (!forcePreserveCase)
            attributes = settings.normalizeAttributes(attributes);
        if (attributes != null && !attributes.isEmpty()) {
            int dupes = attributes.deduplicate(settings);
            if (dupes > 0) {
                error("Dropped duplicate attribute(s) in tag [%s]", startTag.normalName);
            }
        }

        Tag tag = tagFor(startTag.tagName, namespace,
            forcePreserveCase ? ParseSettings.preserveCase : settings);

        return (tag.normalName().equals("form")) ?
            new FormElement(tag, null, attributes) :
            new Element(tag, null, attributes);
    }

    /** Inserts an HTML element for the given tag) */
    Element insertElementFor(final Token.StartTag startTag) {
        Element el = createElementFor(startTag, NamespaceHtml, false);
        doInsertElement(el, startTag);

        // handle self-closing tags. when the spec expects an empty tag, will directly hit insertEmpty, so won't generate this fake end tag.
        if (startTag.isSelfClosing()) {
            Tag tag = el.tag();
            if (tag.isKnownTag()) {
                if (!tag.isEmpty())
                    tokeniser.error("Tag [%s] cannot be self closing; not a void tag", tag.normalName());
                // else: ok
            }
            else { // unknown tag: remember this is self-closing, for output
                tag.setSelfClosing();
            }

            // effectively a pop, but fiddles with the state. handles empty style, title etc which would otherwise leave us in data state
            tokeniser.transition(TokeniserState.Data); // handles