All Downloads are FREE. Search and download functionalities are using the official Maven repository.

commonMain.com.fleeksoft.ksoup.parser.HtmlTreeBuilder.kt Maven / Gradle / Ivy

package com.fleeksoft.ksoup.parser

import com.fleeksoft.ksoup.helper.Validate
import com.fleeksoft.ksoup.internal.Normalizer
import com.fleeksoft.ksoup.internal.StringUtil
import com.fleeksoft.ksoup.nodes.*
import com.fleeksoft.ksoup.parser.HtmlTreeBuilderState.Constants.InTableFoster
import com.fleeksoft.ksoup.parser.HtmlTreeBuilderState.ForeignContent
import com.fleeksoft.ksoup.parser.Parser.Companion.NamespaceHtml
import com.fleeksoft.ksoup.ported.BufferReader
import com.fleeksoft.ksoup.ported.assert
import kotlin.jvm.JvmOverloads

/**
 * HTML Tree Builder; creates a DOM from Tokens.
 */
internal open class HtmlTreeBuilder : TreeBuilder() {
    private var state: HtmlTreeBuilderState? = null // the current state
    private var originalState: HtmlTreeBuilderState? = null // original / marked state
    private var baseUriSetFromDoc = false

    private var headElement: Element? = null // the current head element

    private var formElement: FormElement? = null // the current form element

    private var contextElement: Element? =
        null // fragment parse context -- could be null even if fragment parsing
    private var formattingElements: ArrayList? =
        null // active (open) formatting elements
    private var tmplInsertMode: ArrayList? =
        null // stack of Template Insertion modes
    private var pendingTableCharacters: MutableList? =
        null // chars in table to be shifted out
    private var emptyEnd: Token.EndTag? = null // reused empty end tag
    private var framesetOk = false // if ok to go into frameset
    var isFosterInserts = false // if next inserts should be fostered
    var isFragmentParsing = false // if parsing a fragment of html
        private set

    override fun defaultSettings(): ParseSettings? {
        return ParseSettings.htmlDefault
    }

    override fun newInstance(): HtmlTreeBuilder {
        return HtmlTreeBuilder()
    }

    override fun initialiseParse(
        input: BufferReader,
        baseUri: String,
        parser: Parser,
    ) {
        super.initialiseParse(input, baseUri, parser)

        // this is a bit mucky. todo - probably just create new parser objects to ensure all reset.
        state = HtmlTreeBuilderState.Initial
        originalState = null
        baseUriSetFromDoc = false
        headElement = null
        formElement = null
        contextElement = null
        formattingElements = ArrayList()
        tmplInsertMode = ArrayList()
        pendingTableCharacters = ArrayList()
        emptyEnd = Token.EndTag(this)
        framesetOk = true
        isFosterInserts = false
        isFragmentParsing = false
    }

    override fun parseFragment(
        inputFragment: String,
        context: Element?,
        baseUri: String?,
        parser: Parser,
    ): List {
        // context may be null
        state = HtmlTreeBuilderState.Initial
        initialiseParse(BufferReader(inputFragment), baseUri ?: "", parser)
        contextElement = context
        isFragmentParsing = true
        var root: Element? = null
        if (context != null) {
            if (context.ownerDocument() != null) {
                // quirks setup:
                doc.quirksMode(context.ownerDocument()!!.quirksMode())
            }

            // initialise the tokeniser state:
            val contextTag: String = context.normalName()
            when (contextTag) {
                "title", "textarea" -> tokeniser!!.transition(TokeniserState.Rcdata)
                "iframe", "noembed", "noframes", "style", "xmp" ->
                    tokeniser!!.transition(
                        TokeniserState.Rawtext,
                    )

                "script" -> tokeniser!!.transition(TokeniserState.ScriptData)
                "plaintext" -> tokeniser!!.transition(TokeniserState.PLAINTEXT)
                "template" -> {
                    tokeniser!!.transition(TokeniserState.Data)
                    pushTemplateMode(HtmlTreeBuilderState.InTemplate)
                }

                else -> tokeniser!!.transition(TokeniserState.Data)
            }
            root = Element(tagFor(contextTag, settings), baseUri)
            doc.appendChild(root)
            push(root)
            resetInsertionMode()

            // setup form element to nearest form on context (up ancestor chain). ensures form controls are associated
            // with form correctly
            var formSearch: Element? = context
            while (formSearch != null) {
                if (formSearch is FormElement) {
                    formElement = formSearch
                    break
                }
                formSearch = formSearch.parent()
            }
        }
        runParser()
        return if (context != null) {
            // depending on context and the input html, content may have been added outside of the root el
            // e.g. context=p, input=div, the div will have been pushed out.
            val nodes: List = root!!.siblingNodes()
            if (nodes.isNotEmpty()) root.insertChildren(-1, nodes)
            root.childNodes()
        } else {
            doc.childNodes()
        }
    }

    public override fun process(token: Token): Boolean {
        val dispatch = if (useCurrentOrForeignInsert(token)) this.state else ForeignContent
        return dispatch!!.process(token, this)
    }

    fun useCurrentOrForeignInsert(token: Token): Boolean {
        // https://html.spec.whatwg.org/multipage/parsing.html#tree-construction
        // If the stack of open elements is empty
        if (stack.isEmpty()) return true
        val el: Element = currentElement()
        val ns: String = el.tag().namespace()

        // If the adjusted current node is an element in the HTML namespace
        if (NamespaceHtml == ns) return true

        // If the adjusted current node is a MathML text integration point and the token is a start tag whose tag name is neither "mglyph" nor "malignmark"
        // If the adjusted current node is a MathML text integration point and the token is a character token
        if (isMathmlTextIntegration(el)) {
            if (token.isStartTag() &&
                "mglyph" != token.asStartTag().normalName &&
                "malignmark" != token.asStartTag().normalName
            ) {
                return true
            }
            if (token.isCharacter()) return true
        }
        // If the adjusted current node is a MathML annotation-xml element and the token is a start tag whose tag name is "svg"
        if (Parser.NamespaceMathml == ns &&
            el.nameIs("annotation-xml") &&
            token.isStartTag() && "svg" == token.asStartTag().normalName
        ) {
            return true
        }

        // If the adjusted current node is an HTML integration point and the token is a start tag
        // If the adjusted current node is an HTML integration point and the token is a character token
        return if (isHtmlIntegration(el) &&
            (token.isStartTag() || token.isCharacter())
        ) {
            true
        } else {
            token.isEOF()
        }

        // If the token is an end-of-file token
    }

    fun process(
        token: Token,
        state: HtmlTreeBuilderState,
    ): Boolean {
        return state.process(token, this)
    }

    fun transition(state: HtmlTreeBuilderState?) {
        this.state = state
    }

    fun state(): HtmlTreeBuilderState? {
        return state
    }

    fun markInsertionMode() {
        originalState = state
    }

    fun originalState(): HtmlTreeBuilderState? {
        return originalState
    }

    fun framesetOk(framesetOk: Boolean) {
        this.framesetOk = framesetOk
    }

    fun framesetOk(): Boolean {
        return framesetOk
    }

    val document: Document
        get() = doc

    fun maybeSetBaseUri(base: Element) {
        if (baseUriSetFromDoc) {
            // only listen to the first  in parse
            return
        }
        val href: String = base.absUrl("href")
        if (href.isNotEmpty()) { // ignore  etc
            baseUri = href
            baseUriSetFromDoc = true
            doc.setBaseUri(href) // set on the doc so doc.createElement(Tag) will get updated base, and to update all descendants
        }
    }

    fun error(state: HtmlTreeBuilderState?) {
        if (parser.getErrors().canAddError()) {
            parser.getErrors().add(
                ParseError(
                    reader,
                    "Unexpected ${currentToken!!.tokenType()} token [$currentToken] when in state [$state]",
                ),
            )
        }
    }

    fun createElementFor(
        startTag: Token.StartTag,
        namespace: String,
        forcePreserveCase: Boolean,
    ): Element {
        // dedupe and normalize the attributes:
        var attributes = startTag.attributes
        if (!forcePreserveCase) attributes = settings!!.normalizeAttributes(attributes)
        if (attributes != null && !attributes.isEmpty()) {
            val dupes = attributes.deduplicate(settings!!)
            if (dupes > 0) {
                error("Dropped duplicate attribute(s) in tag [${startTag.normalName}]")
            }
        }

        val tag =
            tagFor(
                startTag.tagName!!,
                namespace,
                if (forcePreserveCase) ParseSettings.preserveCase else settings,
            )

        return if ((tag.normalName() == "form")) {
            FormElement(tag, null, attributes)
        } else {
            Element(
                tag,
                null,
                attributes,
            )
        }
    }

    /** Inserts an HTML element for the given tag)  */
    fun insertElementFor(startTag: Token.StartTag): Element {
        val el = createElementFor(startTag, NamespaceHtml, false)
        doInsertElement(el, startTag)

        // handle self-closing tags. when the spec expects an empty tag, will directly hit insertEmpty, so won't generate this fake end tag.
        if (startTag.isSelfClosing) {
            val tag = el.tag()
            if (tag.isKnownTag()) {
                if (!tag.isEmpty) tokeniser!!.error("Tag [${tag.normalName()}] cannot be self closing; not a void tag")
                // else: ok
            } else { // unknown tag: remember this is self-closing, for output
                tag.setSelfClosing()
            }

            // effectively a pop, but fiddles with the state. handles empty style, title etc which would otherwise leave us in data state
            tokeniser!!.transition(TokeniserState.Data) // handles