All Downloads are FREE. Search and download functionalities are using the official Maven repository.

commonMain.com.fleeksoft.ksoup.parser.XmlTreeBuilder.kt Maven / Gradle / Ivy

package com.fleeksoft.ksoup.parser

import com.fleeksoft.ksoup.helper.Validate
import com.fleeksoft.ksoup.nodes.*
import com.fleeksoft.ksoup.parser.Parser.Companion.NamespaceXml
import com.fleeksoft.ksoup.ported.BufferReader

/**
 * Use the `XmlTreeBuilder` when you want to parse XML without any of the HTML DOM rules being applied to the
 * document.
 *
 * Usage example: `Document xmlDoc = Ksoup.parse(html, baseUrl, Parser.xmlParser());`
 *
 * @author Sabeeh
 */
internal open class XmlTreeBuilder : TreeBuilder() {
    override fun defaultSettings(): ParseSettings {
        return ParseSettings.preserveCase
    }

    override fun initialiseParse(
        input: BufferReader,
        baseUri: String,
        parser: Parser,
    ) {
        super.initialiseParse(input, baseUri, parser)

        // place the document onto the stack. differs from HtmlTreeBuilder (not on stack). Note not push()ed, so not onNodeInserted.
        stack.add(doc)

        doc.outputSettings()
            .syntax(Document.OutputSettings.Syntax.xml)
            .escapeMode(Entities.EscapeMode.xhtml)
            .prettyPrint(false) // as XML, we don't understand what whitespace is significant or not
    }

    fun parse(
        input: BufferReader,
        baseUri: String? = null,
    ): Document {
        return parse(input, baseUri ?: "", Parser(this))
    }

    fun parse(
        input: String,
        baseUri: String? = null,
    ): Document {
        return parse(BufferReader(input), baseUri ?: "", Parser(this))
    }

    override fun newInstance(): XmlTreeBuilder {
        return XmlTreeBuilder()
    }

    override fun defaultNamespace(): String {
        return NamespaceXml
    }

    override fun process(token: Token): Boolean {
        currentToken = token

        when (token.type) {
            Token.TokenType.StartTag -> insertElementFor(token.asStartTag())
            Token.TokenType.EndTag -> popStackToClose(token.asEndTag())
            Token.TokenType.Comment -> insertCommentFor(token.asComment())
            Token.TokenType.Character -> insertCharacterFor(token.asCharacter())
            Token.TokenType.Doctype -> insertDoctypeFor(token.asDoctype())
            Token.TokenType.EOF -> {}
            else -> Validate.fail("Unexpected token type: " + token.type)
        }
        return true
    }

    fun insertElementFor(startTag: Token.StartTag) {
        val tag = tagFor(startTag.name(), settings)
        if (startTag.attributes != null) startTag.attributes!!.deduplicate(settings!!)

        val el = Element(tag, null, settings!!.normalizeAttributes(startTag.attributes))
        currentElement().appendChild(el)
        push(el)

        if (startTag.isSelfClosing) {
            tag.setSelfClosing()
            pop() // push & pop ensures onNodeInserted & onNodeClosed
        }
    }

    fun insertLeafNode(node: LeafNode?) {
        currentElement().appendChild(node!!)
        onNodeInserted(node)
    }

    fun insertCommentFor(commentToken: Token.Comment) {
        val comment = Comment(commentToken.getData())
        var insert: LeafNode? = comment
        if (commentToken.bogus && comment.isXmlDeclaration()) {
            // xml declarations are emitted as bogus comments (which is right for html, but not xml)
            // so we do a bit of a hack and parse the data as an element to pull the attributes out
            // todo - refactor this to parse more appropriately
            val decl = comment.asXmlDeclaration() // else, we couldn't parse it as a decl, so leave as a comment
            if (decl != null) insert = decl
        }
        insertLeafNode(insert)
    }

    fun insertCharacterFor(token: Token.Character) {
        val data: String = token.data!!
        insertLeafNode(if (token.isCData()) CDataNode(data) else TextNode(data))
    }

    fun insertDoctypeFor(token: Token.Doctype) {
        val doctypeNode =
            DocumentType(
                settings!!.normalizeTag(token.getName()),
                token.getPublicIdentifier(),
                token.getSystemIdentifier(),
            )
        doctypeNode.setPubSysKey(token.pubSysKey)
        insertLeafNode(doctypeNode)
    }

    @Deprecated("unused and will be removed. ")
    protected fun insertNode(node: Node?) {
        currentElement().appendChild(node!!)
        onNodeInserted(node)
    }

    @Deprecated("unused and will be removed. ")
    protected fun insertNode(
        node: Node?,
        token: Token?,
    ) {
        currentElement().appendChild(node!!)
        onNodeInserted(node)
    }

    /**
     * If the stack contains an element with this tag's name, pop up the stack to remove the first occurrence. If not
     * found, skips.
     *
     * @param endTag tag to close
     */
    protected fun popStackToClose(endTag: Token.EndTag) {
        // like in HtmlTreeBuilder - don't scan up forever for very (artificially) deeply nested stacks
        val elName = settings!!.normalizeTag(endTag.tagName!!)
        var firstFound: Element? = null

        val bottom: Int = stack.size - 1
        val upper =
            if (bottom >= XmlTreeBuilder.maxQueueDepth) bottom - XmlTreeBuilder.maxQueueDepth else 0

        for (pos in stack.size - 1 downTo upper) {
            val next = stack[pos]!!
            if (next.nodeName() == elName) {
                firstFound = next
                break
            }
        }
        if (firstFound == null) return // not found, skip

        for (pos in stack.size - 1 downTo 0) {
            val next = pop()
            if (next === firstFound) {
                break
            }
        }
    }

    override fun parseFragment(
        inputFragment: String,
        context: Element?,
        baseUri: String?,
        parser: Parser,
    ): List {
        return parseFragment(inputFragment, baseUri, parser)
    }

    fun parseFragment(
        inputFragment: String,
        baseUri: String?,
        parser: Parser,
    ): List {
        initialiseParse(BufferReader(inputFragment), baseUri ?: "", parser)
        runParser()
        return doc.childNodes()
    }

    companion object {
        private const val maxQueueDepth = 256 // an arbitrary tension point between real XML and crafted pain
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy