All Downloads are FREE. Search and download functionalities are using the official Maven repository.

commonMain.com.fleeksoft.ksoup.parser.TreeBuilder.kt Maven / Gradle / Ivy

package com.fleeksoft.ksoup.parser

import com.fleeksoft.ksoup.internal.SharedConstants
import com.fleeksoft.ksoup.nodes.*
import com.fleeksoft.ksoup.parser.Parser.Companion.NamespaceHtml
import com.fleeksoft.ksoup.ported.BufferReader

/**
 * @author Sabeeh
 */
internal abstract class TreeBuilder {
    lateinit var parser: Parser
        internal set
    lateinit var reader: CharacterReader
        private set
    var tokeniser: Tokeniser? = null
        private set

    protected lateinit var doc: Document // current doc we are building into
        private set

    public lateinit var stack: ArrayList // the stack of open elements
    public open var baseUri: String? = null // current base uri, for creating new elements
    public var currentToken: Token? = null // currentToken is used only for error tracking.
    public var settings: ParseSettings? = null

    // tags we've used in this parse; saves tag GC for custom tags.
    protected var seenTags: MutableMap? = null
    private lateinit var start: Token.StartTag // start tag to process
    private lateinit var end: Token.EndTag

    abstract fun defaultSettings(): ParseSettings?

    public var trackSourceRange = false // optionally tracks the source range of nodes

    public open fun initialiseParse(
        input: BufferReader,
        baseUri: String,
        parser: Parser,
    ) {
        end = Token.EndTag(this)
        doc = Document(parser.defaultNamespace(), baseUri)
        doc.parser(parser)
        this.parser = parser
        settings = parser.settings()
        reader = CharacterReader(input)
        trackSourceRange = parser.isTrackPosition
        reader.trackNewlines(
            parser.isTrackErrors() || trackSourceRange,
        ) // when tracking errors or source ranges, enable newline tracking for better legibility
        tokeniser = Tokeniser(this)
        stack = ArrayList(32)
        seenTags = HashMap()
        start = Token.StartTag(this)
        currentToken = start // init current token to the virtual start token.
        this.baseUri = baseUri
    }

    fun parse(
        input: BufferReader,
        baseUri: String,
        parser: Parser,
    ): Document {
        initialiseParse(input, baseUri, parser)
        runParser()

        // tidy up - as the Parser and Treebuilder are retained in document for settings / fragments
        reader.close()
        tokeniser = null
        stack.clear()
        seenTags = null
        return doc
    }

    /**
     * Create a new copy of this TreeBuilder
     * @return copy, ready for a new parse
     */
    abstract fun newInstance(): TreeBuilder

    abstract fun parseFragment(
        inputFragment: String,
        context: Element?,
        baseUri: String?,
        parser: Parser,
    ): List

    protected fun runParser() {
        val tokeniser = this.tokeniser!!
        val eof = Token.TokenType.EOF

        while (true) {
            val token = tokeniser.read()
            currentToken = token
            process(token)
            if (token.type === eof) break
            token.reset()
        }

        // once we hit the end, pop remaining items off the stack
        while (!stack.isEmpty()) pop()
    }

    public abstract fun process(token: Token): Boolean

    fun processStartTag(name: String): Boolean {
        // these are "virtual" start tags (auto-created by the treebuilder), so not tracking the start position
        val start = this.start
        if (currentToken === start) { // don't recycle an in-use token
            return process(Token.StartTag(this).name(name))
        }
        return process(start.reset().name(name))
    }

    fun processStartTag(
        name: String,
        attrs: Attributes?,
    ): Boolean {
        val start = this.start
        if (currentToken === start) { // don't recycle an in-use token
            return process(Token.StartTag(this).nameAttr(name, attrs))
        }
        start.reset()
        start.nameAttr(name, attrs)
        return process(start)
    }

    fun processEndTag(name: String): Boolean {
        if (currentToken === end) { // don't recycle an in-use token
            return process(Token.EndTag(this).name(name))
        }
        return process(end.reset().name(name))
    }

    /**
     * Removes the last Element from the stack, hits onNodeClosed, and then returns it.
     * @return
     */
    fun pop(): Element {
        val size = stack.size
        val removed = stack.removeAt(size - 1)!!
        onNodeClosed(removed)
        return removed
    }

    /**
     * Adds the specified Element to the end of the stack, and hits onNodeInserted.
     * @param element
     */
    fun push(element: Element) {
        stack.add(element)
        onNodeInserted(element)
    }

    /**
     * Get the current element (last on the stack). If all items have been removed, returns the document instead
     * (which might not actually be on the stack; use stack.size() == 0 to test if required.
     * @return the last element on the stack, if any; or the root document
     */
    fun currentElement(): Element {
        val size: Int = stack.size
        return if (size > 0) stack[size - 1]!! else doc
    }

    /**
     * Checks if the Current Element's normal name equals the supplied name, in the HTML namespace.
     * @param normalName name to check
     * @return true if there is a current element on the stack, and its name equals the supplied
     */
    fun currentElementIs(normalName: String?): Boolean {
        if (stack.size == 0) return false
        val current: Element = currentElement()
        return (
            current.normalName() == normalName && current.tag().namespace() == NamespaceHtml
        )
    }

    /**
     * Checks if the Current Element's normal name equals the supplied name, in the specified namespace.
     * @param normalName name to check
     * @param namespace the namespace
     * @return true if there is a current element on the stack, and its name equals the supplied
     */
    fun currentElementIs(
        normalName: String?,
        namespace: String?,
    ): Boolean {
        if (stack.size == 0) return false
        val current: Element = currentElement()
        return (
            current.normalName() == normalName && current.tag().namespace() == namespace
        )
    }

    /**
     * If the parser is tracking errors, add an error at the current position.
     * @param msg error message template
     */
    protected fun error(msg: String) {
        val errors: ParseErrorList = parser.getErrors()
        if (errors.canAddError()) errors.add(ParseError(reader, msg))
    }

    /**
     * (An internal method, visible for Element. For HTML parse, signals that script and style text should be treated as
     * Data Nodes).
     */
    open fun isContentForTagData(normalName: String): Boolean {
        return false
    }

    protected fun tagFor(
        tagName: String,
        namespace: String,
        settings: ParseSettings?,
    ): Tag {
        val cached: Tag? =
            seenTags!![tagName] // note that we don't normalize the cache key. But tag via valueOf may be normalized.
        if (cached == null || cached.namespace() != namespace) {
            // only return from cache if the namespace is the same. not running nested cache to save double hit on the common flow
            val tag: Tag = Tag.valueOf(tagName, namespace, settings)
            seenTags!![tagName] = tag
            return tag
        }
        return cached
    }

    fun tagFor(
        tagName: String,
        settings: ParseSettings?,
    ): Tag {
        return tagFor(tagName, defaultNamespace(), settings)
    }

    /**
     * Gets the default namespace for this TreeBuilder
     * @return the default namespace
     */
    open fun defaultNamespace(): String {
        return NamespaceHtml
    }

    /**
     * Called by implementing TreeBuilders when a node has been inserted. This implementation includes optionally tracking
     * the source range of the node.  @param node the node that was just inserted
     */
    fun onNodeInserted(node: Node) {
        trackNodePosition(node, true)
    }

    /**
     * Called by implementing TreeBuilders when a node is explicitly closed. This implementation includes optionally
     * tracking the closing source range of the node.  @param node the node being closed
     */
    fun onNodeClosed(node: Node) {
        trackNodePosition(node, false)
    }

    private fun trackNodePosition(
        node: Node,
        isStart: Boolean,
    ) {
        if (!trackSourceRange) return

        val token = currentToken!!
        var startPos = token.startPos()
        var endPos = token.endPos()

        // handle implicit element open / closes.
        if (node is Element) {
            val el = node
            if (token.isEOF()) {
                if (el.endSourceRange()
                        .isTracked()
                ) {
                    return // /body and /html are left on stack until EOF, don't reset them
                }

                endPos = reader.pos()
                startPos = endPos
            } else if (isStart) { // opening tag
                if (!token.isStartTag() || !el.normalName().equals(token.asStartTag().normalName)) {
                    endPos = startPos
                }
            } else { // closing tag
                if (!el.tag().isEmpty && !el.tag().isSelfClosing()) {
                    if (!token.isEndTag() || !el.normalName().equals(token.asEndTag().normalName)) {
                        endPos = startPos
                    }
                }
            }
        }

        val startPosition: Range.Position =
            Range.Position(startPos, reader.lineNumber(startPos), reader.columnNumber(startPos))
        val endPosition: Range.Position = Range.Position(endPos, reader.lineNumber(endPos), reader.columnNumber(endPos))
        val range = Range(startPosition, endPosition)
        node.attributes().userData(if (isStart) SharedConstants.RangeKey else SharedConstants.EndRangeKey, range)
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy