All Downloads are FREE. Search and download functionalities are using the official Maven repository.

commonMain.com.fleeksoft.ksoup.parser.TokenQueue.kt Maven / Gradle / Ivy

package com.fleeksoft.ksoup.parser

import com.fleeksoft.ksoup.helper.Validate
import com.fleeksoft.ksoup.internal.StringUtil

/**
 * A character queue with parsing helpers.
 *
 * @author Sabeeh
 */
internal class TokenQueue(data: String) {
    private var queue: String
    private var pos = 0

    fun isEmpty(): Boolean = remainingLength() == 0

    private fun remainingLength(): Int {
        return queue.length - pos
    }

    /**
     * Add a string to the start of the queue.
     * @param seq string to add.
     */
    fun addFirst(seq: String) {
        // not very performant, but an edge case
        queue = seq + queue.substring(pos)
        pos = 0
    }

    /**
     * Tests if the next characters on the queue match the sequence. Case insensitive.
     * @param seq String to check queue for.
     * @return true if the next characters match.
     */
    fun matches(seq: String): Boolean {
        return queue.regionMatches(pos, seq, 0, seq.length, ignoreCase = true)
    }

    /**
     * Tests if the next characters match any of the sequences. Case insensitive.
     * @param seq list of strings to case insensitively check for
     * @return true of any matched, false if none did
     */
    fun matchesAny(vararg seq: String): Boolean {
        for (s in seq) {
            if (matches(s)) return true
        }
        return false
    }

    fun matchesAny(vararg seq: Char): Boolean {
        if (isEmpty()) return false
        for (c in seq) {
            if (queue[pos] == c) return true
        }
        return false
    }

    /**
     * Tests if the queue matches the sequence (as with match), and if they do, removes the matched string from the
     * queue.
     * @param seq String to search for, and if found, remove from queue.
     * @return true if found and removed, false if not found.
     */
    fun matchChomp(seq: String): Boolean {
        return if (matches(seq)) {
            pos += seq.length
            true
        } else {
            false
        }
    }

    /**
     * Tests if queue starts with a whitespace character.
     * @return if starts with whitespace
     */
    fun matchesWhitespace(): Boolean {
        return !isEmpty() && StringUtil.isWhitespace(queue[pos].code)
    }

    /**
     * Test if the queue matches a word character (letter or digit).
     * @return if matches a word character
     */
    fun matchesWord(): Boolean {
        return !isEmpty() && queue[pos].isLetterOrDigit()
    }

    /**
     * Drops the next character off the queue.
     */
    fun advance() {
        if (!isEmpty()) pos++
    }

    /**
     * Consume one character off queue.
     * @return first character on queue.
     */
    fun consume(): Char {
        return queue[pos++]
    }

    /**
     * Consumes the supplied sequence of the queue. If the queue does not start with the supplied sequence, will
     * throw an illegal state exception -- but you should be running match() against that condition.
     *
     *
     * Case insensitive.
     * @param seq sequence to remove from head of queue.
     */
    fun consume(seq: String) {
        if (!matches(seq)) throw IllegalStateException("Queue did not match expected sequence")
        val len = seq.length
        if (len > remainingLength()) throw IllegalStateException("Queue not long enough to consume sequence")
        pos += len
    }

    /**
     * Pulls a string off the queue, up to but exclusive of the match sequence, or to the queue running out.
     * @param seq String to end on (and not include in return, but leave on queue). **Case sensitive.**
     * @return The matched data consumed from queue.
     */
    fun consumeTo(seq: String?): String {
        val offset = queue.indexOf(seq!!, pos)
        return if (offset != -1) {
            val consumed = queue.substring(pos, offset)
            pos += consumed.length
            consumed
        } else {
            remainder()
        }
    }

    fun consumeToIgnoreCase(seq: String): String {
        val start = pos
        val first = seq.substring(0, 1)
        val canScan =
            first.lowercase() == first.uppercase() // if first is not cased, use index of
        while (!isEmpty()) {
            if (matches(seq)) break
            if (canScan) {
                val skip = queue.indexOf(first, pos) - pos
                if (skip == 0) {
                    // this char is the skip char, but not match, so force advance of pos
                    pos++
                } else if (skip < 0) {
                    // no chance of finding, grab to end
                    pos = queue.length
                } else {
                    pos += skip
                }
            } else {
                pos++
            }
        }
        return queue.substring(start, pos)
    }

    /**
     * Consumes to the first sequence provided, or to the end of the queue. Leaves the terminator on the queue.
     * @param seq any number of terminators to consume to. **Case insensitive.**
     * @return consumed string
     */
    // todo: method name. not good that consumeTo cares for case, and consume to any doesn't. And the only use for this
    // is a case sensitive time...
    fun consumeToAny(vararg seq: String): String {
        val start = pos
        while (!isEmpty() && !matchesAny(*seq)) {
            pos++
        }
        return queue.substring(start, pos)
    }

    /**
     * Pulls a string off the queue (like consumeTo), and then pulls off the matched string (but does not return it).
     *
     *
     * If the queue runs out of characters before finding the seq, will return as much as it can (and queue will go
     * isEmpty() == true).
     * @param seq String to match up to, and not include in return, and to pull off queue. **Case sensitive.**
     * @return Data matched from queue.
     */
    fun chompTo(seq: String): String {
        val data = consumeTo(seq)
        matchChomp(seq)
        return data
    }

    fun chompToIgnoreCase(seq: String): String {
        val data = consumeToIgnoreCase(seq) // case insensitive scan
        matchChomp(seq)
        return data
    }

    /**
     * Pulls a balanced string off the queue. E.g. if queue is "(one (two) three) four", (,) will return "one (two) three",
     * and leave " four" on the queue. Unbalanced openers and closers can be quoted (with ' or ") or escaped (with \). Those escapes will be left
     * in the returned string, which is suitable for regexes (where we need to preserve the escape), but unsuitable for
     * contains text strings; use unescape for that.
     * @param open opener
     * @param close closer
     * @return data matched from the queue
     */
    fun chompBalanced(
        open: Char,
        close: Char,
    ): String {
        var start = -1
        var end = -1
        var depth = 0
        var last = 0.toChar()
        var inSingleQuote = false
        var inDoubleQuote = false
        var inRegexQE = false // regex \Q .. \E escapes from Pattern.quote()
        do {
            if (isEmpty()) break
            val c = consume()
            if (last != ESC) {
                if (c == '\'' && c != open && !inDoubleQuote) {
                    inSingleQuote =
                        !inSingleQuote
                } else if (c == '"' && c != open && !inSingleQuote) {
                    inDoubleQuote =
                        !inDoubleQuote
                }
                if (inSingleQuote || inDoubleQuote || inRegexQE) {
                    last = c
                    continue
                }
                if (c == open) {
                    depth++
                    if (start == -1) start = pos
                } else if (c == close) {
                    depth--
                }
            } else if (c == 'Q') {
                inRegexQE = true
            } else if (c == 'E') {
                inRegexQE = false
            }
            if (depth > 0 && last.code != 0) {
                end =
                    pos // don't include the outer match pair in the return
            }
            last = c
        } while (depth > 0)
        val out = if (end >= 0) queue.substring(start, end) else ""
        if (depth > 0) { // ran out of queue before seeing enough )
            Validate.fail("Did not find balanced marker at '$out'")
        }
        return out
    }

    /**
     * Pulls the next run of whitespace characters of the queue.
     * @return Whether consuming whitespace or not
     */
    fun consumeWhitespace(): Boolean {
        var seen = false
        while (matchesWhitespace()) {
            pos++
            seen = true
        }
        return seen
    }

    /**
     * Retrieves the next run of word type (letter or digit) off the queue.
     * @return String of word characters from queue, or empty string if none.
     */
    fun consumeWord(): String {
        val start = pos
        while (matchesWord()) pos++
        return queue.substring(start, pos)
    }

    /**
     * Consume a CSS element selector (tag name, but | instead of : for namespaces (or *| for wildcard namespace), to not conflict with :pseudo selects).
     *
     * @return tag name
     */
    fun consumeElementSelector(): String {
        return consumeEscapedCssIdentifier(*ElementSelectorChars)
    }

    /**
     * Consume a CSS identifier (ID or class) off the queue (letter, digit, -, _)
     * http://www.w3.org/TR/CSS2/syndata.html#value-def-identifier
     * @return identifier
     */
    fun consumeCssIdentifier(): String {
        return consumeEscapedCssIdentifier(*CssIdentifierChars)
    }

    /**
     * Create a new TokenQueue.
     * @param data string of data to back queue.
     */
    init {
        queue = data
    }

    private fun consumeEscapedCssIdentifier(vararg matches: String): String {
        val start = pos
        var escaped = false
        while (!isEmpty()) {
            if (queue[pos] == ESC && remainingLength() > 1) {
                escaped = true
                pos += 2 // skip the escape and the escaped
            } else if (matchesCssIdentifier(*matches)) {
                pos++
            } else {
                break
            }
        }
        val consumed = queue.substring(start, pos)
        return if (escaped) unescape(consumed) else consumed
    }

    private fun matchesCssIdentifier(vararg matches: String): Boolean {
        return matchesWord() || matchesAny(*matches)
    }

    /**
     * Consume and return whatever is left on the queue.
     * @return remained of queue.
     */
    fun remainder(): String {
        val remainder = queue.substring(pos)
        pos = queue.length
        return remainder
    }

    override fun toString(): String {
        return queue.substring(pos)
    }

    companion object {
        private const val ESC = '\\' // escape char for chomp balanced.

        /**
         * Unescape a \ escaped string.
         * @param in backslash escaped string
         * @return unescaped string
         */
        /*fun unescape(`in`: String): String {
            val out: StringBuilder = StringUtil.borrowBuilder()
            var last = 0.toChar()
            for (c in `in`.toCharArray()) {
                if (c == ESC) {
                    if (last == ESC) {
                        out.append(c)
                        c = 0.toChar()
                    }
                } else {
                    out.append(c)
                }
                last = c
            }
            return StringUtil.releaseBuilder(out)
        }*/
        fun unescape(input: String): String {
            val output = StringBuilder()
            var lastChar: Char = 0.toChar()
            for (c in input) {
                var c1 = c
                if (c1 == ESC) {
                    if (lastChar == ESC) {
                        output.append(c1)
                        c1 = 0.toChar()
                    }
                } else {
                    output.append(c1)
                }
                lastChar = c1
            }
            return output.toString()
        }

        /*
    Given a CSS identifier (such as a tag, ID, or class), escape any CSS special characters that would otherwise not be
    valid in a selector.
         */
        fun escapeCssIdentifier(`in`: String): String {
            val out: StringBuilder = StringUtil.borrowBuilder()
            val q = TokenQueue(`in`)
            while (!q.isEmpty()) {
                if (q.matchesCssIdentifier(*ElementSelectorChars)) {
                    out.append(q.consume())
                } else {
                    out.append(ESC).append(q.consume())
                }
            }
            return StringUtil.releaseBuilder(out)
        }

        private val ElementSelectorChars = arrayOf("*|", "|", "_", "-")
        private val CssIdentifierChars = arrayOf("-", "_")
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy