commonMain.kotlinx.serialization.json.internal.lexer.AbstractJsonLexer.kt Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of kotlinx-serialization-json
Kotlin multiplatform serialization runtime library
There is a newer version: 1.7.3
/*
 * Copyright 2017-2021 JetBrains s.r.o. Use of this source code is governed by the Apache 2.0 license.
 */

package kotlinx.serialization.json.internal

import kotlinx.serialization.json.internal.CharMappings.CHAR_TO_TOKEN
import kotlinx.serialization.json.internal.CharMappings.ESCAPE_2_CHAR
import kotlin.js.*
import kotlin.jvm.*
import kotlin.math.*

internal const val lenientHint = "Use 'isLenient = true' in 'Json {}` builder to accept non-compliant JSON."
internal const val coerceInputValuesHint = "Use 'coerceInputValues = true' in 'Json {}` builder to coerce nulls to default values."
internal const val specialFlowingValuesHint =
    "It is possible to deserialize them using 'JsonBuilder.allowSpecialFloatingPointValues = true'"
internal const val ignoreUnknownKeysHint = "Use 'ignoreUnknownKeys = true' in 'Json {}' builder to ignore unknown keys."
internal const val allowStructuredMapKeysHint =
    "Use 'allowStructuredMapKeys = true' in 'Json {}' builder to convert such maps to [key1, value1, key2, value2,...] arrays."

// special strings
internal const val NULL = "null"

// special chars
internal const val COMMA = ','
internal const val COLON = ':'
internal const val BEGIN_OBJ = '{'
internal const val END_OBJ = '}'
internal const val BEGIN_LIST = '['
internal const val END_LIST = ']'
internal const val STRING = '"'
internal const val STRING_ESC = '\\'

internal const val INVALID = 0.toChar()
internal const val UNICODE_ESC = 'u'

// token classes
internal const val TC_OTHER: Byte = 0
internal const val TC_STRING: Byte = 1
internal const val TC_STRING_ESC: Byte = 2
internal const val TC_WHITESPACE: Byte = 3
internal const val TC_COMMA: Byte = 4
internal const val TC_COLON: Byte = 5
internal const val TC_BEGIN_OBJ: Byte = 6
internal const val TC_END_OBJ: Byte = 7
internal const val TC_BEGIN_LIST: Byte = 8
internal const val TC_END_LIST: Byte = 9
internal const val TC_EOF: Byte = 10
internal const val TC_INVALID: Byte = Byte.MAX_VALUE

// mapping from chars to token classes
private const val CTC_MAX = 0x7e

// mapping from escape chars real chars
private const val ESC2C_MAX = 0x75

internal const val asciiCaseMask = 1 shl 5

// object instead of @SharedImmutable because there is mutual initialization in [initC2ESC] and [initC2TC]
internal object CharMappings {
    @JvmField
    val ESCAPE_2_CHAR = CharArray(ESC2C_MAX)

    @JvmField
    val CHAR_TO_TOKEN = ByteArray(CTC_MAX)

    init {
        initEscape()
        initCharToToken()
    }

    private fun initEscape() {
        for (i in 0x00..0x1f) {
            initC2ESC(i, UNICODE_ESC)
        }

        initC2ESC(0x08, 'b')
        initC2ESC(0x09, 't')
        initC2ESC(0x0a, 'n')
        initC2ESC(0x0c, 'f')
        initC2ESC(0x0d, 'r')
        initC2ESC('/', '/')
        initC2ESC(STRING, STRING)
        initC2ESC(STRING_ESC, STRING_ESC)
    }

    private fun initCharToToken() {
        for (i in 0..0x20) {
            initC2TC(i, TC_INVALID)
        }

        initC2TC(0x09, TC_WHITESPACE)
        initC2TC(0x0a, TC_WHITESPACE)
        initC2TC(0x0d, TC_WHITESPACE)
        initC2TC(0x20, TC_WHITESPACE)
        initC2TC(COMMA, TC_COMMA)
        initC2TC(COLON, TC_COLON)
        initC2TC(BEGIN_OBJ, TC_BEGIN_OBJ)
        initC2TC(END_OBJ, TC_END_OBJ)
        initC2TC(BEGIN_LIST, TC_BEGIN_LIST)
        initC2TC(END_LIST, TC_END_LIST)
        initC2TC(STRING, TC_STRING)
        initC2TC(STRING_ESC, TC_STRING_ESC)
    }

    private fun initC2ESC(c: Int, esc: Char) {
        if (esc != UNICODE_ESC) ESCAPE_2_CHAR[esc.code] = c.toChar()
    }

    private fun initC2ESC(c: Char, esc: Char) = initC2ESC(c.code, esc)

    private fun initC2TC(c: Int, cl: Byte) {
        CHAR_TO_TOKEN[c] = cl
    }

    private fun initC2TC(c: Char, cl: Byte) = initC2TC(c.code, cl)
}

internal fun charToTokenClass(c: Char) = if (c.code < CTC_MAX) CHAR_TO_TOKEN[c.code] else TC_OTHER

internal fun escapeToChar(c: Int): Char = if (c < ESC2C_MAX) ESCAPE_2_CHAR[c] else INVALID

/**
 * The base class that reads the JSON from the given char sequence source.
 * It has two implementations: one over the raw [String] instance, [StringJsonLexer],
 * and one over an arbitrary stream of data, [ReaderJsonLexer] (JVM-only).
 *
 * [AbstractJsonLexer] contains base implementation for cold or not performance-sensitive
 * methods on top of [CharSequence], but [StringJsonLexer] overrides some
 * of them for the performance reasons (devirtualization of [CharSequence] and avoid
 * of additional spills).
 */
internal abstract class AbstractJsonLexer {

    protected abstract val source: CharSequence

    @JvmField
    protected var currentPosition: Int = 0 // position in source

    @JvmField
    val path = JsonPath()

    open fun ensureHaveChars() {}

    fun isNotEof(): Boolean = peekNextToken() != TC_EOF

    // Used as bound check in loops
    abstract fun prefetchOrEof(position: Int): Int

    abstract fun tryConsumeComma(): Boolean

    abstract fun canConsumeValue(): Boolean

    abstract fun consumeNextToken(): Byte

    protected fun isValidValueStart(c: Char): Boolean {
        return when (c) {
            '}', ']', ':', ',' -> false
            else -> true
        }
    }

    fun expectEof() {
        val nextToken = consumeNextToken()
        if (nextToken != TC_EOF)
            fail("Expected EOF after parsing, but had ${source[currentPosition - 1]} instead")
    }

    /*
     * Peeked string for coerced enums.
     * If the value was picked, 'consumeString' will take it without scanning the source.
     */
    private var peekedString: String? = null
    protected var escapedString = StringBuilder()

    // TODO consider replacing usages of this method in JsonParser with char overload
    fun consumeNextToken(expected: Byte): Byte {
        val token = consumeNextToken()
        if (token != expected) {
            fail(expected)
        }
        return token
    }

    open fun consumeNextToken(expected: Char) {
        ensureHaveChars()
        val source = source
        var cpos = currentPosition
        while (true) {
            cpos = prefetchOrEof(cpos)
            if (cpos == -1) break // could be inline function but KT-1436
            val c = source[cpos++]
            if (c == ' ' || c == '\n' || c == '\r' || c == '\t') continue
            currentPosition = cpos
            if (c == expected) return
            unexpectedToken(expected)
        }
        currentPosition = cpos
        unexpectedToken(expected) // EOF
    }

    protected fun unexpectedToken(expected: Char) {
        --currentPosition // To properly handle null
        if (currentPosition >= 0 && expected == STRING && consumeStringLenient() == NULL) {
            fail("Expected string literal but 'null' literal was found", currentPosition - 4, coerceInputValuesHint)
        }
        fail(charToTokenClass(expected))
    }

    internal fun fail(expectedToken: Byte): Nothing {
        // We know that the token was consumed prior to this call
        // Slow path, never called in normal code, can avoid optimizing it
        val expected = when (expectedToken) {
            TC_STRING -> "quotation mark '\"'"
            TC_COMMA -> "comma ','"
            TC_COLON -> "colon ':'"
            TC_BEGIN_OBJ -> "start of the object '{'"
            TC_END_OBJ -> "end of the object '}'"
            TC_BEGIN_LIST -> "start of the array '['"
            TC_END_LIST -> "end of the array ']'"
            else -> "valid token" // should never happen
        }
        val s = if (currentPosition == source.length || currentPosition <= 0) "EOF" else source[currentPosition - 1].toString()
        fail("Expected $expected, but had '$s' instead", currentPosition - 1)
    }

    fun peekNextToken(): Byte {
        val source = source
        var cpos = currentPosition
        while (true) {
            cpos = prefetchOrEof(cpos)
            if (cpos == -1) break
            val ch = source[cpos]
            if (ch == ' ' || ch == '\n' || ch == '\r' || ch == '\t') {
                ++cpos
                continue
            }
            currentPosition = cpos
            return charToTokenClass(ch)
        }
        currentPosition = cpos
        return TC_EOF
    }

    /**
     * Tries to consume `null` token from input.
     * Returns `false` if the next 4 chars in input are not `null`,
     * `true` otherwise and consumes it if [doConsume] is `true`.
     */
    fun tryConsumeNull(doConsume: Boolean = true): Boolean {
        var current = skipWhitespaces()
        current = prefetchOrEof(current)
        // Cannot consume null due to EOF, maybe something else
        val len = source.length - current
        if (len < 4 || current == -1) return false
        for (i in 0..3) {
            if (NULL[i] != source[current + i]) return false
        }
        /*
         * If we're in lenient mode, this might be the string with 'null' prefix,
         * distinguish it from 'null'
         */
        if (len > 4 && charToTokenClass(source[current + 4]) == TC_OTHER) return false

        if (doConsume) {
            currentPosition = current + 4
        }
        return true
    }

    open fun skipWhitespaces(): Int {
        var current = currentPosition
        // Skip whitespaces
        while (true) {
            current = prefetchOrEof(current)
            if (current == -1) break
            val c = source[current]
            // Faster than char2TokenClass actually
            if (c == ' ' || c == '\n' || c == '\r' || c == '\t') {
                ++current
            } else {
                break
            }
        }
        currentPosition = current
        return current
    }

    abstract fun peekLeadingMatchingValue(keyToMatch: String, isLenient: Boolean): String?

    fun peekString(isLenient: Boolean): String? {
        val token = peekNextToken()
        val string = if (isLenient) {
            if (token != TC_STRING && token != TC_OTHER) return null
            consumeStringLenient()
        } else {
            if (token != TC_STRING) return null
            consumeString()
        }
        peekedString = string
        return string
    }

    fun discardPeeked() {
        peekedString = null
    }

    open fun indexOf(char: Char, startPos: Int) = source.indexOf(char, startPos)
    open fun substring(startPos: Int, endPos: Int) =  source.substring(startPos, endPos)

    /*
     * This method is a copy of consumeString, but used for key of json objects, so there
     * is no need to lookup peeked string.
     */
    abstract fun consumeKeyString(): String

    private fun insideString(isLenient: Boolean, char: Char): Boolean = if (isLenient) {
        charToTokenClass(char) == TC_OTHER
    } else {
        char != STRING
    }

    open fun consumeStringChunked(isLenient: Boolean, consumeChunk: (stringChunk: String) -> Unit) { // open to allow simpler implementations (i.e. StringJsonLexer)
        val nextToken = peekNextToken()
        if (isLenient && nextToken != TC_OTHER) return // noting to consume

        if (!isLenient) {
            consumeNextToken(STRING)
        }
        var currentPosition = this.currentPosition
        var lastPosition = currentPosition
        var char = source[currentPosition] // Avoid two range checks visible in the profiler
        var usedAppend = false
        while (insideString(isLenient, char)) {
            if (!isLenient && char == STRING_ESC) { // handle escaping only in non-lenient mode
                usedAppend = true
                currentPosition = prefetchOrEof(appendEscape(lastPosition, currentPosition))
                lastPosition = currentPosition
            } else {
                currentPosition++
            }
            if (currentPosition >= source.length) {
                // end of chunk
                writeRange(lastPosition, currentPosition, usedAppend, consumeChunk)
                usedAppend = false
                currentPosition = prefetchOrEof(currentPosition)
                if (currentPosition == -1)
                    fail("EOF", currentPosition)
                lastPosition = currentPosition
            }
            char = source[currentPosition]
        }
        writeRange(lastPosition, currentPosition, usedAppend, consumeChunk)
        this.currentPosition = currentPosition
        if (!isLenient) {
            consumeNextToken(STRING)
        }
    }

    private fun writeRange(fromIndex: Int, toIndex: Int, currentChunkHasEscape: Boolean, consumeChunk: (stringChunk: String) -> Unit) {
        if (currentChunkHasEscape) {
            consumeChunk(decodedString(fromIndex, toIndex))
        } else {
            consumeChunk(substring(fromIndex, toIndex))
        }
    }


    fun consumeString(): String {
        if (peekedString != null) {
            return takePeeked()
        }

        return consumeKeyString()
    }

    @JsName("consumeString2") // WA for JS issue
    protected fun consumeString(source: CharSequence, startPosition: Int, current: Int): String {
        var currentPosition = current
        var lastPosition = startPosition
        var char = source[currentPosition] // Avoid two range checks visible in the profiler
        var usedAppend = false
        while (char != STRING) {
            if (char == STRING_ESC) {
                usedAppend = true
                currentPosition = prefetchOrEof(appendEscape(lastPosition, currentPosition))
                if (currentPosition == -1)
                    fail("EOF", currentPosition)
                lastPosition = currentPosition
            } else if (++currentPosition >= source.length) {
                usedAppend = true
                // end of chunk
                appendRange(lastPosition, currentPosition)
                currentPosition = prefetchOrEof(currentPosition)
                if (currentPosition == -1)
                    fail("EOF", currentPosition)
                lastPosition = currentPosition
            }
            char = source[currentPosition]
        }

        val string = if (!usedAppend) {
            // there was no escaped chars
            substring(lastPosition, currentPosition)
        } else {
            // some escaped chars were there
            decodedString(lastPosition, currentPosition)
        }
        this.currentPosition = currentPosition + 1
        return string
    }

    private fun appendEscape(lastPosition: Int, current: Int): Int {
        appendRange(lastPosition, current)
        return appendEsc(current + 1)
    }

    private fun decodedString(lastPosition: Int, currentPosition: Int): String {
        appendRange(lastPosition, currentPosition)
        val result = escapedString.toString()
        escapedString.setLength(0)
        return result
    }

    private fun takePeeked(): String {
        return peekedString!!.also { peekedString = null }
    }

    fun consumeStringLenientNotNull(): String {
        val result = consumeStringLenient()
        /*
         * Check if lenient value is 'null' _without_ quotation marks and fail for non-nullable read if so.
         */
        if (result == NULL && wasUnquotedString()) {
            fail("Unexpected 'null' value instead of string literal")
        }
        return result
    }

    private fun wasUnquotedString(): Boolean {
        // Is invoked _only_ when the 'null' string was read, thus 'cP - 1' is always within bounds
        return source[currentPosition - 1] != STRING
    }

    // Allows consuming unquoted string
    fun consumeStringLenient(): String {
        if (peekedString != null) {
            return takePeeked()
        }
        var current = skipWhitespaces()
        if (current >= source.length || current == -1) fail("EOF", current)
        val token = charToTokenClass(source[current])
        if (token == TC_STRING) {
            return consumeString()
        }

        if (token != TC_OTHER) {
            fail("Expected beginning of the string, but got ${source[current]}")
        }
        var usedAppend = false
        while (charToTokenClass(source[current]) == TC_OTHER) {
            ++current
            if (current >= source.length) {
                usedAppend = true
                appendRange(currentPosition, current)
                val eof = prefetchOrEof(current)
                if (eof == -1) {
                    // to handle plain lenient strings, such as top-level
                    currentPosition = current
                    return decodedString(0, 0)
                } else {
                    current = eof
                }
            }
        }
        val result = if (!usedAppend) {
            substring(currentPosition, current)
        } else {
            decodedString(currentPosition, current)
        }
        currentPosition = current
        return result
    }

    // initializes buf usage upon the first encountered escaped char
    protected open fun appendRange(fromIndex: Int, toIndex: Int) {
        escapedString.append(source, fromIndex, toIndex)
    }

    private fun appendEsc(startPosition: Int): Int {
        var currentPosition = startPosition
        currentPosition = prefetchOrEof(currentPosition)
        if (currentPosition == -1) fail("Expected escape sequence to continue, got EOF")
        val currentChar = source[currentPosition++]
        if (currentChar == UNICODE_ESC) {
            return appendHex(source, currentPosition)
        }

        val c = escapeToChar(currentChar.code)
        if (c == INVALID) fail("Invalid escaped char '$currentChar'")
        escapedString.append(c)
        return currentPosition
    }

    private fun appendHex(source: CharSequence, startPos: Int): Int {
        if (startPos + 4 >= source.length) {
            currentPosition = startPos
            ensureHaveChars()
            if (currentPosition + 4 >= source.length)
                fail("Unexpected EOF during unicode escape")
            return appendHex(source, currentPosition)
        }
        escapedString.append(
            ((fromHexChar(source, startPos) shl 12) +
                    (fromHexChar(source, startPos + 1) shl 8) +
                    (fromHexChar(source, startPos + 2) shl 4) +
                    fromHexChar(source, startPos + 3)).toChar()
        )
        return startPos + 4
    }

    internal inline fun require(condition: Boolean, position: Int = currentPosition, message: () -> String) {
        if (!condition) fail(message(), position)
    }

    private fun fromHexChar(source: CharSequence, currentPosition: Int): Int {
        return when (val character = source[currentPosition]) {
            in '0'..'9' -> character.code - '0'.code
            in 'a'..'f' -> character.code - 'a'.code + 10
            in 'A'..'F' -> character.code - 'A'.code + 10
            else -> fail("Invalid toHexChar char '$character' in unicode escape")
        }
    }

    fun skipElement(allowLenientStrings: Boolean) {
        val tokenStack = mutableListOf()
        var lastToken = peekNextToken()
        if (lastToken != TC_BEGIN_LIST && lastToken != TC_BEGIN_OBJ) {
            consumeStringLenient()
            return
        }
        while (true) {
            lastToken = peekNextToken()
            if (lastToken == TC_STRING) {
                if (allowLenientStrings) consumeStringLenient() else consumeKeyString()
                continue
            }
            when (lastToken) {
                TC_BEGIN_LIST, TC_BEGIN_OBJ -> {
                    tokenStack.add(lastToken)
                }
                TC_END_LIST -> {
                    if (tokenStack.last() != TC_BEGIN_LIST) throw JsonDecodingException(
                        currentPosition,
                        "found ] instead of } at path: $path",
                        source
                    )
                    tokenStack.removeLast()
                }
                TC_END_OBJ -> {
                    if (tokenStack.last() != TC_BEGIN_OBJ) throw JsonDecodingException(
                        currentPosition,
                        "found } instead of ] at path: $path",
                        source
                    )
                    tokenStack.removeLast()
                }
                TC_EOF -> fail("Unexpected end of input due to malformed JSON during ignoring unknown keys")
            }
            consumeNextToken()
            if (tokenStack.size == 0) return
        }
    }

    override fun toString(): String {
        return "JsonReader(source='$source', currentPosition=$currentPosition)"
    }

    fun failOnUnknownKey(key: String) {
        // At this moment we already have both key and semicolon (and whitespaces! consumed),
        // but still would like an error to point to the beginning of the key, so we are backtracking it
        val processed = substring(0, currentPosition)
        val lastIndexOf = processed.lastIndexOf(key)
        fail("Encountered an unknown key '$key'", lastIndexOf, ignoreUnknownKeysHint)
    }

    fun fail(message: String, position: Int = currentPosition, hint: String = ""): Nothing {
        val hintMessage = if (hint.isEmpty()) "" else "\n$hint"
        throw JsonDecodingException(position, message + " at path: " + path.getPath() + hintMessage, source)
    }

    fun consumeNumericLiteral(): Long {
        /*
         * This is an optimized (~40% for numbers) version of consumeString().toLong()
         * that doesn't allocate and also doesn't support any radix but 10
         */
        var current = skipWhitespaces()
        current = prefetchOrEof(current)
        if (current >= source.length || current == -1) fail("EOF")
        val hasQuotation = if (source[current] == STRING) {
            // Check it again
            // not sure if should call ensureHaveChars() because threshold is far greater than chars count in MAX_LONG
            if (++current == source.length) fail("EOF")
            true
        } else {
            false
        }
        var accumulator = 0L
        var exponentAccumulator = 0L
        var isNegative = false
        var isExponentPositive = false
        var hasExponent = false
        val start = current
        while (current != source.length) {
            val ch: Char = source[current]
            if ((ch == 'e' || ch == 'E') && !hasExponent) {
                if (current == start) fail("Unexpected symbol $ch in numeric literal")
                isExponentPositive = true
                hasExponent = true
                ++current
                continue
            }
            if (ch == '-' && hasExponent) {
                if (current == start) fail("Unexpected symbol '-' in numeric literal")
                isExponentPositive = false
                ++current
                continue
            }
            if (ch == '+' && hasExponent) {
                if (current == start) fail("Unexpected symbol '+' in numeric literal")
                isExponentPositive = true
                ++current
                continue
            }
            if (ch == '-') {
                if (current != start) fail("Unexpected symbol '-' in numeric literal")
                isNegative = true
                ++current
                continue
            }
            val token = charToTokenClass(ch)
            if (token != TC_OTHER) break
            ++current
            val digit = ch - '0'
            if (digit !in 0..9) fail("Unexpected symbol '$ch' in numeric literal")
            if (hasExponent) {
                exponentAccumulator = exponentAccumulator * 10 + digit
                continue
            }
            accumulator = accumulator * 10 - digit
            if (accumulator > 0) fail("Numeric value overflow")
        }
        val hasChars = current != start
        if (start == current || (isNegative && start == current - 1)) {
            fail("Expected numeric literal")
        }
        if (hasQuotation) {
            if (!hasChars) fail("EOF")
            if (source[current] != STRING) fail("Expected closing quotation mark")
            ++current
        }
        currentPosition = current

        fun calculateExponent(exponentAccumulator: Long, isExponentPositive: Boolean): Double = when (isExponentPositive) {
            false -> 10.0.pow(-exponentAccumulator.toDouble())
            true -> 10.0.pow(exponentAccumulator.toDouble())
        }

        if (hasExponent) {
            val doubleAccumulator  = accumulator.toDouble() * calculateExponent(exponentAccumulator, isExponentPositive)
            if (doubleAccumulator > Long.MAX_VALUE || doubleAccumulator < Long.MIN_VALUE) fail("Numeric value overflow")
            if (floor(doubleAccumulator) != doubleAccumulator) fail("Can't convert $doubleAccumulator to Long")
            accumulator = doubleAccumulator.toLong()
        }

        return when {
            isNegative -> accumulator
            accumulator != Long.MIN_VALUE -> -accumulator
            else -> fail("Numeric value overflow")
        }
    }


    fun consumeBoolean(): Boolean {
        return consumeBoolean(skipWhitespaces())
    }

    fun consumeBooleanLenient(): Boolean {
        var current = skipWhitespaces()
        if (current == source.length) fail("EOF")
        val hasQuotation = if (source[current] == STRING) {
            ++current
            true
        } else {
            false
        }
        val result = consumeBoolean(current)
        if (hasQuotation) {
            if (currentPosition == source.length) fail("EOF")
            if (source[currentPosition] != STRING)
                fail("Expected closing quotation mark")
            ++currentPosition
        }
        return result
    }

    @JsName("consumeBoolean2") // WA for JS issue
    private fun consumeBoolean(start: Int): Boolean {
        /*
         * In ASCII representation, upper and lower case letters are different
         * in 6-th bit and we leverage this fact, our implementation consumes boolean literals
         * in a case-insensitive manner.
         */
        var current = prefetchOrEof(start)
        if (current >= source.length || current == -1) fail("EOF")
        return when (source[current++].code or asciiCaseMask) {
            't'.code -> {
                consumeBooleanLiteral("rue", current)
                true
            }
            'f'.code -> {
                consumeBooleanLiteral("alse", current)
                false
            }
            else -> {
                fail("Expected valid boolean literal prefix, but had '${consumeStringLenient()}'")
            }
        }
    }

    private fun consumeBooleanLiteral(literalSuffix: String, current: Int) {
        if (source.length - current < literalSuffix.length) {
            fail("Unexpected end of boolean literal")
        }

        for (i in literalSuffix.indices) {
            val expected = literalSuffix[i]
            val actual = source[current + i]
            if (expected.code != actual.code or asciiCaseMask) {
                fail("Expected valid boolean literal prefix, but had '${consumeStringLenient()}'")
            }
        }

        currentPosition = current + literalSuffix.length
    }
}