All Downloads are FREE. Search and download functionalities are using the official Maven repository.

commonMain.com.fleeksoft.ksoup.parser.TokeniserState.kt Maven / Gradle / Ivy

Go to download

Ksoup is a Kotlin Multiplatform library for working with HTML and XML, and offers an easy-to-use API for URL fetching, data parsing, extraction, and manipulation using DOM and CSS selectors.

The newest version!
package com.fleeksoft.ksoup.parser

import com.fleeksoft.ksoup.nodes.DocumentType

/**
 * States and transition activations for the Tokeniser.
 */
internal enum class TokeniserState {
    Data {
        // in data state, gather characters until a character reference or tag is found
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            when (r.current()) {
                '&' -> t.advanceTransition(CharacterReferenceInData)
                '<' -> t.advanceTransition(TagOpen)
                nullChar -> {
                    t.error(this) // NOT replacement character (oddly?)
                    t.emit(r.consume())
                }

                eof -> t.emit(Token.EOF())
                else -> {
                    val data: String = r.consumeData()
                    t.emit(data)
                }
            }
        }
    },
    CharacterReferenceInData {
        // from & in data
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            readCharRef(t, Data)
        }
    },
    Rcdata {
        // / handles data in title, textarea etc
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            when (r.current()) {
                '&' -> t.advanceTransition(CharacterReferenceInRcdata)
                '<' -> t.advanceTransition(RcdataLessthanSign)
                nullChar -> {
                    t.error(this)
                    r.advance()
                    t.emit(replacementChar)
                }

                eof -> t.emit(Token.EOF())
                else -> {
                    val data: String = r.consumeData()
                    t.emit(data)
                }
            }
        }
    },
    CharacterReferenceInRcdata {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            readCharRef(t, Rcdata)
        }
    },
    Rawtext {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            readRawData(t, r, this, RawtextLessthanSign)
        }
    },
    ScriptData {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            readRawData(t, r, this, ScriptDataLessthanSign)
        }
    },
    PLAINTEXT {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            when (r.current()) {
                nullChar -> {
                    t.error(this)
                    r.advance()
                    t.emit(replacementChar)
                }

                eof -> t.emit(Token.EOF())
                else -> {
                    val data: String = r.consumeTo(nullChar)
                    t.emit(data)
                }
            }
        }
    },
    TagOpen {
        // from < in data
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            when (r.current()) {
                '!' -> t.advanceTransition(MarkupDeclarationOpen)
                '/' -> t.advanceTransition(EndTagOpen)
                '?' -> {
                    t.createBogusCommentPending()
                    t.transition(BogusComment)
                }

                else ->
                    if (r.matchesAsciiAlpha()) {
                        t.createTagPending(true)
                        t.transition(TagName)
                    } else {
                        t.error(this)
                        t.emit('<') // char that got us here
                        t.transition(Data)
                    }
            }
        }
    },
    EndTagOpen {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            if (r.isEmpty()) {
                t.eofError(this)
                t.emit("')) {
                t.error(this)
                t.advanceTransition(Data)
            } else {
                t.error(this)
                t.createBogusCommentPending()
                t.commentPending.append('/') // push the / back on that got us here
                t.transition(BogusComment)
            }
        }
    },
    TagName {
        // from < or  t.transition(BeforeAttributeName)
                '/' -> t.transition(SelfClosingStartTag)
                '<' -> {
                    r.unconsume()
                    t.error(this)
                    t.emitTagPending()
                    t.transition(Data)
                }

                '>' -> {
                    t.emitTagPending()
                    t.transition(Data)
                }

                nullChar -> t.tagPending.appendTagName(replacementStr)
                eof -> {
                    t.eofError(this)
                    t.transition(Data)
                }

                else -> t.tagPending.appendTagName(c)
            }
        }
    },
    RcdataLessthanSign {
        // from < in rcdata
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            if (r.matches('/')) {
                t.createTempBuffer()
                t.advanceTransition(RCDATAEndTagOpen)
            } else if (r.readFully() && r.matchesAsciiAlpha() && t.appropriateEndTagName() != null &&
                !r.containsIgnoreCase(
                    t.appropriateEndTagSeq(),
                )
            ) {
                // diverge from spec: got a start tag, but there's no appropriate end tag (), so rather than
                // consuming to EOF; break out here
                t.tagPending = t.createTagPending(false).name(t.appropriateEndTagName() ?: "")
                t.emitTagPending()
                t.transition(TagOpen) // straight into TagOpen, as we came from < and looks like we're on a start tag
            } else {
                t.emit("<")
                t.transition(Rcdata)
            }
        }
    },
    RCDATAEndTagOpen {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            if (r.matchesAsciiAlpha()) {
                t.createTagPending(false)
                t.tagPending.appendTagName(r.current())
                t.dataBuffer.append(r.current())
                t.advanceTransition(RCDATAEndTagName)
            } else {
                t.emit("
                    if (t.isAppropriateEndTagToken()) {
                        t.transition(
                            BeforeAttributeName,
                        )
                    } else {
                        anythingElse(t, r)
                    }

                '/' ->
                    if (t.isAppropriateEndTagToken()) {
                        t.transition(SelfClosingStartTag)
                    } else {
                        anythingElse(
                            t,
                            r,
                        )
                    }

                '>' ->
                    if (t.isAppropriateEndTagToken()) {
                        t.emitTagPending()
                        t.transition(Data)
                    } else {
                        anythingElse(t, r)
                    }

                else -> anythingElse(t, r)
            }
        }

        private fun anythingElse(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            t.emit(" {
                    t.createTempBuffer()
                    t.transition(ScriptDataEndTagOpen)
                }

                '!' -> {
                    t.emit(" {
                    t.emit("<")
                    t.eofError(this)
                    t.transition(Data)
                }

                else -> {
                    t.emit("<")
                    r.unconsume()
                    t.transition(ScriptData)
                }
            }
        }
    },
    ScriptDataEndTagOpen {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            readEndTag(t, r, ScriptDataEndTagName, ScriptData)
        }
    },
    ScriptDataEndTagName {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            handleDataEndTag(t, r, ScriptData)
        }
    },
    ScriptDataEscapeStart {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            if (r.matches('-')) {
                t.emit('-')
                t.advanceTransition(ScriptDataEscapeStartDash)
            } else {
                t.transition(ScriptData)
            }
        }
    },
    ScriptDataEscapeStartDash {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            if (r.matches('-')) {
                t.emit('-')
                t.advanceTransition(ScriptDataEscapedDashDash)
            } else {
                t.transition(ScriptData)
            }
        }
    },
    ScriptDataEscaped {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            if (r.isEmpty()) {
                t.eofError(this)
                t.transition(Data)
                return
            }
            when (r.current()) {
                '-' -> {
                    t.emit('-')
                    t.advanceTransition(ScriptDataEscapedDash)
                }

                '<' -> t.advanceTransition(ScriptDataEscapedLessthanSign)
                nullChar -> {
                    t.error(this)
                    r.advance()
                    t.emit(replacementChar)
                }

                else -> {
                    val data: String = r.consumeToAny('-', '<', nullChar)
                    t.emit(data)
                }
            }
        }
    },
    ScriptDataEscapedDash {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            if (r.isEmpty()) {
                t.eofError(this)
                t.transition(Data)
                return
            }
            val c: Char = r.consume()
            when (c) {
                '-' -> {
                    t.emit(c)
                    t.transition(ScriptDataEscapedDashDash)
                }

                '<' -> t.transition(ScriptDataEscapedLessthanSign)
                nullChar -> {
                    t.error(this)
                    t.emit(replacementChar)
                    t.transition(ScriptDataEscaped)
                }

                else -> {
                    t.emit(c)
                    t.transition(ScriptDataEscaped)
                }
            }
        }
    },
    ScriptDataEscapedDashDash {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            if (r.isEmpty()) {
                t.eofError(this)
                t.transition(Data)
                return
            }
            val c: Char = r.consume()
            when (c) {
                '-' -> t.emit(c)
                '<' -> t.transition(ScriptDataEscapedLessthanSign)
                '>' -> {
                    t.emit(c)
                    t.transition(ScriptData)
                }

                nullChar -> {
                    t.error(this)
                    t.emit(replacementChar)
                    t.transition(ScriptDataEscaped)
                }

                else -> {
                    t.emit(c)
                    t.transition(ScriptDataEscaped)
                }
            }
        }
    },
    ScriptDataEscapedLessthanSign {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            if (r.matchesAsciiAlpha()) {
                t.createTempBuffer()
                t.dataBuffer.append(r.current())
                t.emit("<")
                t.emit(r.current())
                t.advanceTransition(ScriptDataDoubleEscapeStart)
            } else if (r.matches('/')) {
                t.createTempBuffer()
                t.advanceTransition(ScriptDataEscapedEndTagOpen)
            } else {
                t.emit('<')
                t.transition(ScriptDataEscaped)
            }
        }
    },
    ScriptDataEscapedEndTagOpen {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            if (r.matchesAsciiAlpha()) {
                t.createTagPending(false)
                t.tagPending.appendTagName(r.current())
                t.dataBuffer.append(r.current())
                t.advanceTransition(ScriptDataEscapedEndTagName)
            } else {
                t.emit(" {
                    t.emit(c)
                    t.advanceTransition(ScriptDataDoubleEscapedDash)
                }

                '<' -> {
                    t.emit(c)
                    t.advanceTransition(ScriptDataDoubleEscapedLessthanSign)
                }

                nullChar -> {
                    t.error(this)
                    r.advance()
                    t.emit(replacementChar)
                }

                eof -> {
                    t.eofError(this)
                    t.transition(Data)
                }

                else -> {
                    val data: String = r.consumeToAny('-', '<', nullChar)
                    t.emit(data)
                }
            }
        }
    },
    ScriptDataDoubleEscapedDash {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            val c: Char = r.consume()
            when (c) {
                '-' -> {
                    t.emit(c)
                    t.transition(ScriptDataDoubleEscapedDashDash)
                }

                '<' -> {
                    t.emit(c)
                    t.transition(ScriptDataDoubleEscapedLessthanSign)
                }

                nullChar -> {
                    t.error(this)
                    t.emit(replacementChar)
                    t.transition(ScriptDataDoubleEscaped)
                }

                eof -> {
                    t.eofError(this)
                    t.transition(Data)
                }

                else -> {
                    t.emit(c)
                    t.transition(ScriptDataDoubleEscaped)
                }
            }
        }
    },
    ScriptDataDoubleEscapedDashDash {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            val c: Char = r.consume()
            when (c) {
                '-' -> t.emit(c)
                '<' -> {
                    t.emit(c)
                    t.transition(ScriptDataDoubleEscapedLessthanSign)
                }

                '>' -> {
                    t.emit(c)
                    t.transition(ScriptData)
                }

                nullChar -> {
                    t.error(this)
                    t.emit(replacementChar)
                    t.transition(ScriptDataDoubleEscaped)
                }

                eof -> {
                    t.eofError(this)
                    t.transition(Data)
                }

                else -> {
                    t.emit(c)
                    t.transition(ScriptDataDoubleEscaped)
                }
            }
        }
    },
    ScriptDataDoubleEscapedLessthanSign {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            if (r.matches('/')) {
                t.emit('/')
                t.createTempBuffer()
                t.advanceTransition(ScriptDataDoubleEscapeEnd)
            } else {
                t.transition(ScriptDataDoubleEscaped)
            }
        }
    },
    ScriptDataDoubleEscapeEnd {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            handleDataDoubleEscapeTag(t, r, ScriptDataEscaped, ScriptDataDoubleEscaped)
        }
    },
    BeforeAttributeName {
        // from tagname  {}
                '/' -> t.transition(SelfClosingStartTag)
                '<' -> {
                    r.unconsume()
                    t.error(this)
                    t.emitTagPending()
                    t.transition(Data)
                }

                '>' -> {
                    t.emitTagPending()
                    t.transition(Data)
                }

                nullChar -> {
                    r.unconsume()
                    t.error(this)
                    t.tagPending.newAttribute()
                    t.transition(AttributeName)
                }

                eof -> {
                    t.eofError(this)
                    t.transition(Data)
                }

                '"', '\'', '=' -> {
                    t.error(this)
                    t.tagPending.newAttribute()
                    t.tagPending.appendAttributeName(c, r.pos() - 1, r.pos())
                    t.transition(AttributeName)
                }

                else -> {
                    t.tagPending.newAttribute()
                    r.unconsume()
                    t.transition(AttributeName)
                }
            }
        }
    },
    AttributeName {
        // from before attribute name
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            var pos = r.pos()
            // spec deviate - consume and emit nulls in one hit vs stepping
            val name: String = r.consumeToAnySorted(*attributeNameCharsSorted)
            t.tagPending.appendAttributeName(name, pos, r.pos())

            pos = r.pos()
            when (val c: Char = r.consume()) {
                '\t', '\n', '\r', '\u000c', ' ' -> t.transition(AfterAttributeName)
                '/' -> t.transition(SelfClosingStartTag)
                '=' -> t.transition(BeforeAttributeValue)
                '>' -> {
                    t.emitTagPending()
                    t.transition(Data)
                }

                eof -> {
                    t.eofError(this)
                    t.transition(Data)
                }

                '"', '\'', '<' -> {
                    t.error(this)
                    t.tagPending.appendAttributeName(c, pos, r.pos())
                }

                else -> t.tagPending.appendAttributeName(c, pos, r.pos())
            }
        }
    },
    AfterAttributeName {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            val c: Char = r.consume()
            when (c) {
                '\t', '\n', '\r', '\u000c', ' ' -> {}
                '/' -> t.transition(SelfClosingStartTag)
                '=' -> t.transition(BeforeAttributeValue)
                '>' -> {
                    t.emitTagPending()
                    t.transition(Data)
                }

                nullChar -> {
                    t.error(this)
                    t.tagPending.appendAttributeName(TokeniserState.replacementChar, r.pos() - 1, r.pos())
                    t.transition(AttributeName)
                }

                eof -> {
                    t.eofError(this)
                    t.transition(Data)
                }

                '"', '\'', '<' -> {
                    t.error(this)
                    t.tagPending.newAttribute()
                    t.tagPending.appendAttributeName(c, r.pos() - 1, r.pos())
                    t.transition(AttributeName)
                }

                else -> {
                    t.tagPending.newAttribute()
                    r.unconsume()
                    t.transition(AttributeName)
                }
            }
        }
    },
    BeforeAttributeValue {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            val c: Char = r.consume()
            when (c) {
                '\t', '\n', '\r', '\u000c', ' ' -> {}
                '"' -> t.transition(AttributeValue_doubleQuoted)
                '&' -> {
                    r.unconsume()
                    t.transition(AttributeValue_unquoted)
                }

                '\'' -> t.transition(AttributeValue_singleQuoted)
                nullChar -> {
                    t.error(this)
                    t.tagPending.appendAttributeValue(TokeniserState.replacementChar, r.pos() - 1, r.pos())
                    t.transition(AttributeValue_unquoted)
                }

                eof -> {
                    t.eofError(this)
                    t.emitTagPending()
                    t.transition(Data)
                }

                '>' -> {
                    t.error(this)
                    t.emitTagPending()
                    t.transition(Data)
                }

                '<', '=', '`' -> {
                    t.error(this)
                    t.tagPending.appendAttributeValue(c, r.pos() - 1, r.pos())
                    t.transition(AttributeValue_unquoted)
                }

                else -> {
                    r.unconsume()
                    t.transition(AttributeValue_unquoted)
                }
            }
        }
    },
    AttributeValue_doubleQuoted {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            var pos = r.pos()
            val value: String = r.consumeAttributeQuoted(false)
            if (value.isNotEmpty()) {
                t.tagPending.appendAttributeValue(value, pos, r.pos())
            } else {
                t.tagPending.setEmptyAttributeValue()
            }
            pos = r.pos()
            when (val c: Char = r.consume()) {
                '"' -> t.transition(AfterAttributeValue_quoted)
                '&' -> {
                    val ref: IntArray? = t.consumeCharacterReference('"', true)
                    if (ref != null) {
                        t.tagPending.appendAttributeValue(ref, pos, r.pos())
                    } else {
                        t.tagPending.appendAttributeValue('&', pos, r.pos())
                    }
                }

                nullChar -> {
                    t.error(this)
                    t.tagPending.appendAttributeValue(TokeniserState.replacementChar, pos, r.pos())
                }

                eof -> {
                    t.eofError(this)
                    t.transition(Data)
                }

                else -> t.tagPending.appendAttributeValue(c, pos, r.pos())
            }
        }
    },
    AttributeValue_singleQuoted {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            var pos = r.pos()
            val value: String = r.consumeAttributeQuoted(true)
            if (value.isNotEmpty()) {
                t.tagPending.appendAttributeValue(
                    value,
                    pos,
                    r.pos(),
                )
            } else {
                t.tagPending.setEmptyAttributeValue()
            }
            pos = r.pos()
            when (val c: Char = r.consume()) {
                '\'' -> t.transition(AfterAttributeValue_quoted)
                '&' -> {
                    val ref: IntArray? = t.consumeCharacterReference('\'', true)
                    if (ref != null) {
                        t.tagPending.appendAttributeValue(ref, pos, r.pos())
                    } else {
                        t.tagPending.appendAttributeValue('&', pos, r.pos())
                    }
                }

                nullChar -> {
                    t.error(this)
                    t.tagPending.appendAttributeValue(TokeniserState.replacementChar, pos, r.pos())
                }

                eof -> {
                    t.eofError(this)
                    t.transition(Data)
                }

                else -> t.tagPending.appendAttributeValue(c, pos, r.pos())
            }
        }
    },
    AttributeValue_unquoted {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            var pos = r.pos()
            val value: String = r.consumeToAnySorted(*attributeValueUnquoted)
            if (value.isNotEmpty()) t.tagPending.appendAttributeValue(value, pos, r.pos())
            pos = r.pos()
            when (val c: Char = r.consume()) {
                '\t', '\n', '\r', '\u000c', ' ' -> t.transition(BeforeAttributeName)
                '&' -> {
                    val ref: IntArray? = t.consumeCharacterReference('>', true)
                    if (ref != null) {
                        t.tagPending.appendAttributeValue(ref, pos, r.pos())
                    } else {
                        t.tagPending.appendAttributeValue('&', pos, r.pos())
                    }
                }

                '>' -> {
                    t.emitTagPending()
                    t.transition(Data)
                }

                nullChar -> {
                    t.error(this)
                    t.tagPending.appendAttributeValue(TokeniserState.replacementChar, pos, r.pos())
                }

                eof -> {
                    t.eofError(this)
                    t.transition(Data)
                }

                '"', '\'', '<', '=', '`' -> {
                    t.error(this)
                    t.tagPending.appendAttributeValue(c, pos, r.pos())
                }

                else -> t.tagPending.appendAttributeValue(c, pos, r.pos())
            }
        }
    }, // CharacterReferenceInAttributeValue state handled inline
    AfterAttributeValue_quoted {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            val c: Char = r.consume()
            when (c) {
                '\t', '\n', '\r', '\u000c', ' ' -> t.transition(BeforeAttributeName)
                '/' -> t.transition(SelfClosingStartTag)
                '>' -> {
                    t.emitTagPending()
                    t.transition(Data)
                }

                eof -> {
                    t.eofError(this)
                    t.transition(Data)
                }

                else -> {
                    r.unconsume()
                    t.error(this)
                    t.transition(BeforeAttributeName)
                }
            }
        }
    },
    SelfClosingStartTag {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            val c: Char = r.consume()
            when (c) {
                '>' -> {
                    t.tagPending.isSelfClosing = true
                    t.emitTagPending()
                    t.transition(Data)
                }

                eof -> {
                    t.eofError(this)
                    t.transition(Data)
                }

                else -> {
                    r.unconsume()
                    t.error(this)
                    t.transition(BeforeAttributeName)
                }
            }
        }
    },
    BogusComment {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            // todo: handle bogus comment starting from eof. when does that trigger?
            t.commentPending.append(r.consumeTo('>'))
            // todo: replace nullChar with replaceChar
            val next: Char = r.current()
            if (next == '>' || next == eof) {
                r.consume()
                t.emitCommentPending()
                t.transition(Data)
            }
        }
    },
    MarkupDeclarationOpen {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            if (r.matchConsume("--")) {
                t.createCommentPending()
                t.transition(CommentStart)
            } else if (r.matchConsumeIgnoreCase("DOCTYPE")) {
                t.transition(Doctype)
            } else if (r.matchConsume("[CDATA[")) {
                // todo: should actually check current namespace, and only non-html allows cdata. until namespace
                // is implemented properly, keep handling as cdata
                // } else if (!t.currentNodeInHtmlNS() && r.matchConsume("[CDATA[")) {
                t.createTempBuffer()
                t.transition(CdataSection)
            } else {
                t.error(this)
                t.createBogusCommentPending()
                t.transition(BogusComment)
            }
        }
    },
    CommentStart {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            val c: Char = r.consume()
            when (c) {
                '-' -> t.transition(CommentStartDash)
                nullChar -> {
                    t.error(this)
                    t.commentPending.append(replacementChar)
                    t.transition(Comment)
                }

                '>' -> {
                    t.error(this)
                    t.emitCommentPending()
                    t.transition(Data)
                }

                eof -> {
                    t.eofError(this)
                    t.emitCommentPending()
                    t.transition(Data)
                }

                else -> {
                    r.unconsume()
                    t.transition(Comment)
                }
            }
        }
    },
    CommentStartDash {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            when (val c: Char = r.consume()) {
                '-' -> t.transition(CommentEnd)
                nullChar -> {
                    t.error(this)
                    t.commentPending.append(replacementChar)
                    t.transition(Comment)
                }

                '>' -> {
                    t.error(this)
                    t.emitCommentPending()
                    t.transition(Data)
                }

                eof -> {
                    t.eofError(this)
                    t.emitCommentPending()
                    t.transition(Data)
                }

                else -> {
                    t.commentPending.append(c)
                    t.transition(Comment)
                }
            }
        }
    },
    Comment {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            val c: Char = r.current()
            when (c) {
                '-' -> t.advanceTransition(CommentEndDash)
                nullChar -> {
                    t.error(this)
                    r.advance()
                    t.commentPending.append(replacementChar)
                }

                eof -> {
                    t.eofError(this)
                    t.emitCommentPending()
                    t.transition(Data)
                }

                else -> t.commentPending.append(r.consumeToAny('-', nullChar))
            }
        }
    },
    CommentEndDash {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            val c: Char = r.consume()
            when (c) {
                '-' -> t.transition(CommentEnd)
                nullChar -> {
                    t.error(this)
                    t.commentPending.append('-').append(replacementChar)
                    t.transition(Comment)
                }

                eof -> {
                    t.eofError(this)
                    t.emitCommentPending()
                    t.transition(Data)
                }

                else -> {
                    t.commentPending.append('-').append(c)
                    t.transition(Comment)
                }
            }
        }
    },
    CommentEnd {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            when (val c: Char = r.consume()) {
                '>' -> {
                    t.emitCommentPending()
                    t.transition(Data)
                }

                nullChar -> {
                    t.error(this)
                    t.commentPending.append("--").append(replacementChar)
                    t.transition(Comment)
                }

                '!' -> t.transition(CommentEndBang)
                '-' -> t.commentPending.append('-')
                eof -> {
                    t.eofError(this)
                    t.emitCommentPending()
                    t.transition(Data)
                }

                else -> {
                    t.commentPending.append("--").append(c)
                    t.transition(Comment)
                }
            }
        }
    },
    CommentEndBang {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            when (val c: Char = r.consume()) {
                '-' -> {
                    t.commentPending.append("--!")
                    t.transition(CommentEndDash)
                }

                '>' -> {
                    t.emitCommentPending()
                    t.transition(Data)
                }

                nullChar -> {
                    t.error(this)
                    t.commentPending.append("--!").append(replacementChar)
                    t.transition(Comment)
                }

                eof -> {
                    t.eofError(this)
                    t.emitCommentPending()
                    t.transition(Data)
                }

                else -> {
                    t.commentPending.append("--!").append(c)
                    t.transition(Comment)
                }
            }
        }
    },
    Doctype {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            val c: Char = r.consume()
            when (c) {
                '\t', '\n', '\r', '\u000c', ' ' -> t.transition(BeforeDoctypeName)
                eof -> {
                    t.eofError(this)
                    t.error(this)
                    t.createDoctypePending()
                    t.doctypePending.isForceQuirks = true
                    t.emitDoctypePending()
                    t.transition(Data)
                }

                '>' -> {
                    t.error(this)
                    t.createDoctypePending()
                    t.doctypePending.isForceQuirks = true
                    t.emitDoctypePending()
                    t.transition(Data)
                }

                else -> {
                    t.error(this)
                    t.transition(BeforeDoctypeName)
                }
            }
        }
    },
    BeforeDoctypeName {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            if (r.matchesAsciiAlpha()) {
                t.createDoctypePending()
                t.transition(DoctypeName)
                return
            }
            when (val c: Char = r.consume()) {
                '\t', '\n', '\r', '\u000c', ' ' -> {}
                nullChar -> {
                    t.error(this)
                    t.createDoctypePending()
                    t.doctypePending.name.append(replacementChar)
                    t.transition(DoctypeName)
                }

                eof -> {
                    t.eofError(this)
                    t.createDoctypePending()
                    t.doctypePending.isForceQuirks = true
                    t.emitDoctypePending()
                    t.transition(Data)
                }

                else -> {
                    t.createDoctypePending()
                    t.doctypePending.name.append(c)
                    t.transition(DoctypeName)
                }
            }
        }
    },
    DoctypeName {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            if (r.matchesLetter()) {
                val name: String = r.consumeLetterSequence()
                t.doctypePending.name.append(name)
                return
            }
            when (val c: Char = r.consume()) {
                '>' -> {
                    t.emitDoctypePending()
                    t.transition(Data)
                }

                '\t', '\n', '\r', '\u000c', ' ' -> t.transition(AfterDoctypeName)
                nullChar -> {
                    t.error(this)
                    t.doctypePending.name.append(replacementChar)
                }

                eof -> {
                    t.eofError(this)
                    t.doctypePending.isForceQuirks = true
                    t.emitDoctypePending()
                    t.transition(Data)
                }

                else -> t.doctypePending.name.append(c)
            }
        }
    },
    AfterDoctypeName {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            if (r.isEmpty()) {
                t.eofError(this)
                t.doctypePending.isForceQuirks = true
                t.emitDoctypePending()
                t.transition(Data)
                return
            }
            if (r.matchesAny('\t', '\n', '\r', '\u000c', ' ')) {
                r.advance() // ignore whitespace
            } else if (r.matches('>')) {
                t.emitDoctypePending()
                t.advanceTransition(Data)
            } else if (r.matchConsumeIgnoreCase(DocumentType.PUBLIC_KEY)) {
                t.doctypePending.pubSysKey = DocumentType.PUBLIC_KEY
                t.transition(AfterDoctypePublicKeyword)
            } else if (r.matchConsumeIgnoreCase(DocumentType.SYSTEM_KEY)) {
                t.doctypePending.pubSysKey = DocumentType.SYSTEM_KEY
                t.transition(AfterDoctypeSystemKeyword)
            } else {
                t.error(this)
                t.doctypePending.isForceQuirks = true
                t.advanceTransition(BogusDoctype)
            }
        }
    },
    AfterDoctypePublicKeyword {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            val c: Char = r.consume()
            when (c) {
                '\t', '\n', '\r', '\u000c', ' ' -> t.transition(BeforeDoctypePublicIdentifier)
                '"' -> {
                    t.error(this)
                    // set public id to empty string
                    t.transition(DoctypePublicIdentifier_doubleQuoted)
                }

                '\'' -> {
                    t.error(this)
                    // set public id to empty string
                    t.transition(DoctypePublicIdentifier_singleQuoted)
                }

                '>' -> {
                    t.error(this)
                    t.doctypePending.isForceQuirks = true
                    t.emitDoctypePending()
                    t.transition(Data)
                }

                eof -> {
                    t.eofError(this)
                    t.doctypePending.isForceQuirks = true
                    t.emitDoctypePending()
                    t.transition(Data)
                }

                else -> {
                    t.error(this)
                    t.doctypePending.isForceQuirks = true
                    t.transition(BogusDoctype)
                }
            }
        }
    },
    BeforeDoctypePublicIdentifier {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            val c: Char = r.consume()
            when (c) {
                '\t', '\n', '\r', '\u000c', ' ' -> {}
                '"' -> // set public id to empty string
                    t.transition(DoctypePublicIdentifier_doubleQuoted)

                '\'' -> // set public id to empty string
                    t.transition(DoctypePublicIdentifier_singleQuoted)

                '>' -> {
                    t.error(this)
                    t.doctypePending.isForceQuirks = true
                    t.emitDoctypePending()
                    t.transition(Data)
                }

                eof -> {
                    t.eofError(this)
                    t.doctypePending.isForceQuirks = true
                    t.emitDoctypePending()
                    t.transition(Data)
                }

                else -> {
                    t.error(this)
                    t.doctypePending.isForceQuirks = true
                    t.transition(BogusDoctype)
                }
            }
        }
    },
    DoctypePublicIdentifier_doubleQuoted {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            when (val c: Char = r.consume()) {
                '"' -> t.transition(AfterDoctypePublicIdentifier)
                nullChar -> {
                    t.error(this)
                    t.doctypePending.publicIdentifier.append(replacementChar)
                }

                '>' -> {
                    t.error(this)
                    t.doctypePending.isForceQuirks = true
                    t.emitDoctypePending()
                    t.transition(Data)
                }

                eof -> {
                    t.eofError(this)
                    t.doctypePending.isForceQuirks = true
                    t.emitDoctypePending()
                    t.transition(Data)
                }

                else -> t.doctypePending.publicIdentifier.append(c)
            }
        }
    },
    DoctypePublicIdentifier_singleQuoted {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            val c: Char = r.consume()
            when (c) {
                '\'' -> t.transition(AfterDoctypePublicIdentifier)
                nullChar -> {
                    t.error(this)
                    t.doctypePending.publicIdentifier.append(replacementChar)
                }

                '>' -> {
                    t.error(this)
                    t.doctypePending.isForceQuirks = true
                    t.emitDoctypePending()
                    t.transition(Data)
                }

                eof -> {
                    t.eofError(this)
                    t.doctypePending.isForceQuirks = true
                    t.emitDoctypePending()
                    t.transition(Data)
                }

                else -> t.doctypePending.publicIdentifier.append(c)
            }
        }
    },
    AfterDoctypePublicIdentifier {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            val c: Char = r.consume()
            when (c) {
                '\t', '\n', '\r', '\u000c', ' ' ->
                    t.transition(
                        BetweenDoctypePublicAndSystemIdentifiers,
                    )

                '>' -> {
                    t.emitDoctypePending()
                    t.transition(Data)
                }

                '"' -> {
                    t.error(this)
                    // system id empty
                    t.transition(DoctypeSystemIdentifier_doubleQuoted)
                }

                '\'' -> {
                    t.error(this)
                    // system id empty
                    t.transition(DoctypeSystemIdentifier_singleQuoted)
                }

                eof -> {
                    t.eofError(this)
                    t.doctypePending.isForceQuirks = true
                    t.emitDoctypePending()
                    t.transition(Data)
                }

                else -> {
                    t.error(this)
                    t.doctypePending.isForceQuirks = true
                    t.transition(BogusDoctype)
                }
            }
        }
    },
    BetweenDoctypePublicAndSystemIdentifiers {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            val c: Char = r.consume()
            when (c) {
                '\t', '\n', '\r', '\u000c', ' ' -> {}
                '>' -> {
                    t.emitDoctypePending()
                    t.transition(Data)
                }

                '"' -> {
                    t.error(this)
                    // system id empty
                    t.transition(DoctypeSystemIdentifier_doubleQuoted)
                }

                '\'' -> {
                    t.error(this)
                    // system id empty
                    t.transition(DoctypeSystemIdentifier_singleQuoted)
                }

                eof -> {
                    t.eofError(this)
                    t.doctypePending.isForceQuirks = true
                    t.emitDoctypePending()
                    t.transition(Data)
                }

                else -> {
                    t.error(this)
                    t.doctypePending.isForceQuirks = true
                    t.transition(BogusDoctype)
                }
            }
        }
    },
    AfterDoctypeSystemKeyword {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            val c: Char = r.consume()
            when (c) {
                '\t', '\n', '\r', '\u000c', ' ' -> t.transition(BeforeDoctypeSystemIdentifier)
                '>' -> {
                    t.error(this)
                    t.doctypePending.isForceQuirks = true
                    t.emitDoctypePending()
                    t.transition(Data)
                }

                '"' -> {
                    t.error(this)
                    // system id empty
                    t.transition(DoctypeSystemIdentifier_doubleQuoted)
                }

                '\'' -> {
                    t.error(this)
                    // system id empty
                    t.transition(DoctypeSystemIdentifier_singleQuoted)
                }

                eof -> {
                    t.eofError(this)
                    t.doctypePending.isForceQuirks = true
                    t.emitDoctypePending()
                    t.transition(Data)
                }

                else -> {
                    t.error(this)
                    t.doctypePending.isForceQuirks = true
                    t.emitDoctypePending()
                }
            }
        }
    },
    BeforeDoctypeSystemIdentifier {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            val c: Char = r.consume()
            when (c) {
                '\t', '\n', '\r', '\u000c', ' ' -> {}
                '"' -> // set system id to empty string
                    t.transition(DoctypeSystemIdentifier_doubleQuoted)

                '\'' -> // set public id to empty string
                    t.transition(DoctypeSystemIdentifier_singleQuoted)

                '>' -> {
                    t.error(this)
                    t.doctypePending.isForceQuirks = true
                    t.emitDoctypePending()
                    t.transition(Data)
                }

                eof -> {
                    t.eofError(this)
                    t.doctypePending.isForceQuirks = true
                    t.emitDoctypePending()
                    t.transition(Data)
                }

                else -> {
                    t.error(this)
                    t.doctypePending.isForceQuirks = true
                    t.transition(BogusDoctype)
                }
            }
        }
    },
    DoctypeSystemIdentifier_doubleQuoted {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            val c: Char = r.consume()
            when (c) {
                '"' -> t.transition(AfterDoctypeSystemIdentifier)
                nullChar -> {
                    t.error(this)
                    t.doctypePending.systemIdentifier.append(replacementChar)
                }

                '>' -> {
                    t.error(this)
                    t.doctypePending.isForceQuirks = true
                    t.emitDoctypePending()
                    t.transition(Data)
                }

                eof -> {
                    t.eofError(this)
                    t.doctypePending.isForceQuirks = true
                    t.emitDoctypePending()
                    t.transition(Data)
                }

                else -> t.doctypePending.systemIdentifier.append(c)
            }
        }
    },
    DoctypeSystemIdentifier_singleQuoted {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            when (val c: Char = r.consume()) {
                '\'' -> t.transition(AfterDoctypeSystemIdentifier)
                nullChar -> {
                    t.error(this)
                    t.doctypePending.systemIdentifier.append(replacementChar)
                }

                '>' -> {
                    t.error(this)
                    t.doctypePending.isForceQuirks = true
                    t.emitDoctypePending()
                    t.transition(Data)
                }

                eof -> {
                    t.eofError(this)
                    t.doctypePending.isForceQuirks = true
                    t.emitDoctypePending()
                    t.transition(Data)
                }

                else -> t.doctypePending.systemIdentifier.append(c)
            }
        }
    },
    AfterDoctypeSystemIdentifier {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            val c: Char = r.consume()
            when (c) {
                '\t', '\n', '\r', '\u000c', ' ' -> {}
                '>' -> {
                    t.emitDoctypePending()
                    t.transition(Data)
                }

                eof -> {
                    t.eofError(this)
                    t.doctypePending.isForceQuirks = true
                    t.emitDoctypePending()
                    t.transition(Data)
                }

                else -> {
                    t.error(this)
                    t.transition(BogusDoctype)
                }
            }
        }
    },
    BogusDoctype {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            val c: Char = r.consume()
            when (c) {
                '>' -> {
                    t.emitDoctypePending()
                    t.transition(Data)
                }

                eof -> {
                    t.emitDoctypePending()
                    t.transition(Data)
                }

                else -> {}
            }
        }
    },
    CdataSection {
        override fun read(
            t: Tokeniser,
            r: CharacterReader,
        ) {
            val data: String = r.consumeTo("]]>")
            t.dataBuffer.append(data)
            if (r.matchConsume("]]>") || r.isEmpty()) {
                t.emit(Token.CData(t.dataBuffer.toString()))
                t.transition(Data)
            } // otherwise, buffer underrun, stay in data section
        }
    },
    ;

    abstract fun read(
        t: Tokeniser,
        r: CharacterReader,
    )

    companion object {
        const val nullChar = '\u0000'

        // char searches. must be sorted, used in inSorted. MUST update TokenisetStateTest if more arrays are added.
        val attributeNameCharsSorted =
            charArrayOf('\t', '\n', '\u000c', '\r', ' ', '"', '\'', '/', '<', '=', '>')
        val attributeValueUnquoted =
            charArrayOf(
                nullChar,
                '\t',
                '\n',
                '\u000c',
                '\r',
                ' ',
                '"',
                '&',
                '\'',
                '<',
                '=',
                '>',
                '`',
            )
        private val replacementChar: Char = Tokeniser.replacementChar
        private val replacementStr: String = Tokeniser.replacementChar.toString()
        private val eof: Char = CharacterReader.EOF

        /**
         * Handles RawtextEndTagName, ScriptDataEndTagName, and ScriptDataEscapedEndTagName. Same body impl, just
         * different else exit transitions.
         */
        private fun handleDataEndTag(
            t: Tokeniser,
            r: CharacterReader,
            elseTransition: TokeniserState,
        ) {
            if (r.matchesLetter()) {
                val name: String = r.consumeLetterSequence()
                t.tagPending.appendTagName(name)
                t.dataBuffer.append(name)
                return
            }
            var needsExitTransition = false
            if (t.isAppropriateEndTagToken() && !r.isEmpty()) {
                val c: Char = r.consume()
                when (c) {
                    '\t', '\n', '\r', '\u000c', ' ' -> t.transition(BeforeAttributeName)
                    '/' -> t.transition(SelfClosingStartTag)
                    '>' -> {
                        t.emitTagPending()
                        t.transition(Data)
                    }

                    else -> {
                        t.dataBuffer.append(c)
                        needsExitTransition = true
                    }
                }
            } else {
                needsExitTransition = true
            }
            if (needsExitTransition) {
                t.emit(" t.advanceTransition(advance)
                nullChar -> {
                    t.error(current)
                    r.advance()
                    t.emit(replacementChar)
                }

                eof -> t.emit(Token.EOF())
                else -> {
                    val data: String = r.consumeRawData()
                    t.emit(data)
                }
            }
        }

        private fun readCharRef(
            t: Tokeniser,
            advance: TokeniserState,
        ) {
            val c: IntArray? = t.consumeCharacterReference(null, false)
            if (c == null) t.emit('&') else t.emit(c)
            t.transition(advance)
        }

        private fun readEndTag(
            t: Tokeniser,
            r: CharacterReader,
            a: TokeniserState,
            b: TokeniserState,
        ) {
            if (r.matchesAsciiAlpha()) {
                t.createTagPending(false)
                t.transition(a)
            } else {
                t.emit("' -> {
                    if (t.dataBuffer.toString() == "script") {
                        t.transition(primary)
                    } else {
                        t.transition(fallback)
                    }
                    t.emit(c)
                }

                else -> {
                    r.unconsume()
                    t.transition(fallback)
                }
            }
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy