commonMain.com.fleeksoft.ksoup.parser.Tokeniser.kt Maven / Gradle / Ivy
package com.fleeksoft.ksoup.parser
import com.fleeksoft.ksoup.helper.Validate
import com.fleeksoft.ksoup.internal.StringUtil
import com.fleeksoft.ksoup.nodes.Entities
import com.fleeksoft.ksoup.ported.codePointsToString
import de.cketti.codepoints.appendCodePoint
/**
* Readers the input stream into tokens.
*/
internal class Tokeniser(private val treeBuilder: TreeBuilder) {
private val reader: CharacterReader = treeBuilder.reader
private val errors: ParseErrorList = treeBuilder.parser.getErrors()
private var state = TokeniserState.Data
private var emitPending: Token? = null
private var isEmitPending = false
private var charsString: String? = null
private val charsBuilder = StringBuilder(1024)
val dataBuffer = StringBuilder(1024)
private val startPending = Token.StartTag(treeBuilder)
private val endPending = Token.EndTag(treeBuilder)
var tagPending: Token.Tag = startPending
private val charPending = Token.Character()
val doctypePending = Token.Doctype()
val commentPending = Token.Comment()
private var lastStartTag: String? = null
private var lastStartCloseSeq: String? = null
private var markupStartPos = Unset
private var charStartPos = Unset
private val codepointHolder = IntArray(1) // holder to not have to keep creating arrays
private val multipointHolder = IntArray(2)
fun read(): Token {
while (!isEmitPending) {
state.read(this, reader)
}
return when {
charsBuilder.isNotEmpty() -> {
val str = charsBuilder.toString()
charsBuilder.clear()
charPending.data(str).also { charsString = null }
}
charsString != null -> {
charPending.data(charsString!!).also { charsString = null }
}
else -> {
isEmitPending = false
emitPending!!
}
}
}
fun emit(token: Token) {
Validate.isFalse(isEmitPending)
emitPending = token
isEmitPending = true
token.startPos(markupStartPos)
token.endPos(reader.pos())
charStartPos = Unset
when (token.type) {
Token.TokenType.StartTag -> {
val startTag = token as Token.StartTag
lastStartTag = startTag.tagName
lastStartCloseSeq = null // only lazy inits
}
Token.TokenType.EndTag -> {
val endTag = token as Token.EndTag
if (endTag.hasAttributes()) {
error("Attributes incorrectly present on end tag [/${endTag.retrieveNormalName()}]")
}
}
else -> {}
}
}
fun emit(str: String) {
if (charsString == null) {
charsString = str
} else {
if (charsBuilder.isEmpty()) {
charsBuilder.append(charsString)
}
charsBuilder.append(str)
}
charPending.startPos(charStartPos)
charPending.endPos(reader.pos())
}
fun emit(strBuilder: StringBuilder) {
if (charsString == null) {
charsString = strBuilder.toString()
} else {
if (charsBuilder.isEmpty()) {
charsBuilder.append(charsString)
}
charsBuilder.append(strBuilder)
}
charPending.startPos(charStartPos)
charPending.endPos(reader.pos())
}
fun emit(c: Char) {
if (charsString == null) {
charsString = c.toString()
} else {
if (charsBuilder.isEmpty()) {
charsBuilder.append(charsString)
}
charsBuilder.append(c)
}
charPending.startPos(charStartPos)
charPending.endPos(reader.pos())
}
fun emit(chars: CharArray) {
emit(chars.concatToString())
}
fun emit(codepoints: IntArray) {
emit(codepoints.codePointsToString())
// emit(String(codepoints, 0, codepoints.size))
}
fun getState(): TokeniserState {
return state
}
fun transition(newState: TokeniserState) {
when (newState) {
TokeniserState.TagOpen -> markupStartPos = reader.pos()
TokeniserState.Data -> if (charStartPos == Unset) charStartPos = reader.pos()
else -> {}
}
this.state = newState
}
fun advanceTransition(newState: TokeniserState) {
transition(newState)
reader.advance()
}
fun consumeCharacterReference(
additionalAllowedCharacter: Char?,
inAttribute: Boolean,
): IntArray? {
if (reader.isEmpty()) return null
if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current()) return null
if (reader.matchesAnySorted(notCharRefCharsSorted)) return null
val codeRef = codepointHolder
reader.mark()
if (reader.matchConsume("#")) {
val isHexMode = reader.matchConsumeIgnoreCase("X")
val numRef =
if (isHexMode) reader.consumeHexSequence() else reader.consumeDigitSequence()
if (numRef.isEmpty()) {
characterReferenceError("numeric reference with no numerals")
reader.rewindToMark()
return null
}
reader.unmark()
if (!reader.matchConsume(";")) {
characterReferenceError("missing semicolon on [$numRef]")
}
var charval =
try {
numRef.toInt(if (isHexMode) 16 else 10)
} catch (e: NumberFormatException) {
-1
}
if (charval == -1 || charval > 0x10FFFF) {
characterReferenceError("character [$charval] outside of valid range")
codeRef[0] = Tokeniser.replacementChar.code
} else {
if (charval >= win1252ExtensionsStart && charval < win1252ExtensionsStart + win1252Extensions.size) {
characterReferenceError("character [$charval] is not a valid unicode code point")
charval = win1252Extensions[charval - win1252ExtensionsStart]
}
codeRef[0] = charval
}
return codeRef
} else {
val nameRef = reader.consumeLetterThenDigitSequence()
val looksLegit = reader.matches(';')
val found =
(Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit))
if (!found) {
reader.rewindToMark()
if (looksLegit) characterReferenceError("invalid named reference [$nameRef]")
return null
}
if (inAttribute && reader.matchesAny('=', '-', '_')) {
reader.rewindToMark() // don't want that to match
return null
}
reader.unmark()
if (!reader.matchConsume(";")) characterReferenceError("missing semicolon on [&$nameRef]")
val numChars = Entities.codepointsForName(nameRef, multipointHolder)
return when (numChars) {
1 -> {
codeRef[0] = multipointHolder[0]
codeRef
}
2 -> multipointHolder
else -> {
Validate.fail("Unexpected characters returned for $nameRef")
multipointHolder
}
}
}
}
fun createTagPending(start: Boolean): Token.Tag {
tagPending = if (start) startPending.reset() else endPending.reset()
return tagPending
}
fun emitTagPending() {
tagPending.finaliseTag()
emit(tagPending)
}
fun createCommentPending() {
commentPending.reset()
}
fun emitCommentPending() {
emit(commentPending)
}
fun createBogusCommentPending() {
commentPending.reset()
commentPending.bogus = true
}
fun createDoctypePending() {
doctypePending.reset()
}
fun emitDoctypePending() {
emit(doctypePending)
}
fun createTempBuffer() {
Token.reset(dataBuffer)
}
fun isAppropriateEndTagToken(): Boolean = lastStartTag != null && tagPending.name().equals(lastStartTag, ignoreCase = true)
fun appropriateEndTagName(): String? {
return lastStartTag // could be null
}
/** Returns the closer sequence `
© 2015 - 2025 Weber Informatics LLC | Privacy Policy