commonMain.com.xebia.functional.tokenizer.TokenVocabulary.kt Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of xef-tokenizer-jvm Show documentation
Show all versions of xef-tokenizer-jvm Show documentation
Building applications with LLMs through composability in Kotlin
The newest version!
package com.xebia.functional.tokenizer
interface TokenVocabulary {
val decodedTokens: Map
companion object {
operator fun invoke(encodingType: EncodingType): TokenVocabulary =
object : TokenVocabulary {
override val decodedTokens: Map = encodingType.buildDecodedTokenVocabulary()
private fun EncodingType.buildDecodedTokenVocabulary(): Map = buildMap {
base.lineSequence().forEach { line ->
val (_, rank) = line.split(Regex("\\s+"), limit = 2)
val tokenId: Int = rank.toInt()
val token: String = encoding.decode(listOf(tokenId))
put(tokenId, token)
}
specialTokensBase.forEach { put(it.value, it.key) }
}
}
}
}