All Downloads are FREE. Search and download functionalities are using the official Maven repository.

commonMain.com.xebia.functional.tokenizer.TokenVocabulary.kt Maven / Gradle / Ivy

There is a newer version: 0.0.5-alpha.113
Show newest version
package com.xebia.functional.tokenizer

interface TokenVocabulary {
    val decodedTokens: Map

    companion object {
        operator fun invoke(encodingType: EncodingType): TokenVocabulary =
            object : TokenVocabulary {
                override val decodedTokens: Map = encodingType.buildDecodedTokenVocabulary()

                private fun EncodingType.buildDecodedTokenVocabulary(): Map = buildMap {
                    base.lineSequence().forEach { line ->
                        val (_, rank) = line.split(Regex("\\s+"), limit = 2)
                        val tokenId: Int = rank.toInt()
                        val token: String = encoding.decode(listOf(tokenId))
                        put(tokenId, token)
                    }
                    specialTokensBase.forEach { put(it.value, it.key) }
                }
            }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy