com.komputation.demos.trec.NLP.kt Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of komputation Show documentation
Show all versions of komputation Show documentation
Komputation is a neural network framework for the JVM written in the Kotlin programming language.
package com.komputation.demos.trec
import com.komputation.matrix.Matrix
import com.komputation.matrix.intMatrix
import com.komputation.matrix.oneHotArray
import java.io.File
object NLP {
fun generateVocabulary(documents: Iterable>) =
documents
.flatMap { tokens -> tokens }
.toSet()
fun embedVocabulary(vocabulary: Set, embeddingFile: File): Map {
val embeddingMap = hashMapOf()
embeddingFile.bufferedReader().use { reader ->
reader
.lineSequence()
.forEach { line ->
val split = line.split(" ")
val word = split.first()
if (vocabulary.contains(word)) {
val embedding = split.drop(1).map { it.toFloat() }.toFloatArray()
embeddingMap.put(word, embedding)
}
}
embeddingMap
}
return embeddingMap
}
fun filterTokens(documents: Iterable>, vocabulary: Collection) =
documents
.map { document -> document.filter { vocabulary.contains(it) } }
fun filterDocuments(documents: Iterable>, minLength : Int) =
documents
.withIndex()
.filter { (_, document)-> document.size >= minLength }
.map { (index, _) -> index }
fun vectorizeDocuments(documents: Iterable>, vocabulary: Collection) =
documents
.map { tokens -> tokens.map { vocabulary.indexOf(it) }.toIntArray() }
.map { indices -> intMatrix(*indices) as Matrix }
.toTypedArray()
fun indexCategories(categories: Set) =
categories
.toSet()
.sorted()
.mapIndexed { index, category -> category to index }
.toMap()
fun createTargets(categories: Iterable, indexedCategories: Map) =
categories
.map { category -> oneHotArray(indexedCategories.size, indexedCategories[category]!!) }
.toTypedArray()
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy