All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.komputation.demos.trec.NLP.kt Maven / Gradle / Ivy

Go to download

Komputation is a neural network framework for the JVM written in the Kotlin programming language.

There is a newer version: 0.12.5
Show newest version
package com.komputation.demos.trec

import com.komputation.matrix.Matrix
import com.komputation.matrix.intMatrix
import com.komputation.matrix.oneHotArray
import java.io.File

object NLP {

    fun generateVocabulary(documents: Iterable>) =

        documents
            .flatMap { tokens -> tokens }
            .toSet()

    fun embedVocabulary(vocabulary: Set, embeddingFile: File): Map {

        val embeddingMap = hashMapOf()

        embeddingFile.bufferedReader().use { reader ->

            reader
                .lineSequence()
                .forEach { line ->

                    val split = line.split(" ")

                    val word = split.first()

                    if (vocabulary.contains(word)) {

                        val embedding = split.drop(1).map { it.toFloat() }.toFloatArray()

                        embeddingMap.put(word, embedding)

                    }

                }

            embeddingMap

        }

        return embeddingMap

    }

    fun filterTokens(documents: Iterable>, vocabulary: Collection) =

        documents
            .map { document -> document.filter { vocabulary.contains(it) } }

    fun filterDocuments(documents: Iterable>, minLength : Int) =

        documents
            .withIndex()
            .filter { (_, document)-> document.size >= minLength }
            .map { (index, _) -> index }

    fun vectorizeDocuments(documents: Iterable>, vocabulary: Collection) =

        documents
            .map { tokens -> tokens.map { vocabulary.indexOf(it) }.toIntArray() }
            .map { indices -> intMatrix(*indices) as Matrix }
            .toTypedArray()

    fun indexCategories(categories: Set) =

        categories
            .toSet()
            .sorted()
            .mapIndexed { index, category -> category to index }
            .toMap()

    fun createTargets(categories: Iterable, indexedCategories: Map) =

        categories
            .map { category -> oneHotArray(indexedCategories.size, indexedCategories[category]!!) }
            .toTypedArray()

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy