utils.CorpusReader.kt Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of simplednn Show documentation
Show all versions of simplednn Show documentation
SimpleDNN is a machine learning lightweight open-source library written in Kotlin whose purpose is to
support the development of feed-forward and recurrent Artificial Neural Networks.
/* Copyright 2016-present The KotlinNLP Authors. All Rights Reserved.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, you can obtain one at http://mozilla.org/MPL/2.0/.
* ------------------------------------------------------------------*/
package utils
import com.jsoniter.JsonIterator
import com.kotlinnlp.simplednn.dataset.Corpus
import com.kotlinnlp.simplednn.dataset.Example
import utils.exampleextractor.ExampleExtractor
import java.io.BufferedInputStream
import java.io.FileInputStream
import CorpusPaths
/**
* A helper to read corpora from file (containing training, validation and test sets).
*/
class CorpusReader {
/**
* Read the corpus from the given [corpusPath], extracting examples with the an [exampleExtractor] from a pure JSON
* file if [perLine] is false, otherwise from a JSON-line file.
*
* @param corpusPath the [CorpusPaths] from which to read the datasets
* @param exampleExtractor an [ExampleExtractor]
* @param perLine a Boolean indicating if the file contains a JSON object per line, or a unique pure JSON
*
* @return the read [Corpus]
*/
fun read(corpusPath: CorpusPaths,
exampleExtractor: ExampleExtractor,
perLine: Boolean): Corpus {
println("\n-- CORPUS READING")
val startTime = System.currentTimeMillis()
val dataset = Corpus(
training = this.readDataset(corpusPath.training, exampleExtractor, perLine = perLine),
validation = this.readDataset(corpusPath.validation, exampleExtractor, perLine = perLine),
test = this.readDataset(corpusPath.test, exampleExtractor, perLine = perLine))
println("Elapsed time: %s s".format(System.currentTimeMillis() - startTime))
println("Train: %d examples".format(dataset.training.size))
println("Validation: %d examples".format(dataset.validation.size))
println("Test: %d examples".format(dataset.test.size))
return dataset
}
/**
* Read a dataset from the given file extracting examples with the given [exampleExtractor].
*
* @param filename the name of the dataset file
* @param exampleExtractor an [ExampleExtractor]
* @param perLine a Boolean indicating if the file contains a JSON object per line, or a unique pure JSON
*
* @return the read dataset
*/
private fun readDataset(filename: String,
exampleExtractor: ExampleExtractor,
perLine: Boolean): ArrayList {
return if (perLine)
this.readDatasetPerLine(filename = filename, exampleExtractor = exampleExtractor)
else
this.readDatasetFromWholeFile(filename = filename, exampleExtractor = exampleExtractor)
}
/**
* Read a dataset per line from the given file extracting examples with the given [exampleExtractor].
*
* @param filename the name of the dataset file
* @param exampleExtractor an [ExampleExtractor]
*
* @return the read dataset
*/
private fun readDatasetPerLine(filename: String,
exampleExtractor: ExampleExtractor): ArrayList {
val examples = ArrayList()
val file = FileInputStream(filename)
file.reader().forEachLine {
examples.add(exampleExtractor.extract(JsonIterator.parse(it)))
}
return examples
}
/**
* Read a dataset from the given JSON file extracting examples with the given [exampleExtractor].
*
* @param filename the name of the dataset file
* @param exampleExtractor an [ExampleExtractor]
*
* @return the read dataset
*/
private fun readDatasetFromWholeFile(filename: String,
exampleExtractor: ExampleExtractor): ArrayList {
val examples = ArrayList()
val iterator = JsonIterator.parse(BufferedInputStream(FileInputStream(filename)), 2048)
while(iterator.readArray()) {
examples.add(exampleExtractor.extract(iterator))
}
return examples
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy