com.johnsnowlabs.nlp.training.CoNLLU.scala Maven / Gradle / Ivy
The newest version!
/*
* Copyright 2017-2022 John Snow Labs
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.johnsnowlabs.nlp.training
import com.johnsnowlabs.nlp.annotators.common.Annotated.PosTaggedSentence
import com.johnsnowlabs.nlp.annotators.common._
import com.johnsnowlabs.nlp.util.io.{ExternalResource, ReadAs, ResourceHelper}
import com.johnsnowlabs.nlp.{Annotation, AnnotatorType, DocumentAssembler}
import org.apache.spark.sql.types._
import org.apache.spark.sql.{Dataset, SparkSession}
case class CoNLLUDocument(
text: String,
uPosTagged: Seq[PosTaggedSentence],
xPosTagged: Seq[PosTaggedSentence],
lemma: Seq[PosTaggedSentence])
/** Instantiates the class to read a CoNLL-U dataset.
*
* The dataset should be in the format of
* [[https://universaldependencies.org/format.html CoNLL-U]] and needs to be specified with
* `readDataset`, which will create a dataframe with the data.
*
* ==Example==
* {{{
* import com.johnsnowlabs.nlp.training.CoNLLU
*
* val conlluFile = "src/test/resources/conllu/en.test.conllu"
* val conllDataSet = CoNLLU(false).readDataset(ResourceHelper.spark, conlluFile)
* conllDataSet.selectExpr("text", "form.result as form", "upos.result as upos", "xpos.result as xpos", "lemma.result as lemma")
* .show(1, false)
* +---------------------------------------+----------------------------------------------+---------------------------------------------+------------------------------+--------------------------------------------+
* |text |form |upos |xpos |lemma |
* +---------------------------------------+----------------------------------------------+---------------------------------------------+------------------------------+--------------------------------------------+
* |What if Google Morphed Into GoogleOS? |[What, if, Google, Morphed, Into, GoogleOS, ?]|[PRON, SCONJ, PROPN, VERB, ADP, PROPN, PUNCT]|[WP, IN, NNP, VBD, IN, NNP, .]|[what, if, Google, morph, into, GoogleOS, ?]|
* +---------------------------------------+----------------------------------------------+---------------------------------------------+------------------------------+--------------------------------------------+
* }}}
* @param explodeSentences
* Whether to split each sentence into a separate row
*/
case class CoNLLU(
conllTextCol: String = "text",
documentCol: String = "document",
sentenceCol: String = "sentence",
formCol: String = CoNLLUCols.FORM.toString.toLowerCase,
uposCol: String = CoNLLUCols.UPOS.toString.toLowerCase,
xposCol: String = CoNLLUCols.XPOS.toString.toLowerCase,
lemmaCol: String = CoNLLUCols.LEMMA.toString.toLowerCase,
explodeSentences: Boolean = true) {
private val annotationType = ArrayType(Annotation.dataType)
def readDatasetFromLines(lines: Array[String], spark: SparkSession): Dataset[_] = {
val docs = CoNLLHelper.readLines(lines, explodeSentences)
packDocs(docs, spark)
}
def readDataset(
spark: SparkSession,
path: String,
readAs: String = ReadAs.TEXT.toString): Dataset[_] = {
val er = ExternalResource(path, readAs, Map("format" -> "text"))
val docs = readDocs(er)
packDocs(docs, spark)
}
def packDocs(docs: Seq[CoNLLUDocument], spark: SparkSession): Dataset[_] = {
import spark.implicits._
val rows = docs
.map { doc =>
val text = doc.text
val docs = packAssembly(text)
val sentences = packSentence(text, doc.uPosTagged)
val tokenized = packTokenized(doc.uPosTagged)
val uPosTagged = packPosTagged(doc.uPosTagged)
val xPosTagged = packPosTagged(doc.xPosTagged)
val lemma = packTokenized(doc.lemma)
(text, docs, sentences, tokenized, uPosTagged, xPosTagged, lemma)
}
.toDF
.rdd
spark.createDataFrame(rows, schema)
}
def packAssembly(text: String, isTraining: Boolean = true): Seq[Annotation] = {
new DocumentAssembler()
.assemble(text, Map("training" -> isTraining.toString))
}
def packSentence(text: String, sentences: Seq[TaggedSentence]): Seq[Annotation] = {
val indexedSentences = sentences.zipWithIndex.map { case (sentence, index) =>
val start = sentence.indexedTaggedWords.map(t => t.begin).min
val end = sentence.indexedTaggedWords.map(t => t.end).max
val sentenceText = text.substring(start, end + 1)
new Sentence(sentenceText, start, end, index)
}
SentenceSplit.pack(indexedSentences)
}
def packTokenized(sentences: Seq[TaggedSentence]): Seq[Annotation] = {
val tokenizedSentences = sentences.zipWithIndex.map { case (sentence, index) =>
val tokens = sentence.indexedTaggedWords.map(t => IndexedToken(t.word, t.begin, t.end))
TokenizedSentence(tokens, index)
}
TokenizedWithSentence.pack(tokenizedSentences)
}
def packPosTagged(sentences: Seq[TaggedSentence]): Seq[Annotation] = {
PosTagged.pack(sentences)
}
def readDocs(er: ExternalResource): Seq[CoNLLUDocument] = {
val lines = ResourceHelper.parseLines(er)
CoNLLHelper.readLines(lines, explodeSentences)
}
def schema: StructType = {
val text = StructField(conllTextCol, StringType)
val doc = getAnnotationType(documentCol, AnnotatorType.DOCUMENT)
val sentence = getAnnotationType(sentenceCol, AnnotatorType.DOCUMENT)
val token = getAnnotationType(formCol, AnnotatorType.TOKEN)
val uPos = getAnnotationType(uposCol, AnnotatorType.POS)
val xPos = getAnnotationType(xposCol, AnnotatorType.POS)
val lemma = getAnnotationType(lemmaCol, AnnotatorType.TOKEN)
StructType(Seq(text, doc, sentence, token, uPos, xPos, lemma))
}
def getAnnotationType(
column: String,
annotatorType: String,
addMetadata: Boolean = true): StructField = {
if (!addMetadata)
StructField(column, annotationType, nullable = false)
else {
val metadataBuilder: MetadataBuilder = new MetadataBuilder()
metadataBuilder.putString("annotatorType", annotatorType)
StructField(column, annotationType, nullable = false, metadataBuilder.build)
}
}
}