com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronApproachDistributed.scala Maven / Gradle / Ivy
/*
* Copyright 2017-2022 John Snow Labs
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.johnsnowlabs.nlp.annotators.pos.perceptron
import com.johnsnowlabs.nlp.annotators.common.{IndexedTaggedWord, TaggedSentence}
import com.johnsnowlabs.nlp.annotators.param.ExternalResourceParam
import com.johnsnowlabs.nlp.util.io.{ExternalResource, ReadAs, ResourceHelper}
import com.johnsnowlabs.nlp.{Annotation, AnnotatorApproach, AnnotatorType}
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.ml.PipelineModel
import org.apache.spark.ml.param.{IntParam, Param}
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.functions.rand
import org.apache.spark.util.LongAccumulator
import scala.collection.mutable.{ListBuffer, Map => MMap}
/** Distributed Averaged Perceptron model to tag words part-of-speech.
*
* Sets a POS tag to each word within a sentence. Its train data (train_pos) is a spark dataset
* of POS format values with Annotation columns.
*
* See
* [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/pos/perceptron/DistributedPos.scala]]
* for further reference on how to use this APIs.
*
* @param uid
* internal uid required to generate writable annotators
* @groupname anno Annotator types
* @groupdesc anno
* Required input and expected output annotator types
* @groupname Ungrouped Members
* @groupname param Parameters
* @groupname setParam Parameter setters
* @groupname getParam Parameter getters
* @groupname Ungrouped Members
* @groupprio param 1
* @groupprio anno 2
* @groupprio Ungrouped 3
* @groupprio setParam 4
* @groupprio getParam 5
* @groupdesc param
* A list of (hyper-)parameter keys this annotator can take. Users can set and get the
* parameter values through setters and getters, respectively.
*/
class PerceptronApproachDistributed(override val uid: String)
extends AnnotatorApproach[PerceptronModel]
with PerceptronTrainingUtils {
import com.johnsnowlabs.nlp.AnnotatorType._
/** Averaged Perceptron model to tag words part-of-speech */
override val description: String = "Averaged Perceptron model to tag words part-of-speech"
/** column of Array of POS tags that match tokens
*
* @group param
*/
val posCol = new Param[String](this, "posCol", "column of Array of POS tags that match tokens")
/** POS tags delimited corpus. Needs 'delimiter' in options
*
* @group param
*/
val corpus = new ExternalResourceParam(
this,
"corpus",
"POS tags delimited corpus. Needs 'delimiter' in options")
/** Number of iterations in training, converges to better accuracy
*
* @group param
*/
val nIterations = new IntParam(
this,
"nIterations",
"Number of iterations in training, converges to better accuracy")
setDefault(nIterations, 5)
/** Column containing an array of POS Tags matching every token on the line.
*
* @group setParam
*/
def setPosColumn(value: String): this.type = set(posCol, value)
/** POS tags delimited corpus. Needs 'delimiter' in options
*
* @group setParam
*/
def setCorpus(value: ExternalResource): this.type = {
require(
value.options.contains("delimiter"),
"PerceptronApproach needs 'delimiter' in options to associate words with tags")
set(corpus, value)
}
/** POS tags delimited corpus. Needs 'delimiter' in options
*
* @group setParam
*/
def setCorpus(
path: String,
delimiter: String,
readAs: ReadAs.Format = ReadAs.SPARK,
options: Map[String, String] = Map("format" -> "text")): this.type =
set(corpus, ExternalResource(path, readAs, options ++ Map("delimiter" -> delimiter)))
/** Number of iterations for training. May improve accuracy but takes longer. Default 5.
*
* @group setParam
*/
def setNIterations(value: Int): this.type = set(nIterations, value)
def this() = this(Identifiable.randomUID("POS"))
/** Output annotator types : POS
*
* @group anno
*/
override val outputAnnotatorType: AnnotatorType = POS
/** Input annotator types : TOKEN, DOCUMENT
*
* @group anno
*/
override val inputAnnotatorTypes: Array[AnnotatorType] = Array(TOKEN, DOCUMENT)
/** Finds very frequent tags on a word in training, and marks them as non ambiguous based on
* tune parameters ToDo: Move such parameters to configuration
*
* @param taggedSentences
* Takes entire tagged sentences to find frequent tags
* @param frequencyThreshold
* How many times at least a tag on a word to be marked as frequent
* @param ambiguityThreshold
* How much percentage of total amount of words are covered to be marked as frequent
*/
def buildTagBook(
taggedSentences: Dataset[TaggedSentence],
frequencyThreshold: Int = 20,
ambiguityThreshold: Double = 0.97): Map[String, String] = {
import ResourceHelper.spark.implicits._
val tagFrequenciesByWord = taggedSentences
.flatMap(_.taggedWords)
.groupByKey(tw => tw.word.toLowerCase)
.mapGroups { case (lw, tw) => (lw, tw.toSeq.groupBy(_.tag).mapValues(_.length)) }
.filter { lwtw =>
val (_, mode) = lwtw._2.maxBy(t => t._2)
val n = lwtw._2.values.sum
n >= frequencyThreshold && (mode / n.toDouble) >= ambiguityThreshold
}
tagFrequenciesByWord
.map { case (word, tagFrequencies) =>
val (tag, _) = tagFrequencies.maxBy(_._2)
logger.debug(s"TRAINING: Ambiguity discarded on: << $word >> set to: << $tag >>")
(word, tag)
}
.collect
.toMap
}
private[pos] def averageWeights(
tags: Broadcast[Array[String]],
taggedWordBook: Broadcast[Map[String, String]],
featuresWeight: StringMapStringDoubleAccumulator,
updateIteration: LongAccumulator,
timetotals: TupleKeyLongDoubleMapAccumulator): AveragedPerceptron = {
val fw = featuresWeight.value
val uiv = updateIteration.value
val totals = timetotals.value
featuresWeight.reset()
updateIteration.reset()
timetotals.reset()
val finalfw = fw.map { case (feature, weights) =>
(
feature,
weights.map { case (tag, weight) =>
val param = (feature, tag)
val total = totals
.get(param)
.map(_._2)
.getOrElse(0.0) + ((uiv - totals.get(param).map(_._1).getOrElse(0L)) * weight)
(tag, total / uiv.toDouble)
})
}
val apr = AveragedPerceptron(tags.value, taggedWordBook.value, finalfw)
taggedWordBook.destroy()
tags.destroy()
apr
}
/** Trains a model based on a provided CORPUS
*
* @return
* A trained averaged model
*/
override def train(
dataset: Dataset[_],
recursivePipeline: Option[PipelineModel]): PerceptronModel = {
val featuresWeightAcc = new StringMapStringDoubleAccumulator()
val timeTotalsAcc = new TupleKeyLongDoubleMapAccumulator()
val updateIterationAcc = new LongAccumulator()
dataset.sparkSession.sparkContext.register(featuresWeightAcc)
dataset.sparkSession.sparkContext.register(timeTotalsAcc)
dataset.sparkSession.sparkContext.register(updateIterationAcc)
/** Generates TagBook, which holds all the word to tags mapping that are not ambiguous */
val taggedSentences: Dataset[TaggedSentence] = if (get(posCol).isDefined) {
import ResourceHelper.spark.implicits._
val tokenColumn = dataset.schema.fields
.find(f =>
f.metadata.contains("annotatorType") && f.metadata
.getString("annotatorType") == AnnotatorType.TOKEN)
.map(_.name)
.get
dataset
.select(tokenColumn, $(posCol))
.as[(Array[Annotation], Array[String])]
.map { case (annotations, posTags) =>
lazy val strTokens = annotations.map(_.result).mkString("#")
lazy val strPosTags = posTags.mkString("#")
require(
annotations.length == posTags.length,
s"Cannot train from $posCol since there" +
s" is a row with different amount of tags and tokens:\n$strTokens\n$strPosTags")
TaggedSentence(
annotations
.zip(posTags)
.map { case (annotation, posTag) =>
IndexedTaggedWord(annotation.result, posTag, annotation.begin, annotation.end)
})
}
} else {
ResourceHelper.parseTupleSentencesDS($(corpus))
}
val nPartitions = $(corpus).options.get("repartition").map(_.toInt).getOrElse(0)
val doCache = $(corpus).options.get("cache").exists(_.toBoolean == true)
val repartitioned =
if (nPartitions > 0 && nPartitions != taggedSentences.rdd.partitions.length)
taggedSentences.repartition(nPartitions)
else
taggedSentences
val cachedSentences =
if (doCache)
repartitioned.cache
else
repartitioned
val taggedWordBook =
dataset.sparkSession.sparkContext.broadcast(buildTagBook(taggedSentences))
/** finds all distinct tags and stores them */
val classes = {
import ResourceHelper.spark.implicits._
dataset.sparkSession.sparkContext
.broadcast(taggedSentences.flatMap(_.tags).distinct.collect)
}
/** Iterates for training */
(1 to $(nIterations)).foreach { iteration =>
{
logger.debug(s"TRAINING: Iteration n: $iteration")
val iterationTimestamps = if (iteration == 1) {
dataset.sparkSession.sparkContext.broadcast(Map.empty[(String, String), Long])
} else {
dataset.sparkSession.sparkContext.broadcast(timeTotalsAcc.value.mapValues(_._1))
}
val iterationWeights = if (iteration == 1) {
dataset.sparkSession.sparkContext.broadcast(Map.empty[String, Map[String, Double]])
} else {
dataset.sparkSession.sparkContext.broadcast(featuresWeightAcc.value)
}
val iterationUpdateCount = if (iteration == 1) {
dataset.sparkSession.sparkContext.broadcast[Long](0L)
} else {
dataset.sparkSession.sparkContext.broadcast[Long](updateIterationAcc.value)
}
val sortedSentences: Dataset[TaggedSentence] =
cachedSentences.sort(rand()).sortWithinPartitions(rand())
/** Cache of iteration datasets does not show any improvements, try sample? */
sortedSentences.foreachPartition((partition: Iterator[TaggedSentence]) => {
val _temp1 = ListBuffer.empty[((String, String), Long)]
iterationTimestamps.value.copyToBuffer(_temp1)
val newPartitionTimeTotals = MMap.empty[(String, String), (Long, Double)]
val partitionTimestamps = _temp1.toMap
_temp1.clear()
val _temp2 = ListBuffer.empty[(String, Map[String, Double])]
iterationWeights.value.copyToBuffer(_temp2)
val newPartitionWeights = MMap.empty[String, MMap[String, Double]]
val partitionWeights = _temp2.toMap
_temp2.clear()
var partitionUpdateCount: Long = iterationUpdateCount.value
val partitionUpdateCountOriginal = partitionUpdateCount
val partitionTotals: MMap[(String, String), Double] =
MMap.empty[(String, String), Double]
val twb = taggedWordBook.value
val cls = classes.value
def update(truth: String, guess: String, features: Iterable[String]): Unit = {
def updateFeature(
tag: String,
feature: String,
weight: Double,
value: Double): Unit = {
/** update totals and timestamps */
val param = (feature, tag)
val newTimestamp = partitionUpdateCount
partitionTotals.update(
param,
partitionTotals.getOrElse(param, 0.0) + ((newTimestamp - newPartitionTimeTotals
.get(param)
.map(_._1)
.getOrElse(partitionTimestamps.getOrElse(param, 0L))) * weight))
newPartitionTimeTotals.update(param, (newTimestamp, partitionTotals(param)))
/** update weights */
val newWeights =
newPartitionWeights.getOrElse(feature, MMap()) ++ MMap(tag -> (weight + value))
newPartitionWeights.update(feature, newWeights)
}
/** if prediction was wrong, take all features and for each feature get feature's
* current tags and their weights congratulate if success and punish for wrong in
* weight
*/
if (truth != guess) {
features.foreach { feature =>
val weights = newPartitionWeights
.get(feature)
.map(pw => partitionWeights.getOrElse(feature, Map()) ++ pw)
.orElse(partitionWeights.get(feature))
.getOrElse(Map())
updateFeature(truth, feature, weights.getOrElse(truth, 0.0), 1.0)
updateFeature(guess, feature, weights.getOrElse(guess, 0.0), -1.0)
}
}
}
def predict(features: Map[String, Int]): String = {
/** scores are used for feature scores, which are all by default 0 if a feature has a
* relevant score, look for all its possible tags and their scores multiply their
* weights per the times they appear Return highest tag by score
*/
val scoresByTag = features
.filter { case (feature, value) =>
(partitionWeights.contains(feature) || newPartitionWeights
.contains(feature)) && value != 0
}
.map { case (feature, value) =>
newPartitionWeights
.get(feature)
.map(pw => partitionWeights.getOrElse(feature, Map()) ++ pw)
.getOrElse(partitionWeights(feature))
.map { case (tag, weight) =>
(tag, value * weight)
}
}
.aggregate(Map[String, Double]())(
(tagsScores, tagScore) =>
tagScore ++ tagsScores.map { case (tag, score) =>
(tag, tagScore.getOrElse(tag, 0.0) + score)
},
(pTagScore, cTagScore) =>
pTagScore.map { case (tag, score) =>
(tag, cTagScore.getOrElse(tag, 0.0) + score)
})
/** ToDo: Watch it here. Because of missing training corpus, default values are made
* to make tests pass Secondary sort by tag simply made to match original python
* behavior
*/
cls.maxBy { tag =>
(scoresByTag.getOrElse(tag, 0.0), tag)
}
}
/** In a shuffled sentences list, try to find tag of the word, hold the correct answer
*/
partition.foreach { taggedSentence =>
/** Defines a sentence context, with room to for look back */
var prev = START(0)
var prev2 = START(1)
val context = START ++: taggedSentence.words.map(w => normalized(w)) ++: END
taggedSentence.words.zipWithIndex.foreach { case (word, i) =>
val guess =
twb.getOrElse(
word.toLowerCase, {
val features = getFeatures(i, word, context, prev, prev2)
val guess = predict(features)
partitionUpdateCount += 1L
update(taggedSentence.tags(i), guess, features.keys)
guess
})
/** shift the context */
prev2 = prev
prev = guess
}
}
featuresWeightAcc.addMany(newPartitionWeights)
timeTotalsAcc.updateMany(newPartitionTimeTotals)
updateIterationAcc.add(partitionUpdateCount - partitionUpdateCountOriginal)
})
if (doCache) { sortedSentences.unpersist() }
iterationTimestamps.unpersist(true)
iterationWeights.unpersist(true)
iterationUpdateCount.unpersist(true)
}
}
logger.debug("TRAINING: Finished all iterations")
new PerceptronModel().setModel(
averageWeights(
classes,
taggedWordBook,
featuresWeightAcc,
updateIterationAcc,
timeTotalsAcc))
}
}
/** This is the companion object of [[PerceptronApproachDistributed]]. Please refer to that class
* for the documentation.
*/
object PerceptronApproachDistributed extends DefaultParamsReadable[PerceptronApproachDistributed]
© 2015 - 2024 Weber Informatics LLC | Privacy Policy