
com.johnsnowlabs.nlp.annotators.sda.vivekn.ViveknSentimentModel.scala Maven / Gradle / Ivy
package com.johnsnowlabs.nlp.annotators.sda.vivekn
import com.johnsnowlabs.nlp.annotators.common.{TokenizedSentence, TokenizedWithSentence}
import com.johnsnowlabs.nlp.serialization.{ArrayFeature, MapFeature}
import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, ParamsAndFeaturesReadable}
import com.johnsnowlabs.util.ConfigLoader
import com.typesafe.config.Config
import org.apache.spark.ml.param.LongParam
import org.apache.spark.ml.util.Identifiable
class ViveknSentimentModel(override val uid: String) extends AnnotatorModel[ViveknSentimentModel] with ViveknSentimentUtils {
import com.johnsnowlabs.nlp.AnnotatorType._
private val config: Config = ConfigLoader.retrieve
private val importantFeatureRatio = config.getDouble("nlp.viveknSentiment.importantFeaturesRatio")
private val unimportantFeatureStep = config.getDouble("nlp.viveknSentiment.unimportantFeaturesStepRatio")
private val featureLimit = config.getInt("nlp.viveknSentiment.featuresLimit")
override val annotatorType: AnnotatorType = SENTIMENT
override val requiredAnnotatorTypes: Array[AnnotatorType] = Array(TOKEN, DOCUMENT)
protected val positive: MapFeature[String, Long] = new MapFeature(this, "positive_sentences")
protected val negative: MapFeature[String, Long] = new MapFeature(this, "negative_sentences")
protected val words: ArrayFeature[String] = new ArrayFeature(this, "words")
protected val positiveTotals: LongParam = new LongParam(this, "positive_totals", "count of positive words")
protected val negativeTotals: LongParam = new LongParam(this, "negative_totals", "count of negative words")
def this() = this(Identifiable.randomUID("VIVEKN"))
private[vivekn] def getPositive: Map[String, Long] = $$(positive)
private[vivekn] def getNegative: Map[String, Long] = $$(negative)
private[vivekn] def getFeatures: Array[String] = $$(words)
private[vivekn] def setPositive(value: Map[String, Long]): this.type = set(positive, value)
private[vivekn] def setNegative(value: Map[String, Long]): this.type = set(negative, value)
private[vivekn] def setPositiveTotals(value: Long): this.type = set(positiveTotals, value)
private[vivekn] def setNegativeTotals(value: Long): this.type = set(negativeTotals, value)
private[vivekn] def setWords(value: Array[String]): this.type = {
require(value.nonEmpty, "Word analysis for features cannot be empty. Set prune to false if training is small")
val currentFeatures = scala.collection.mutable.Set.empty[String]
val start = (value.length * importantFeatureRatio).ceil.toInt
val afterStart = {
if (featureLimit == -1) value.length
else featureLimit
}
val step = (afterStart * unimportantFeatureStep).ceil.toInt
value.take(start).foreach(currentFeatures.add)
Range(start, afterStart, step).foreach(k => {
value.slice(k, k+step).foreach(currentFeatures.add)
})
set(words, currentFeatures.toArray)
}
def classify(sentence: TokenizedSentence): Boolean = {
val wordFeatures = negateSequence(sentence.tokens.toList).intersect($$(words)).distinct
if (wordFeatures.isEmpty) return true
val positiveProbability = wordFeatures.map(word => scala.math.log(($$(positive).getOrElse(word, 0L) + 1.0) / (2.0 * $(positiveTotals)))).sum
val negativeProbability = wordFeatures.map(word => scala.math.log(($$(negative).getOrElse(word, 0L) + 1.0) / (2.0 * $(negativeTotals)))).sum
positiveProbability > negativeProbability
}
/**
* Tokens are needed to identify each word in a sentence boundary
* POS tags are optionally submitted to the model in case they are needed
* Lemmas are another optional annotator for some models
* Bounds of sentiment are hardcoded to 0 as they render useless
* @param annotations Annotations that correspond to inputAnnotationCols generated by previous annotators if any
* @return any number of annotations processed for every input annotation. Not necessary one to one relationship
*/
override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = {
val sentences = TokenizedWithSentence.unpack(annotations)
sentences.filter(s => s.indexedTokens.nonEmpty).map(sentence => {
Annotation(
annotatorType,
sentence.indexedTokens.map(t => t.begin).min,
sentence.indexedTokens.map(t => t.end).max,
if (classify(sentence)) "positive" else "negative",
Map.empty[String, String]
)
})
}
}
object ViveknSentimentModel extends ParamsAndFeaturesReadable[ViveknSentimentModel]
© 2015 - 2025 Weber Informatics LLC | Privacy Policy