com.johnsnowlabs.nlp.annotators.sda.vivekn.ViveknSentimentModel.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-nlp_2.11
spark-nlp
There is a newer version: 1.6.2
package com.johnsnowlabs.nlp.annotators.sda.vivekn

import com.johnsnowlabs.nlp.annotators.common.{TokenizedSentence, TokenizedWithSentence}
import com.johnsnowlabs.nlp.serialization.{ArrayFeature, MapFeature}
import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, ParamsAndFeaturesReadable}
import com.johnsnowlabs.util.ConfigLoader
import com.typesafe.config.Config
import org.apache.spark.ml.param.LongParam
import org.apache.spark.ml.util.Identifiable

class ViveknSentimentModel(override val uid: String) extends AnnotatorModel[ViveknSentimentModel] with ViveknSentimentUtils {

  import com.johnsnowlabs.nlp.AnnotatorType._

  private val config: Config = ConfigLoader.retrieve
  private val importantFeatureRatio = config.getDouble("nlp.viveknSentiment.importantFeaturesRatio")
  private val unimportantFeatureStep = config.getDouble("nlp.viveknSentiment.unimportantFeaturesStepRatio")
  private val featureLimit = config.getInt("nlp.viveknSentiment.featuresLimit")

  override val annotatorType: AnnotatorType = SENTIMENT

  override val requiredAnnotatorTypes: Array[AnnotatorType] = Array(TOKEN, DOCUMENT)

  protected val positive: MapFeature[String, Long] = new MapFeature(this, "positive_sentences")
  protected val negative: MapFeature[String, Long] = new MapFeature(this, "negative_sentences")
  protected val words: ArrayFeature[String] = new ArrayFeature(this, "words")

  protected val positiveTotals: LongParam = new LongParam(this, "positive_totals", "count of positive words")
  protected val negativeTotals: LongParam = new LongParam(this, "negative_totals", "count of negative words")

  def this() = this(Identifiable.randomUID("VIVEKN"))

  private[vivekn] def getPositive: Map[String, Long] = $$(positive)
  private[vivekn] def getNegative: Map[String, Long] = $$(negative)
  private[vivekn] def getFeatures: Array[String] = $$(words)

  private[vivekn] def setPositive(value: Map[String, Long]): this.type = set(positive, value)
  private[vivekn] def setNegative(value: Map[String, Long]): this.type = set(negative, value)
  private[vivekn] def setPositiveTotals(value: Long): this.type = set(positiveTotals, value)
  private[vivekn] def setNegativeTotals(value: Long): this.type = set(negativeTotals, value)
  private[vivekn] def setWords(value: Array[String]): this.type = {
    require(value.nonEmpty, "Word analysis for features cannot be empty. Set prune to false if training is small")
    val currentFeatures = scala.collection.mutable.Set.empty[String]
    val start = (value.length * importantFeatureRatio).ceil.toInt
    val afterStart = {
      if (featureLimit == -1) value.length
      else featureLimit
    }
    val step = (afterStart * unimportantFeatureStep).ceil.toInt
    value.take(start).foreach(currentFeatures.add)
    Range(start, afterStart, step).foreach(k => {
      value.slice(k, k+step).foreach(currentFeatures.add)
    })

    set(words, currentFeatures.toArray)
  }

  def classify(sentence: TokenizedSentence): Boolean = {
    val wordFeatures = negateSequence(sentence.tokens.toList).intersect($$(words)).distinct
    if (wordFeatures.isEmpty) return true
    val positiveProbability = wordFeatures.map(word => scala.math.log(($$(positive).getOrElse(word, 0L) + 1.0) / (2.0 * $(positiveTotals)))).sum
    val negativeProbability = wordFeatures.map(word => scala.math.log(($$(negative).getOrElse(word, 0L) + 1.0) / (2.0 * $(negativeTotals)))).sum
    positiveProbability > negativeProbability
  }

  /**
    * Tokens are needed to identify each word in a sentence boundary
    * POS tags are optionally submitted to the model in case they are needed
    * Lemmas are another optional annotator for some models
    * Bounds of sentiment are hardcoded to 0 as they render useless
    * @param annotations Annotations that correspond to inputAnnotationCols generated by previous annotators if any
    * @return any number of annotations processed for every input annotation. Not necessary one to one relationship
    */
  override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = {
    val sentences = TokenizedWithSentence.unpack(annotations)

    sentences.filter(s => s.indexedTokens.nonEmpty).map(sentence => {
      Annotation(
        annotatorType,
        sentence.indexedTokens.map(t => t.begin).min,
        sentence.indexedTokens.map(t => t.end).max,
        if (classify(sentence)) "positive" else "negative",
        Map.empty[String, String]
      )
    })
  }

}

object ViveknSentimentModel extends ParamsAndFeaturesReadable[ViveknSentimentModel]