com.johnsnowlabs.nlp.embeddings.SentenceEmbeddings.scala Maven / Gradle / Ivy

Go to download
/*
 * Copyright 2017-2022 John Snow Labs
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.johnsnowlabs.nlp.embeddings

import com.johnsnowlabs.nlp.AnnotatorType.{DOCUMENT, SENTENCE_EMBEDDINGS, WORD_EMBEDDINGS}
import com.johnsnowlabs.nlp.annotators.common.{SentenceSplit, WordpieceEmbeddingsSentence}
import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, AnnotatorType, HasSimpleAnnotate}
import com.johnsnowlabs.storage.HasStorageRef
import org.apache.spark.ml.param.{IntParam, Param}
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
import org.apache.spark.sql.{DataFrame, Dataset}

/** Converts the results from [[WordEmbeddings]], [[BertEmbeddings]], or [[ElmoEmbeddings]] into
  * sentence or document embeddings by either summing up or averaging all the word embeddings in a
  * sentence or a document (depending on the inputCols).
  *
  * This can be configured with `setPoolingStrategy`, which either be `"AVERAGE"` or `"SUM"`.
  *
  * For more extended examples see the
  * [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/text-similarity/Spark_NLP_Spark_ML_Text_Similarity.ipynb Examples]].
  * and the
  * [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/embeddings/SentenceEmbeddingsTestSpec.scala SentenceEmbeddingsTestSpec]].
  *
  * ==Example==
  * {{{
  * import spark.implicits._
  * import com.johnsnowlabs.nlp.base.DocumentAssembler
  * import com.johnsnowlabs.nlp.annotators.Tokenizer
  * import com.johnsnowlabs.nlp.embeddings.WordEmbeddingsModel
  * import com.johnsnowlabs.nlp.embeddings.SentenceEmbeddings
  * import com.johnsnowlabs.nlp.EmbeddingsFinisher
  * import org.apache.spark.ml.Pipeline
  *
  * val documentAssembler = new DocumentAssembler()
  *   .setInputCol("text")
  *   .setOutputCol("document")
  *
  * val tokenizer = new Tokenizer()
  *   .setInputCols(Array("document"))
  *   .setOutputCol("token")
  *
  * val embeddings = WordEmbeddingsModel.pretrained()
  *   .setInputCols("document", "token")
  *   .setOutputCol("embeddings")
  *
  * val embeddingsSentence = new SentenceEmbeddings()
  *   .setInputCols(Array("document", "embeddings"))
  *   .setOutputCol("sentence_embeddings")
  *   .setPoolingStrategy("AVERAGE")
  *
  * val embeddingsFinisher = new EmbeddingsFinisher()
  *   .setInputCols("sentence_embeddings")
  *   .setOutputCols("finished_embeddings")
  *   .setOutputAsVector(true)
  *   .setCleanAnnotations(false)
  *
  * val pipeline = new Pipeline()
  *   .setStages(Array(
  *     documentAssembler,
  *     tokenizer,
  *     embeddings,
  *     embeddingsSentence,
  *     embeddingsFinisher
  *   ))
  *
  * val data = Seq("This is a sentence.").toDF("text")
  * val result = pipeline.fit(data).transform(data)
  *
  * result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
  * +--------------------------------------------------------------------------------+
  * |                                                                          result|
  * +--------------------------------------------------------------------------------+
  * |[-0.22093398869037628,0.25130119919776917,0.41810303926467896,-0.380883991718...|
  * +--------------------------------------------------------------------------------+
  * }}}
  *
  * @groupname anno Annotator types
  * @groupdesc anno
  *   Required input and expected output annotator types
  * @groupname Ungrouped Members
  * @groupname param Parameters
  * @groupname setParam Parameter setters
  * @groupname getParam Parameter getters
  * @groupname Ungrouped Members
  * @groupprio param  1
  * @groupprio anno  2
  * @groupprio Ungrouped 3
  * @groupprio setParam  4
  * @groupprio getParam  5
  * @groupdesc param
  *   A list of (hyper-)parameter keys this annotator can take. Users can set and get the
  *   parameter values through setters and getters, respectively.
  */
class SentenceEmbeddings(override val uid: String)
    extends AnnotatorModel[SentenceEmbeddings]
    with HasSimpleAnnotate[SentenceEmbeddings]
    with HasEmbeddingsProperties
    with HasStorageRef {

  /** Output annotator type : SENTENCE_EMBEDDINGS
    *
    * @group anno
    */
  override val outputAnnotatorType: AnnotatorType = SENTENCE_EMBEDDINGS

  /** Input annotator type : DOCUMENT, WORD_EMBEDDINGS
    *
    * @group anno
    */
  override val inputAnnotatorTypes: Array[AnnotatorType] = Array(DOCUMENT, WORD_EMBEDDINGS)

  /** Number of embedding dimensions (Default: `100`)
    *
    * @group param
    */
  override val dimension = new IntParam(this, "dimension", "Number of embedding dimensions")

  /** Number of embedding dimensions (Default: `100`)
    *
    * @group getParam
    */
  override def getDimension: Int = $(dimension)

  /** Choose how you would like to aggregate Word Embeddings to Sentence Embeddings (Default:
    * `"AVERAGE"`). Can either be `"AVERAGE"` or `"SUM"`.
    *
    * @group param
    */
  val poolingStrategy = new Param[String](
    this,
    "poolingStrategy",
    "Choose how you would like to aggregate Word Embeddings to Sentence Embeddings: AVERAGE or SUM")

  /** Choose how you would like to aggregate Word Embeddings to Sentence Embeddings (Default:
    * `"AVERAGE"`). Can either be `"AVERAGE"` or `"SUM"`.
    *
    * @group setParam
    */
  def setPoolingStrategy(strategy: String): this.type = {
    strategy.toLowerCase() match {
      case "average" => set(poolingStrategy, "AVERAGE")
      case "sum" => set(poolingStrategy, "SUM")
      case _ => throw new MatchError("poolingStrategy must be either AVERAGE or SUM")
    }
  }

  setDefault(
    inputCols -> Array(DOCUMENT, WORD_EMBEDDINGS),
    outputCol -> "sentence_embeddings",
    poolingStrategy -> "AVERAGE",
    dimension -> 100)

  /** Internal constructor to submit a random UID */
  def this() = this(Identifiable.randomUID("SENTENCE_EMBEDDINGS"))

  private def calculateSentenceEmbeddings(matrix: Array[Array[Float]]): Array[Float] = {
    val res = Array.ofDim[Float](matrix(0).length)
    setDimension(matrix(0).length)

    matrix(0).indices.foreach { j =>
      matrix.indices.foreach { i =>
        res(j) += matrix(i)(j)
      }
      if ($(poolingStrategy) == "AVERAGE")
        res(j) /= matrix.length
    }
    res
  }

  /** takes a document and annotations and produces new annotations of this annotator's annotation
    * type
    *
    * @param annotations
    *   Annotations that correspond to inputAnnotationCols generated by previous annotators if any
    * @return
    *   any number of annotations processed for every input annotation. Not necessary one to one
    *   relationship
    */
  override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = {
    val sentences = SentenceSplit.unpack(annotations)

    val embeddingsSentences = WordpieceEmbeddingsSentence.unpack(annotations)

    sentences.map { sentence =>
      val embeddings =
        embeddingsSentences.filter(embeddings => embeddings.sentenceId == sentence.index)

      val sentenceEmbeddings = embeddings.flatMap { tokenEmbedding =>
        val allEmbeddings = tokenEmbedding.tokens.map { token =>
          token.embeddings
        }
        calculateSentenceEmbeddings(allEmbeddings)
      }.toArray

      Annotation(
        annotatorType = outputAnnotatorType,
        begin = sentence.start,
        end = sentence.end,
        result = sentence.content,
        metadata = Map(
          "sentence" -> sentence.index.toString,
          "token" -> sentence.content,
          "pieceId" -> "-1",
          "isWordStart" -> "true"),
        embeddings = sentenceEmbeddings)
    }
  }
  override protected def beforeAnnotate(dataset: Dataset[_]): Dataset[_] = {
    val ref =
      HasStorageRef.getStorageRefFromInput(dataset, $(inputCols), AnnotatorType.WORD_EMBEDDINGS)
    if (get(storageRef).isEmpty)
      setStorageRef(ref)
    dataset
  }
  override protected def afterAnnotate(dataset: DataFrame): DataFrame = {
    dataset.withColumn(
      getOutputCol,
      wrapSentenceEmbeddingsMetadata(
        dataset.col(getOutputCol),
        $(dimension),
        Some($(storageRef))))
  }
}

/** This is the companion object of [[SentenceEmbeddings]]. Please refer to that class for the
  * documentation.
  */
object SentenceEmbeddings extends DefaultParamsReadable[SentenceEmbeddings]