com.johnsnowlabs.nlp.embeddings.Doc2VecApproach.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-nlp_2.12 Show documentation
spark-nlp
The newest version!
/*
 * Copyright 2017-2022 John Snow Labs
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.johnsnowlabs.nlp.embeddings

import com.johnsnowlabs.nlp.AnnotatorType.{SENTENCE_EMBEDDINGS, TOKEN}
import com.johnsnowlabs.nlp.{AnnotatorApproach, HasEnableCachingProperties, HasProtectedParams}
import com.johnsnowlabs.storage.HasStorageRef
import org.apache.spark.ml.PipelineModel
import org.apache.spark.ml.param.{DoubleParam, IntParam}
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
import org.apache.spark.mllib.feature.Word2Vec
import org.apache.spark.sql.{Dataset, SparkSession}

/** Trains a Word2Vec model that creates vector representations of words in a text corpus.
  *
  * The algorithm first constructs a vocabulary from the corpus and then learns vector
  * representation of words in the vocabulary. The vector representation can be used as features
  * in natural language processing and machine learning algorithms.
  *
  * We use Word2Vec implemented in Spark ML. It uses skip-gram model in our implementation and a
  * hierarchical softmax method to train the model. The variable names in the implementation match
  * the original C implementation.
  *
  * For instantiated/pretrained models, see [[Doc2VecModel]].
  *
  * '''Sources''' :
  *
  * For the original C implementation, see https://code.google.com/p/word2vec/
  *
  * For the research paper, see
  * [[https://arxiv.org/abs/1301.3781 Efficient Estimation of Word Representations in Vector Space]]
  * and
  * [[https://arxiv.org/pdf/1310.4546v1.pdf Distributed Representations of Words and Phrases and their Compositionality]].
  *
  * ==Example==
  * {{{
  * import spark.implicits._
  * import com.johnsnowlabs.nlp.annotator.{Tokenizer, Doc2VecApproach}
  * import com.johnsnowlabs.nlp.base.DocumentAssembler
  * import org.apache.spark.ml.Pipeline
  *
  * val documentAssembler = new DocumentAssembler()
  *   .setInputCol("text")
  *   .setOutputCol("document")
  *
  * val tokenizer = new Tokenizer()
  *   .setInputCols(Array("document"))
  *   .setOutputCol("token")
  *
  * val embeddings = new Doc2VecApproach()
  *   .setInputCols("token")
  *   .setOutputCol("embeddings")
  *
  * val pipeline = new Pipeline()
  *   .setStages(Array(
  *     documentAssembler,
  *     tokenizer,
  *     embeddings
  *   ))
  *
  * val path = "src/test/resources/spell/sherlockholmes.txt"
  * val dataset = spark.sparkContext.textFile(path)
  *   .toDF("text")
  * val pipelineModel = pipeline.fit(dataset)
  * }}}
  *
  * @groupname anno Annotator types
  * @groupdesc anno
  *   Required input and expected output annotator types
  * @groupname Ungrouped Members
  * @groupname param Parameters
  * @groupname setParam Parameter setters
  * @groupname getParam Parameter getters
  * @groupname Ungrouped Members
  * @groupprio param  1
  * @groupprio anno  2
  * @groupprio Ungrouped 3
  * @groupprio setParam  4
  * @groupprio getParam  5
  * @groupdesc param
  *   A list of (hyper-)parameter keys this annotator can take. Users can set and get the
  *   parameter values through setters and getters, respectively.
  */
class Doc2VecApproach(override val uid: String)
    extends AnnotatorApproach[Doc2VecModel]
    with HasStorageRef
    with HasEnableCachingProperties
    with HasProtectedParams {

  def this() = this(Identifiable.randomUID("Doc2VecApproach"))

  override val description =
    "Distributed Representations of Words and Phrases and their Compositionality"

  /** Input Annotator Types: TOKEN
    *
    * @group anno
    */
  override val inputAnnotatorTypes: Array[AnnotatorType] = Array(TOKEN)

  /** Output Annotator Types: SENTENCE_EMBEDDINGS
    *
    * @group anno
    */
  override val outputAnnotatorType: String = SENTENCE_EMBEDDINGS

  /** The dimension of the code that you want to transform from words (Default: `100`).
    *
    * @group param
    */
  val vectorSize =
    new IntParam(this, "vectorSize", "the dimension of codes after transforming from words (> 0)")
      .setProtected()

  /** @group setParam */
  def setVectorSize(value: Int): this.type = {
    require(value > 0, s"vector size must be positive but got $value")
    set(vectorSize, value)
  }

  /** @group getParam */
  def getVectorSize: Int = $(vectorSize)

  /** The window size (context words from [-window, window]) (Default: `5`)
    *
    * @group param
    */
  val windowSize = new IntParam(
    this,
    "windowSize",
    "the window size (context words from [-window, window]) (> 0)")

  /** @group setParam */
  def setWindowSize(value: Int): this.type = {
    require(value > 0, s"Window of words must be positive but got $value")
    set(windowSize, value)
    this
  }

  /** @group getParam */
  def getWindowSize: Int = $(windowSize)

  /** Number of partitions for sentences of words (Default: `1`).
    *
    * @group param
    */
  val numPartitions =
    new IntParam(this, "numPartitions", "number of partitions for sentences of words (> 0)")

  /** @group setParam */
  def setNumPartitions(value: Int): this.type = {
    require(value > 0, s"Number of partitions must be positive but got $value")
    set(numPartitions, value)
    this
  }

  /** @group getParam */
  def getNumPartitions: Int = $(numPartitions)

  /** The minimum number of times a token must appear to be included in the word2vec model's
    * vocabulary. Default: 5
    *
    * @group param
    */
  val minCount = new IntParam(
    this,
    "minCount",
    "the minimum number of times a token must " +
      "appear to be included in the word2vec model's vocabulary (>= 0)")

  /** @group setParam */
  def setMinCount(value: Int): this.type = {
    require(value > 0, s"Minimum number of times must be nonnegative but got $value")
    set(minCount, value)
    this
  }

  /** @group getParam */
  def getMinCount: Int = $(minCount)

  /** Sets the maximum length (in words) of each sentence in the input data (Default: `1000`). Any
    * sentence longer than this threshold will be divided into chunks of up to `maxSentenceLength`
    * size.
    *
    * @group param
    */
  val maxSentenceLength = new IntParam(
    this,
    "maxSentenceLength",
    "Maximum length " +
      "(in words) of each sentence in the input data. Any sentence longer than this threshold will " +
      "be divided into chunks up to the size (> 0)")

  /** @group setParam */
  def setMaxSentenceLength(value: Int): this.type = {
    require(value > 0, s"Maximum length of sentences must be positive but got $value")
    set(maxSentenceLength, value)
    this
  }

  /** @group getParam */
  def getMaxSentenceLength: Int = $(maxSentenceLength)

  /** Param for Step size to be used for each iteration of optimization (> 0) (Default:
    * `0.025`).
    *
    * @group param
    */
  val stepSize: DoubleParam = new DoubleParam(
    this,
    "stepSize",
    "Step size (learning rate) to be used for each iteration of optimization (> 0)")

  /** @group setParam */
  def setStepSize(value: Double): this.type = {
    require(value > 0, s"Initial step size must be positive but got $value")
    set(stepSize, value)
    this
  }

  /** @group getParam */
  def getStepSize: Double = $(stepSize)

  /** Param for maximum number of iterations (>= 0) (Default: `1`)
    *
    * @group param
    */
  val maxIter: IntParam = new IntParam(this, "maxIter", "maximum number of iterations (>= 0)")

  /** @group setParam */
  def setMaxIter(value: Int): this.type = {
    require(value > 0, s"Number of iterations must be positive but got $value")
    set(maxIter, value)
    this
  }

  /** @group getParam */
  def getMaxIter: Int = $(maxIter)

  /** Random seed for shuffling the dataset (Default: `44`)
    *
    * @group param
    */
  val seed = new IntParam(this, "seed", "Random seed")

  /** @group setParam */
  def setSeed(value: Int): Doc2VecApproach.this.type = {
    require(value > 0, s"random seed must be positive but got $value")
    set(seed, value)
    this
  }

  /** @group getParam */
  def getSeed: Int = $(seed)

  setDefault(
    vectorSize -> 100,
    windowSize -> 5,
    numPartitions -> 1,
    minCount -> 1,
    maxSentenceLength -> 1000,
    stepSize -> 0.025,
    maxIter -> 1,
    seed -> 44,
    enableCaching -> false)

  override def beforeTraining(spark: SparkSession): Unit = {}

  override def train(
      dataset: Dataset[_],
      recursivePipeline: Option[PipelineModel]): Doc2VecModel = {

    val tokenResult: String = ".result"
    val inputColumns = getInputCols(0) + tokenResult

    val word2Vec = new Word2Vec()
      .setLearningRate($(stepSize))
      .setMinCount($(minCount))
      .setNumIterations($(maxIter))
      .setNumPartitions($(numPartitions))
      .setVectorSize($(vectorSize))
      .setWindowSize($(windowSize))
      .setMaxSentenceLength($(maxSentenceLength))
      .setSeed($(seed))

    val input = dataset.select(dataset.col(inputColumns)).rdd.map(r => r.getSeq[String](0))

    if (getEnableCaching)
      input.cache()

    val model = word2Vec.fit(input)

    if (getEnableCaching)
      input.unpersist()

    new Doc2VecModel()
      .setWordVectors(model.getVectors)
      .setVectorSize($(vectorSize))
      .setStorageRef($(storageRef))
      .setDimension($(vectorSize))

  }

}

/** This is the companion object of [[Doc2VecApproach]]. Please refer to that class for the
  * documentation.
  */
object Doc2VecApproach extends DefaultParamsReadable[Doc2VecApproach]