com.johnsnowlabs.nlp.annotators.parser.dep.DependencyParserApproach.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-nlp_2.12 Show documentation
spark-nlp
The newest version!
/*
 * Copyright 2017-2022 John Snow Labs
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.johnsnowlabs.nlp.annotators.parser.dep

import com.johnsnowlabs.nlp.AnnotatorApproach
import com.johnsnowlabs.nlp.AnnotatorType._
import com.johnsnowlabs.nlp.annotators.param.ExternalResourceParam
import com.johnsnowlabs.nlp.annotators.parser.dep.GreedyTransition._
import com.johnsnowlabs.nlp.util.io.{ExternalResource, ReadAs, ResourceHelper}
import org.apache.spark.ml.PipelineModel
import org.apache.spark.ml.param.IntParam
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
import org.apache.spark.sql.Dataset
import org.slf4j.LoggerFactory

/** Trains an unlabeled parser that finds a grammatical relations between two words in a sentence.
  *
  * For instantiated/pretrained models, see [[DependencyParserModel]].
  *
  * Dependency parser provides information about word relationship. For example, dependency
  * parsing can tell you what the subjects and objects of a verb are, as well as which words are
  * modifying (describing) the subject. This can help you find precise answers to specific
  * questions.
  *
  * The required training data can be set in two different ways (only one can be chosen for a
  * particular model):
  *   - Dependency treebank in the [[http://www.nltk.org/nltk_data/ Penn Treebank format]] set
  *     with `setDependencyTreeBank`
  *   - Dataset in the [[https://universaldependencies.org/format.html CoNLL-U format]] set with
  *     `setConllU`
  *
  * Apart from that, no additional training data is needed.
  *
  * See
  * [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserApproachTestSpec.scala DependencyParserApproachTestSpec]]
  * for further reference on how to use this API.
  *
  * ==Example==
  * {{{
  * import spark.implicits._
  * import com.johnsnowlabs.nlp.base.DocumentAssembler
  * import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector
  * import com.johnsnowlabs.nlp.annotators.Tokenizer
  * import com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronModel
  * import com.johnsnowlabs.nlp.annotators.parser.dep.DependencyParserApproach
  * import org.apache.spark.ml.Pipeline
  *
  * val documentAssembler = new DocumentAssembler()
  *   .setInputCol("text")
  *   .setOutputCol("document")
  *
  * val sentence = new SentenceDetector()
  *   .setInputCols("document")
  *   .setOutputCol("sentence")
  *
  * val tokenizer = new Tokenizer()
  *   .setInputCols("sentence")
  *   .setOutputCol("token")
  *
  * val posTagger = PerceptronModel.pretrained()
  *   .setInputCols("sentence", "token")
  *   .setOutputCol("pos")
  *
  * val dependencyParserApproach = new DependencyParserApproach()
  *   .setInputCols("sentence", "pos", "token")
  *   .setOutputCol("dependency")
  *   .setDependencyTreeBank("src/test/resources/parser/unlabeled/dependency_treebank")
  *
  * val pipeline = new Pipeline().setStages(Array(
  *   documentAssembler,
  *   sentence,
  *   tokenizer,
  *   posTagger,
  *   dependencyParserApproach
  * ))
  *
  * // Additional training data is not needed, the dependency parser relies on the dependency tree bank / CoNLL-U only.
  * val emptyDataSet = Seq.empty[String].toDF("text")
  * val pipelineModel = pipeline.fit(emptyDataSet)
  * }}}
  *
  * @see
  *   [[com.johnsnowlabs.nlp.annotators.parser.typdep.TypedDependencyParserApproach TypedDependencyParserApproach]]
  *   to extract labels for the dependencies
  * @groupname anno Annotator types
  * @groupdesc anno
  *   Required input and expected output annotator types
  * @groupname Ungrouped Members
  * @groupname param Parameters
  * @groupname setParam Parameter setters
  * @groupname getParam Parameter getters
  * @groupname Ungrouped Members
  * @groupprio param  1
  * @groupprio anno  2
  * @groupprio Ungrouped 3
  * @groupprio setParam  4
  * @groupprio getParam  5
  * @groupdesc param
  *   A list of (hyper-)parameter keys this annotator can take. Users can set and get the
  *   parameter values through setters and getters, respectively.
  */
class DependencyParserApproach(override val uid: String)
    extends AnnotatorApproach[DependencyParserModel] {

  override val description: String =
    "Dependency Parser is an unlabeled parser that finds a grammatical relation between two words in a sentence"

  private val logger = LoggerFactory.getLogger("DependencyParserApproach")

  def this() = this(Identifiable.randomUID(DEPENDENCY))

  /** Dependency treebank source files
    *
    * @group param
    */
  val dependencyTreeBank =
    new ExternalResourceParam(this, "dependencyTreeBank", "Dependency treebank source files")

  /** Universal Dependencies source files
    *
    * @group param
    */
  val conllU = new ExternalResourceParam(this, "conllU", "Universal Dependencies source files")

  /** Number of iterations in training, converges to better accuracy (Default: `10`)
    *
    * @group param
    */
  val numberOfIterations = new IntParam(
    this,
    "numberOfIterations",
    "Number of iterations in training, converges to better accuracy")

  /** Dependency treebank folder with files in
    * [[http://www.nltk.org/nltk_data/ Penn Treebank format]]
    *
    * @group setParam
    */
  def setDependencyTreeBank(
      path: String,
      readAs: ReadAs.Format = ReadAs.TEXT,
      options: Map[String, String] = Map.empty[String, String]): this.type =
    set(dependencyTreeBank, ExternalResource(path, readAs, options))

  /** Path to a file in [[https://universaldependencies.org/format.html CoNLL-U format]]
    *
    * @group setParam
    */
  def setConllU(
      path: String,
      readAs: ReadAs.Format = ReadAs.TEXT,
      options: Map[String, String] = Map.empty[String, String]): this.type =
    set(conllU, ExternalResource(path, readAs, options))

  /** Number of iterations in training, converges to better accuracy
    *
    * @group setParam
    */
  def setNumberOfIterations(value: Int): this.type = set(numberOfIterations, value)

  setDefault(dependencyTreeBank, ExternalResource("", ReadAs.TEXT, Map.empty[String, String]))
  setDefault(conllU, ExternalResource("", ReadAs.TEXT, Map.empty[String, String]))
  setDefault(numberOfIterations, 10)

  /** Number of iterations in training, converges to better accuracy
    *
    * @group getParam
    */
  def getNumberOfIterations: Int = $(numberOfIterations)

  /** Output annotation type : DEPENDENCY
    *
    * @group anno
    */
  override val outputAnnotatorType: String = DEPENDENCY

  /** Input annotation type : DOCUMENT, POS, TOKEN
    *
    * @group anno
    */
  override val inputAnnotatorTypes = Array(DOCUMENT, POS, TOKEN)

  private lazy val conllUAsArray = ResourceHelper.parseLines($(conllU))

  def readCONLL(filesContent: Seq[Iterator[String]]): List[Sentence] = {

    val buffer = StringBuilder.newBuilder

    filesContent.foreach { fileContent =>
      fileContent.foreach(line => buffer.append(line + System.lineSeparator()))
    }

    val wholeText = buffer.toString()
    val sections = wholeText.split(s"${System.lineSeparator()}${System.lineSeparator()}").toList

    val sentences = sections.map(s => {
      val lines = s.split(s"${System.lineSeparator()}").toList
      val body = lines.map(l => {
        val arr = l.split("\\s+")
        val (raw, pos, dep) = (arr(0), arr(1), arr(2).toInt)
        // CONLL dependency layout assumes [root, word1, word2, ..., wordn]  (where n == lines.length)
        // our   dependency layout assumes [word0, word1, ..., word(n-1)] { root }
        val dep_ex = if (dep == 0) lines.length + 1 - 1 else dep - 1
        WordData(raw, pos, dep_ex)
      })
      body // Don't pretty up the sentence itself
    })
    sentences
  }

  override def train(
      dataset: Dataset[_],
      recursivePipeline: Option[PipelineModel]): DependencyParserModel = {

    validateTrainingFiles()
    val trainingSentences = getTrainingSentences
    val (classes, tagDictionary) = TagDictionary.classesAndTagDictionary(trainingSentences)
    val tagger = new Tagger(classes, tagDictionary)
    val taggerNumberOfIterations = getNumberOfIterations

    val dependencyMaker = new DependencyMaker(tagger)

    val dependencyMakerPerformanceProgress = (0 until taggerNumberOfIterations).map { seed =>
      dependencyMaker.train(trainingSentences, seed)
    }
    logger.info(s"Dependency Maker Performance = $dependencyMakerPerformanceProgress")

    new DependencyParserModel()
      .setPerceptron(dependencyMaker)
  }

  def validateTrainingFiles(): Unit = {
    if ($(dependencyTreeBank).path != "" && $(conllU).path != "") {
      throw new IllegalArgumentException(
        "Use either TreeBank or CoNLL-U format file both are not allowed.")
    }
    if ($(dependencyTreeBank).path == "" && $(conllU).path == "") {
      throw new IllegalArgumentException("Either TreeBank or CoNLL-U format file is required.")
    }
  }

  /** Gets a list of ConnlU training sentences */
  def getTrainingSentences: List[Sentence] = {
    if ($(dependencyTreeBank).path != "") {
      val filesContentTreeBank = getFilesContentTreeBank
      readCONLL(filesContentTreeBank)
    } else {
      getTrainingSentencesFromConllU(conllUAsArray)
    }
  }

  /** Gets a iterable TreeBank */
  def getFilesContentTreeBank: Seq[Iterator[String]] =
    ResourceHelper.getFilesContentBuffer($(dependencyTreeBank))

  def getTrainingSentencesFromConllU(conllUAsArray: Array[String]): List[Sentence] = {

    val conllUSentences = conllUAsArray.filterNot(line => lineIsComment(line))
    val indexSentenceBoundaries = conllUSentences.zipWithIndex.filter(_._1 == "").map(_._2)
    val cleanConllUSentences = indexSentenceBoundaries.zipWithIndex.map {
      case (indexSentenceBoundary, index) =>
        if (index == 0) {
          conllUSentences.slice(index, indexSentenceBoundary)
        } else {
          conllUSentences.slice(indexSentenceBoundaries(index - 1) + 1, indexSentenceBoundary)
        }
    }
    val sentences = cleanConllUSentences.map { cleanConllUSentence =>
      transformToSentences(cleanConllUSentence)
    }
    sentences.toList
  }

  def lineIsComment(line: String): Boolean = {
    if (line.nonEmpty) {
      line(0) == '#'
    } else {
      false
    }
  }

  def transformToSentences(cleanConllUSentence: Array[String]): Sentence = {
    val ID_INDEX = 0
    val WORD_INDEX = 1
    val POS_INDEX = 4
    val HEAD_INDEX = 6
    val SEPARATOR = "\\t"

    val sentences = cleanConllUSentence.map { conllUWord =>
      val wordArray = conllUWord.split(SEPARATOR)
      if (wordArray(ID_INDEX).matches("\\d+") && !wordArray(ID_INDEX).contains(".")) {
        var head = wordArray(HEAD_INDEX).toInt
        if (head == 0) {
          head = cleanConllUSentence.length
        } else {
          head = head - 1
        }
        WordData(wordArray(WORD_INDEX), wordArray(POS_INDEX), head)
      } else {
        WordData("", "", -1)
      }
    }

    sentences.filter(word => word.dep != -1).toList
  }

}

/** This is the companion object of [[DependencyParserApproach]]. Please refer to that class for
  * the documentation.
  */
object DependencyParserApproach extends DefaultParamsReadable[DependencyParserApproach]