com.johnsnowlabs.nlp.annotators.RecursiveTokenizerModel.scala Maven / Gradle / Ivy

Go to download
/*
 * Copyright 2017-2022 John Snow Labs
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.johnsnowlabs.nlp.annotators

import com.johnsnowlabs.nlp.annotators.common.{
  InfixToken,
  PrefixedToken,
  PreprocessingParser,
  SuffixedToken
}
import com.johnsnowlabs.nlp.serialization.{ArrayFeature, SetFeature}
import com.johnsnowlabs.nlp._
import org.apache.spark.ml.util.Identifiable

/** Instantiated model of the [[RecursiveTokenizer]]. For usage and examples see the documentation
  * of the main class.
  *
  * @param uid
  *   required internal uid for saving annotator
  * @groupname anno Annotator types
  * @groupdesc anno
  *   Required input and expected output annotator types
  * @groupname Ungrouped Members
  * @groupname param Parameters
  * @groupname setParam Parameter setters
  * @groupname getParam Parameter getters
  * @groupname Ungrouped Members
  * @groupprio param  1
  * @groupprio anno  2
  * @groupprio Ungrouped 3
  * @groupprio setParam  4
  * @groupprio getParam  5
  * @groupdesc param
  *   A list of (hyper-)parameter keys this annotator can take. Users can set and get the
  *   parameter values through setters and getters, respectively.
  */
class RecursiveTokenizerModel(override val uid: String)
    extends AnnotatorModel[RecursiveTokenizerModel]
    with HasSimpleAnnotate[RecursiveTokenizerModel]
    with ParamsAndFeaturesWritable {

  /** Annotator reference id. Used to identify elements in metadata or to refer to this annotator
    * type
    */
  def this() = this(Identifiable.randomUID("RecursiveTokenizerModel"))

  /** Output Annotator types: TOKEN
    *
    * @group anno
    */
  override val outputAnnotatorType: AnnotatorType = AnnotatorType.TOKEN

  /** Input Annotator types: DOCUMENT
    *
    * @group anno
    */
  override val inputAnnotatorTypes: Array[String] = Array(AnnotatorType.DOCUMENT)

  /** prefixes
    *
    * @group param
    */
  val prefixes: ArrayFeature[String] = new ArrayFeature[String](this, "prefixes")

  /** prefixes
    *
    * @group setParam
    */
  def setPrefixes(p: Array[String]): this.type = set(prefixes, p.sortBy(_.length).reverse)

  /** suffixes
    *
    * @group param
    */
  val suffixes: ArrayFeature[String] = new ArrayFeature[String](this, "suffixes")

  /** suffixes
    *
    * @group setParam
    */
  def setSuffixes(s: Array[String]): this.type = set(suffixes, s.sortBy(_.length).reverse)

  /** infixes
    *
    * @group param
    */
  val infixes: ArrayFeature[String] = new ArrayFeature[String](this, "infixes")

  /** infixes
    *
    * @group setParam
    */
  def setInfixes(s: Array[String]): this.type = set(infixes, s.sortBy(_.length).reverse)

  /** whitelist
    *
    * @group param
    */
  val whitelist: SetFeature[String] = new SetFeature[String](this, "whitelist")

  /** whitelist
    *
    * @group setParam
    */
  def setWhitelist(wlist: Set[String]): this.type = set(whitelist, wlist)

  /** takes a document and annotations and produces new annotations of this annotator's annotation
    * type
    *
    * @param annotations
    *   Annotations that correspond to inputAnnotationCols generated by previous annotators if any
    * @return
    *   any number of annotations processed for every input annotation. Not necessary one to one
    *   relationship
    */
  override def annotate(annotations: Seq[Annotation]): Seq[Annotation] =
    annotations.flatMap { annotation =>
      tokenize(annotation.result, annotation.begin).map(token =>
        annotation.copy(
          annotatorType = AnnotatorType.TOKEN,
          result = token._1,
          begin = token._2,
          end = token._3,
          metadata = annotation.metadata
            .updated("sentence", annotation.metadata.getOrElse("sentence", "0"))))
    }

  // hardcoded at this time
  @transient
  private lazy val firstPass = Seq(InfixToken($$(infixes)))
  @transient
  private lazy val secondPass = Seq(SuffixedToken($$(suffixes)), PrefixedToken($$(prefixes)))

  private def tokenize(text: String, beginTextIndex: Int): Seq[(String, Int, Int)] = {
    val splitText = text.split(" ").filter(_ != " ")
    var previousBegin = beginTextIndex

    splitText.zipWithIndex
      .flatMap { case (token, tokenIndex) =>
        if (tokenIndex > 0) {
          previousBegin = previousBegin + splitText(tokenIndex - 1).length + 1
        }
        var tmpToken: Seq[(String, Int, Int)] =
          Seq((token, previousBegin, previousBegin + token.length - 1))

        firstPass.foreach { parser =>
          tmpToken = tmpToken.flatMap { token =>
            if (whitelist.getOrDefault.contains(token._1))
              Seq(token)
            else
              parseSeparator(parser, token._1, token._2)
          }
        }

        secondPass.foreach { parser =>
          tmpToken = tmpToken.flatMap { token =>
            if (whitelist.getOrDefault.contains(token._1))
              Seq(token)
            else {
              parseSeparator(parser, token._1, token._2)
            }
          }
        }
        tmpToken
      }
      .filter(!_._1.equals(""))
  }

  private def parseSeparator(
      parser: PreprocessingParser,
      token: String,
      begin: Int): Array[(String, Int, Int)] = {
    val parserSeparator = parser.separate(token).split(" ")
    var currentParsedTokenBegin = begin
    val tokensResult = parserSeparator.zipWithIndex.map { case (parsedToken, index) =>
      if (index > 0)
        currentParsedTokenBegin = currentParsedTokenBegin + parserSeparator(index - 1).length
      (parsedToken, currentParsedTokenBegin, currentParsedTokenBegin + parsedToken.length - 1)
    }

    tokensResult
  }

}

object RecursiveTokenizerModel extends ParamsAndFeaturesReadable[RecursiveTokenizerModel]