com.johnsnowlabs.nlp.annotators.NormalizerModel.scala Maven / Gradle / Ivy

Go to download
/*
 * Copyright 2017-2022 John Snow Labs
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.johnsnowlabs.nlp.annotators

import com.johnsnowlabs.nlp.AnnotatorType.TOKEN
import com.johnsnowlabs.nlp.serialization.MapFeature
import com.johnsnowlabs.nlp.{
  Annotation,
  AnnotatorModel,
  HasSimpleAnnotate,
  ParamsAndFeaturesReadable
}
import org.apache.spark.ml.param.{BooleanParam, IntParam, StringArrayParam}
import org.apache.spark.ml.util.Identifiable

/** Instantiated Model of the [[Normalizer]]. For usage and examples, please see the documentation
  * of that class.
  *
  * @see
  *   [[Normalizer]] for the base class
  * @param uid
  *   required internal uid for saving annotator
  * @groupname anno Annotator types
  * @groupdesc anno
  *   Required input and expected output annotator types
  * @groupname Ungrouped Members
  * @groupname param Parameters
  * @groupname setParam Parameter setters
  * @groupname getParam Parameter getters
  * @groupname Ungrouped Members
  * @groupprio param  1
  * @groupprio anno  2
  * @groupprio Ungrouped 3
  * @groupprio setParam  4
  * @groupprio getParam  5
  * @groupdesc param
  *   A list of (hyper-)parameter keys this annotator can take. Users can set and get the
  *   parameter values through setters and getters, respectively.
  */
class NormalizerModel(override val uid: String)
    extends AnnotatorModel[NormalizerModel]
    with HasSimpleAnnotate[NormalizerModel] {

  /** Output annotator type : TOKEN
    *
    * @group anno
    */
  override val outputAnnotatorType: AnnotatorType = TOKEN

  /** Input annotator type : TOKEN
    *
    * @group anno
    */
  override val inputAnnotatorTypes: Array[AnnotatorType] = Array(TOKEN)

  case class TokenizerAndNormalizerMap(
      beginTokenizer: Int,
      endTokenizer: Int,
      token: String,
      beginNormalizer: Int,
      endNormalizer: Int,
      normalizer: String)

  /** normalization regex patterns which match will be removed from token
    *
    * @group param
    */
  val cleanupPatterns = new StringArrayParam(
    this,
    "cleanupPatterns",
    "normalization regex patterns which match will be removed from token")

  /** @group setParam */
  def setCleanupPatterns(value: Array[String]): this.type = set(cleanupPatterns, value)

  /** @group setParam */
  def getCleanupPatterns: Array[String] = $(cleanupPatterns)

  /** whether to convert strings to lowercase
    *
    * @group param
    */
  val lowercase = new BooleanParam(this, "lowercase", "whether to convert strings to lowercase")

  /** @group setParam */
  def setLowercase(value: Boolean): this.type = set(lowercase, value)

  /** @group setParam */
  def getLowercase: Boolean = $(lowercase)

  /** slangDict
    *
    * @group param
    */
  protected val slangDict: MapFeature[String, String] = new MapFeature(this, "slangDict")

  /** whether or not to be case sensitive to match slangs. Defaults to false.
    *
    * @group param
    */
  val slangMatchCase = new BooleanParam(
    this,
    "slangMatchCase",
    "whether or not to be case sensitive to match slangs. Defaults to false.")

  /** @group setParam */
  def setSlangMatchCase(value: Boolean): this.type = set(slangMatchCase, value)

  /** @group getParam */
  def getSlangMatchCase: Boolean = $(slangMatchCase)

  def this() = this(Identifiable.randomUID("NORMALIZER"))

  /** @group setParam */
  def setSlangDict(value: Map[String, String]): this.type = set(slangDict, value)

  /** Set the minimum allowed length for each token
    *
    * @group param
    */
  val minLength = new IntParam(this, "minLength", "Set the minimum allowed length for each token")

  /** @group setParam */
  def setMinLength(value: Int): this.type = {
    require(value >= 0, "minLength must be greater equal than 0")
    require(value.isValidInt, "minLength must be Int")
    set(minLength, value)
  }

  /** @group getParam */
  def getMinLength: Int = $(minLength)

  /** Set the maximum allowed length for each token
    *
    * @group param
    */
  val maxLength = new IntParam(this, "maxLength", "Set the maximum allowed length for each token")

  /** @group setParam */
  def setMaxLength(value: Int): this.type = {
    require(
      value >= $ {
        minLength
      },
      "maxLength must be greater equal than minLength")
    require(value.isValidInt, "minLength must be Int")
    set(maxLength, value)
  }

  /** @group getParam */
  def getMaxLength: Int = $(maxLength)

  def applyRegexPatterns(word: String): String = {

    val nToken = {
      get(cleanupPatterns)
        .map(_.foldLeft(word)((currentText, compositeToken) => {
          currentText.replaceAll(compositeToken, "")
        }))
        .getOrElse(word)
    }
    nToken
  }

  /** Txt file with delimited words to be transformed into something else
    *
    * @group getParam
    */
  protected def getSlangDict: Map[String, String] = $$(slangDict)

  /** ToDo: Review implementation, Current implementation generates spaces between non-words,
    * potentially breaking tokens
    */
  override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = {
    val normalizedAnnotations = annotations.flatMap { originalToken =>
      /** slang dictionary keys should have been lowercased if slangMatchCase is false */
      val unslanged = $$(slangDict).get(
        if ($(slangMatchCase)) originalToken.result
        else originalToken.result.toLowerCase)

      /** simple-tokenize the unslanged slag phrase */
      val tokenizedUnslang = {
        unslanged
          .map(unslang => {
            unslang.split(" ")
          })
          .getOrElse(Array(originalToken.result))
      }

      val cleaned = tokenizedUnslang.map(word => applyRegexPatterns(word))

      val cased = if ($(lowercase)) cleaned.map(_.toLowerCase) else cleaned

      cased
        .filter(t =>
          t.nonEmpty && t.length >= $(minLength) && get(maxLength).forall(m => t.length <= m))
        .map { finalToken =>
          {
            Annotation(
              outputAnnotatorType,
              originalToken.begin,
              originalToken.begin + finalToken.length - 1,
              finalToken,
              originalToken.metadata)
          }
        }

    }

    normalizedAnnotations

  }

}

object NormalizerModel extends ParamsAndFeaturesReadable[NormalizerModel]