com.johnsnowlabs.nlp.annotators.Normalizer.scala Maven / Gradle / Ivy
/*
* Copyright 2017-2022 John Snow Labs
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.johnsnowlabs.nlp.annotators
import com.johnsnowlabs.nlp.AnnotatorApproach
import com.johnsnowlabs.nlp.AnnotatorType.TOKEN
import com.johnsnowlabs.nlp.annotators.param.ExternalResourceParam
import com.johnsnowlabs.nlp.util.io.{ExternalResource, ReadAs, ResourceHelper}
import org.apache.spark.ml.PipelineModel
import org.apache.spark.ml.param.{BooleanParam, IntParam, StringArrayParam}
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
import org.apache.spark.sql.Dataset
/** Annotator that cleans out tokens. Requires stems, hence tokens. Removes all dirty characters
* from text following a regex pattern and transforms words based on a provided dictionary
*
* For extended examples of usage, see the
* [[https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb Spark NLP Workshop]].
*
* ==Example==
* {{{
* import spark.implicits._
* import com.johnsnowlabs.nlp.DocumentAssembler
* import com.johnsnowlabs.nlp.annotator.{Normalizer, Tokenizer}
* import org.apache.spark.ml.Pipeline
* val documentAssembler = new DocumentAssembler()
* .setInputCol("text")
* .setOutputCol("document")
*
* val tokenizer = new Tokenizer()
* .setInputCols("document")
* .setOutputCol("token")
*
* val normalizer = new Normalizer()
* .setInputCols("token")
* .setOutputCol("normalized")
* .setLowercase(true)
* .setCleanupPatterns(Array("""[^\w\d\s]""")) // remove punctuations (keep alphanumeric chars)
* // if we don't set CleanupPatterns, it will only keep alphabet letters ([^A-Za-z])
*
* val pipeline = new Pipeline().setStages(Array(
* documentAssembler,
* tokenizer,
* normalizer
* ))
*
* val data = Seq("John and Peter are brothers. However they don't support each other that much.")
* .toDF("text")
* val result = pipeline.fit(data).transform(data)
*
* result.selectExpr("normalized.result").show(truncate = false)
* +----------------------------------------------------------------------------------------+
* |result |
* +----------------------------------------------------------------------------------------+
* |[john, and, peter, are, brothers, however, they, dont, support, each, other, that, much]|
* +----------------------------------------------------------------------------------------+
* }}}
* @param uid
* required internal uid for saving annotator
* @groupname anno Annotator types
* @groupdesc anno
* Required input and expected output annotator types
* @groupname Ungrouped Members
* @groupname param Parameters
* @groupname setParam Parameter setters
* @groupname getParam Parameter getters
* @groupname Ungrouped Members
* @groupprio param 1
* @groupprio anno 2
* @groupprio Ungrouped 3
* @groupprio setParam 4
* @groupprio getParam 5
* @groupdesc param
* A list of (hyper-)parameter keys this annotator can take. Users can set and get the
* parameter values through setters and getters, respectively.
*/
class Normalizer(override val uid: String) extends AnnotatorApproach[NormalizerModel] {
/** Cleans out tokens */
override val description: String = "Cleans out tokens"
/** Output Annotator Type : TOKEN
*
* @group anno
*/
override val outputAnnotatorType: AnnotatorType = TOKEN
/** Input Annotator Type : TOKEN
*
* @group anno
*/
override val inputAnnotatorTypes: Array[String] = Array(TOKEN)
/** Normalization regex patterns which match will be removed from token (Default:
* `Array("[^\\pL+]")`)
*
* @group param
*/
val cleanupPatterns = new StringArrayParam(
this,
"cleanupPatterns",
"Normalization regex patterns which match will be removed from token")
/** Normalization regex patterns which match will be removed from token (Default:
* `Array("[^\\pL+]")`)
* @group getParam
*/
def getCleanupPatterns: Array[String] = $(cleanupPatterns)
/** Normalization regex patterns which match will be removed from token (Default:
* `Array("[^\\pL+]")`)
* @group setParam
*/
def setCleanupPatterns(value: Array[String]): this.type = set(cleanupPatterns, value)
/** Whether to convert strings to lowercase (Default: `false`)
*
* @group param
*/
val lowercase = new BooleanParam(
this,
"lowercase",
"Whether to convert strings to lowercase (Default: `false`)")
/** Whether to convert strings to lowercase (Default: `false`)
* @group getParam
*/
def getLowercase: Boolean = $(lowercase)
/** Whether to convert strings to lowercase (Default: `false`)
* @group setParam
*/
def setLowercase(value: Boolean): this.type = set(lowercase, value)
/** Delimited file with list of custom words to be manually corrected
*
* @group param
*/
val slangDictionary = new ExternalResourceParam(
this,
"slangDictionary",
"Delimited file with list of custom words to be manually corrected")
/** Delimited file with list of custom words to be manually corrected
* @group setParam
*/
def setSlangDictionary(value: ExternalResource): this.type = {
require(
value.options.contains("delimiter"),
"slang dictionary is a delimited text. needs 'delimiter' in options")
set(slangDictionary, value)
}
/** Delimited file with list of custom words to be manually corrected
* @group setParam
*/
def setSlangDictionary(
path: String,
delimiter: String,
readAs: ReadAs.Format = ReadAs.TEXT,
options: Map[String, String] = Map("format" -> "text")): this.type =
set(slangDictionary, ExternalResource(path, readAs, options ++ Map("delimiter" -> delimiter)))
/** Whether or not to be case sensitive to match slangs (Default: `false`)
*
* @group param
*/
val slangMatchCase = new BooleanParam(
this,
"slangMatchCase",
"Whether or not to be case sensitive to match slangs. Defaults to false.")
/** Whether or not to be case sensitive to match slangs (Default: `false`)
* @group setParam
*/
def setSlangMatchCase(value: Boolean): this.type = set(slangMatchCase, value)
/** Whether or not to be case sensitive to match slangs (Default: `false`)
* @group getParam
*/
def getSlangMatchCase: Boolean = $(slangMatchCase)
/** Set the minimum allowed length for each token (Default: `0`)
*
* @group param
*/
val minLength = new IntParam(this, "minLength", "Set the minimum allowed length for each token")
/** Set the minimum allowed length for each token (Default: `0`)
* @group setParam
*/
def setMinLength(value: Int): this.type = {
require(value >= 0, "minLength must be greater equal than 0")
require(value.isValidInt, "minLength must be Int")
set(minLength, value)
}
/** Set the minimum allowed length for each token (Default: `0`)
* @group getParam
*/
def getMinLength: Int = $(minLength)
/** Set the maximum allowed length for each token
*
* @group param
*/
val maxLength = new IntParam(this, "maxLength", "Set the maximum allowed length for each token")
/** Set the maximum allowed length for each token
* @group setParam
*/
def setMaxLength(value: Int): this.type = {
require(
value >= $ {
minLength
},
"maxLength must be greater equal than minLength")
require(value.isValidInt, "minLength must be Int")
set(maxLength, value)
}
/** Set the maximum allowed length for each token
* @group getParam
*/
def getMaxLength: Int = $(maxLength)
setDefault(
lowercase -> false,
cleanupPatterns -> Array("[^\\pL+]"),
slangMatchCase -> false,
minLength -> 0)
def this() = this(Identifiable.randomUID("NORMALIZER"))
override def train(
dataset: Dataset[_],
recursivePipeline: Option[PipelineModel]): NormalizerModel = {
val loadSlangs = if (get(slangDictionary).isDefined) {
val parsed = ResourceHelper.parseKeyValueText($(slangDictionary))
if ($(slangMatchCase))
parsed.mapValues(_.trim)
else
parsed.map { case (k, v) => (k.toLowerCase, v.trim) }
} else
Map.empty[String, String]
val raw = new NormalizerModel()
.setCleanupPatterns($(cleanupPatterns))
.setLowercase($(lowercase))
.setSlangDict(loadSlangs)
.setSlangMatchCase($(slangMatchCase))
.setMinLength($(minLength))
if (isDefined(maxLength))
raw.setMaxLength($(maxLength))
raw
}
}
/** This is the companion object of [[Normalizer]]. Please refer to that class for the
* documentation.
*/
object Normalizer extends DefaultParamsReadable[Normalizer]
© 2015 - 2024 Weber Informatics LLC | Privacy Policy