com.johnsnowlabs.nlp.annotators.DocumentNormalizer.scala Maven / Gradle / Ivy
/*
* Copyright 2017-2022 John Snow Labs
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.johnsnowlabs.nlp.annotators
import com.johnsnowlabs.nlp.AnnotatorType.DOCUMENT
import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, AnnotatorType, HasSimpleAnnotate}
import org.apache.spark.ml.param.{BooleanParam, Param, StringArrayParam}
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
import java.nio.charset.{Charset, StandardCharsets}
import scala.collection.mutable.ListBuffer
import scala.util.matching.Regex
import scala.util.matching.Regex.Match
import scala.util.{Failure, Success, Try}
import scala.xml.XML
/** Annotator which normalizes raw text from tagged text, e.g. scraped web pages or xml documents,
* from document type columns into Sentence. Removes all dirty characters from text following one
* or more input regex patterns. Can apply not wanted character removal with a specific policy.
* Can apply lower case normalization.
*
* For extended examples of usage, see the
* [[https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb Spark NLP Workshop]].
*
* ==Example==
* {{{
* import spark.implicits._
* import com.johnsnowlabs.nlp.DocumentAssembler
* import com.johnsnowlabs.nlp.annotator.DocumentNormalizer
* import org.apache.spark.ml.Pipeline
*
* val documentAssembler = new DocumentAssembler()
* .setInputCol("text")
* .setOutputCol("document")
*
* val cleanUpPatterns = Array("<[^>]*>")
*
* val documentNormalizer = new DocumentNormalizer()
* .setInputCols("document")
* .setOutputCol("normalizedDocument")
* .setAction("clean")
* .setPatterns(cleanUpPatterns)
* .setReplacement(" ")
* .setPolicy("pretty_all")
* .setLowercase(true)
*
* val pipeline = new Pipeline().setStages(Array(
* documentAssembler,
* documentNormalizer
* ))
*
* val text =
* """
*
*
*