com.johnsnowlabs.nlp.annotators.RecursiveTokenizerModel.scala Maven / Gradle / Ivy
The newest version!
/*
* Copyright 2017-2022 John Snow Labs
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.johnsnowlabs.nlp.annotators
import com.johnsnowlabs.nlp.annotators.common.{
InfixToken,
PrefixedToken,
PreprocessingParser,
SuffixedToken
}
import com.johnsnowlabs.nlp.serialization.{ArrayFeature, SetFeature}
import com.johnsnowlabs.nlp._
import org.apache.spark.ml.util.Identifiable
/** Instantiated model of the [[RecursiveTokenizer]]. For usage and examples see the documentation
* of the main class.
*
* @param uid
* required internal uid for saving annotator
* @groupname anno Annotator types
* @groupdesc anno
* Required input and expected output annotator types
* @groupname Ungrouped Members
* @groupname param Parameters
* @groupname setParam Parameter setters
* @groupname getParam Parameter getters
* @groupname Ungrouped Members
* @groupprio param 1
* @groupprio anno 2
* @groupprio Ungrouped 3
* @groupprio setParam 4
* @groupprio getParam 5
* @groupdesc param
* A list of (hyper-)parameter keys this annotator can take. Users can set and get the
* parameter values through setters and getters, respectively.
*/
class RecursiveTokenizerModel(override val uid: String)
extends AnnotatorModel[RecursiveTokenizerModel]
with HasSimpleAnnotate[RecursiveTokenizerModel]
with ParamsAndFeaturesWritable {
/** Annotator reference id. Used to identify elements in metadata or to refer to this annotator
* type
*/
def this() = this(Identifiable.randomUID("RecursiveTokenizerModel"))
/** Output Annotator types: TOKEN
*
* @group anno
*/
override val outputAnnotatorType: AnnotatorType = AnnotatorType.TOKEN
/** Input Annotator types: DOCUMENT
*
* @group anno
*/
override val inputAnnotatorTypes: Array[String] = Array(AnnotatorType.DOCUMENT)
/** prefixes
*
* @group param
*/
val prefixes: ArrayFeature[String] = new ArrayFeature[String](this, "prefixes")
/** prefixes
*
* @group setParam
*/
def setPrefixes(p: Array[String]): this.type = set(prefixes, p.sortBy(_.length).reverse)
/** suffixes
*
* @group param
*/
val suffixes: ArrayFeature[String] = new ArrayFeature[String](this, "suffixes")
/** suffixes
*
* @group setParam
*/
def setSuffixes(s: Array[String]): this.type = set(suffixes, s.sortBy(_.length).reverse)
/** infixes
*
* @group param
*/
val infixes: ArrayFeature[String] = new ArrayFeature[String](this, "infixes")
/** infixes
*
* @group setParam
*/
def setInfixes(s: Array[String]): this.type = set(infixes, s.sortBy(_.length).reverse)
/** whitelist
*
* @group param
*/
val whitelist: SetFeature[String] = new SetFeature[String](this, "whitelist")
/** whitelist
*
* @group setParam
*/
def setWhitelist(wlist: Set[String]): this.type = set(whitelist, wlist)
/** takes a document and annotations and produces new annotations of this annotator's annotation
* type
*
* @param annotations
* Annotations that correspond to inputAnnotationCols generated by previous annotators if any
* @return
* any number of annotations processed for every input annotation. Not necessary one to one
* relationship
*/
override def annotate(annotations: Seq[Annotation]): Seq[Annotation] =
annotations.flatMap { annotation =>
tokenize(annotation.result, annotation.begin).map(token =>
annotation.copy(
annotatorType = AnnotatorType.TOKEN,
result = token._1,
begin = token._2,
end = token._3,
metadata = annotation.metadata
.updated("sentence", annotation.metadata.getOrElse("sentence", "0"))))
}
// hardcoded at this time
@transient
private lazy val firstPass = Seq(InfixToken($$(infixes)))
@transient
private lazy val secondPass = Seq(SuffixedToken($$(suffixes)), PrefixedToken($$(prefixes)))
private def tokenize(text: String, beginTextIndex: Int): Seq[(String, Int, Int)] = {
val splitText = text.split(" ").filter(_ != " ")
var previousBegin = beginTextIndex
splitText.zipWithIndex
.flatMap { case (token, tokenIndex) =>
if (tokenIndex > 0) {
previousBegin = previousBegin + splitText(tokenIndex - 1).length + 1
}
var tmpToken: Seq[(String, Int, Int)] =
Seq((token, previousBegin, previousBegin + token.length - 1))
firstPass.foreach { parser =>
tmpToken = tmpToken.flatMap { token =>
if (whitelist.getOrDefault.contains(token._1))
Seq(token)
else
parseSeparator(parser, token._1, token._2)
}
}
secondPass.foreach { parser =>
tmpToken = tmpToken.flatMap { token =>
if (whitelist.getOrDefault.contains(token._1))
Seq(token)
else {
parseSeparator(parser, token._1, token._2)
}
}
}
tmpToken
}
.filter(!_._1.equals(""))
}
private def parseSeparator(
parser: PreprocessingParser,
token: String,
begin: Int): Array[(String, Int, Int)] = {
val parserSeparator = parser.separate(token).split(" ")
var currentParsedTokenBegin = begin
val tokensResult = parserSeparator.zipWithIndex.map { case (parsedToken, index) =>
if (index > 0)
currentParsedTokenBegin = currentParsedTokenBegin + parserSeparator(index - 1).length
(parsedToken, currentParsedTokenBegin, currentParsedTokenBegin + parsedToken.length - 1)
}
tokensResult
}
}
object RecursiveTokenizerModel extends ParamsAndFeaturesReadable[RecursiveTokenizerModel]