
com.johnsnowlabs.nlp.annotators.SimpleTokenizer.scala Maven / Gradle / Ivy
package com.johnsnowlabs.nlp.annotators
import com.johnsnowlabs.nlp.annotators.common.{InfixToken, PrefixedToken, SuffixedToken}
import com.johnsnowlabs.nlp.serialization.ArrayFeature
import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, AnnotatorType}
import org.apache.spark.ml.util.Identifiable
class SimpleTokenizer(override val uid: String) extends AnnotatorModel[SimpleTokenizer] {
def this() = this(Identifiable.randomUID("SILLY_TOKENIZER"))
val prefixes = new ArrayFeature[String](this, "prefixes")
def setPrefixes(p: Array[String]):this.type = set(prefixes, p.sortBy(_.size).reverse)
val suffixes = new ArrayFeature[String](this, "suffixes")
def setSuffixes(s: Array[String]):this.type = set(suffixes, s.sortBy(_.size).reverse)
setDefault(prefixes, () => Array("'", "\"", "(", "[", "\n"))
setDefault(suffixes, () => Array(".", ":", "%", ",", ";", "?", "'", "\"", ")", "]", "\n"))
/**
* takes a document and annotations and produces new annotations of this annotator's annotation type
*
* @param annotations Annotations that correspond to inputAnnotationCols generated by previous annotators if any
* @return any number of annotations processed for every input annotation. Not necessary one to one relationship
*/
override def annotate(annotations: Seq[Annotation]): Seq[Annotation] =
annotations.flatMap { annotation =>
tokenize(annotation.result).map(token => annotation.
copy(result = token, metadata = annotation.metadata.updated("sentence",
annotation.metadata.getOrElse("sentence", "0"))))
}
// hardcoded at this time
private lazy val firstPass = Seq(InfixToken(Array("\n")))
private lazy val secondPass = Seq(SuffixedToken($$(suffixes)),
PrefixedToken($$(prefixes)))
private def tokenize(text: String):Seq[String] =
text.split(" ").filter(_!=" ").flatMap{ token =>
var tmp = Seq(token)
firstPass.foreach{ parser =>
tmp = tmp.flatMap(t => parser.separate(t).split(" "))
}
secondPass.foreach{ parser =>
tmp = tmp.flatMap(t => parser.separate(t).split(" "))
}
tmp
}.filter(!_.equals(""))
override val outputAnnotatorType: AnnotatorType = AnnotatorType.TOKEN
/** Annotator reference id. Used to identify elements in metadata or to refer to this annotator type */
override val inputAnnotatorTypes: Array[String] = Array(AnnotatorType.DOCUMENT)
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy