com.johnsnowlabs.nlp.annotators.SimpleTokenizer.scala Maven / Gradle / Ivy

Go to download
package com.johnsnowlabs.nlp.annotators

import com.johnsnowlabs.nlp.annotators.common.{InfixToken, PrefixedToken, SuffixedToken}
import com.johnsnowlabs.nlp.serialization.ArrayFeature
import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, AnnotatorType}
import org.apache.spark.ml.util.Identifiable


class SimpleTokenizer(override val uid: String) extends AnnotatorModel[SimpleTokenizer] {

  def this() = this(Identifiable.randomUID("SILLY_TOKENIZER"))

  val prefixes = new ArrayFeature[String](this, "prefixes")
  def setPrefixes(p: Array[String]):this.type = set(prefixes, p.sortBy(_.size).reverse)

  val suffixes = new ArrayFeature[String](this, "suffixes")
  def setSuffixes(s: Array[String]):this.type = set(suffixes, s.sortBy(_.size).reverse)

  setDefault(prefixes, () => Array("'", "\"", "(", "[", "\n"))
  setDefault(suffixes, () => Array(".", ":", "%", ",", ";", "?", "'", "\"", ")", "]", "\n"))

  /**
    * takes a document and annotations and produces new annotations of this annotator's annotation type
    *
    * @param annotations Annotations that correspond to inputAnnotationCols generated by previous annotators if any
    * @return any number of annotations processed for every input annotation. Not necessary one to one relationship
    */
  override def annotate(annotations: Seq[Annotation]): Seq[Annotation] =
    annotations.flatMap { annotation =>
      tokenize(annotation.result).map(token => annotation.
        copy(result = token, metadata = annotation.metadata.updated("sentence",
          annotation.metadata.getOrElse("sentence", "0"))))
  }

  // hardcoded at this time
  private lazy val firstPass = Seq(InfixToken(Array("\n")))

  private lazy val secondPass = Seq(SuffixedToken($$(suffixes)),
    PrefixedToken($$(prefixes)))

  private def tokenize(text: String):Seq[String] =
    text.split(" ").filter(_!=" ").flatMap{ token =>
      var tmp = Seq(token)

      firstPass.foreach{ parser =>
        tmp = tmp.flatMap(t => parser.separate(t).split(" "))
      }

      secondPass.foreach{ parser =>
        tmp = tmp.flatMap(t => parser.separate(t).split(" "))
      }

      tmp
  }.filter(!_.equals(""))

  override val outputAnnotatorType: AnnotatorType = AnnotatorType.TOKEN

  /** Annotator reference id. Used to identify elements in metadata or to refer to this annotator type */
  override val inputAnnotatorTypes: Array[String] = Array(AnnotatorType.DOCUMENT)
}