smile.nlp.package.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of smile-scala_2.13 Show documentation
smile-scala
The newest version!
/*
 * Copyright (c) 2010-2021 Haifeng Li. All rights reserved.
 *
 * Smile is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Smile is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Smile.  If not, see .
 */

package smile

import scala.language.implicitConversions
import scala.jdk.CollectionConverters._
import smile.math.MathEx
import smile.nlp.dictionary.StopWords
import smile.nlp.pos.{HMMPOSTagger, PennTreebankPOS}
import smile.nlp.stemmer.{LancasterStemmer, PorterStemmer}
import smile.util.time

/** Natural language processing.
  *
  * @author Haifeng Li
  */
package object nlp {
  implicit def pimpString(string: String): PimpedString = new PimpedString(string)

  /** Porter's stemming algorithm. The stemmer is based on the idea that the
    * suffixes in the English language are mostly made up of a combination of
    * smaller and simpler suffixes. This is a linear step stemmer.
    * Specifically it has five steps applying rules within each step. Within
    * each step, if a suffix rule matched to a word, then the conditions
    * attached to that rule are tested on what would be the resulting stem,
    * if that suffix was removed, in the way defined by the rule. Once a Rule
    * passes its conditions and is accepted the rule fires and the suffix is
    * removed and control moves to the next step. If the rule is not accepted
    * then the next rule in the step is tested, until either a rule from that
    * step fires and control passes to the next step or there are no more rules
    * in that step whence control moves to the next step.
    */
  val porter: PorterStemmer = new stemmer.PorterStemmer {
    def apply(word: String): String = stem(word)
  }

  /** The Paice/Husk Lancaster stemming algorithm. The stemmer is a conflation
    * based iterative stemmer. The stemmer, although remaining efficient and
    * easily implemented, is known to be very strong and aggressive. The stemmer
    * utilizes a single table of rules, each of which may specify
    * the removal or replacement of an ending.
    */
  val lancaster: LancasterStemmer = new stemmer.LancasterStemmer {
    def apply(word: String): String = stem(word)
  }

  /** Creates an in-memory text corpus.
    *
    * @param text a set of text.
    */
  def corpus(text: scala.collection.Seq[String]): SimpleCorpus = {
    val corpus = new SimpleCorpus
    text.foreach(text => corpus.add(new Text(text)))
    corpus
  }

  /** Identify bigram collocations (words that often appear consecutively) within
    * corpora. They may also be used to find other associations between word
    * occurrences.
    *
    * Finding collocations requires first calculating the frequencies of words
    * and their appearance in the context of other words. Often the collection
    * of words will then requiring filtering to only retain useful content terms.
    * Each n-gram of words may then be scored according to some association measure,
    * in order to determine the relative likelihood of each n-gram being a
    * collocation.
    *
    * @param k finds top k bigram.
    * @param minFreq the minimum frequency of collocation.
    * @param text input text.
    * @return significant bigram collocations in descending order
    *         of likelihood ratio.
    */
  def bigram(k: Int, minFreq: Int, text: String*): Array[smile.nlp.collocation.Bigram] = time("Bi-gram collocation") {
    smile.nlp.collocation.Bigram.of(corpus(text), k, minFreq)
  }

  /** Identify bigram collocations whose p-value is less than
    * the given threshold.
    *
    * @param p the p-value threshold
    * @param minFreq the minimum frequency of collocation.
    * @param text input text.
    * @return significant bigram collocations in descending order
    *         of likelihood ratio.
    */
  def bigram(p: Double, minFreq: Int, text: String*): Array[smile.nlp.collocation.Bigram] = time("Bi-gram collocation") {
    smile.nlp.collocation.Bigram.of(corpus(text), p, minFreq)
  }

  /** An Apiori-like algorithm to extract n-gram phrases.
    *
    * @param maxNGramSize The maximum length of n-gram
    * @param minFreq The minimum frequency of n-gram in the sentences.
    * @param text input text.
    * @return An array of sets of n-grams. The i-th entry is the set of i-grams.
    */
  def ngram(maxNGramSize: Int, minFreq: Int, text: String*): Array[Array[smile.nlp.collocation.NGram]] = time("N-gram collocation") {
    val sentences = text.flatMap { text =>
      text.sentences.map { sentence =>
        sentence.words("none").map { word =>
          porter.stripPluralParticiple(word).toLowerCase
        }
      }
    }

    smile.nlp.collocation.NGram.of(sentences.asJava, maxNGramSize, minFreq)
  }

  /** Part-of-speech taggers.
    *
    * @param sentence a sentence that is already segmented to words.
    * @return the pos tags.
    */
  def postag(sentence: Array[String]): Array[PennTreebankPOS] = time("PoS tagging with Hidden Markov Model") {
    HMMPOSTagger.getDefault.tag(sentence)
  }

  /** Converts a bag of words to a feature vector.
    *
    * @param terms the token list used as features.
    * @param bag the bag of words.
    * @return a vector of frequency of feature tokens in the bag.
    */
  def vectorize(terms: Array[String], bag: Map[String, Int]): Array[Double] = {
    terms.map(bag.getOrElse(_, 0).toDouble)
  }

  /** Converts a binary bag of words to a sparse feature vector.
    *
    * @param terms the token list used as features.
    * @param bag the bag of words.
    * @return an integer vector, which elements are the indices of presented
    *         feature tokens in ascending order.
    */
  def vectorize(terms: Array[String], bag: Set[String]): Array[Int] = {
    terms.zipWithIndex.filter { case (w, _) => bag.contains(w)}.map(_._2)
  }

  /** Returns the document frequencies, i.e. the number of documents that contain term.
    *
    * @param terms the token list used as features.
    * @param corpus the training corpus.
    * @return the array of document frequencies.
    */
  def df(terms: Array[String], corpus: Array[Map[String, Int]]): Array[Int] = {
    terms.map { term =>
      corpus.count(_.contains(term))
    }
  }

  /** TF-IDF relevance score between a term and a document based on a corpus.
    *
    * @param tf    the frequency of searching term in the document to rank.
    * @param maxtf the maximum frequency over all terms in the document.
    * @param n     the number of documents in the corpus.
    * @param df    the number of documents containing the given term in the corpus.
    */
  private def tfidf(tf: Double, maxtf: Double, n: Int, df: Int): Double = {
    (tf / maxtf) * Math.log((1.0 + n) / (1.0 + df))
  }

  /** Converts a corpus to TF-IDF feature vectors, which
    * are normalized to L2 norm 1.
    *
    * @param corpus the corpus of documents in bag-of-words representation.
    * @return a matrix of which each row is the TF-IDF feature vector.
    */
  def tfidf(corpus: Array[Array[Double]]): Array[Array[Double]] = {
    val n = corpus.length
    val df = new Array[Int](corpus(0).length)
    corpus.foreach { bag =>
      for (i <- df.indices) {
        if (bag(i) > 0) df(i) = df(i) + 1
      }
      df
    }

    corpus.map { bag =>
      tfidf(bag, n, df)
    }
  }

  /** Converts a bag of words to a feature vector by TF-IDF, which
    * is normalized to L2 norm 1.
    *
    * @param bag the bag-of-words feature vector of a document.
    * @param n the number of documents in training corpus.
    * @param df the number of documents containing the given term in the corpus.
    * @return TF-IDF feature vector
    */
  def tfidf(bag: Array[Double], n: Int, df: Array[Int]): Array[Double] = {
    import Ordering.Double.TotalOrdering
    val maxtf = bag.max
    val features = new Array[Double](bag.length)

    for (i <- features.indices) {
      features(i) = tfidf(bag(i), maxtf, n, df(i))
    }

    MathEx.unitize(features)

    features
  }
}

package nlp {
  import tokenizer.{SimpleSentenceSplitter, SimpleTokenizer}
  import smile.nlp.dictionary.{EnglishPunctuations, EnglishStopWords}
  import smile.nlp.normalizer.SimpleNormalizer
  import smile.nlp.pos.{HMMPOSTagger, PennTreebankPOS}
  import smile.nlp.stemmer.{PorterStemmer, Stemmer}

  private[nlp] class PimpedString(text: String) {
    val tokenizer = new SimpleTokenizer(true)

    /**
      * Normalizes Unicode text.
      * 
      * Apply Unicode normalization form NFKC.
      * Strip, trim, normalize, and compress whitespace.
      * Remove control and formatting characters.
      * Normalize double and single quotes.
      * 
      */
    def normalize: String = {
      SimpleNormalizer.getInstance().normalize(text)
    }

    /** Splits English text into sentences. Given an English text,
      * it returns a list of strings, where each element is an
      * English sentence. By default, it treats occurrences of '.', '?' and '!' as
      * sentence delimiters, but does its best to determine when an occurrence of '.'
      * does not have this role (e.g. in abbreviations, URLs, numbers, etc.).
      *
      * Recognizing the end of a sentence is not an easy task for a computer.
      * In English, punctuation marks that usually appear at the end of a sentence
      * may not indicate the end of a sentence. The period is the worst offender.
      * A period can end a sentence but it can also be part of an abbreviation
      * or acronym, an ellipsis, a decimal number, or part of a bracket of periods
      * surrounding a Roman numeral. A period can even act both as the end of an
      * abbreviation and the end of a sentence at the same time. Other the other
      * hand, some poems may not contain any sentence punctuation at all.
      *
      * Another problem punctuation mark is the single quote, which can introduce
      * a quote or start a contraction such as 'tis. Leading-quote contractions
      * are uncommon in contemporary English texts, but appear frequently in Early
      * Modern English texts.
      *
      * This tokenizer assumes that the text has already been segmented into
      * paragraphs. Any carriage returns will be replaced by whitespace.
      *
      * ====References:====
      *  - Paul Clough. A Perl program for sentence splitting using rules.
      */
    def sentences: Array[String] = {
      SimpleSentenceSplitter.getInstance.split(text)
    }

    /** Tokenizes English sentences with some differences from
      * TreebankWordTokenizer, notably on handling not-contractions. If a period
      * serves as both the end of sentence and a part of abbreviation, e.g. etc. at
      * the end of sentence, it will generate tokens of "etc." and "." while
      * TreebankWordTokenizer will generate "etc" and ".".
      *
      * Most punctuation is split from adjoining words. Verb contractions and the
      * Anglo-Saxon genitive of nouns are split into their component morphemes,
      * and each morpheme is tagged separately.
      *
      * This tokenizer assumes that the text has already been segmented into
      * sentences. Any periods -- apart from those at the end of a string or before
      * newline -- are assumed to be part of the word they are attached to (e.g. for
      * abbreviations, etc), and are not separately tokenized.
      *
      * If the parameter filter is not "none", the method will also filter
      * out stop words and punctuations. There is no definite list of stop
      * words which all tools incorporate. The valid values of the parameter
      * filter include
      *   - "none": no filtering
      *   - "default": the default English stop word list
      *   - "comprehensive": a more comprehensive English stop word list
      *   - "google": the stop words list used by Google search engine
      *   - "mysql": the stop words list used by MySQL FullText feature
      *   - custom stop word list: comma separated stop word list
      */
    def words(filter: String = "default"): Array[String] = {
      val tokens = tokenizer.split(text)

      if (filter == "none") return tokens

      val dict = filter.toLowerCase match {
        case "default" => EnglishStopWords.DEFAULT
        case "comprehensive" => EnglishStopWords.COMPREHENSIVE
        case "google" => EnglishStopWords.GOOGLE
        case "mysql" => EnglishStopWords.MYSQL
        case _ => new StopWords {
          val dict: Set[String] = filter.split(",").toSet

          override def contains(word: String): Boolean = dict.contains(word)

          override def size: Int = dict.size

          override def iterator: java.util.Iterator[String] = dict.iterator.asJava
        }
      }

      val punctuations = EnglishPunctuations.getInstance()
      tokens.filter { word =>
        !(dict.contains(word.toLowerCase) || punctuations.contains(word))
      }
    }

    /** Returns the bag of words. The bag-of-words model is a simple
      * representation of text as the bag of its words, disregarding
      * grammar and word order but keeping multiplicity.
      *
      * @param filter stop list for filtering.
      * @param stemmer stemmer to transform a word into its root form.
      */
    def bag(filter: String = "default", stemmer: Option[Stemmer] = Some(new PorterStemmer())): Map[String, Int] = {
      val words = text.normalize.sentences.flatMap(_.words(filter))

      val tokens = stemmer.map { stemmer =>
        words.map(stemmer.stem)
      }.getOrElse(words)

      val map = tokens.map(_.toLowerCase).groupBy(identity)
      map.map { case (k, v) => (k, v.length) }.withDefaultValue(0)
    }

    /** Returns the binary bag of words. Presence/absence is used instead
      * of frequencies.
      *
      * @param filter stop list for filtering.
      * @param stemmer stemmer to transform a word into its root form.
      */
    def bag2(filter: String = "default", stemmer: Option[Stemmer] = Some(new PorterStemmer())): Set[String] = {
      val words = text.normalize.sentences.flatMap(_.words(filter))

      val tokens = stemmer.map { stemmer =>
        words.map(stemmer.stem)
      }.getOrElse(words)

      tokens.map(_.toLowerCase).toSet
    }

    /** Returns the (word, part-of-speech) pairs.
      * The text should be a single sentence.
      */
    def postag: Array[(String, PennTreebankPOS)] = {
      val words = text.words("none")
      words.zip(HMMPOSTagger.getDefault.tag(words))
    }

    /** Keyword extraction from a single document using word co-occurrence
      * statistical information.
      *
      * @param k the number of top keywords to return.
      * @return the top keywords.
      */
    def keywords(k: Int = 10): Array[smile.nlp.collocation.NGram] = {
      smile.nlp.keyword.CooccurrenceKeywords.of(text, k)
    }
  }

  /** Hacking scaladoc [[https://github.com/scala/bug/issues/8124 issue-8124]].
    * The user should ignore this object. */
  object $dummy
}