All Downloads are FREE. Search and download functionalities are using the official Maven repository.

smile.nlp.package.scala Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (c) 2010-2021 Haifeng Li. All rights reserved.
 *
 * Smile is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Smile is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Smile.  If not, see .
 */

package smile

import scala.language.implicitConversions
import scala.jdk.CollectionConverters._
import smile.math.MathEx
import smile.nlp.dictionary.StopWords
import smile.nlp.pos.{HMMPOSTagger, PennTreebankPOS}
import smile.nlp.stemmer.{LancasterStemmer, PorterStemmer}
import smile.util.time

/** Natural language processing.
  *
  * @author Haifeng Li
  */
package object nlp {
  implicit def pimpString(string: String): PimpedString = new PimpedString(string)

  /** Porter's stemming algorithm. The stemmer is based on the idea that the
    * suffixes in the English language are mostly made up of a combination of
    * smaller and simpler suffixes. This is a linear step stemmer.
    * Specifically it has five steps applying rules within each step. Within
    * each step, if a suffix rule matched to a word, then the conditions
    * attached to that rule are tested on what would be the resulting stem,
    * if that suffix was removed, in the way defined by the rule. Once a Rule
    * passes its conditions and is accepted the rule fires and the suffix is
    * removed and control moves to the next step. If the rule is not accepted
    * then the next rule in the step is tested, until either a rule from that
    * step fires and control passes to the next step or there are no more rules
    * in that step whence control moves to the next step.
    */
  val porter: PorterStemmer = new stemmer.PorterStemmer {
    def apply(word: String): String = stem(word)
  }

  /** The Paice/Husk Lancaster stemming algorithm. The stemmer is a conflation
    * based iterative stemmer. The stemmer, although remaining efficient and
    * easily implemented, is known to be very strong and aggressive. The stemmer
    * utilizes a single table of rules, each of which may specify
    * the removal or replacement of an ending.
    */
  val lancaster: LancasterStemmer = new stemmer.LancasterStemmer {
    def apply(word: String): String = stem(word)
  }

  /** Creates an in-memory text corpus.
    *
    * @param text a set of text.
    */
  def corpus(text: scala.collection.Seq[String]): SimpleCorpus = {
    val corpus = new SimpleCorpus
    text.foreach(text => corpus.add(new Text(text)))
    corpus
  }

  /** Identify bigram collocations (words that often appear consecutively) within
    * corpora. They may also be used to find other associations between word
    * occurrences.
    *
    * Finding collocations requires first calculating the frequencies of words
    * and their appearance in the context of other words. Often the collection
    * of words will then requiring filtering to only retain useful content terms.
    * Each n-gram of words may then be scored according to some association measure,
    * in order to determine the relative likelihood of each n-gram being a
    * collocation.
    *
    * @param k finds top k bigram.
    * @param minFreq the minimum frequency of collocation.
    * @param text input text.
    * @return significant bigram collocations in descending order
    *         of likelihood ratio.
    */
  def bigram(k: Int, minFreq: Int, text: String*): Array[smile.nlp.collocation.Bigram] = time("Bi-gram collocation") {
    smile.nlp.collocation.Bigram.of(corpus(text), k, minFreq)
  }

  /** Identify bigram collocations whose p-value is less than
    * the given threshold.
    *
    * @param p the p-value threshold
    * @param minFreq the minimum frequency of collocation.
    * @param text input text.
    * @return significant bigram collocations in descending order
    *         of likelihood ratio.
    */
  def bigram(p: Double, minFreq: Int, text: String*): Array[smile.nlp.collocation.Bigram] = time("Bi-gram collocation") {
    smile.nlp.collocation.Bigram.of(corpus(text), p, minFreq)
  }

  /** An Apiori-like algorithm to extract n-gram phrases.
    *
    * @param maxNGramSize The maximum length of n-gram
    * @param minFreq The minimum frequency of n-gram in the sentences.
    * @param text input text.
    * @return An array of sets of n-grams. The i-th entry is the set of i-grams.
    */
  def ngram(maxNGramSize: Int, minFreq: Int, text: String*): Array[Array[smile.nlp.collocation.NGram]] = time("N-gram collocation") {
    val sentences = text.flatMap { text =>
      text.sentences.map { sentence =>
        sentence.words("none").map { word =>
          porter.stripPluralParticiple(word).toLowerCase
        }
      }
    }

    smile.nlp.collocation.NGram.of(sentences.asJava, maxNGramSize, minFreq)
  }

  /** Part-of-speech taggers.
    *
    * @param sentence a sentence that is already segmented to words.
    * @return the pos tags.
    */
  def postag(sentence: Array[String]): Array[PennTreebankPOS] = time("PoS tagging with Hidden Markov Model") {
    HMMPOSTagger.getDefault.tag(sentence)
  }

  /** Converts a bag of words to a feature vector.
    *
    * @param terms the token list used as features.
    * @param bag the bag of words.
    * @return a vector of frequency of feature tokens in the bag.
    */
  def vectorize(terms: Array[String], bag: Map[String, Int]): Array[Double] = {
    terms.map(bag.getOrElse(_, 0).toDouble)
  }

  /** Converts a binary bag of words to a sparse feature vector.
    *
    * @param terms the token list used as features.
    * @param bag the bag of words.
    * @return an integer vector, which elements are the indices of presented
    *         feature tokens in ascending order.
    */
  def vectorize(terms: Array[String], bag: Set[String]): Array[Int] = {
    terms.zipWithIndex.filter { case (w, _) => bag.contains(w)}.map(_._2)
  }

  /** Returns the document frequencies, i.e. the number of documents that contain term.
    *
    * @param terms the token list used as features.
    * @param corpus the training corpus.
    * @return the array of document frequencies.
    */
  def df(terms: Array[String], corpus: Array[Map[String, Int]]): Array[Int] = {
    terms.map { term =>
      corpus.count(_.contains(term))
    }
  }

  /** TF-IDF relevance score between a term and a document based on a corpus.
    *
    * @param tf    the frequency of searching term in the document to rank.
    * @param maxtf the maximum frequency over all terms in the document.
    * @param n     the number of documents in the corpus.
    * @param df    the number of documents containing the given term in the corpus.
    */
  private def tfidf(tf: Double, maxtf: Double, n: Int, df: Int): Double = {
    (tf / maxtf) * Math.log((1.0 + n) / (1.0 + df))
  }

  /** Converts a corpus to TF-IDF feature vectors, which
    * are normalized to L2 norm 1.
    *
    * @param corpus the corpus of documents in bag-of-words representation.
    * @return a matrix of which each row is the TF-IDF feature vector.
    */
  def tfidf(corpus: Array[Array[Double]]): Array[Array[Double]] = {
    val n = corpus.length
    val df = new Array[Int](corpus(0).length)
    corpus.foreach { bag =>
      for (i <- df.indices) {
        if (bag(i) > 0) df(i) = df(i) + 1
      }
      df
    }

    corpus.map { bag =>
      tfidf(bag, n, df)
    }
  }

  /** Converts a bag of words to a feature vector by TF-IDF, which
    * is normalized to L2 norm 1.
    *
    * @param bag the bag-of-words feature vector of a document.
    * @param n the number of documents in training corpus.
    * @param df the number of documents containing the given term in the corpus.
    * @return TF-IDF feature vector
    */
  def tfidf(bag: Array[Double], n: Int, df: Array[Int]): Array[Double] = {
    import Ordering.Double.TotalOrdering
    val maxtf = bag.max
    val features = new Array[Double](bag.length)

    for (i <- features.indices) {
      features(i) = tfidf(bag(i), maxtf, n, df(i))
    }

    MathEx.unitize(features)

    features
  }
}

package nlp {
  import tokenizer.{SimpleSentenceSplitter, SimpleTokenizer}
  import smile.nlp.dictionary.{EnglishPunctuations, EnglishStopWords}
  import smile.nlp.normalizer.SimpleNormalizer
  import smile.nlp.pos.{HMMPOSTagger, PennTreebankPOS}
  import smile.nlp.stemmer.{PorterStemmer, Stemmer}

  private[nlp] class PimpedString(text: String) {
    val tokenizer = new SimpleTokenizer(true)

    /**
      * Normalizes Unicode text.
      * 
    *
  • Apply Unicode normalization form NFKC.
  • *
  • Strip, trim, normalize, and compress whitespace.
  • *
  • Remove control and formatting characters.
  • *
  • Normalize double and single quotes.
  • *
*/ def normalize: String = { SimpleNormalizer.getInstance().normalize(text) } /** Splits English text into sentences. Given an English text, * it returns a list of strings, where each element is an * English sentence. By default, it treats occurrences of '.', '?' and '!' as * sentence delimiters, but does its best to determine when an occurrence of '.' * does not have this role (e.g. in abbreviations, URLs, numbers, etc.). * * Recognizing the end of a sentence is not an easy task for a computer. * In English, punctuation marks that usually appear at the end of a sentence * may not indicate the end of a sentence. The period is the worst offender. * A period can end a sentence but it can also be part of an abbreviation * or acronym, an ellipsis, a decimal number, or part of a bracket of periods * surrounding a Roman numeral. A period can even act both as the end of an * abbreviation and the end of a sentence at the same time. Other the other * hand, some poems may not contain any sentence punctuation at all. * * Another problem punctuation mark is the single quote, which can introduce * a quote or start a contraction such as 'tis. Leading-quote contractions * are uncommon in contemporary English texts, but appear frequently in Early * Modern English texts. * * This tokenizer assumes that the text has already been segmented into * paragraphs. Any carriage returns will be replaced by whitespace. * * ====References:==== * - Paul Clough. A Perl program for sentence splitting using rules. */ def sentences: Array[String] = { SimpleSentenceSplitter.getInstance.split(text) } /** Tokenizes English sentences with some differences from * TreebankWordTokenizer, notably on handling not-contractions. If a period * serves as both the end of sentence and a part of abbreviation, e.g. etc. at * the end of sentence, it will generate tokens of "etc." and "." while * TreebankWordTokenizer will generate "etc" and ".". * * Most punctuation is split from adjoining words. Verb contractions and the * Anglo-Saxon genitive of nouns are split into their component morphemes, * and each morpheme is tagged separately. * * This tokenizer assumes that the text has already been segmented into * sentences. Any periods -- apart from those at the end of a string or before * newline -- are assumed to be part of the word they are attached to (e.g. for * abbreviations, etc), and are not separately tokenized. * * If the parameter filter is not "none", the method will also filter * out stop words and punctuations. There is no definite list of stop * words which all tools incorporate. The valid values of the parameter * filter include * - "none": no filtering * - "default": the default English stop word list * - "comprehensive": a more comprehensive English stop word list * - "google": the stop words list used by Google search engine * - "mysql": the stop words list used by MySQL FullText feature * - custom stop word list: comma separated stop word list */ def words(filter: String = "default"): Array[String] = { val tokens = tokenizer.split(text) if (filter == "none") return tokens val dict = filter.toLowerCase match { case "default" => EnglishStopWords.DEFAULT case "comprehensive" => EnglishStopWords.COMPREHENSIVE case "google" => EnglishStopWords.GOOGLE case "mysql" => EnglishStopWords.MYSQL case _ => new StopWords { val dict: Set[String] = filter.split(",").toSet override def contains(word: String): Boolean = dict.contains(word) override def size: Int = dict.size override def iterator: java.util.Iterator[String] = dict.iterator.asJava } } val punctuations = EnglishPunctuations.getInstance() tokens.filter { word => !(dict.contains(word.toLowerCase) || punctuations.contains(word)) } } /** Returns the bag of words. The bag-of-words model is a simple * representation of text as the bag of its words, disregarding * grammar and word order but keeping multiplicity. * * @param filter stop list for filtering. * @param stemmer stemmer to transform a word into its root form. */ def bag(filter: String = "default", stemmer: Option[Stemmer] = Some(new PorterStemmer())): Map[String, Int] = { val words = text.normalize.sentences.flatMap(_.words(filter)) val tokens = stemmer.map { stemmer => words.map(stemmer.stem) }.getOrElse(words) val map = tokens.map(_.toLowerCase).groupBy(identity) map.map { case (k, v) => (k, v.length) }.withDefaultValue(0) } /** Returns the binary bag of words. Presence/absence is used instead * of frequencies. * * @param filter stop list for filtering. * @param stemmer stemmer to transform a word into its root form. */ def bag2(filter: String = "default", stemmer: Option[Stemmer] = Some(new PorterStemmer())): Set[String] = { val words = text.normalize.sentences.flatMap(_.words(filter)) val tokens = stemmer.map { stemmer => words.map(stemmer.stem) }.getOrElse(words) tokens.map(_.toLowerCase).toSet } /** Returns the (word, part-of-speech) pairs. * The text should be a single sentence. */ def postag: Array[(String, PennTreebankPOS)] = { val words = text.words("none") words.zip(HMMPOSTagger.getDefault.tag(words)) } /** Keyword extraction from a single document using word co-occurrence * statistical information. * * @param k the number of top keywords to return. * @return the top keywords. */ def keywords(k: Int = 10): Array[smile.nlp.collocation.NGram] = { smile.nlp.keyword.CooccurrenceKeywords.of(text, k) } } /** Hacking scaladoc [[https://github.com/scala/bug/issues/8124 issue-8124]]. * The user should ignore this object. */ object $dummy }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy