Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
com.johnsnowlabs.nlp.annotators.Stemmer.scala Maven / Gradle / Ivy
/*
* Copyright 2017-2022 John Snow Labs
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.johnsnowlabs.nlp.annotators
import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, HasSimpleAnnotate}
import org.apache.spark.ml.param.Param
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
import scala.language.postfixOps
/** Returns hard-stems out of words with the objective of retrieving the meaningful part of the
* word. For extended examples of usage, see the
* [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/stemmer/Word_Stemming_with_Stemmer.ipynb Examples]].
*
* ==Example==
* {{{
* import spark.implicits._
* import com.johnsnowlabs.nlp.DocumentAssembler
* import com.johnsnowlabs.nlp.annotator.{Stemmer, Tokenizer}
* import org.apache.spark.ml.Pipeline
*
* val documentAssembler = new DocumentAssembler()
* .setInputCol("text")
* .setOutputCol("document")
*
* val tokenizer = new Tokenizer()
* .setInputCols("document")
* .setOutputCol("token")
*
* val stemmer = new Stemmer()
* .setInputCols("token")
* .setOutputCol("stem")
*
* val pipeline = new Pipeline().setStages(Array(
* documentAssembler,
* tokenizer,
* stemmer
* ))
*
* val data = Seq("Peter Pipers employees are picking pecks of pickled peppers.")
* .toDF("text")
* val result = pipeline.fit(data).transform(data)
*
* result.selectExpr("stem.result").show(truncate = false)
* +-------------------------------------------------------------+
* |result |
* +-------------------------------------------------------------+
* |[peter, piper, employe, ar, pick, peck, of, pickl, pepper, .]|
* +-------------------------------------------------------------+
* }}}
*
* @param uid
* internal uid element for storing annotator into disk
* @groupname anno Annotator types
* @groupdesc anno
* Required input and expected output annotator types
* @groupname Ungrouped Members
* @groupname param Parameters
* @groupname setParam Parameter setters
* @groupname getParam Parameter getters
* @groupname Ungrouped Members
* @groupprio anno 1
* @groupprio param 2
* @groupprio Ungrouped 3
* @groupprio setParam 4
* @groupprio getParam 5
* @groupdesc param
* A list of (hyper-)parameter keys this annotator can take. Users can set and get the
* parameter values through setters and getters, respectively.
*/
class Stemmer(override val uid: String)
extends AnnotatorModel[Stemmer]
with HasSimpleAnnotate[Stemmer] {
import com.johnsnowlabs.nlp.AnnotatorType._
/** Language of the text (Default: `"english"`)
*
* @group param
*/
val language: Param[String] = new Param(this, "language", "Language of the text")
setDefault(language, "english")
/** Output annotator type : TOKEN
*
* @group anno
*/
override val outputAnnotatorType: AnnotatorType = TOKEN
/** Input annotator type : TOKEN
*
* @group anno
*/
override val inputAnnotatorTypes: Array[AnnotatorType] = Array(TOKEN)
/** Language of the text (Default: `"english"`)
*
* @group setParam
*/
def setLanguage(value: String): Stemmer = set(language, value)
/** Language of the text (Default: `"english"`)
*
* @group getParam
*/
def getLanguage: String = $(language)
def this() = this(Identifiable.randomUID("STEMMER"))
/** one-to-one stem annotation that returns single hard-stem per token */
override def annotate(annotations: Seq[Annotation]): Seq[Annotation] =
annotations.map { tokenAnnotation =>
val stem = EnglishStemmer.stem(tokenAnnotation.result)
Annotation(
outputAnnotatorType,
tokenAnnotation.begin,
tokenAnnotation.end,
stem,
tokenAnnotation.metadata)
}
}
/** This is the companion object of [[Stemmer]]. Please refer to that class for the documentation.
*/
object Stemmer extends DefaultParamsReadable[Stemmer]
object EnglishStemmer {
def stem(word: String): String = {
// Deal with plurals and past participles
var stem = new Word(word).applyReplaces("sses" → "ss", "ies" → "i", "ss" → "ss", "s" → "")
if ((stem matchedBy ((~v ~) + "ed")) ||
(stem matchedBy ((~v ~) + "ing"))) {
stem = stem.applyReplaces(~v ~)("ed" → "", "ing" → "")
stem = stem.applyReplaces(
"at" → "ate",
"bl" → "ble",
"iz" → "ize",
(~d and not(~L or ~S or ~Z)) → singleLetter,
(m == 1 and ~o) → "e")
} else {
stem = stem.applyReplaces(((m > 0) + "eed") → "ee")
}
stem = stem.applyReplaces(((~v ~) + "y") → "i")
// Remove suffixes
stem = stem.applyReplaces(m > 0)(
"ational" → "ate",
"tional" → "tion",
"enci" → "ence",
"anci" → "ance",
"izer" → "ize",
"abli" → "able",
"alli" → "al",
"entli" → "ent",
"eli" → "e",
"ousli" → "ous",
"ization" → "ize",
"ation" → "ate",
"ator" → "ate",
"alism" → "al",
"iveness" → "ive",
"fulness" → "ful",
"ousness" → "ous",
"aliti" → "al",
"iviti" → "ive",
"biliti" → "ble")
stem = stem.applyReplaces(m > 0)(
"icate" → "ic",
"ative" → "",
"alize" → "al",
"iciti" → "ic",
"ical" → "ic",
"ful" → "",
"ness" → "")
stem = stem.applyReplaces(m > 1)(
"al" → "",
"ance" → "",
"ence" → "",
"er" → "",
"ic" → "",
"able" → "",
"ible" → "",
"ant" → "",
"ement" → "",
"ment" → "",
"ent" → "",
((~S or ~T) + "ion") → "",
"ou" → "",
"ism" → "",
"ate" → "",
"iti" → "",
"ous" → "",
"ive" → "",
"ize" → "")
// Tide up a little bit
stem = stem applyReplaces (((m > 1) + "e") → "",
(((m == 1) and not(~o)) + "e") → "")
stem = stem applyReplaces ((m > 1 and ~d and ~L) → singleLetter)
stem.toString
}
/** Pattern that is matched against the word. Usually, the end of the word is compared to
* suffix, and the beginning is checked to satisfy a condition.
*/
private case class Pattern(condition: Condition, suffix: String)
/** Condition, that is checked against the beginning of the word Predicate to be applied to the
* word
*/
private case class Condition(predicate: Word ⇒ Boolean) {
def + = new Pattern(this, _: String)
def unary_~ : Condition = this // just syntactic sugar
def ~ = this
def and(condition: Condition): Condition =
Condition((word) ⇒ predicate(word) && condition.predicate(word))
def or(condition: Condition): Condition =
Condition((word) ⇒ predicate(word) || condition.predicate(word))
}
private def not: Condition ⇒ Condition = { case Condition(predicate) ⇒
Condition(!predicate(_))
}
private val emptyCondition = Condition(_ ⇒ true)
private object m {
def >(measure: Int) = Condition(_.measure > measure)
def ==(measure: Int) = Condition(_.measure == measure)
}
private val S = Condition(_ endsWith "s")
private val Z = Condition(_ endsWith "z")
private val L = Condition(_ endsWith "l")
private val T = Condition(_ endsWith "t")
private val d = Condition(_.endsWithCC)
private val o = Condition(_.endsWithCVC)
private val v = Condition(_.containsVowels)
/** Builder of the stem
*
* @param build
* Function to be called to build a stem
*/
private case class StemBuilder(build: Word ⇒ Word)
private def suffixStemBuilder(suffix: String) = StemBuilder(_ + suffix)
private val singleLetter = StemBuilder(_ trimSuffix 1)
private class Word(string: String) {
val word = string.toLowerCase
def trimSuffix(suffixLength: Int) = new Word(word substring (0, word.length - suffixLength))
def endsWith = word endsWith _
def +(suffix: String) = new Word(word + suffix)
def satisfies = (_: Condition).predicate(this)
def hasConsonantAt(position: Int): Boolean =
(word.indices contains position) && (word(position) match {
case 'a' | 'e' | 'i' | 'o' | 'u' ⇒ false
case 'y' if hasConsonantAt(position - 1) ⇒ false
case _ ⇒ true
})
def hasVowelAt = !hasConsonantAt(_: Int)
def containsVowels = word.indices exists hasVowelAt
def endsWithCC =
(word.length > 1) &&
(word(word.length - 1) == word(word.length - 2)) &&
hasConsonantAt(word.length - 1)
def endsWithCVC =
(word.length > 2) &&
hasConsonantAt(word.length - 1) &&
hasVowelAt(word.length - 2) &&
hasConsonantAt(word.length - 3) &&
!(Set('w', 'x', 'y') contains word(word.length - 2))
/** Measure of the word -- the number of VCs
*
* @return
* integer
*/
def measure = word.indices.filter(pos ⇒ hasVowelAt(pos) && hasConsonantAt(pos + 1)).length
def matchedBy: Pattern ⇒ Boolean = { case Pattern(condition, suffix) ⇒
endsWith(suffix) && (trimSuffix(suffix.length) satisfies condition)
}
def applyReplaces(replaces: (Pattern, StemBuilder)*): Word = {
for ((pattern, stemBuilder) ← replaces if matchedBy(pattern))
return stemBuilder build trimSuffix(pattern.suffix.length)
this
}
def applyReplaces(commonCondition: Condition)(replaces: (Pattern, StemBuilder)*): Word =
applyReplaces(replaces map { case (Pattern(condition, suffix), stemBuilder) ⇒
(Pattern(commonCondition and condition, suffix), stemBuilder)
}: _*)
override def toString = word
}
//////////////////// CLASS ENDS/////////////////////////////////
private implicit def pimpMyRule[P <% Pattern, SB <% StemBuilder](
rule: (P, SB)): (Pattern, StemBuilder) = (rule._1, rule._2)
private implicit def emptyConditionPattern: String ⇒ Pattern = Pattern(emptyCondition, _)
private implicit def emptySuffixPattern: Condition ⇒ Pattern = Pattern(_, "")
private implicit def suffixedStemBuilder: String ⇒ StemBuilder = suffixStemBuilder
}