
smile.nlp.stemmer.PorterStemmer Maven / Gradle / Ivy
/*******************************************************************************
* Copyright (c) 2010 Haifeng Li
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
package smile.nlp.stemmer;
/**
* Porter's stemming algorithm. The stemmer is based on the idea that the
* suffixes in the English language are mostly made up of a combination of
* smaller and simpler suffixes. This is a linear step stemmer.
* Specifically it has five steps applying rules within each step. Within
* each step, if a suffix rule matched to a word, then the conditions
* attached to that rule are tested on what would be the resulting stem,
* if that suffix was removed, in the way defined by the rule. Once a Rule
* passes its conditions and is accepted the rule fires and the suffix is
* removed and control moves to the next step. If the rule is not accepted
* then the next rule in the step is tested, until either a rule from that
* step fires and control passes to the next step or there are no more rules
* in that step whence control moves to the next step. For details, see
*
* Martin Porter, An algorithm for suffix stripping, Program, 14(3), 130-137, 1980.
*
* Note that this class is NOT multi-thread safe.
*
* The code is based on http://www.tartarus.org/~martin/PorterStemmer
*
* History:
*
* Release 1
*
* Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.
* The words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]
* is then out outside the bounds of b.
*
* Release 2
*
* Similarly,
*
* Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.
* 'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and
* b[j] is then outside the bounds of b.
*
* Release 3
*
* Considerably revised 4/9/00 in the light of many helpful suggestions
* from Brian Goetz of Quiotix Corporation ([email protected]).
*
* Release 4
*/
public class PorterStemmer implements Stemmer {
/**
* Working buffer.
*/
private char[] b;
/**
* A general offset into the string
*/
private int j;
/**
* The offset to the current working character.
*/
private int k;
/**
* Constructor.
*/
public PorterStemmer() {
}
/**
* Returns true if b[i] is a consonant.
*/
private final boolean isConsonant(int i) {
switch (b[i]) {
case 'a':
case 'e':
case 'i':
case 'o':
case 'u':
return false;
case 'y':
return (i == 0) ? true : !isConsonant(i - 1);
default:
return true;
}
}
/**
* m() measures the number of consonant sequences between 0 and j. if c is
* a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
* presence,
*
* -
gives 0
* -
vc gives 1
* -
vcvc gives 2
* -
vcvcvc gives 3
* - ....
*
*/
private final int m() {
int n = 0;
int i = 0;
while (true) {
if (i > j) {
return n;
}
if (!isConsonant(i)) {
break;
}
i++;
}
i++;
while (true) {
while (true) {
if (i > j) {
return n;
}
if (isConsonant(i)) {
break;
}
i++;
}
i++;
n++;
while (true) {
if (i > j) {
return n;
}
if (!isConsonant(i)) {
break;
}
i++;
}
i++;
}
}
/**
* Returns true if 0,...j contains a vowel
*/
private final boolean vowelinstem() {
int i;
for (i = 0; i <= j; i++) {
if (!isConsonant(i)) {
return true;
}
}
return false;
}
/**
* Returns true if j,(j-1) contain a double consonant.
*/
private final boolean doublec(int j) {
if (j < 1) {
return false;
}
if (b[j] != b[j - 1]) {
return false;
}
return isConsonant(j);
}
/**
* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
* and also if the second c is not w,x or y. this is used when trying to
* restore an e at the end of a short word. e.g.
* cav(e), lov(e), hop(e), crim(e), but snow, box, tray.
*/
private final boolean cvc(int i) {
if (i < 2 || !isConsonant(i) || isConsonant(i - 1) || !isConsonant(i - 2)) {
return false;
}
{
int ch = b[i];
if (ch == 'w' || ch == 'x' || ch == 'y') {
return false;
}
}
return true;
}
private final boolean endWith(String s) {
int l = s.length();
int o = k - l + 1;
if (o < 0) {
return false;
}
for (int i = 0; i < l; i++) {
if (b[o + i] != s.charAt(i)) {
return false;
}
}
j = k - l;
return true;
}
/**
* Sets (j+1),...k to the characters in the string s, readjusting k.
*/
private final void setto(String s) {
int l = s.length();
int o = j + 1;
for (int i = 0; i < l; i++) {
b[o + i] = s.charAt(i);
}
k = j + l;
}
/**
* Used further down.
*/
private final void r(String s) {
if (m() > 0) {
setto(s);
}
}
/**
* step1 without special handling ending y.
*/
private final void step1() {
step1(false);
}
/**
* step1() gets rid of plurals and -ed or -ing. e.g. If the argument y is true,
* do the special handling of ending ies and ied.
*
* caresses -> caress
* ponies -> poni
* ties -> ti
* caress -> caress
* cats -> cat
*
* feed -> feed
* agreed -> agree
* disabled -> disable
*
* matting -> mat
* mating -> mate
* meeting -> meet
* milling -> mill
* messing -> mess
*
* meetings -> meet
*/
private final void step1(boolean y) {
if (b[k] == 's') {
if (endWith("sses")) {
k -= 2;
} else if (endWith("ies")) {
if (y && k-3 >= 0 && isConsonant(k-3)) {
setto("y");
} else {
setto("i");
}
} else if (b[k - 1] != 's') {
k--;
}
}
if (endWith("eed")) {
if (m() > 0) {
k--;
}
} else if ((endWith("ed") || endWith("ing")) && vowelinstem()) {
k = j;
if (endWith("at")) {
setto("ate");
} else if (endWith("bl")) {
setto("ble");
} else if (endWith("iz")) {
setto("ize");
} else if (y && endWith("i") && k-1 >= 0 && isConsonant(k-1)) {
setto("y");
} else if (doublec(k)) {
k--;
{
int ch = b[k];
if (ch == 'l' || ch == 's' || ch == 'z') {
k++;
}
}
} else if (m() == 1 && cvc(k)) {
setto("e");
}
}
}
/**
* step2() turns terminal y to i when there is another vowel in the stem.
*/
private final void step2() {
if (endWith("y") && vowelinstem()) {
b[k] = 'i';
}
}
/**
* step3() maps double suffices to single ones. so -ization ( = -ize plus
* -ation) maps to -ize etc. note that the string before the suffix must give
* m() > 0.
*/
private final void step3() {
if (k == 0) {
return;
}
switch (b[k - 1]) {
case 'a':
if (endWith("ational")) {
r("ate");
break;
}
if (endWith("tional")) {
r("tion");
break;
}
break;
case 'c':
if (endWith("enci")) {
r("ence");
break;
}
if (endWith("anci")) {
r("ance");
break;
}
break;
case 'e':
if (endWith("izer")) {
r("ize");
break;
}
break;
case 'l':
if (endWith("bli")) {
r("ble");
break;
}
if (endWith("alli")) {
r("al");
break;
}
if (endWith("entli")) {
r("ent");
break;
}
if (endWith("eli")) {
r("e");
break;
}
if (endWith("ousli")) {
r("ous");
break;
}
break;
case 'o':
if (endWith("ization")) {
r("ize");
break;
}
if (endWith("ation")) {
r("ate");
break;
}
if (endWith("ator")) {
r("ate");
break;
}
break;
case 's':
if (endWith("alism")) {
r("al");
break;
}
if (endWith("iveness")) {
r("ive");
break;
}
if (endWith("fulness")) {
r("ful");
break;
}
if (endWith("ousness")) {
r("ous");
break;
}
break;
case 't':
if (endWith("aliti")) {
r("al");
break;
}
if (endWith("iviti")) {
r("ive");
break;
}
if (endWith("biliti")) {
r("ble");
break;
}
break;
case 'g':
if (endWith("logi")) {
r("log");
break;
}
}
}
/**
* step4() deals with -ic-, -full, -ness etc. similar strategy to step3.
*/
private final void step4() {
switch (b[k]) {
case 'e':
if (endWith("icate")) {
r("ic");
break;
}
if (endWith("ative")) {
r("");
break;
}
if (endWith("alize")) {
r("al");
break;
}
break;
case 'i':
if (endWith("iciti")) {
r("ic");
break;
}
break;
case 'l':
if (endWith("ical")) {
r("ic");
break;
}
if (endWith("ful")) {
r("");
break;
}
break;
case 's':
if (endWith("ness")) {
r("");
break;
}
break;
}
}
/**
* step5() takes off -ant, -ence etc., in context vcvc.
*/
private final void step5() {
if (k == 0) {
return;
}
switch (b[k - 1]) {
case 'a':
if (endWith("al")) {
break;
}
return;
case 'c':
if (endWith("ance")) {
break;
}
if (endWith("ence")) {
break;
}
return;
case 'e':
if (endWith("er")) {
break;
}
return;
case 'i':
if (endWith("ic")) {
break;
}
return;
case 'l':
if (endWith("able")) {
break;
}
if (endWith("ible")) {
break;
}
return;
case 'n':
if (endWith("ant")) {
break;
}
if (endWith("ement")) {
break;
}
if (endWith("ment")) {
break;
}
/* element etc. not stripped before the m */
if (endWith("ent")) {
break;
}
return;
case 'o':
if (endWith("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) {
break;
}
if (endWith("ou")) {
break;
}
return;
/* takes care of -ous */
case 's':
if (endWith("ism")) {
break;
}
return;
case 't':
if (endWith("ate")) {
break;
}
if (endWith("iti")) {
break;
}
return;
case 'u':
if (endWith("ous")) {
break;
}
return;
case 'v':
if (endWith("ive")) {
break;
}
return;
case 'z':
if (endWith("ize")) {
break;
}
return;
default:
return;
}
if (m() > 1) {
k = j;
}
}
/**
* step6() removes a final -e if m() > 1.
*/
private final void step6() {
j = k;
if (b[k] == 'e') {
int a = m();
if (a > 1 || a == 1 && !cvc(k - 1)) {
k--;
}
}
if (b[k] == 'l' && doublec(k) && m() > 1) {
k--;
}
}
@Override
public String stem(String word) {
b = word.toCharArray();
k = word.length() - 1;
if (k > 1) {
step1();
step2();
step3();
step4();
step5();
step6();
}
return new String(b, 0, k+1);
}
/**
* Remove plurals and participles.
*/
public String stripPluralParticiple(String word) {
b = word.toCharArray();
k = word.length() - 1;
if (k > 1 && !word.equalsIgnoreCase("is") && !word.equalsIgnoreCase("was") && !word.equalsIgnoreCase("has") && !word.equalsIgnoreCase("his") && !word.equalsIgnoreCase("this")) {
step1(true);
return new String(b, 0, k+1);
}
return word;
}
}