com.github.chen0040.data.text.PorterStemmer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of java-data-text Show documentation
Show all versions of java-data-text Show documentation
Java implementation of text processing such as stemmers
package com.github.chen0040.data.text;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
public class PorterStemmer implements TextFilter, Serializable {
private static final long serialVersionUID = -283938886140264059L;
@Override
public List filter(List words) {
List result = new ArrayList<>();
for (String word : words) {
result.add(Stem(word));
}
return result;
}
public static String Stem(String word) {
char[] b = word.toCharArray();
int[] jk = new int[2];
jk[0] = 0;
jk[1] = 0;
jk[1] = b.length - 1;
if (jk[1] > 1) {
Step1(b, jk);
Step2(b, jk);
Step3(b, jk);
Step4(b, jk);
Step5(b, jk);
Step6(b, jk);
}
return new String(b, 0, jk[1] + 1);
}
///
/// m() measures the number of consonant sequences between 0 and j. if c is
/// a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
/// presence,
/// gives 0
/// vc gives 1
/// vcvc gives 2
/// vcvcvc gives 3
/// ....
///
///
///
private static int GetConsonantSeqCount(char[] b, int j) {
int n = 0;
int i = 0;
// check one vowel sequence
while (true) {
if (i > j) return n;
if (!IsConsonant(b, i)) break;
i++;
}
i++;
while (true) {
// check one constant sequence
while (true) {
if (i > j) return n;
if (IsConsonant(b, i)) break;
i++;
}
i++;
n++; // now one vowel sequence followed by one constant sequence is found, increment by 1
// check one vowel sequence
while (true) {
if (i > j) return n;
if (!IsConsonant(b, i)) break;
i++;
}
i++;
}
}
private static boolean IsConsonant(char[] b, int i) {
switch (b[i]) {
case 'a':
case 'e':
case 'i':
case 'o':
case 'u':
return false;
case 'y':
return (i == 0) ? true : !IsConsonant(b, i - 1);
default:
return true;
}
}
///
/// vowelinstem() is true <=> 0,...j contains a vowel
///
private static boolean ContainsVowel(char[] b, int j) {
int i;
for (i = 0; i <= j; i++) if (!IsConsonant(b, i)) return true;
return false;
}
///
/// cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
/// and also if the second c is not w,x or y. this is used when trying to
/// restore an e at the end of a short word. e.g.
///
/// cav(e), lov(e), hop(e), crim(e), but
/// snow, box, tray.
///
///
///
///
private static boolean IsConsonantVowelConsonant(char[] b, int i) {
if (i < 2 || !IsConsonant(b, i) || IsConsonant(b, i - 1) || !IsConsonant(b, i - 2)) return false;
{
int ch = b[i];
if (ch == 'w' || ch == 'x' || ch == 'y') return false;
}
return true;
}
private static boolean EndsWith(char[] b, int[] jk, String s) {
int l = s.length();
int o = jk[1] - l + 1;
if (o < 0) return false;
for (int i = 0; i < l; i++) if (b[o + i] != s.charAt(i)) return false;
jk[0] = jk[1] - l;
return true;
}
private static void Replace(char[] b, int[] jk, String s) {
int l = s.length();
int o = jk[0] + 1;
for (int i = 0; i < l; i++) b[o + i] = s.charAt(i);
jk[1] = jk[0] + l;
}
///
/// doublec(j) is true <=> j,(j-1) contain a double consonant.
///
///
///
///
private static boolean IsDoubleConsonant(char[] b, int j) {
if (j < 1) return false;
if (b[j] != b[j - 1]) return false;
return IsConsonant(b, j);
}
///
/// Step1() gets rid of plurals and -ed or -ing. e.g.
///
/// caresses -> caress
/// ponies -> poni
/// ties -> ti
/// caress -> caress
/// cats -> cat
/// feed -> feed
/// agreed -> agree
/// disabled -> disable
/// matting -> mat
/// mating -> mate
/// meeting -> meet
/// milling -> mill
/// messing -> mess
/// meetings -> meet
///
///
///
private static void Step1(char[] b, int[] jk) {
if (b[jk[1]] == 's') {
if (EndsWith(b, jk, "sses")) {
jk[1] -= 2;
} else {
if (EndsWith(b, jk, "ies")) {
Replace(b, jk, "i");
} else {
if (b[jk[1] - 1] != 's') jk[1]--;
}
}
}
if (EndsWith(b, jk, "eed")) {
if (GetConsonantSeqCount(b, jk[0]) > 0) jk[1]--;
} else {
if ((EndsWith(b, jk, "ed") || EndsWith(b, jk, "ing")) && ContainsVowel(b, jk[0])) {
jk[1] = jk[0];
if (EndsWith(b, jk, "at")) Replace(b, jk, "ate");
else {
if (EndsWith(b, jk, "bl")) {
Replace(b, jk, "ble");
} else {
if (EndsWith(b, jk, "iz")) Replace(b, jk, "ize");
else {
if (IsDoubleConsonant(b, jk[1])) {
jk[1]--;
{
int ch = b[jk[1]];
if (ch == 'l' || ch == 's' || ch == 'z') jk[1]++;
}
} else if (GetConsonantSeqCount(b, jk[0]) == 1 && IsConsonantVowelConsonant(b, jk[1])) {
Replace(b, jk, "e");
}
}
}
}
}
}
}
///
/// Step2() turns terminal y to i when there is another vowel in the stem.
///
///
///
///
private static void Step2(char[] b, int[] jk) {
int j = jk[0];
int k = jk[1];
if (EndsWith(b, jk, "y") && ContainsVowel(b, j)) {
b[k] = 'i';
}
}
private static void r(char[] b, int[] jk, String s) {
int j = jk[0];
int k = jk[1];
if (GetConsonantSeqCount(b, j) > 0) {
Replace(b, jk, s);
}
}
///
/// Step3() maps double suffices to single ones. so -ization ( = -ize plus -ation) maps to -ize etc. note that the String before the suffix must give m() > 0.
///
///
///
///
private static void Step3(char[] b, int[] jk) {
if (jk[1] == 0) return;
switch (b[jk[1] - 1]) {
case 'a':
if (EndsWith(b, jk, "ational")) {
r(b, jk, "ate");
break;
}
if (EndsWith(b, jk, "tional")) {
r(b, jk, "tion");
break;
}
break;
case 'c':
if (EndsWith(b, jk, "enci")) {
r(b, jk, "ence");
break;
}
if (EndsWith(b, jk, "anci")) {
r(b, jk, "ance");
break;
}
break;
case 'e':
if (EndsWith(b, jk, "izer")) {
r(b, jk, "ize");
break;
}
break;
case 'l':
if (EndsWith(b, jk, "bli")) {
r(b, jk, "ble");
break;
}
if (EndsWith(b, jk, "alli")) {
r(b, jk, "al");
break;
}
if (EndsWith(b, jk, "entli")) {
r(b, jk, "ent");
break;
}
if (EndsWith(b, jk, "eli")) {
r(b, jk, "e");
break;
}
if (EndsWith(b, jk, "ousli")) {
r(b, jk, "ous");
break;
}
break;
case 'o':
if (EndsWith(b, jk, "ization")) {
r(b, jk, "ize");
break;
}
if (EndsWith(b, jk, "ation")) {
r(b, jk, "ate");
break;
}
if (EndsWith(b, jk, "ator")) {
r(b, jk, "ate");
break;
}
break;
case 's':
if (EndsWith(b, jk, "alism")) {
r(b, jk, "al");
break;
}
if (EndsWith(b, jk, "iveness")) {
r(b, jk, "ive");
break;
}
if (EndsWith(b, jk, "fulness")) {
r(b, jk, "ful");
break;
}
if (EndsWith(b, jk, "ousness")) {
r(b, jk, "ous");
break;
}
break;
case 't':
if (EndsWith(b, jk, "aliti")) {
r(b, jk, "al");
break;
}
if (EndsWith(b, jk, "iviti")) {
r(b, jk, "ive");
break;
}
if (EndsWith(b, jk, "biliti")) {
r(b, jk, "ble");
break;
}
break;
case 'g':
if (EndsWith(b, jk, "logi")) {
r(b, jk, "log");
break;
}
break;
}
}
///
/// Step4() deals with -ic-, -full, -ness etc. similar strategy to Step3.
///
private static void Step4(char[] b, int[] jk) {
switch (b[jk[1]]) {
case 'e':
if (EndsWith(b, jk, "icate")) {
r(b, jk, "ic");
break;
}
if (EndsWith(b, jk, "ative")) {
r(b, jk, "");
break;
}
if (EndsWith(b, jk, "alize")) {
r(b, jk, "al");
break;
}
break;
case 'i':
if (EndsWith(b, jk, "iciti")) {
r(b, jk, "ic");
break;
}
break;
case 'l':
if (EndsWith(b, jk, "ical")) {
r(b, jk, "ic");
break;
}
if (EndsWith(b, jk, "ful")) {
r(b, jk, "");
break;
}
break;
case 's':
if (EndsWith(b, jk, "ness")) {
r(b, jk, "");
break;
}
break;
}
}
///
/// Step5() takes off -ant, -ence etc., in context vcvc.
///
private static void Step5(char[] b, int[] jk) {
if (jk[1] == 0) return; /* for Bug 1 */
switch (b[jk[1] - 1]) {
case 'a':
if (EndsWith(b, jk, "al")) break;
return;
case 'c':
if (EndsWith(b, jk, "ance")) break;
if (EndsWith(b, jk, "ence")) break;
return;
case 'e':
if (EndsWith(b, jk, "er")) break;
return;
case 'i':
if (EndsWith(b, jk, "ic")) break;
return;
case 'l':
if (EndsWith(b, jk, "able")) break;
if (EndsWith(b, jk, "ible")) break;
return;
case 'n':
if (EndsWith(b, jk, "ant")) break;
if (EndsWith(b, jk, "ement")) break;
if (EndsWith(b, jk, "ment")) break;
/* element etc. not stripped before the m */
if (EndsWith(b, jk, "ent")) break;
return;
case 'o':
if (EndsWith(b, jk, "ion") && jk[0] >= 0 && (b[jk[0]] == 's' || b[jk[0]] == 't')) break;
/* j >= 0 fixes Bug 2 */
if (EndsWith(b, jk, "ou")) break;
return;
/* takes care of -ous */
case 's':
if (EndsWith(b, jk, "ism")) break;
return;
case 't':
if (EndsWith(b, jk, "ate")) break;
if (EndsWith(b, jk, "iti")) break;
return;
case 'u':
if (EndsWith(b, jk, "ous")) break;
return;
case 'v':
if (EndsWith(b, jk, "ive")) break;
return;
case 'z':
if (EndsWith(b, jk, "ize")) break;
return;
default:
return;
}
if (GetConsonantSeqCount(b, jk[0]) > 1) jk[1] = jk[0];
}
///
/// Step6() removes a final -e if m() > 1.
///
private static void Step6(char[] b, int[] jk) {
jk[0] = jk[1];
if (b[jk[1]] == 'e') {
int a = GetConsonantSeqCount(b, jk[0]);
if (a > 1 || a == 1 && !IsConsonantVowelConsonant(b, jk[1] - 1)) jk[1]--;
}
if (b[jk[1]] == 'l' && IsDoubleConsonant(b, jk[1]) && GetConsonantSeqCount(b, jk[0]) > 1) jk[1]--;
}
@Override
public Object clone(){
return new PorterStemmer();
}
}