org.apache.lucene.analysis.ru.RussianStemmer Maven / Gradle / Ivy
The newest version!
package org.apache.lucene.analysis.ru;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description).
* @deprecated Use {@link org.tartarus.snowball.ext.RussianStemmer} instead,
* which has the same functionality. This filter will be removed in Lucene 4.0
*/
@Deprecated
class RussianStemmer
{
// positions of RV, R1 and R2 respectively
private int RV, /*R1,*/ R2;
// letters (currently unused letters are commented out)
private final static char A = '\u0430';
//private final static char B = '\u0431';
private final static char V = '\u0432';
private final static char G = '\u0433';
//private final static char D = '\u0434';
private final static char E = '\u0435';
//private final static char ZH = '\u0436';
//private final static char Z = '\u0437';
private final static char I = '\u0438';
private final static char I_ = '\u0439';
//private final static char K = '\u043A';
private final static char L = '\u043B';
private final static char M = '\u043C';
private final static char N = '\u043D';
private final static char O = '\u043E';
//private final static char P = '\u043F';
//private final static char R = '\u0440';
private final static char S = '\u0441';
private final static char T = '\u0442';
private final static char U = '\u0443';
//private final static char F = '\u0444';
private final static char X = '\u0445';
//private final static char TS = '\u0446';
//private final static char CH = '\u0447';
private final static char SH = '\u0448';
private final static char SHCH = '\u0449';
//private final static char HARD = '\u044A';
private final static char Y = '\u044B';
private final static char SOFT = '\u044C';
private final static char AE = '\u044D';
private final static char IU = '\u044E';
private final static char IA = '\u044F';
// stem definitions
private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA };
private static char[][] perfectiveGerundEndings1 = {
{ V },
{ V, SH, I },
{ V, SH, I, S, SOFT }
};
private static char[][] perfectiveGerund1Predessors = {
{ A },
{ IA }
};
private static char[][] perfectiveGerundEndings2 = { { I, V }, {
Y, V }, {
I, V, SH, I }, {
Y, V, SH, I }, {
I, V, SH, I, S, SOFT }, {
Y, V, SH, I, S, SOFT }
};
private static char[][] adjectiveEndings = {
{ E, E },
{ I, E },
{ Y, E },
{ O, E },
{ E, I_ },
{ I, I_ },
{ Y, I_ },
{ O, I_ },
{ E, M },
{ I, M },
{ Y, M },
{ O, M },
{ I, X },
{ Y, X },
{ U, IU },
{ IU, IU },
{ A, IA },
{ IA, IA },
{ O, IU },
{ E, IU },
{ I, M, I },
{ Y, M, I },
{ E, G, O },
{ O, G, O },
{ E, M, U },
{O, M, U }
};
private static char[][] participleEndings1 = {
{ SHCH },
{ E, M },
{ N, N },
{ V, SH },
{ IU, SHCH }
};
private static char[][] participleEndings2 = {
{ I, V, SH },
{ Y, V, SH },
{ U, IU, SHCH }
};
private static char[][] participle1Predessors = {
{ A },
{ IA }
};
private static char[][] reflexiveEndings = {
{ S, IA },
{ S, SOFT }
};
private static char[][] verbEndings1 = {
{ I_ },
{ L },
{ N },
{ L, O },
{ N, O },
{ E, T },
{ IU, T },
{ L, A },
{ N, A },
{ L, I },
{ E, M },
{ N, Y },
{ E, T, E },
{ I_, T, E },
{ T, SOFT },
{ E, SH, SOFT },
{ N, N, O }
};
private static char[][] verbEndings2 = {
{ IU },
{ U, IU },
{ E, N },
{ E, I_ },
{ IA, T },
{ U, I_ },
{ I, L },
{ Y, L },
{ I, M },
{ Y, M },
{ I, T },
{ Y, T },
{ I, L, A },
{ Y, L, A },
{ E, N, A },
{ I, T, E },
{ I, L, I },
{ Y, L, I },
{ I, L, O },
{ Y, L, O },
{ E, N, O },
{ U, E, T },
{ U, IU, T },
{ E, N, Y },
{ I, T, SOFT },
{ Y, T, SOFT },
{ I, SH, SOFT },
{ E, I_, T, E },
{ U, I_, T, E }
};
private static char[][] verb1Predessors = {
{ A },
{ IA }
};
private static char[][] nounEndings = {
{ A },
{ U },
{ I_ },
{ O },
{ U },
{ E },
{ Y },
{ I },
{ SOFT },
{ IA },
{ E, V },
{ O, V },
{ I, E },
{ SOFT, E },
{ IA, X },
{ I, IU },
{ E, I },
{ I, I },
{ E, I_ },
{ O, I_ },
{ E, M },
{ A, M },
{ O, M },
{ A, X },
{ SOFT, IU },
{ I, IA },
{ SOFT, IA },
{ I, I_ },
{ IA, M },
{ IA, M, I },
{ A, M, I },
{ I, E, I_ },
{ I, IA, M },
{ I, E, M },
{ I, IA, X },
{ I, IA, M, I }
};
private static char[][] superlativeEndings = {
{ E, I_, SH },
{ E, I_, SH, E }
};
private static char[][] derivationalEndings = {
{ O, S, T },
{ O, S, T, SOFT }
};
/**
* RussianStemmer constructor comment.
*/
public RussianStemmer()
{
super();
}
/**
* Adjectival ending is an adjective ending,
* optionally preceded by participle ending.
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuilder
*/
private boolean adjectival(StringBuilder stemmingZone)
{
// look for adjective ending in a stemming zone
if (!findAndRemoveEnding(stemmingZone, adjectiveEndings))
return false;
// if adjective ending was found, try for participle ending.
if (!findAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors))
findAndRemoveEnding(stemmingZone, participleEndings2);
return true;
}
/**
* Derivational endings
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuilder
*/
private boolean derivational(StringBuilder stemmingZone)
{
int endingLength = findEnding(stemmingZone, derivationalEndings);
if (endingLength == 0)
// no derivational ending found
return false;
else
{
// Ensure that the ending locates in R2
if (R2 - RV <= stemmingZone.length() - endingLength)
{
stemmingZone.setLength(stemmingZone.length() - endingLength);
return true;
}
else
{
return false;
}
}
}
/**
* Finds ending among given ending class and returns the length of ending found(0, if not found).
* Creation date: (17/03/2002 8:18:34 PM)
*/
private int findEnding(StringBuilder stemmingZone, int startIndex, char[][] theEndingClass)
{
boolean match = false;
for (int i = theEndingClass.length - 1; i >= 0; i--)
{
char[] theEnding = theEndingClass[i];
// check if the ending is bigger than stemming zone
if (startIndex < theEnding.length - 1)
{
match = false;
continue;
}
match = true;
int stemmingIndex = startIndex;
for (int j = theEnding.length - 1; j >= 0; j--)
{
if (stemmingZone.charAt(stemmingIndex--) != theEnding[j])
{
match = false;
break;
}
}
// check if ending was found
if (match)
{
return theEndingClass[i].length; // cut ending
}
}
return 0;
}
private int findEnding(StringBuilder stemmingZone, char[][] theEndingClass)
{
return findEnding(stemmingZone, stemmingZone.length() - 1, theEndingClass);
}
/**
* Finds the ending among the given class of endings and removes it from stemming zone.
* Creation date: (17/03/2002 8:18:34 PM)
*/
private boolean findAndRemoveEnding(StringBuilder stemmingZone, char[][] theEndingClass)
{
int endingLength = findEnding(stemmingZone, theEndingClass);
if (endingLength == 0)
// not found
return false;
else {
stemmingZone.setLength(stemmingZone.length() - endingLength);
// cut the ending found
return true;
}
}
/**
* Finds the ending among the given class of endings, then checks if this ending was
* preceded by any of given predecessors, and if so, removes it from stemming zone.
* Creation date: (17/03/2002 8:18:34 PM)
*/
private boolean findAndRemoveEnding(StringBuilder stemmingZone,
char[][] theEndingClass, char[][] thePredessors)
{
int endingLength = findEnding(stemmingZone, theEndingClass);
if (endingLength == 0)
// not found
return false;
else
{
int predessorLength =
findEnding(stemmingZone,
stemmingZone.length() - endingLength - 1,
thePredessors);
if (predessorLength == 0)
return false;
else {
stemmingZone.setLength(stemmingZone.length() - endingLength);
// cut the ending found
return true;
}
}
}
/**
* Marks positions of RV, R1 and R2 in a given word.
* Creation date: (16/03/2002 3:40:11 PM)
*/
private void markPositions(String word)
{
RV = 0;
// R1 = 0;
R2 = 0;
int i = 0;
// find RV
while (word.length() > i && !isVowel(word.charAt(i)))
{
i++;
}
if (word.length() - 1 < ++i)
return; // RV zone is empty
RV = i;
// find R1
while (word.length() > i && isVowel(word.charAt(i)))
{
i++;
}
if (word.length() - 1 < ++i)
return; // R1 zone is empty
// R1 = i;
// find R2
while (word.length() > i && !isVowel(word.charAt(i)))
{
i++;
}
if (word.length() - 1 < ++i)
return; // R2 zone is empty
while (word.length() > i && isVowel(word.charAt(i)))
{
i++;
}
if (word.length() - 1 < ++i)
return; // R2 zone is empty
R2 = i;
}
/**
* Checks if character is a vowel..
* Creation date: (16/03/2002 10:47:03 PM)
* @return boolean
* @param letter char
*/
private boolean isVowel(char letter)
{
for (int i = 0; i < vowels.length; i++)
{
if (letter == vowels[i])
return true;
}
return false;
}
/**
* Noun endings.
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuilder
*/
private boolean noun(StringBuilder stemmingZone)
{
return findAndRemoveEnding(stemmingZone, nounEndings);
}
/**
* Perfective gerund endings.
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuilder
*/
private boolean perfectiveGerund(StringBuilder stemmingZone)
{
return findAndRemoveEnding(
stemmingZone,
perfectiveGerundEndings1,
perfectiveGerund1Predessors)
|| findAndRemoveEnding(stemmingZone, perfectiveGerundEndings2);
}
/**
* Reflexive endings.
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuilder
*/
private boolean reflexive(StringBuilder stemmingZone)
{
return findAndRemoveEnding(stemmingZone, reflexiveEndings);
}
/**
* Insert the method's description here.
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuilder
*/
private boolean removeI(StringBuilder stemmingZone)
{
if (stemmingZone.length() > 0
&& stemmingZone.charAt(stemmingZone.length() - 1) == I)
{
stemmingZone.setLength(stemmingZone.length() - 1);
return true;
}
else
{
return false;
}
}
/**
* Insert the method's description here.
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuilder
*/
private boolean removeSoft(StringBuilder stemmingZone)
{
if (stemmingZone.length() > 0
&& stemmingZone.charAt(stemmingZone.length() - 1) == SOFT)
{
stemmingZone.setLength(stemmingZone.length() - 1);
return true;
}
else
{
return false;
}
}
/**
* Finds the stem for given Russian word.
* Creation date: (16/03/2002 3:36:48 PM)
* @return java.lang.String
* @param input java.lang.String
*/
public String stem(String input)
{
markPositions(input);
if (RV == 0)
return input; //RV wasn't detected, nothing to stem
StringBuilder stemmingZone = new StringBuilder(input.substring(RV));
// stemming goes on in RV
// Step 1
if (!perfectiveGerund(stemmingZone))
{
reflexive(stemmingZone);
if (!adjectival(stemmingZone))
if (!verb(stemmingZone))
noun(stemmingZone);
}
// Step 2
removeI(stemmingZone);
// Step 3
derivational(stemmingZone);
// Step 4
superlative(stemmingZone);
undoubleN(stemmingZone);
removeSoft(stemmingZone);
// return result
return input.substring(0, RV) + stemmingZone.toString();
}
/**
* Superlative endings.
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuilder
*/
private boolean superlative(StringBuilder stemmingZone)
{
return findAndRemoveEnding(stemmingZone, superlativeEndings);
}
/**
* Undoubles N.
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuilder
*/
private boolean undoubleN(StringBuilder stemmingZone)
{
char[][] doubleN = {
{ N, N }
};
if (findEnding(stemmingZone, doubleN) != 0)
{
stemmingZone.setLength(stemmingZone.length() - 1);
return true;
}
else
{
return false;
}
}
/**
* Verb endings.
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuilder
*/
private boolean verb(StringBuilder stemmingZone)
{
return findAndRemoveEnding(
stemmingZone,
verbEndings1,
verb1Predessors)
|| findAndRemoveEnding(stemmingZone, verbEndings2);
}
/**
* Static method for stemming.
*/
public static String stemWord(String theWord)
{
RussianStemmer stemmer = new RussianStemmer();
return stemmer.stem(theWord);
}
}