org.apache.lucene.analysis.cz.CzechStemmer Maven / Gradle / Ivy
Show all versions of lucene-analysis-common Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.cz;
import static org.apache.lucene.analysis.util.StemmerUtil.*;
/**
* Light Stemmer for Czech.
*
* Implements the algorithm described in: Indexing and stemming approaches for the Czech
* language http://portal.acm.org/citation.cfm?id=1598600
*/
class CzechStemmer {
/**
* Stem an input buffer of Czech text.
*
* @param s input buffer
* @param len length of input buffer
* @return length of input buffer after normalization
*
NOTE: Input is expected to be in lowercase, but with diacritical marks
*/
int stem(char[] s, int len) {
len = removeCase(s, len);
len = removePossessives(s, len);
if (len > 0) {
len = normalize(s, len);
}
return len;
}
private int removeCase(char[] s, int len) {
if (len > 7 && endsWith(s, len, "atech")) return len - 5;
if (len > 6
&& (endsWith(s, len, "ětem") || endsWith(s, len, "etem") || endsWith(s, len, "atům")))
return len - 4;
if (len > 5
&& (endsWith(s, len, "ech")
|| endsWith(s, len, "ich")
|| endsWith(s, len, "ích")
|| endsWith(s, len, "ého")
|| endsWith(s, len, "ěmi")
|| endsWith(s, len, "emi")
|| endsWith(s, len, "ému")
|| endsWith(s, len, "ěte")
|| endsWith(s, len, "ete")
|| endsWith(s, len, "ěti")
|| endsWith(s, len, "eti")
|| endsWith(s, len, "ího")
|| endsWith(s, len, "iho")
|| endsWith(s, len, "ími")
|| endsWith(s, len, "ímu")
|| endsWith(s, len, "imu")
|| endsWith(s, len, "ách")
|| endsWith(s, len, "ata")
|| endsWith(s, len, "aty")
|| endsWith(s, len, "ých")
|| endsWith(s, len, "ama")
|| endsWith(s, len, "ami")
|| endsWith(s, len, "ové")
|| endsWith(s, len, "ovi")
|| endsWith(s, len, "ými"))) return len - 3;
if (len > 4
&& (endsWith(s, len, "em")
|| endsWith(s, len, "es")
|| endsWith(s, len, "ém")
|| endsWith(s, len, "ím")
|| endsWith(s, len, "ům")
|| endsWith(s, len, "at")
|| endsWith(s, len, "ám")
|| endsWith(s, len, "os")
|| endsWith(s, len, "us")
|| endsWith(s, len, "ým")
|| endsWith(s, len, "mi")
|| endsWith(s, len, "ou"))) return len - 2;
if (len > 3) {
switch (s[len - 1]) {
case 'a':
case 'e':
case 'i':
case 'o':
case 'u':
case 'ů':
case 'y':
case 'á':
case 'é':
case 'í':
case 'ý':
case 'ě':
return len - 1;
}
}
return len;
}
private int removePossessives(char[] s, int len) {
if (len > 5 && (endsWith(s, len, "ov") || endsWith(s, len, "in") || endsWith(s, len, "ův")))
return len - 2;
return len;
}
private int normalize(char[] s, int len) {
if (endsWith(s, len, "čt")) { // čt -> ck
s[len - 2] = 'c';
s[len - 1] = 'k';
return len;
}
if (endsWith(s, len, "št")) { // št -> sk
s[len - 2] = 's';
s[len - 1] = 'k';
return len;
}
switch (s[len - 1]) {
case 'c': // [cč] -> k
case 'č':
s[len - 1] = 'k';
return len;
case 'z': // [zž] -> h
case 'ž':
s[len - 1] = 'h';
return len;
}
if (len > 1 && s[len - 2] == 'e') {
s[len - 2] = s[len - 1]; // e* > *
return len - 1;
}
if (len > 2 && s[len - 2] == 'ů') {
s[len - 2] = 'o'; // *ů* -> *o*
return len;
}
return len;
}
}