org.apache.lucene.analysis.de.GermanStemmer Maven / Gradle / Ivy
Show all versions of lucene-analysis-common Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.de;
import java.util.Locale;
// This file is encoded in UTF-8
/**
* A stemmer for German words.
*
* The algorithm is based on the report "A Fast and Simple Stemming Algorithm for German Words"
* by Jörg Caumanns (joerg.caumanns at isst.fhg.de).
*/
public class GermanStemmer {
/** Buffer for the terms while stemming them. */
private StringBuilder sb = new StringBuilder();
/** Amount of characters that are removed with substitute()
while stemming. */
private int substCount = 0;
private static final Locale locale = new Locale("de", "DE");
/**
* Stemms the given term to an unique discriminator
.
*
* @param term The term that should be stemmed.
* @return Discriminator for term
*/
protected String stem(String term) {
// Use lowercase for medium stemming.
term = term.toLowerCase(locale);
if (!isStemmable(term)) return term;
// Reset the StringBuilder.
sb.delete(0, sb.length());
sb.insert(0, term);
// Stemming starts here...
substitute(sb);
strip(sb);
optimize(sb);
resubstitute(sb);
removeParticleDenotion(sb);
return sb.toString();
}
/**
* Checks if a term could be stemmed.
*
* @return true if, and only if, the given term consists in letters.
*/
private boolean isStemmable(String term) {
for (int c = 0; c < term.length(); c++) {
if (!Character.isLetter(term.charAt(c))) return false;
}
return true;
}
/**
* suffix stripping (stemming) on the current term. The stripping is reduced to the seven "base"
* suffixes "e", "s", "n", "t", "em", "er" and * "nd", from which all regular suffixes are build
* of. The simplification causes some overstemming, and way more irregular stems, but still
* provides unique. discriminators in the most of those cases. The algorithm is context free,
* except of the length restrictions.
*/
private void strip(StringBuilder buffer) {
boolean doMore = true;
while (doMore && buffer.length() > 3) {
if ((buffer.length() + substCount > 5)
&& buffer.substring(buffer.length() - 2, buffer.length()).equals("nd")) {
buffer.delete(buffer.length() - 2, buffer.length());
} else if ((buffer.length() + substCount > 4)
&& buffer.substring(buffer.length() - 2, buffer.length()).equals("em")) {
buffer.delete(buffer.length() - 2, buffer.length());
} else if ((buffer.length() + substCount > 4)
&& buffer.substring(buffer.length() - 2, buffer.length()).equals("er")) {
buffer.delete(buffer.length() - 2, buffer.length());
} else if (buffer.charAt(buffer.length() - 1) == 'e') {
buffer.deleteCharAt(buffer.length() - 1);
} else if (buffer.charAt(buffer.length() - 1) == 's') {
buffer.deleteCharAt(buffer.length() - 1);
} else if (buffer.charAt(buffer.length() - 1) == 'n') {
buffer.deleteCharAt(buffer.length() - 1);
}
// "t" occurs only as suffix of verbs.
else if (buffer.charAt(buffer.length() - 1) == 't') {
buffer.deleteCharAt(buffer.length() - 1);
} else {
doMore = false;
}
}
}
/** Does some optimizations on the term. This optimisations are contextual. */
private void optimize(StringBuilder buffer) {
// Additional step for female plurals of professions and inhabitants.
if (buffer.length() > 5
&& buffer.substring(buffer.length() - 5, buffer.length()).equals("erin*")) {
buffer.deleteCharAt(buffer.length() - 1);
strip(buffer);
}
// Additional step for irregular plural nouns like "Matrizen -> Matrix".
// NOTE: this length constraint is probably not a great value, it's just to prevent AIOOBE on
// empty terms
if (buffer.length() > 0 && buffer.charAt(buffer.length() - 1) == ('z')) {
buffer.setCharAt(buffer.length() - 1, 'x');
}
}
/** Removes a particle denotion ("ge") from a term. */
private void removeParticleDenotion(StringBuilder buffer) {
if (buffer.length() > 4) {
for (int c = 0; c < buffer.length() - 3; c++) {
if (buffer.substring(c, c + 4).equals("gege")) {
buffer.delete(c, c + 2);
return;
}
}
}
}
/**
* Do some substitutions for the term to reduce overstemming:
*
*
- Substitute Umlauts with their corresponding vowel:{@code äöü -> aou}, "ß" is substituted
* by "ss" - Substitute a second char of a pair of equal characters with an asterisk: {@code ?? ->
* ?*} - Substitute some common character combinations with a token: {@code sch/ch/ei/ie/ig/st ->
* $/§/%/&/#/!}
*/
private void substitute(StringBuilder buffer) {
substCount = 0;
for (int c = 0; c < buffer.length(); c++) {
// Replace the second char of a pair of the equal characters with an asterisk
if (c > 0 && buffer.charAt(c) == buffer.charAt(c - 1)) {
buffer.setCharAt(c, '*');
}
// Substitute Umlauts.
else if (buffer.charAt(c) == 'ä') {
buffer.setCharAt(c, 'a');
} else if (buffer.charAt(c) == 'ö') {
buffer.setCharAt(c, 'o');
} else if (buffer.charAt(c) == 'ü') {
buffer.setCharAt(c, 'u');
}
// Fix bug so that 'ß' at the end of a word is replaced.
else if (buffer.charAt(c) == 'ß') {
buffer.setCharAt(c, 's');
buffer.insert(c + 1, 's');
substCount++;
}
// Take care that at least one character is left left side from the current one
if (c < buffer.length() - 1) {
// Masking several common character combinations with an token
if ((c < buffer.length() - 2)
&& buffer.charAt(c) == 's'
&& buffer.charAt(c + 1) == 'c'
&& buffer.charAt(c + 2) == 'h') {
buffer.setCharAt(c, '$');
buffer.delete(c + 1, c + 3);
substCount += 2;
} else if (buffer.charAt(c) == 'c' && buffer.charAt(c + 1) == 'h') {
buffer.setCharAt(c, '§');
buffer.deleteCharAt(c + 1);
substCount++;
} else if (buffer.charAt(c) == 'e' && buffer.charAt(c + 1) == 'i') {
buffer.setCharAt(c, '%');
buffer.deleteCharAt(c + 1);
substCount++;
} else if (buffer.charAt(c) == 'i' && buffer.charAt(c + 1) == 'e') {
buffer.setCharAt(c, '&');
buffer.deleteCharAt(c + 1);
substCount++;
} else if (buffer.charAt(c) == 'i' && buffer.charAt(c + 1) == 'g') {
buffer.setCharAt(c, '#');
buffer.deleteCharAt(c + 1);
substCount++;
} else if (buffer.charAt(c) == 's' && buffer.charAt(c + 1) == 't') {
buffer.setCharAt(c, '!');
buffer.deleteCharAt(c + 1);
substCount++;
}
}
}
}
/**
* Undoes the changes made by substitute(). That are character pairs and character combinations.
* Umlauts will remain as their corresponding vowel, as "ß" remains as "ss".
*/
private void resubstitute(StringBuilder buffer) {
for (int c = 0; c < buffer.length(); c++) {
if (buffer.charAt(c) == '*') {
char x = buffer.charAt(c - 1);
buffer.setCharAt(c, x);
} else if (buffer.charAt(c) == '$') {
buffer.setCharAt(c, 's');
buffer.insert(c + 1, new char[] {'c', 'h'}, 0, 2);
} else if (buffer.charAt(c) == '§') {
buffer.setCharAt(c, 'c');
buffer.insert(c + 1, 'h');
} else if (buffer.charAt(c) == '%') {
buffer.setCharAt(c, 'e');
buffer.insert(c + 1, 'i');
} else if (buffer.charAt(c) == '&') {
buffer.setCharAt(c, 'i');
buffer.insert(c + 1, 'e');
} else if (buffer.charAt(c) == '#') {
buffer.setCharAt(c, 'i');
buffer.insert(c + 1, 'g');
} else if (buffer.charAt(c) == '!') {
buffer.setCharAt(c, 's');
buffer.insert(c + 1, 't');
}
}
}
}