org.apache.lucene.analysis.de.GermanStemmer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-analysis-common Show documentation
Apache Lucene (module: common)
There is a newer version: 10.1.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.de;

import java.util.Locale;

// This file is encoded in UTF-8

/**
 * A stemmer for German words.
 *
 * The algorithm is based on the report "A Fast and Simple Stemming Algorithm for German Words"
 * by Jörg Caumanns (joerg.caumanns at isst.fhg.de).
 */
public class GermanStemmer {
  /** Buffer for the terms while stemming them. */
  private StringBuilder sb = new StringBuilder();

  /** Amount of characters that are removed with substitute() while stemming. */
  private int substCount = 0;

  private static final Locale locale = new Locale("de", "DE");

  /**
   * Stemms the given term to an unique discriminator.
   *
   * @param term The term that should be stemmed.
   * @return Discriminator for term
   */
  protected String stem(String term) {
    // Use lowercase for medium stemming.
    term = term.toLowerCase(locale);
    if (!isStemmable(term)) return term;
    // Reset the StringBuilder.
    sb.delete(0, sb.length());
    sb.insert(0, term);
    // Stemming starts here...
    substitute(sb);
    strip(sb);
    optimize(sb);
    resubstitute(sb);
    removeParticleDenotion(sb);
    return sb.toString();
  }

  /**
   * Checks if a term could be stemmed.
   *
   * @return true if, and only if, the given term consists in letters.
   */
  private boolean isStemmable(String term) {
    for (int c = 0; c < term.length(); c++) {
      if (!Character.isLetter(term.charAt(c))) return false;
    }
    return true;
  }

  /**
   * suffix stripping (stemming) on the current term. The stripping is reduced to the seven "base"
   * suffixes "e", "s", "n", "t", "em", "er" and * "nd", from which all regular suffixes are build
   * of. The simplification causes some overstemming, and way more irregular stems, but still
   * provides unique. discriminators in the most of those cases. The algorithm is context free,
   * except of the length restrictions.
   */
  private void strip(StringBuilder buffer) {
    boolean doMore = true;
    while (doMore && buffer.length() > 3) {
      if ((buffer.length() + substCount > 5)
          && buffer.substring(buffer.length() - 2, buffer.length()).equals("nd")) {
        buffer.delete(buffer.length() - 2, buffer.length());
      } else if ((buffer.length() + substCount > 4)
          && buffer.substring(buffer.length() - 2, buffer.length()).equals("em")) {
        buffer.delete(buffer.length() - 2, buffer.length());
      } else if ((buffer.length() + substCount > 4)
          && buffer.substring(buffer.length() - 2, buffer.length()).equals("er")) {
        buffer.delete(buffer.length() - 2, buffer.length());
      } else if (buffer.charAt(buffer.length() - 1) == 'e') {
        buffer.deleteCharAt(buffer.length() - 1);
      } else if (buffer.charAt(buffer.length() - 1) == 's') {
        buffer.deleteCharAt(buffer.length() - 1);
      } else if (buffer.charAt(buffer.length() - 1) == 'n') {
        buffer.deleteCharAt(buffer.length() - 1);
      }
      // "t" occurs only as suffix of verbs.
      else if (buffer.charAt(buffer.length() - 1) == 't') {
        buffer.deleteCharAt(buffer.length() - 1);
      } else {
        doMore = false;
      }
    }
  }

  /** Does some optimizations on the term. This optimisations are contextual. */
  private void optimize(StringBuilder buffer) {
    // Additional step for female plurals of professions and inhabitants.
    if (buffer.length() > 5
        && buffer.substring(buffer.length() - 5, buffer.length()).equals("erin*")) {
      buffer.deleteCharAt(buffer.length() - 1);
      strip(buffer);
    }
    // Additional step for irregular plural nouns like "Matrizen -> Matrix".
    // NOTE: this length constraint is probably not a great value, it's just to prevent AIOOBE on
    // empty terms
    if (buffer.length() > 0 && buffer.charAt(buffer.length() - 1) == ('z')) {
      buffer.setCharAt(buffer.length() - 1, 'x');
    }
  }

  /** Removes a particle denotion ("ge") from a term. */
  private void removeParticleDenotion(StringBuilder buffer) {
    if (buffer.length() > 4) {
      for (int c = 0; c < buffer.length() - 3; c++) {
        if (buffer.substring(c, c + 4).equals("gege")) {
          buffer.delete(c, c + 2);
          return;
        }
      }
    }
  }

  /**
   * Do some substitutions for the term to reduce overstemming:
   *
   * - Substitute Umlauts with their corresponding vowel:{@code äöü -> aou}, "ß" is substituted
   * by "ss" - Substitute a second char of a pair of equal characters with an asterisk: {@code ?? ->
   * ?*} - Substitute some common character combinations with a token: {@code sch/ch/ei/ie/ig/st ->
   * $/§/%/&/#/!}
   */
  private void substitute(StringBuilder buffer) {
    substCount = 0;
    for (int c = 0; c < buffer.length(); c++) {
      // Replace the second char of a pair of the equal characters with an asterisk
      if (c > 0 && buffer.charAt(c) == buffer.charAt(c - 1)) {
        buffer.setCharAt(c, '*');
      }
      // Substitute Umlauts.
      else if (buffer.charAt(c) == 'ä') {
        buffer.setCharAt(c, 'a');
      } else if (buffer.charAt(c) == 'ö') {
        buffer.setCharAt(c, 'o');
      } else if (buffer.charAt(c) == 'ü') {
        buffer.setCharAt(c, 'u');
      }
      // Fix bug so that 'ß' at the end of a word is replaced.
      else if (buffer.charAt(c) == 'ß') {
        buffer.setCharAt(c, 's');
        buffer.insert(c + 1, 's');
        substCount++;
      }
      // Take care that at least one character is left left side from the current one
      if (c < buffer.length() - 1) {
        // Masking several common character combinations with an token
        if ((c < buffer.length() - 2)
            && buffer.charAt(c) == 's'
            && buffer.charAt(c + 1) == 'c'
            && buffer.charAt(c + 2) == 'h') {
          buffer.setCharAt(c, '$');
          buffer.delete(c + 1, c + 3);
          substCount += 2;
        } else if (buffer.charAt(c) == 'c' && buffer.charAt(c + 1) == 'h') {
          buffer.setCharAt(c, '§');
          buffer.deleteCharAt(c + 1);
          substCount++;
        } else if (buffer.charAt(c) == 'e' && buffer.charAt(c + 1) == 'i') {
          buffer.setCharAt(c, '%');
          buffer.deleteCharAt(c + 1);
          substCount++;
        } else if (buffer.charAt(c) == 'i' && buffer.charAt(c + 1) == 'e') {
          buffer.setCharAt(c, '&');
          buffer.deleteCharAt(c + 1);
          substCount++;
        } else if (buffer.charAt(c) == 'i' && buffer.charAt(c + 1) == 'g') {
          buffer.setCharAt(c, '#');
          buffer.deleteCharAt(c + 1);
          substCount++;
        } else if (buffer.charAt(c) == 's' && buffer.charAt(c + 1) == 't') {
          buffer.setCharAt(c, '!');
          buffer.deleteCharAt(c + 1);
          substCount++;
        }
      }
    }
  }

  /**
   * Undoes the changes made by substitute(). That are character pairs and character combinations.
   * Umlauts will remain as their corresponding vowel, as "ß" remains as "ss".
   */
  private void resubstitute(StringBuilder buffer) {
    for (int c = 0; c < buffer.length(); c++) {
      if (buffer.charAt(c) == '*') {
        char x = buffer.charAt(c - 1);
        buffer.setCharAt(c, x);
      } else if (buffer.charAt(c) == '$') {
        buffer.setCharAt(c, 's');
        buffer.insert(c + 1, new char[] {'c', 'h'}, 0, 2);
      } else if (buffer.charAt(c) == '§') {
        buffer.setCharAt(c, 'c');
        buffer.insert(c + 1, 'h');
      } else if (buffer.charAt(c) == '%') {
        buffer.setCharAt(c, 'e');
        buffer.insert(c + 1, 'i');
      } else if (buffer.charAt(c) == '&') {
        buffer.setCharAt(c, 'i');
        buffer.insert(c + 1, 'e');
      } else if (buffer.charAt(c) == '#') {
        buffer.setCharAt(c, 'i');
        buffer.insert(c + 1, 'g');
      } else if (buffer.charAt(c) == '!') {
        buffer.setCharAt(c, 's');
        buffer.insert(c + 1, 't');
      }
    }
  }
}