weka.core.stemmers.LovinsStemmer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of weka-dev Show documentation
The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This version represents the developer version, the "bleeding edge" of development, you could say. New functionality gets added to this version.
There is a newer version: 3.9.6
Show newest version
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 * LovinsStemmer.java
 * Copyright (C) 2001-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.core.stemmers;

import java.util.HashMap;

import weka.core.RevisionUtils;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.TechnicalInformationHandler;

/**
 
 * A stemmer based on the Lovins stemmer, described here:

 * 

 * Julie Beth Lovins (1968). Development of a stemming algorithm. Mechanical Translation and Computational Linguistics. 11:22-31.
 * 
 
 * 
 
 * BibTeX:
 * 
 * @article{Lovins1968,
 *    author = {Julie Beth Lovins},
 *    journal = {Mechanical Translation and Computational Linguistics},
 *    pages = {22-31},
 *    title = {Development of a stemming algorithm},
 *    volume = {11},
 *    year = {1968}
 * }
 * 
 * 
 
 *
 * @author  Eibe Frank (eibe at cs dot waikato dot ac dot nz)
 * @version $Revision: 8034 $
 */
public class LovinsStemmer 
  implements Stemmer, TechnicalInformationHandler {

  /** for serialization */
  static final long serialVersionUID = -6113024782588197L;
  
  /** Enters C version compatibility mode if set to true (emulates
    features of the original C implementation that are inconsistent
    with the algorithm as described in Lovins's paper) */
  private static boolean m_CompMode = false;

  /** The hash tables containing the list of endings. */
  private static HashMap m_l11 = null;
  private static HashMap m_l10 = null;
  private static HashMap m_l9 = null;
  private static HashMap m_l8 = null;
  private static HashMap m_l7 = null;
  private static HashMap m_l6 = null;
  private static HashMap m_l5 = null;
  private static HashMap m_l4 = null;
  private static HashMap m_l3 = null;
  private static HashMap m_l2 = null;
  private static HashMap m_l1 = null;

  static {

    m_l11 = new HashMap();
    m_l11.put("alistically", "B");
    m_l11.put("arizability", "A");
    m_l11.put("izationally", "B");
    m_l10 = new HashMap();
    m_l10.put("antialness", "A");
    m_l10.put("arisations", "A");
    m_l10.put("arizations", "A");
    m_l10.put("entialness", "A");
    m_l9 = new HashMap();
    m_l9.put("allically", "C");
    m_l9.put("antaneous", "A");
    m_l9.put("antiality", "A");
    m_l9.put("arisation", "A");
    m_l9.put("arization", "A");
    m_l9.put("ationally", "B");
    m_l9.put("ativeness", "A");
    m_l9.put("eableness", "E");
    m_l9.put("entations", "A");
    m_l9.put("entiality", "A");
    m_l9.put("entialize", "A");
    m_l9.put("entiation", "A");
    m_l9.put("ionalness", "A");
    m_l9.put("istically", "A");
    m_l9.put("itousness", "A");
    m_l9.put("izability", "A");
    m_l9.put("izational", "A");
    m_l8 = new HashMap();
    m_l8.put("ableness", "A");
    m_l8.put("arizable", "A");
    m_l8.put("entation", "A");
    m_l8.put("entially", "A");
    m_l8.put("eousness", "A");
    m_l8.put("ibleness", "A");
    m_l8.put("icalness", "A");
    m_l8.put("ionalism", "A");
    m_l8.put("ionality", "A");
    m_l8.put("ionalize", "A");
    m_l8.put("iousness", "A");
    m_l8.put("izations", "A");
    m_l8.put("lessness", "A");
    m_l7 = new HashMap();
    m_l7.put("ability", "A");
    m_l7.put("aically", "A");
    m_l7.put("alistic", "B");
    m_l7.put("alities", "A");
    m_l7.put("ariness", "E");
    m_l7.put("aristic", "A");
    m_l7.put("arizing", "A");
    m_l7.put("ateness", "A");
    m_l7.put("atingly", "A");
    m_l7.put("ational", "B");
    m_l7.put("atively", "A");
    m_l7.put("ativism", "A");
    m_l7.put("elihood", "E");
    m_l7.put("encible", "A");
    m_l7.put("entally", "A");
    m_l7.put("entials", "A");
    m_l7.put("entiate", "A");
    m_l7.put("entness", "A");
    m_l7.put("fulness", "A");
    m_l7.put("ibility", "A");
    m_l7.put("icalism", "A");
    m_l7.put("icalist", "A");
    m_l7.put("icality", "A");
    m_l7.put("icalize", "A");
    m_l7.put("ication", "G");
    m_l7.put("icianry", "A");
    m_l7.put("ination", "A");
    m_l7.put("ingness", "A");
    m_l7.put("ionally", "A");
    m_l7.put("isation", "A");
    m_l7.put("ishness", "A");
    m_l7.put("istical", "A");
    m_l7.put("iteness", "A");
    m_l7.put("iveness", "A");
    m_l7.put("ivistic", "A");
    m_l7.put("ivities", "A");
    m_l7.put("ization", "F");
    m_l7.put("izement", "A");
    m_l7.put("oidally", "A");
    m_l7.put("ousness", "A");
    m_l6 = new HashMap();
    m_l6.put("aceous", "A");
    m_l6.put("acious", "B");
    m_l6.put("action", "G");
    m_l6.put("alness", "A");
    m_l6.put("ancial", "A");
    m_l6.put("ancies", "A");
    m_l6.put("ancing", "B");
    m_l6.put("ariser", "A");
    m_l6.put("arized", "A");
    m_l6.put("arizer", "A");
    m_l6.put("atable", "A");
    m_l6.put("ations", "B");
    m_l6.put("atives", "A");
    m_l6.put("eature", "Z");
    m_l6.put("efully", "A");
    m_l6.put("encies", "A");
    m_l6.put("encing", "A");
    m_l6.put("ential", "A");
    m_l6.put("enting", "C");
    m_l6.put("entist", "A");
    m_l6.put("eously", "A");
    m_l6.put("ialist", "A");
    m_l6.put("iality", "A");
    m_l6.put("ialize", "A");
    m_l6.put("ically", "A");
    m_l6.put("icance", "A");
    m_l6.put("icians", "A");
    m_l6.put("icists", "A");
    m_l6.put("ifully", "A");
    m_l6.put("ionals", "A");
    m_l6.put("ionate", "D");
    m_l6.put("ioning", "A");
    m_l6.put("ionist", "A");
    m_l6.put("iously", "A");
    m_l6.put("istics", "A");
    m_l6.put("izable", "E");
    m_l6.put("lessly", "A");
    m_l6.put("nesses", "A");
    m_l6.put("oidism", "A");
    m_l5 = new HashMap();
    m_l5.put("acies", "A");
    m_l5.put("acity", "A");
    m_l5.put("aging", "B");
    m_l5.put("aical", "A");
    if (!m_CompMode) {
      m_l5.put("alist", "A");
    }
    m_l5.put("alism", "B");
    m_l5.put("ality", "A");
    m_l5.put("alize", "A");
    m_l5.put("allic", "b");
    m_l5.put("anced", "B");
    m_l5.put("ances", "B");
    m_l5.put("antic", "C");
    m_l5.put("arial", "A");
    m_l5.put("aries", "A");
    m_l5.put("arily", "A");
    m_l5.put("arity", "B");
    m_l5.put("arize", "A");
    m_l5.put("aroid", "A");
    m_l5.put("ately", "A");
    m_l5.put("ating", "I");
    m_l5.put("ation", "B");
    m_l5.put("ative", "A");
    m_l5.put("ators", "A");
    m_l5.put("atory", "A");
    m_l5.put("ature", "E");
    m_l5.put("early", "Y");
    m_l5.put("ehood", "A");
    m_l5.put("eless", "A");
    if (!m_CompMode) {
      m_l5.put("elily", "A");
    } else {
      m_l5.put("elity", "A");
    }
    m_l5.put("ement", "A");
    m_l5.put("enced", "A");
    m_l5.put("ences", "A");
    m_l5.put("eness", "E");
    m_l5.put("ening", "E");
    m_l5.put("ental", "A");
    m_l5.put("ented", "C");
    m_l5.put("ently", "A");
    m_l5.put("fully", "A");
    m_l5.put("ially", "A");
    m_l5.put("icant", "A");
    m_l5.put("ician", "A");
    m_l5.put("icide", "A");
    m_l5.put("icism", "A");
    m_l5.put("icist", "A");
    m_l5.put("icity", "A");
    m_l5.put("idine", "I");
    m_l5.put("iedly", "A");
    m_l5.put("ihood", "A");
    m_l5.put("inate", "A");
    m_l5.put("iness", "A");
    m_l5.put("ingly", "B");
    m_l5.put("inism", "J");
    m_l5.put("inity", "c");
    m_l5.put("ional", "A");
    m_l5.put("ioned", "A");
    m_l5.put("ished", "A");
    m_l5.put("istic", "A");
    m_l5.put("ities", "A");
    m_l5.put("itous", "A");
    m_l5.put("ively", "A");
    m_l5.put("ivity", "A");
    m_l5.put("izers", "F");
    m_l5.put("izing", "F");
    m_l5.put("oidal", "A");
    m_l5.put("oides", "A");
    m_l5.put("otide", "A");
    m_l5.put("ously", "A");
    m_l4 = new HashMap();
    m_l4.put("able", "A");
    m_l4.put("ably", "A");
    m_l4.put("ages", "B");
    m_l4.put("ally", "B");
    m_l4.put("ance", "B");
    m_l4.put("ancy", "B");
    m_l4.put("ants", "B");
    m_l4.put("aric", "A");
    m_l4.put("arly", "K");
    m_l4.put("ated", "I");
    m_l4.put("ates", "A");
    m_l4.put("atic", "B");
    m_l4.put("ator", "A");
    m_l4.put("ealy", "Y");
    m_l4.put("edly", "E");
    m_l4.put("eful", "A");
    m_l4.put("eity", "A");
    m_l4.put("ence", "A");
    m_l4.put("ency", "A");
    m_l4.put("ened", "E");
    m_l4.put("enly", "E");
    m_l4.put("eous", "A");
    m_l4.put("hood", "A");
    m_l4.put("ials", "A");
    m_l4.put("ians", "A");
    m_l4.put("ible", "A");
    m_l4.put("ibly", "A");
    m_l4.put("ical", "A");
    m_l4.put("ides", "L");
    m_l4.put("iers", "A");
    m_l4.put("iful", "A");
    m_l4.put("ines", "M");
    m_l4.put("ings", "N");
    m_l4.put("ions", "B");
    m_l4.put("ious", "A");
    m_l4.put("isms", "B");
    m_l4.put("ists", "A");
    m_l4.put("itic", "H");
    m_l4.put("ized", "F");
    m_l4.put("izer", "F");
    m_l4.put("less", "A");
    m_l4.put("lily", "A");
    m_l4.put("ness", "A");
    m_l4.put("ogen", "A");
    m_l4.put("ward", "A");
    m_l4.put("wise", "A");
    m_l4.put("ying", "B");
    m_l4.put("yish", "A");
    m_l3 = new HashMap();
    m_l3.put("acy", "A");
    m_l3.put("age", "B");
    m_l3.put("aic", "A");
    m_l3.put("als", "b");
    m_l3.put("ant", "B");
    m_l3.put("ars", "O");
    m_l3.put("ary", "F");
    m_l3.put("ata", "A");
    m_l3.put("ate", "A");
    m_l3.put("eal", "Y");
    m_l3.put("ear", "Y");
    m_l3.put("ely", "E");
    m_l3.put("ene", "E");
    m_l3.put("ent", "C");
    m_l3.put("ery", "E");
    m_l3.put("ese", "A");
    m_l3.put("ful", "A");
    m_l3.put("ial", "A");
    m_l3.put("ian", "A");
    m_l3.put("ics", "A");
    m_l3.put("ide", "L");
    m_l3.put("ied", "A");
    m_l3.put("ier", "A");
    m_l3.put("ies", "P");
    m_l3.put("ily", "A");
    m_l3.put("ine", "M");
    m_l3.put("ing", "N");
    m_l3.put("ion", "Q");
    m_l3.put("ish", "C");
    m_l3.put("ism", "B");
    m_l3.put("ist", "A");
    m_l3.put("ite", "a");
    m_l3.put("ity", "A");
    m_l3.put("ium", "A");
    m_l3.put("ive", "A");
    m_l3.put("ize", "F");
    m_l3.put("oid", "A");
    m_l3.put("one", "R");
    m_l3.put("ous", "A");
    m_l2 = new HashMap();
    m_l2.put("ae", "A"); 
    m_l2.put("al", "b");
    m_l2.put("ar", "X");
    m_l2.put("as", "B");
    m_l2.put("ed", "E");
    m_l2.put("en", "F");
    m_l2.put("es", "E");
    m_l2.put("ia", "A");
    m_l2.put("ic", "A");
    m_l2.put("is", "A");
    m_l2.put("ly", "B");
    m_l2.put("on", "S");
    m_l2.put("or", "T");
    m_l2.put("um", "U");
    m_l2.put("us", "V");
    m_l2.put("yl", "R");
    m_l2.put("s\'", "A");
    m_l2.put("\'s", "A");
    m_l1 = new HashMap();
    m_l1.put("a", "A");
    m_l1.put("e", "A");
    m_l1.put("i", "A");
    m_l1.put("o", "A");
    m_l1.put("s", "W");
    m_l1.put("y", "B");	
  }

  /**
   * Returns a string describing the stemmer
   * @return a description suitable for
   *         displaying in the explorer/experimenter gui
   */
  public String globalInfo() {
    return 
        "A stemmer based on the Lovins stemmer, described here:\n\n"
      + getTechnicalInformation().toString();
  }

  /**
   * Returns an instance of a TechnicalInformation object, containing 
   * detailed information about the technical background of this class,
   * e.g., paper reference or book this class is based on.
   * 
   * @return the technical information about this class
   */
  public TechnicalInformation getTechnicalInformation() {
    TechnicalInformation 	result;
    
    result = new TechnicalInformation(Type.ARTICLE);
    result.setValue(Field.AUTHOR, "Julie Beth Lovins");
    result.setValue(Field.YEAR, "1968");
    result.setValue(Field.TITLE, "Development of a stemming algorithm");
    result.setValue(Field.JOURNAL, "Mechanical Translation and Computational Linguistics");
    result.setValue(Field.VOLUME, "11");
    result.setValue(Field.PAGES, "22-31");

    return result;
  }

  /**
   * Finds and removes ending from given word.
   * 
   * @param word	the word to work on
   * @return 		the processed word
   */
  private String removeEnding(String word) {

    int length = word.length();
    int el = 11;

    while (el > 0) {
      if (length - el > 1) {
        String ending = word.substring(length - el);
        String conditionCode = null;
        switch (el) {
          case 11: conditionCode = (String)m_l11.get(ending);
                   break;
          case 10: conditionCode = (String)m_l10.get(ending);
                   break; 
          case 9: conditionCode = (String)m_l9.get(ending);
                  break;
          case 8: conditionCode = (String)m_l8.get(ending);
                  break;   
          case 7: conditionCode = (String)m_l7.get(ending);
                  break;   
          case 6: conditionCode = (String)m_l6.get(ending);
                  break;   
          case 5: conditionCode = (String)m_l5.get(ending);
                  break;   
          case 4: conditionCode = (String)m_l4.get(ending);
                  break;   
          case 3: conditionCode = (String)m_l3.get(ending);
                  break;   
          case 2: conditionCode = (String)m_l2.get(ending);
                  break;   
          case 1: conditionCode = (String)m_l1.get(ending);
                  break;   
          default:
        }
        if (conditionCode != null) {
          switch (conditionCode.charAt(0)) {
            case 'A':
              return word.substring(0, length - el);
            case 'B':
              if (length - el > 2) {
                return word.substring(0, length - el);
              }
              break;
            case 'C':
              if (length - el > 3) {
                return word.substring(0, length - el);
              }
              break;
            case 'D':
              if (length - el > 4) {
                return word.substring(0, length - el);
              }
              break;
            case 'E':
              if (word.charAt(length - el - 1) != 'e') {
                return word.substring(0, length - el);
              }
              break;
            case 'F':
              if ((length - el > 2) &&
                  (word.charAt(length - el - 1) != 'e')) {
                return word.substring(0, length - el);
                  }
              break;
            case 'G':
              if ((length - el > 2) &&
                  (word.charAt(length - el - 1) == 'f')) {
                return word.substring(0, length - el);
                  }
              break;
            case 'H':
              if ((word.charAt(length - el - 1) == 't') ||
                  ((word.charAt(length - el - 1) == 'l') &&
                   (word.charAt(length - el - 2) == 'l'))) {
                return word.substring(0, length - el);
                   }
              break;
            case 'I':
              if ((word.charAt(length - el - 1) != 'o') &&
                  (word.charAt(length - el - 1) != 'e')) { 
                return word.substring(0, length - el);
                  }
              break;
            case 'J':
              if ((word.charAt(length - el - 1) != 'a') &&
                  (word.charAt(length - el - 1) != 'e')) { 
                return word.substring(0, length - el);
                  }
              break;
            case 'K':
              if ((length - el > 2) &&
                  ((word.charAt(length - el - 1) == 'l') ||
                   (word.charAt(length - el - 1) == 'i') ||
                   ((word.charAt(length - el - 1) == 'e') &&
                    (word.charAt(length - el - 3) == 'u')))) {
                return word.substring(0, length - el);
                    }
              break;
            case 'L':
              if ((word.charAt(length - el - 1) != 'u') &&
                  (word.charAt(length - el - 1) != 'x') &&
                  ((word.charAt(length - el - 1) != 's') ||
                   (word.charAt(length - el - 2) == 'o'))) {
                return word.substring(0, length - el);
                   }
              break;
            case 'M':
              if ((word.charAt(length - el - 1) != 'a') &&
                  (word.charAt(length - el - 1) != 'c') &&
                  (word.charAt(length - el - 1) != 'e') &&
                  (word.charAt(length - el - 1) != 'm')) {
                return word.substring(0, length - el);
                  }
              break;
            case 'N':
              if ((length - el > 3) || 
                  ((length - el == 3) &&
                   ((word.charAt(length - el - 3) != 's')))) {
                return word.substring(0, length - el);
                   }
              break;
            case 'O':
              if ((word.charAt(length - el - 1) == 'l') ||
                  (word.charAt(length - el - 1) == 'i')) {
                return word.substring(0, length - el);
                  }
              break;
            case 'P':
              if (word.charAt(length - el - 1) != 'c') {
                return word.substring(0, length - el);
              }
              break;
            case 'Q':
              if ((length - el > 2) &&
                  (word.charAt(length - el - 1) != 'l') &&
                  (word.charAt(length - el - 1) != 'n')) {
                return word.substring(0, length - el);
                  }
              break;
            case 'R':
              if ((word.charAt(length - el - 1) == 'n') ||
                  (word.charAt(length - el - 1) == 'r')) {
                return word.substring(0, length - el);
                  }
              break;
            case 'S':
              if (((word.charAt(length - el - 1) == 'r') &&
                    (word.charAt(length - el - 2) == 'd')) ||
                  ((word.charAt(length - el - 1) == 't') &&
                   (word.charAt(length - el - 2) != 't'))) {
                return word.substring(0, length - el);
                   }
              break;
            case 'T':
              if ((word.charAt(length - el - 1) == 's') ||
                  ((word.charAt(length - el - 1) == 't') &&
                   (word.charAt(length - el - 2) != 'o'))) {
                return word.substring(0, length - el);
                   }
              break;
            case 'U':
              if ((word.charAt(length - el - 1) == 'l') ||
                  (word.charAt(length - el - 1) == 'm') ||
                  (word.charAt(length - el - 1) == 'n') ||
                  (word.charAt(length - el - 1) == 'r')) {
                return word.substring(0, length - el);
                  }
              break;
            case 'V':
              if (word.charAt(length - el - 1) == 'c') {
                return word.substring(0, length - el);
              }
              break;
            case 'W':
              if ((word.charAt(length - el - 1) != 's') &&
                  (word.charAt(length - el - 1) != 'u')) {
                return word.substring(0, length - el);
                  }
              break;
            case 'X':
              if ((word.charAt(length - el - 1) == 'l') ||
                  (word.charAt(length - el - 1) == 'i') ||
                  ((length - el > 2) &&
                   (word.charAt(length - el - 1) == 'e') &&
                   (word.charAt(length - el - 3) == 'u'))) {
                return word.substring(0, length - el);
                   }
              break;
            case 'Y':
              if ((word.charAt(length - el - 1) == 'n') &&
                  (word.charAt(length - el - 2) == 'i')) {
                return word.substring(0, length - el);
                  }
              break;
            case 'Z':
              if (word.charAt(length - el - 1) != 'f') {
                return word.substring(0, length - el);
              }
              break;
            case 'a':
              if ((word.charAt(length - el - 1) == 'd') ||
                  (word.charAt(length - el - 1) == 'f') ||
                  (((word.charAt(length - el - 1) == 'h') &&
                    (word.charAt(length - el - 2) == 'p'))) ||
                  (((word.charAt(length - el - 1) == 'h') &&
                    (word.charAt(length - el - 2) == 't'))) ||
                  (word.charAt(length - el - 1) == 'l') ||
                  (((word.charAt(length - el - 1) == 'r') &&
                    (word.charAt(length - el - 2) == 'e'))) ||
                  (((word.charAt(length - el - 1) == 'r') &&
                    (word.charAt(length - el - 2) == 'o'))) ||
                  (((word.charAt(length - el - 1) == 's') &&
                    (word.charAt(length - el - 2) == 'e'))) ||
                  (word.charAt(length - el - 1) == 't')) {
                return word.substring(0, length - el);
                  }
              break;
            case 'b':
              if (m_CompMode) {
                if (((length - el == 3 ) &&
                      (!((word.charAt(length - el - 1) == 't') &&
                         (word.charAt(length - el - 2) == 'e') &&
                         (word.charAt(length - el - 3) == 'm')))) ||
                    ((length - el > 3) &&
                     (!((word.charAt(length - el - 1) == 't') &&
                        (word.charAt(length - el - 2) == 's') &&
                        (word.charAt(length - el - 3) == 'y') &&
                        (word.charAt(length - el - 4) == 'r'))))) {
                  return word.substring(0, length - el);
                        }
              } else {
                if ((length - el > 2) &&
                    (!((word.charAt(length - el - 1) == 't') &&
                       (word.charAt(length - el - 2) == 'e') &&
                       (word.charAt(length - el - 3) == 'm'))) &&
                    ((length - el < 4) ||
                     (!((word.charAt(length - el - 1) == 't') &&
                        (word.charAt(length - el - 2) == 's') &&
                        (word.charAt(length - el - 3) == 'y') &&
                        (word.charAt(length - el - 4) == 'r'))))) {
                  return word.substring(0, length - el);
                        }
              } 
              break;
            case 'c':
              if (word.charAt(length - el - 1) == 'l') {
                return word.substring(0, length - el);
              }
              break;
            default:
              throw new IllegalArgumentException("Fatal error.");
          }
        }
      }
      el--;
    }
    return word;
  }

  /**
   * Recodes ending of given word.
   * 
   * @param word	the word to work on
   * @return		the processed word
   */
  private String recodeEnding(String word) {

    int lastPos = word.length() - 1;

    // Rule 1
    if (word.endsWith("bb") ||
        word.endsWith("dd") ||
        word.endsWith("gg") ||
        word.endsWith("ll") ||
        word.endsWith("mm") ||
        word.endsWith("nn") ||
        word.endsWith("pp") ||
        word.endsWith("rr") ||
        word.endsWith("ss") ||
        word.endsWith("tt")) {
      word = word.substring(0, lastPos);
      lastPos--;
        }

    // Rule 2
    if (word.endsWith("iev")) {
      word = word.substring(0, lastPos - 2).concat("ief");
    }

    // Rule 3
    if (word.endsWith("uct")) {
      word = word.substring(0, lastPos - 2).concat("uc");
      lastPos--;
    }

    // Rule 4
    if (word.endsWith("umpt")) {
      word = word.substring(0, lastPos - 3).concat("um");
      lastPos -= 2;
    }

    // Rule 5
    if (word.endsWith("rpt")) {
      word = word.substring(0, lastPos - 2).concat("rb");
      lastPos--;
    }

    // Rule 6
    if (word.endsWith("urs")) {
      word = word.substring(0, lastPos - 2).concat("ur");
      lastPos--;
    }

    // Rule 7
    if (word.endsWith("istr")) {
      word = word.substring(0, lastPos - 3).concat("ister");
      lastPos++;
    }

    // Rule 7a
    if (word.endsWith("metr")) {
      word = word.substring(0, lastPos - 3).concat("meter");
      lastPos++;
    }

    // Rule 8
    if (word.endsWith("olv")) {
      word = word.substring(0, lastPos - 2).concat("olut");
      lastPos++;
    }

    // Rule 9
    if (word.endsWith("ul")) {
      if ((lastPos - 2 < 0) ||
          ((word.charAt(lastPos - 2) != 'a') &&
           (word.charAt(lastPos - 2) != 'i') &&
           (word.charAt(lastPos - 2) != 'o'))) {
        word = word.substring(0, lastPos - 1).concat("l");
        lastPos--;
           }
    }

    // Rule 10
    if (word.endsWith("bex")) {
      word = word.substring(0, lastPos - 2).concat("bic");
    }

    // Rule 11
    if (word.endsWith("dex")) {
      word = word.substring(0, lastPos - 2).concat("dic");
    }

    // Rule 12
    if (word.endsWith("pex")) {
      word = word.substring(0, lastPos - 2).concat("pic");
    }

    // Rule 13
    if (word.endsWith("tex")) {
      word = word.substring(0, lastPos - 2).concat("tic");
    }

    // Rule 14
    if (word.endsWith("ax")) {
      word = word.substring(0, lastPos - 1).concat("ac");
    }

    // Rule 15
    if (word.endsWith("ex")) {
      word = word.substring(0, lastPos - 1).concat("ec");
    }

    // Rule 16
    if (word.endsWith("ix")) {
      word = word.substring(0, lastPos - 1).concat("ic");
    }

    // Rule 17
    if (word.endsWith("lux")) {
      word = word.substring(0, lastPos - 2).concat("luc");
    }

    // Rule 18
    if (word.endsWith("uad")) {
      word = word.substring(0, lastPos - 2).concat("uas");
    }

    // Rule 19
    if (word.endsWith("vad")) {
      word = word.substring(0, lastPos - 2).concat("vas");
    }

    // Rule 20
    if (word.endsWith("cid")) {
      word = word.substring(0, lastPos - 2).concat("cis");
    }

    // Rule 21
    if (word.endsWith("lid")) {
      word = word.substring(0, lastPos - 2).concat("lis");
    }

    // Rule 22
    if (word.endsWith("erid")) {
      word = word.substring(0, lastPos - 3).concat("eris");
    }

    // Rule 23
    if (word.endsWith("pand")) {
      word = word.substring(0, lastPos - 3).concat("pans");
    }

    // Rule 24
    if (word.endsWith("end")) {
      if ((lastPos - 3 < 0) ||
          (word.charAt(lastPos - 3) != 's')) {
        word = word.substring(0, lastPos - 2).concat("ens");
          }
    }

    // Rule 25
    if (word.endsWith("ond")) {
      word = word.substring(0, lastPos - 2).concat("ons");
    }

    // Rule 26
    if (word.endsWith("lud")) {
      word = word.substring(0, lastPos - 2).concat("lus");
    }

    // Rule 27
    if (word.endsWith("rud")) {
      word = word.substring(0, lastPos - 2).concat("rus");
    }

    // Rule 28
    if (word.endsWith("her")) {
      if ((lastPos - 3 < 0) ||
          ((word.charAt(lastPos - 3) != 'p') &&
           (word.charAt(lastPos - 3) != 't'))) {
        word = word.substring(0, lastPos - 2).concat("hes");
           }
    }

    // Rule 29
    if (word.endsWith("mit")) {
      word = word.substring(0, lastPos - 2).concat("mis");
    }

    // Rule 30
    if (word.endsWith("end")) {
      if ((lastPos - 3 < 0) ||
          (word.charAt(lastPos - 3) != 'm')) {
        word = word.substring(0, lastPos - 2).concat("ens");
          }
    }

    // Rule 31
    if (word.endsWith("ert")) {
      word = word.substring(0, lastPos - 2).concat("ers");
    }

    // Rule 32
    if (word.endsWith("et")) {
      if ((lastPos - 2 < 0) ||
          (word.charAt(lastPos - 2) != 'n')) {
        word = word.substring(0, lastPos - 1).concat("es");
          }
    }

    // Rule 33
    if (word.endsWith("yt")) {
      word = word.substring(0, lastPos - 1).concat("ys");
    }

    // Rule 34
    if (word.endsWith("yz")) {
      word = word.substring(0, lastPos - 1).concat("ys");
    }

    return word;
  }

  /**
   * Returns the stemmed version of the given word.
   * Word is converted to lower case before stemming.
   * 
   * @param word 	a string consisting of a single word
   * @return 		the stemmed word
   */
  public String stem(String word) {

    if (word.length() > 2) {
      return recodeEnding(removeEnding(word.toLowerCase()));
    } else {
      return word.toLowerCase();
    }
  }

  /**
   * Stems everything in the given string. String
   * is converted to lower case before stemming.
   * 
   * @param str		the string to stem
   * @return 		the processed string
   */
  public String stemString(String str) {

    StringBuffer result = new StringBuffer();
    int start = -1;
    for (int j = 0; j < str.length(); j++) {
      char c = str.charAt(j);
      if (Character.isLetterOrDigit(c)) {
        if (start == -1) {
          start = j;
        }
      } else if (c == '\'') {
        if (start == -1) {
          result.append(c);
        }
      } else {
        if (start != -1) {
          result.append(stem(str.substring(start, j)));
          start = -1;
        }
        result.append(c);
      }
    }
    if (start != -1) {
      result.append(stem(str.substring(start, str.length())));
    }
    return result.toString();  
  }

  /**
   * returns a string representation of the stemmer
   * 
   * @return a string representation of the stemmer
   */
  public String toString() {
    return getClass().getName();
  }
  
  /**
   * Returns the revision string.
   * 
   * @return		the revision
   */
  public String getRevision() {
    return RevisionUtils.extract("$Revision: 8034 $");
  }

  /**
   * Runs the stemmer with the given options
   *
   * @param args      the options
   */
  public static void main(String[] args) {
    try {
      Stemming.useStemmer(new LovinsStemmer(), args);
    }
    catch (Exception e) {
      e.printStackTrace();
    }
  }
}