All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.analysis.en.KStemmer Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
/*
This file was partially derived from the
original CIIR University of Massachusetts Amherst version of KStemmer.java (license for
the original shown below)
 */

/*
 Copyright © 2003,
 Center for Intelligent Information Retrieval,
 University of Massachusetts, Amherst.
 All rights reserved.

 Redistribution and use in source and binary forms, with or without modification,
 are permitted provided that the following conditions are met:

 1. Redistributions of source code must retain the above copyright notice, this
 list of conditions and the following disclaimer.

 2. Redistributions in binary form must reproduce the above copyright notice,
 this list of conditions and the following disclaimer in the documentation
 and/or other materials provided with the distribution.

 3. The names "Center for Intelligent Information Retrieval" and
 "University of Massachusetts" must not be used to endorse or promote products
 derived from this software without prior written permission. To obtain
 permission, contact [email protected].

 THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF MASSACHUSETTS AND OTHER CONTRIBUTORS
 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
 THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
 GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 SUCH DAMAGE.
 */
package org.apache.lucene.analysis.en;

import org.apache.lucene.analysis.CharArrayMap;
import org.apache.lucene.analysis.util.OpenStringBuilder;
/**
 * 

Title: Kstemmer

*

Description: This is a java version of Bob Krovetz' kstem stemmer

*

Copyright: Copyright 2008, Luicid Imagination, Inc.

*

Copyright: Copyright 2003, CIIR University of Massachusetts Amherst (http://ciir.cs.umass.edu)

*/ /** * This class implements the Kstem algorithm */ public class KStemmer { static private final int MaxWordLen = 50; static private final String[] exceptionWords = {"aide", "bathe", "caste", "cute", "dame", "dime", "doge", "done", "dune", "envelope", "gage", "grille", "grippe", "lobe", "mane", "mare", "nape", "node", "pane", "pate", "plane", "pope", "programme", "quite", "ripe", "rote", "rune", "sage", "severe", "shoppe", "sine", "slime", "snipe", "steppe", "suite", "swinge", "tare", "tine", "tope", "tripe", "twine"}; static private final String[][] directConflations = { {"aging", "age"}, {"going", "go"}, {"goes", "go"}, {"lying", "lie"}, {"using", "use"}, {"owing", "owe"}, {"suing", "sue"}, {"dying", "die"}, {"tying", "tie"}, {"vying", "vie"}, {"aged", "age"}, {"used", "use"}, {"vied", "vie"}, {"cued", "cue"}, {"died", "die"}, {"eyed", "eye"}, {"hued", "hue"}, {"iced", "ice"}, {"lied", "lie"}, {"owed", "owe"}, {"sued", "sue"}, {"toed", "toe"}, {"tied", "tie"}, {"does", "do"}, {"doing", "do"}, {"aeronautical", "aeronautics"}, {"mathematical", "mathematics"}, {"political", "politics"}, {"metaphysical", "metaphysics"}, {"cylindrical", "cylinder"}, {"nazism", "nazi"}, {"ambiguity", "ambiguous"}, {"barbarity", "barbarous"}, {"credulity", "credulous"}, {"generosity", "generous"}, {"spontaneity", "spontaneous"}, {"unanimity", "unanimous"}, {"voracity", "voracious"}, {"fled", "flee"}, {"miscarriage", "miscarry"}}; static private final String[][] countryNationality = { {"afghan", "afghanistan"}, {"african", "africa"}, {"albanian", "albania"}, {"algerian", "algeria"}, {"american", "america"}, {"andorran", "andorra"}, {"angolan", "angola"}, {"arabian", "arabia"}, {"argentine", "argentina"}, {"armenian", "armenia"}, {"asian", "asia"}, {"australian", "australia"}, {"austrian", "austria"}, {"azerbaijani", "azerbaijan"}, {"azeri", "azerbaijan"}, {"bangladeshi", "bangladesh"}, {"belgian", "belgium"}, {"bermudan", "bermuda"}, {"bolivian", "bolivia"}, {"bosnian", "bosnia"}, {"botswanan", "botswana"}, {"brazilian", "brazil"}, {"british", "britain"}, {"bulgarian", "bulgaria"}, {"burmese", "burma"}, {"californian", "california"}, {"cambodian", "cambodia"}, {"canadian", "canada"}, {"chadian", "chad"}, {"chilean", "chile"}, {"chinese", "china"}, {"colombian", "colombia"}, {"croat", "croatia"}, {"croatian", "croatia"}, {"cuban", "cuba"}, {"cypriot", "cyprus"}, {"czechoslovakian", "czechoslovakia"}, {"danish", "denmark"}, {"egyptian", "egypt"}, {"equadorian", "equador"}, {"eritrean", "eritrea"}, {"estonian", "estonia"}, {"ethiopian", "ethiopia"}, {"european", "europe"}, {"fijian", "fiji"}, {"filipino", "philippines"}, {"finnish", "finland"}, {"french", "france"}, {"gambian", "gambia"}, {"georgian", "georgia"}, {"german", "germany"}, {"ghanian", "ghana"}, {"greek", "greece"}, {"grenadan", "grenada"}, {"guamian", "guam"}, {"guatemalan", "guatemala"}, {"guinean", "guinea"}, {"guyanan", "guyana"}, {"haitian", "haiti"}, {"hawaiian", "hawaii"}, {"holland", "dutch"}, {"honduran", "honduras"}, {"hungarian", "hungary"}, {"icelandic", "iceland"}, {"indonesian", "indonesia"}, {"iranian", "iran"}, {"iraqi", "iraq"}, {"iraqui", "iraq"}, {"irish", "ireland"}, {"israeli", "israel"}, {"italian", "italy"}, {"jamaican", "jamaica"}, {"japanese", "japan"}, {"jordanian", "jordan"}, {"kampuchean", "cambodia"}, {"kenyan", "kenya"}, {"korean", "korea"}, {"kuwaiti", "kuwait"}, {"lankan", "lanka"}, {"laotian", "laos"}, {"latvian", "latvia"}, {"lebanese", "lebanon"}, {"liberian", "liberia"}, {"libyan", "libya"}, {"lithuanian", "lithuania"}, {"macedonian", "macedonia"}, {"madagascan", "madagascar"}, {"malaysian", "malaysia"}, {"maltese", "malta"}, {"mauritanian", "mauritania"}, {"mexican", "mexico"}, {"micronesian", "micronesia"}, {"moldovan", "moldova"}, {"monacan", "monaco"}, {"mongolian", "mongolia"}, {"montenegran", "montenegro"}, {"moroccan", "morocco"}, {"myanmar", "burma"}, {"namibian", "namibia"}, {"nepalese", "nepal"}, // {"netherlands", "dutch"}, {"nicaraguan", "nicaragua"}, {"nigerian", "nigeria"}, {"norwegian", "norway"}, {"omani", "oman"}, {"pakistani", "pakistan"}, {"panamanian", "panama"}, {"papuan", "papua"}, {"paraguayan", "paraguay"}, {"peruvian", "peru"}, {"portuguese", "portugal"}, {"romanian", "romania"}, {"rumania", "romania"}, {"rumanian", "romania"}, {"russian", "russia"}, {"rwandan", "rwanda"}, {"samoan", "samoa"}, {"scottish", "scotland"}, {"serb", "serbia"}, {"serbian", "serbia"}, {"siam", "thailand"}, {"siamese", "thailand"}, {"slovakia", "slovak"}, {"slovakian", "slovak"}, {"slovenian", "slovenia"}, {"somali", "somalia"}, {"somalian", "somalia"}, {"spanish", "spain"}, {"swedish", "sweden"}, {"swiss", "switzerland"}, {"syrian", "syria"}, {"taiwanese", "taiwan"}, {"tanzanian", "tanzania"}, {"texan", "texas"}, {"thai", "thailand"}, {"tunisian", "tunisia"}, {"turkish", "turkey"}, {"ugandan", "uganda"}, {"ukrainian", "ukraine"}, {"uruguayan", "uruguay"}, {"uzbek", "uzbekistan"}, {"venezuelan", "venezuela"}, {"vietnamese", "viet"}, {"virginian", "virginia"}, {"yemeni", "yemen"}, {"yugoslav", "yugoslavia"}, {"yugoslavian", "yugoslavia"}, {"zambian", "zambia"}, {"zealander", "zealand"}, {"zimbabwean", "zimbabwe"}}; static private final String[] supplementDict = {"aids", "applicator", "capacitor", "digitize", "electromagnet", "ellipsoid", "exosphere", "extensible", "ferromagnet", "graphics", "hydromagnet", "polygraph", "toroid", "superconduct", "backscatter", "connectionism"}; static private final String[] properNouns = {"abrams", "achilles", "acropolis", "adams", "agnes", "aires", "alexander", "alexis", "alfred", "algiers", "alps", "amadeus", "ames", "amos", "andes", "angeles", "annapolis", "antilles", "aquarius", "archimedes", "arkansas", "asher", "ashly", "athens", "atkins", "atlantis", "avis", "bahamas", "bangor", "barbados", "barger", "bering", "brahms", "brandeis", "brussels", "bruxelles", "cairns", "camoros", "camus", "carlos", "celts", "chalker", "charles", "cheops", "ching", "christmas", "cocos", "collins", "columbus", "confucius", "conners", "connolly", "copernicus", "cramer", "cyclops", "cygnus", "cyprus", "dallas", "damascus", "daniels", "davies", "davis", "decker", "denning", "dennis", "descartes", "dickens", "doris", "douglas", "downs", "dreyfus", "dukakis", "dulles", "dumfries", "ecclesiastes", "edwards", "emily", "erasmus", "euphrates", "evans", "everglades", "fairbanks", "federales", "fisher", "fitzsimmons", "fleming", "forbes", "fowler", "france", "francis", "goering", "goodling", "goths", "grenadines", "guiness", "hades", "harding", "harris", "hastings", "hawkes", "hawking", "hayes", "heights", "hercules", "himalayas", "hippocrates", "hobbs", "holmes", "honduras", "hopkins", "hughes", "humphreys", "illinois", "indianapolis", "inverness", "iris", "iroquois", "irving", "isaacs", "italy", "james", "jarvis", "jeffreys", "jesus", "jones", "josephus", "judas", "julius", "kansas", "keynes", "kipling", "kiwanis", "lansing", "laos", "leeds", "levis", "leviticus", "lewis", "louis", "maccabees", "madras", "maimonides", "maldive", "massachusetts", "matthews", "mauritius", "memphis", "mercedes", "midas", "mingus", "minneapolis", "mohammed", "moines", "morris", "moses", "myers", "myknos", "nablus", "nanjing", "nantes", "naples", "neal", "netherlands", "nevis", "nostradamus", "oedipus", "olympus", "orleans", "orly", "papas", "paris", "parker", "pauling", "peking", "pershing", "peter", "peters", "philippines", "phineas", "pisces", "pryor", "pythagoras", "queens", "rabelais", "ramses", "reynolds", "rhesus", "rhodes", "richards", "robins", "rodgers", "rogers", "rubens", "sagittarius", "seychelles", "socrates", "texas", "thames", "thomas", "tiberias", "tunis", "venus", "vilnius", "wales", "warner", "wilkins", "williams", "wyoming", "xmas", "yonkers", "zeus", "frances", "aarhus", "adonis", "andrews", "angus", "antares", "aquinas", "arcturus", "ares", "artemis", "augustus", "ayers", "barnabas", "barnes", "becker", "bejing", "biggs", "billings", "boeing", "boris", "borroughs", "briggs", "buenos", "calais", "caracas", "cassius", "cerberus", "ceres", "cervantes", "chantilly", "chartres", "chester", "connally", "conner", "coors", "cummings", "curtis", "daedalus", "dionysus", "dobbs", "dolores", "edmonds"}; static class DictEntry { boolean exception; String root; DictEntry(String root, boolean isException) { this.root = root; this.exception = isException; } } private static final CharArrayMap dict_ht = initializeDictHash(); /*** * caching off private int maxCacheSize; private CharArrayMap cache = * null; private static final String SAME = "SAME"; // use if stemmed form is * the same ***/ private final OpenStringBuilder word = new OpenStringBuilder(); private int j; /* index of final letter in stem (within word) */ private int k; /* * INDEX of final letter in word. You must add 1 to k to get * the current length of word. When you want the length of * word, use the method wordLength, which returns (k+1). */ /* * private void initializeStemHash() { if (maxCacheSize > 0) cache = new * CharArrayMap(maxCacheSize,false); } ***/ private char finalChar() { return word.charAt(k); } private char penultChar() { return word.charAt(k - 1); } private boolean isVowel(int index) { return !isCons(index); } private boolean isCons(int index) { char ch; ch = word.charAt(index); if ((ch == 'a') || (ch == 'e') || (ch == 'i') || (ch == 'o') || (ch == 'u')) return false; if ((ch != 'y') || (index == 0)) return true; else return (!isCons(index - 1)); } private static CharArrayMap initializeDictHash() { DictEntry defaultEntry; DictEntry entry; CharArrayMap d = new CharArrayMap<>(1000, false); for (int i = 0; i < exceptionWords.length; i++) { if (!d.containsKey(exceptionWords[i])) { entry = new DictEntry(exceptionWords[i], true); d.put(exceptionWords[i], entry); } else { throw new RuntimeException("Warning: Entry [" + exceptionWords[i] + "] already in dictionary 1"); } } for (int i = 0; i < directConflations.length; i++) { if (!d.containsKey(directConflations[i][0])) { entry = new DictEntry(directConflations[i][1], false); d.put(directConflations[i][0], entry); } else { throw new RuntimeException("Warning: Entry [" + directConflations[i][0] + "] already in dictionary 2"); } } for (int i = 0; i < countryNationality.length; i++) { if (!d.containsKey(countryNationality[i][0])) { entry = new DictEntry(countryNationality[i][1], false); d.put(countryNationality[i][0], entry); } else { throw new RuntimeException("Warning: Entry [" + countryNationality[i][0] + "] already in dictionary 3"); } } defaultEntry = new DictEntry(null, false); String[] array; array = KStemData1.data; for (int i = 0; i < array.length; i++) { if (!d.containsKey(array[i])) { d.put(array[i], defaultEntry); } else { throw new RuntimeException("Warning: Entry [" + array[i] + "] already in dictionary 4"); } } array = KStemData2.data; for (int i = 0; i < array.length; i++) { if (!d.containsKey(array[i])) { d.put(array[i], defaultEntry); } else { throw new RuntimeException("Warning: Entry [" + array[i] + "] already in dictionary 4"); } } array = KStemData3.data; for (int i = 0; i < array.length; i++) { if (!d.containsKey(array[i])) { d.put(array[i], defaultEntry); } else { throw new RuntimeException("Warning: Entry [" + array[i] + "] already in dictionary 4"); } } array = KStemData4.data; for (int i = 0; i < array.length; i++) { if (!d.containsKey(array[i])) { d.put(array[i], defaultEntry); } else { throw new RuntimeException("Warning: Entry [" + array[i] + "] already in dictionary 4"); } } array = KStemData5.data; for (int i = 0; i < array.length; i++) { if (!d.containsKey(array[i])) { d.put(array[i], defaultEntry); } else { throw new RuntimeException("Warning: Entry [" + array[i] + "] already in dictionary 4"); } } array = KStemData6.data; for (int i = 0; i < array.length; i++) { if (!d.containsKey(array[i])) { d.put(array[i], defaultEntry); } else { throw new RuntimeException("Warning: Entry [" + array[i] + "] already in dictionary 4"); } } array = KStemData7.data; for (int i = 0; i < array.length; i++) { if (!d.containsKey(array[i])) { d.put(array[i], defaultEntry); } else { throw new RuntimeException("Warning: Entry [" + array[i] + "] already in dictionary 4"); } } for (int i = 0; i < KStemData8.data.length; i++) { if (!d.containsKey(KStemData8.data[i])) { d.put(KStemData8.data[i], defaultEntry); } else { throw new RuntimeException("Warning: Entry [" + KStemData8.data[i] + "] already in dictionary 4"); } } for (int i = 0; i < supplementDict.length; i++) { if (!d.containsKey(supplementDict[i])) { d.put(supplementDict[i], defaultEntry); } else { throw new RuntimeException("Warning: Entry [" + supplementDict[i] + "] already in dictionary 5"); } } for (int i = 0; i < properNouns.length; i++) { if (!d.containsKey(properNouns[i])) { d.put(properNouns[i], defaultEntry); } else { throw new RuntimeException("Warning: Entry [" + properNouns[i] + "] already in dictionary 6"); } } return d; } private boolean isAlpha(char ch) { return ch >= 'a' && ch <= 'z'; // terms must be lowercased already } /* length of stem within word */ private int stemLength() { return j + 1; }; private boolean endsIn(char[] s) { if (s.length > k) return false; int r = word.length() - s.length; /* length of word before this suffix */ j = k; for (int r1 = r, i = 0; i < s.length; i++, r1++) { if (s[i] != word.charAt(r1)) return false; } j = r - 1; /* index of the character BEFORE the posfix */ return true; } private boolean endsIn(char a, char b) { if (2 > k) return false; // check left to right since the endings have often already matched if (word.charAt(k - 1) == a && word.charAt(k) == b) { j = k - 2; return true; } return false; } private boolean endsIn(char a, char b, char c) { if (3 > k) return false; if (word.charAt(k - 2) == a && word.charAt(k - 1) == b && word.charAt(k) == c) { j = k - 3; return true; } return false; } private boolean endsIn(char a, char b, char c, char d) { if (4 > k) return false; if (word.charAt(k - 3) == a && word.charAt(k - 2) == b && word.charAt(k - 1) == c && word.charAt(k) == d) { j = k - 4; return true; } return false; } private DictEntry wordInDict() { /*** * if (matchedEntry != null) { if (dict_ht.get(word.getArray(), 0, * word.size()) != matchedEntry) { * System.out.println("Uh oh... cached entry doesn't match"); } return * matchedEntry; } ***/ if (matchedEntry != null) return matchedEntry; DictEntry e = dict_ht.get(word.getArray(), 0, word.length()); if (e != null && !e.exception) { matchedEntry = e; // only cache if it's not an exception. } // lookups.add(word.toString()); return e; } /* Convert plurals to singular form, and '-ies' to 'y' */ private void plural() { if (word.charAt(k) == 's') { if (endsIn('i', 'e', 's')) { word.setLength(j + 3); k--; if (lookup()) /* ensure calories -> calorie */ return; k++; word.unsafeWrite('s'); setSuffix("y"); lookup(); } else if (endsIn('e', 's')) { /* try just removing the "s" */ word.setLength(j + 2); k--; /* * note: don't check for exceptions here. So, `aides' -> `aide', but * `aided' -> `aid'. The exception for double s is used to prevent * crosses -> crosse. This is actually correct if crosses is a plural * noun (a type of racket used in lacrosse), but the verb is much more * common */ /**** * YCS: this was the one place where lookup was not followed by return. * So restructure it. if ((j>0)&&(lookup(word.toString())) && * !((word.charAt(j) == 's') && (word.charAt(j-1) == 's'))) return; *****/ boolean tryE = j > 0 && !((word.charAt(j) == 's') && (word.charAt(j - 1) == 's')); if (tryE && lookup()) return; /* try removing the "es" */ word.setLength(j + 1); k--; if (lookup()) return; /* the default is to retain the "e" */ word.unsafeWrite('e'); k++; if (!tryE) lookup(); // if we didn't try the "e" ending before return; } else { if (word.length() > 3 && penultChar() != 's' && !endsIn('o', 'u', 's')) { /* unless the word ends in "ous" or a double "s", remove the final "s" */ word.setLength(k); k--; lookup(); } } } } private void setSuffix(String s) { setSuff(s, s.length()); } /* replace old suffix with s */ private void setSuff(String s, int len) { word.setLength(j + 1); for (int l = 0; l < len; l++) { word.unsafeWrite(s.charAt(l)); } k = j + len; } /* Returns true if the word is found in the dictionary */ // almost all uses of lookup() return immediately and are // followed by another lookup in the dict. Store the match // to avoid this double lookup. DictEntry matchedEntry = null; private boolean lookup() { /****** * debugging code String thisLookup = word.toString(); boolean added = * lookups.add(thisLookup); if (!added) { * System.out.println("######extra lookup:" + thisLookup); // occaasional * extra lookups aren't necessarily errors... could happen by diff * manipulations // throw new RuntimeException("######extra lookup:" + * thisLookup); } else { // System.out.println("new lookup:" + thisLookup); * } ******/ matchedEntry = dict_ht.get(word.getArray(), 0, word.size()); return matchedEntry != null; } // Set lookups = new HashSet<>(); /* convert past tense (-ed) to present, and `-ied' to `y' */ private void pastTense() { /* * Handle words less than 5 letters with a direct mapping This prevents * (fled -> fl). */ if (word.length() <= 4) return; if (endsIn('i', 'e', 'd')) { word.setLength(j + 3); k--; if (lookup()) /* we almost always want to convert -ied to -y, but */ return; /* this isn't true for short words (died->die) */ k++; /* I don't know any long words that this applies to, */ word.unsafeWrite('d'); /* but just in case... */ setSuffix("y"); lookup(); return; } /* the vowelInStem() is necessary so we don't stem acronyms */ if (endsIn('e', 'd') && vowelInStem()) { /* see if the root ends in `e' */ word.setLength(j + 2); k = j + 1; DictEntry entry = wordInDict(); if (entry != null) if (!entry.exception) /* * if it's in the dictionary and * not an exception */ return; /* try removing the "ed" */ word.setLength(j + 1); k = j; if (lookup()) return; /* * try removing a doubled consonant. if the root isn't found in the * dictionary, the default is to leave it doubled. This will correctly * capture `backfilled' -> `backfill' instead of `backfill' -> * `backfille', and seems correct most of the time */ if (doubleC(k)) { word.setLength(k); k--; if (lookup()) return; word.unsafeWrite(word.charAt(k)); k++; lookup(); return; } /* if we have a `un-' prefix, then leave the word alone */ /* (this will sometimes screw up with `under-', but we */ /* will take care of that later) */ if ((word.charAt(0) == 'u') && (word.charAt(1) == 'n')) { word.unsafeWrite('e'); word.unsafeWrite('d'); k = k + 2; // nolookup() return; } /* * it wasn't found by just removing the `d' or the `ed', so prefer to end * with an `e' (e.g., `microcoded' -> `microcode'). */ word.setLength(j + 1); word.unsafeWrite('e'); k = j + 1; // nolookup() - we already tried the "e" ending return; } } /* return TRUE if word ends with a double consonant */ private boolean doubleC(int i) { if (i < 1) return false; if (word.charAt(i) != word.charAt(i - 1)) return false; return (isCons(i)); } private boolean vowelInStem() { for (int i = 0; i < stemLength(); i++) { if (isVowel(i)) return true; } return false; } /* handle `-ing' endings */ private void aspect() { /* * handle short words (aging -> age) via a direct mapping. This prevents * (thing -> the) in the version of this routine that ignores inflectional * variants that are mentioned in the dictionary (when the root is also * present) */ if (word.length() <= 5) return; /* the vowelinstem() is necessary so we don't stem acronyms */ if (endsIn('i', 'n', 'g') && vowelInStem()) { /* try adding an `e' to the stem and check against the dictionary */ word.setCharAt(j + 1, 'e'); word.setLength(j + 2); k = j + 1; DictEntry entry = wordInDict(); if (entry != null) { if (!entry.exception) /* if it's in the dictionary and not an exception */ return; } /* adding on the `e' didn't work, so remove it */ word.setLength(k); k--; /* note that `ing' has also been removed */ if (lookup()) return; /* if I can remove a doubled consonant and get a word, then do so */ if (doubleC(k)) { k--; word.setLength(k + 1); if (lookup()) return; word.unsafeWrite(word.charAt(k)); /* restore the doubled consonant */ /* the default is to leave the consonant doubled */ /* (e.g.,`fingerspelling' -> `fingerspell'). Unfortunately */ /* `bookselling' -> `booksell' and `mislabelling' -> `mislabell'). */ /* Without making the algorithm significantly more complicated, this */ /* is the best I can do */ k++; lookup(); return; } /* * the word wasn't in the dictionary after removing the stem, and then * checking with and without a final `e'. The default is to add an `e' * unless the word ends in two consonants, so `microcoding' -> * `microcode'. The two consonants restriction wouldn't normally be * necessary, but is needed because we don't try to deal with prefixes and * compounds, and most of the time it is correct (e.g., footstamping -> * footstamp, not footstampe; however, decoupled -> decoupl). We can * prevent almost all of the incorrect stems if we try to do some prefix * analysis first */ if ((j > 0) && isCons(j) && isCons(j - 1)) { k = j; word.setLength(k + 1); // nolookup() because we already did according to the comment return; } word.setLength(j + 1); word.unsafeWrite('e'); k = j + 1; // nolookup(); we already tried an 'e' ending return; } } /* * this routine deals with -ity endings. It accepts -ability, -ibility, and * -ality, even without checking the dictionary because they are so * productive. The first two are mapped to -ble, and the -ity is remove for * the latter */ private void ityEndings() { int old_k = k; if (endsIn('i', 't', 'y')) { word.setLength(j + 1); /* try just removing -ity */ k = j; if (lookup()) return; word.unsafeWrite('e'); /* try removing -ity and adding -e */ k = j + 1; if (lookup()) return; word.setCharAt(j + 1, 'i'); word.append("ty"); k = old_k; /* * the -ability and -ibility endings are highly productive, so just accept * them */ if ((j > 0) && (word.charAt(j - 1) == 'i') && (word.charAt(j) == 'l')) { word.setLength(j - 1); word.append("le"); /* convert to -ble */ k = j; lookup(); return; } /* ditto for -ivity */ if ((j > 0) && (word.charAt(j - 1) == 'i') && (word.charAt(j) == 'v')) { word.setLength(j + 1); word.unsafeWrite('e'); /* convert to -ive */ k = j + 1; lookup(); return; } /* ditto for -ality */ if ((j > 0) && (word.charAt(j - 1) == 'a') && (word.charAt(j) == 'l')) { word.setLength(j + 1); k = j; lookup(); return; } /* * if the root isn't in the dictionary, and the variant *is* there, then * use the variant. This allows `immunity'->`immune', but prevents * `capacity'->`capac'. If neither the variant nor the root form are in * the dictionary, then remove the ending as a default */ if (lookup()) return; /* the default is to remove -ity altogether */ word.setLength(j + 1); k = j; // nolookup(), we already did it. return; } } /* handle -ence and -ance */ private void nceEndings() { int old_k = k; char word_char; if (endsIn('n', 'c', 'e')) { word_char = word.charAt(j); if (!((word_char == 'e') || (word_char == 'a'))) return; word.setLength(j); word.unsafeWrite('e'); /* try converting -e/ance to -e (adherance/adhere) */ k = j; if (lookup()) return; word.setLength(j); /* * try removing -e/ance altogether * (disappearance/disappear) */ k = j - 1; if (lookup()) return; word.unsafeWrite(word_char); /* restore the original ending */ word.append("nce"); k = old_k; // nolookup() because we restored the original ending } return; } /* handle -ness */ private void nessEndings() { if (endsIn('n', 'e', 's', 's')) { /* * this is a very productive endings, so * just accept it */ word.setLength(j + 1); k = j; if (word.charAt(j) == 'i') word.setCharAt(j, 'y'); lookup(); } return; } /* handle -ism */ private void ismEndings() { if (endsIn('i', 's', 'm')) { /* * this is a very productive ending, so just * accept it */ word.setLength(j + 1); k = j; lookup(); } return; } /* this routine deals with -ment endings. */ private void mentEndings() { int old_k = k; if (endsIn('m', 'e', 'n', 't')) { word.setLength(j + 1); k = j; if (lookup()) return; word.append("ment"); k = old_k; // nolookup } return; } /* this routine deals with -ize endings. */ private void izeEndings() { int old_k = k; if (endsIn('i', 'z', 'e')) { word.setLength(j + 1); /* try removing -ize entirely */ k = j; if (lookup()) return; word.unsafeWrite('i'); if (doubleC(j)) { /* allow for a doubled consonant */ word.setLength(j); k = j - 1; if (lookup()) return; word.unsafeWrite(word.charAt(j - 1)); } word.setLength(j + 1); word.unsafeWrite('e'); /* try removing -ize and adding -e */ k = j + 1; if (lookup()) return; word.setLength(j + 1); word.append("ize"); k = old_k; // nolookup() } return; } /* handle -ency and -ancy */ private void ncyEndings() { if (endsIn('n', 'c', 'y')) { if (!((word.charAt(j) == 'e') || (word.charAt(j) == 'a'))) return; word.setCharAt(j + 2, 't'); /* try converting -ncy to -nt */ word.setLength(j + 3); k = j + 2; if (lookup()) return; word.setCharAt(j + 2, 'c'); /* the default is to convert it to -nce */ word.unsafeWrite('e'); k = j + 3; lookup(); } return; } /* handle -able and -ible */ private void bleEndings() { int old_k = k; char word_char; if (endsIn('b', 'l', 'e')) { if (!((word.charAt(j) == 'a') || (word.charAt(j) == 'i'))) return; word_char = word.charAt(j); word.setLength(j); /* try just removing the ending */ k = j - 1; if (lookup()) return; if (doubleC(k)) { /* allow for a doubled consonant */ word.setLength(k); k--; if (lookup()) return; k++; word.unsafeWrite(word.charAt(k - 1)); } word.setLength(j); word.unsafeWrite('e'); /* try removing -a/ible and adding -e */ k = j; if (lookup()) return; word.setLength(j); word.append("ate"); /* try removing -able and adding -ate */ /* (e.g., compensable/compensate) */ k = j + 2; if (lookup()) return; word.setLength(j); word.unsafeWrite(word_char); /* restore the original values */ word.append("ble"); k = old_k; // nolookup() } return; } /* * handle -ic endings. This is fairly straightforward, but this is also the * only place we try *expanding* an ending, -ic -> -ical. This is to handle * cases like `canonic' -> `canonical' */ private void icEndings() { if (endsIn('i', 'c')) { word.setLength(j + 3); word.append("al"); /* try converting -ic to -ical */ k = j + 4; if (lookup()) return; word.setCharAt(j + 1, 'y'); /* try converting -ic to -y */ word.setLength(j + 2); k = j + 1; if (lookup()) return; word.setCharAt(j + 1, 'e'); /* try converting -ic to -e */ if (lookup()) return; word.setLength(j + 1); /* try removing -ic altogether */ k = j; if (lookup()) return; word.append("ic"); /* restore the original ending */ k = j + 2; // nolookup() } return; } private static char[] ization = "ization".toCharArray(); private static char[] ition = "ition".toCharArray(); private static char[] ation = "ation".toCharArray(); private static char[] ication = "ication".toCharArray(); /* handle some derivational endings */ /* * this routine deals with -ion, -ition, -ation, -ization, and -ication. The * -ization ending is always converted to -ize */ private void ionEndings() { int old_k = k; if (!endsIn('i', 'o', 'n')) { return; } if (endsIn(ization)) { /* * the -ize ending is very productive, so simply * accept it as the root */ word.setLength(j + 3); word.unsafeWrite('e'); k = j + 3; lookup(); return; } if (endsIn(ition)) { word.setLength(j + 1); word.unsafeWrite('e'); k = j + 1; if (lookup()) /* * remove -ition and add `e', and check against the * dictionary */ return; /* (e.g., definition->define, opposition->oppose) */ /* restore original values */ word.setLength(j + 1); word.append("ition"); k = old_k; // nolookup() } else if (endsIn(ation)) { word.setLength(j + 3); word.unsafeWrite('e'); k = j + 3; if (lookup()) /* remove -ion and add `e', and check against the dictionary */ return; /* (elmination -> eliminate) */ word.setLength(j + 1); word.unsafeWrite('e'); /* * remove -ation and add `e', and check against the * dictionary */ k = j + 1; if (lookup()) return; word.setLength(j + 1);/* * just remove -ation (resignation->resign) and * check dictionary */ k = j; if (lookup()) return; /* restore original values */ word.setLength(j + 1); word.append("ation"); k = old_k; // nolookup() } /* * test -ication after -ation is attempted (e.g., `complication->complicate' * rather than `complication->comply') */ if (endsIn(ication)) { word.setLength(j + 1); word.unsafeWrite('y'); k = j + 1; if (lookup()) /* * remove -ication and add `y', and check against the * dictionary */ return; /* (e.g., amplification -> amplify) */ /* restore original values */ word.setLength(j + 1); word.append("ication"); k = old_k; // nolookup() } // if (endsIn(ion)) { if (true) { // we checked for this earlier... just need to set "j" j = k - 3; // YCS word.setLength(j + 1); word.unsafeWrite('e'); k = j + 1; if (lookup()) /* remove -ion and add `e', and check against the dictionary */ return; word.setLength(j + 1); k = j; if (lookup()) /* remove -ion, and if it's found, treat that as the root */ return; /* restore original values */ word.setLength(j + 1); word.append("ion"); k = old_k; // nolookup() } // nolookup(); all of the other paths restored original values return; } /* * this routine deals with -er, -or, -ier, and -eer. The -izer ending is * always converted to -ize */ private void erAndOrEndings() { int old_k = k; if (word.charAt(k) != 'r') return; // YCS char word_char; /* so we can remember if it was -er or -or */ if (endsIn('i', 'z', 'e', 'r')) { /* * -ize is very productive, so accept it * as the root */ word.setLength(j + 4); k = j + 3; lookup(); return; } if (endsIn('e', 'r') || endsIn('o', 'r')) { word_char = word.charAt(j + 1); if (doubleC(j)) { word.setLength(j); k = j - 1; if (lookup()) return; word.unsafeWrite(word.charAt(j - 1)); /* restore the doubled consonant */ } if (word.charAt(j) == 'i') { /* do we have a -ier ending? */ word.setCharAt(j, 'y'); word.setLength(j + 1); k = j; if (lookup()) /* yes, so check against the dictionary */ return; word.setCharAt(j, 'i'); /* restore the endings */ word.unsafeWrite('e'); } if (word.charAt(j) == 'e') { /* handle -eer */ word.setLength(j); k = j - 1; if (lookup()) return; word.unsafeWrite('e'); } word.setLength(j + 2); /* remove the -r ending */ k = j + 1; if (lookup()) return; word.setLength(j + 1); /* try removing -er/-or */ k = j; if (lookup()) return; word.unsafeWrite('e'); /* try removing -or and adding -e */ k = j + 1; if (lookup()) return; word.setLength(j + 1); word.unsafeWrite(word_char); word.unsafeWrite('r'); /* restore the word to the way it was */ k = old_k; // nolookup() } } /* * this routine deals with -ly endings. The -ally ending is always converted * to -al Sometimes this will temporarily leave us with a non-word (e.g., * heuristically maps to heuristical), but then the -al is removed in the next * step. */ private void lyEndings() { int old_k = k; if (endsIn('l', 'y')) { word.setCharAt(j + 2, 'e'); /* try converting -ly to -le */ if (lookup()) return; word.setCharAt(j + 2, 'y'); word.setLength(j + 1); /* try just removing the -ly */ k = j; if (lookup()) return; if ((j > 0) && (word.charAt(j - 1) == 'a') && (word.charAt(j) == 'l')) /* * always * convert * - * ally * to * - * al */ return; word.append("ly"); k = old_k; if ((j > 0) && (word.charAt(j - 1) == 'a') && (word.charAt(j) == 'b')) { /* * always * convert * - * ably * to * - * able */ word.setCharAt(j + 2, 'e'); k = j + 2; return; } if (word.charAt(j) == 'i') { /* e.g., militarily -> military */ word.setLength(j); word.unsafeWrite('y'); k = j; if (lookup()) return; word.setLength(j); word.append("ily"); k = old_k; } word.setLength(j + 1); /* the default is to remove -ly */ k = j; // nolookup()... we already tried removing the "ly" variant } return; } /* * this routine deals with -al endings. Some of the endings from the previous * routine are finished up here. */ private void alEndings() { int old_k = k; if (word.length() < 4) return; if (endsIn('a', 'l')) { word.setLength(j + 1); k = j; if (lookup()) /* try just removing the -al */ return; if (doubleC(j)) { /* allow for a doubled consonant */ word.setLength(j); k = j - 1; if (lookup()) return; word.unsafeWrite(word.charAt(j - 1)); } word.setLength(j + 1); word.unsafeWrite('e'); /* try removing the -al and adding -e */ k = j + 1; if (lookup()) return; word.setLength(j + 1); word.append("um"); /* try converting -al to -um */ /* (e.g., optimal - > optimum ) */ k = j + 2; if (lookup()) return; word.setLength(j + 1); word.append("al"); /* restore the ending to the way it was */ k = old_k; if ((j > 0) && (word.charAt(j - 1) == 'i') && (word.charAt(j) == 'c')) { word.setLength(j - 1); /* try removing -ical */ k = j - 2; if (lookup()) return; word.setLength(j - 1); word.unsafeWrite('y');/* try turning -ical to -y (e.g., bibliographical) */ k = j - 1; if (lookup()) return; word.setLength(j - 1); word.append("ic"); /* the default is to convert -ical to -ic */ k = j; // nolookup() ... converting ical to ic means removing "al" which we // already tried // ERROR lookup(); return; } if (word.charAt(j) == 'i') { /* sometimes -ial endings should be removed */ word.setLength(j); /* (sometimes it gets turned into -y, but we */ k = j - 1; /* aren't dealing with that case for now) */ if (lookup()) return; word.append("ial"); k = old_k; lookup(); } } return; } /* * this routine deals with -ive endings. It normalizes some of the -ative * endings directly, and also maps some -ive endings to -ion. */ private void iveEndings() { int old_k = k; if (endsIn('i', 'v', 'e')) { word.setLength(j + 1); /* try removing -ive entirely */ k = j; if (lookup()) return; word.unsafeWrite('e'); /* try removing -ive and adding -e */ k = j + 1; if (lookup()) return; word.setLength(j + 1); word.append("ive"); if ((j > 0) && (word.charAt(j - 1) == 'a') && (word.charAt(j) == 't')) { word.setCharAt(j - 1, 'e'); /* try removing -ative and adding -e */ word.setLength(j); /* (e.g., determinative -> determine) */ k = j - 1; if (lookup()) return; word.setLength(j - 1); /* try just removing -ative */ if (lookup()) return; word.append("ative"); k = old_k; } /* try mapping -ive to -ion (e.g., injunctive/injunction) */ word.setCharAt(j + 2, 'o'); word.setCharAt(j + 3, 'n'); if (lookup()) return; word.setCharAt(j + 2, 'v'); /* restore the original values */ word.setCharAt(j + 3, 'e'); k = old_k; // nolookup() } return; } KStemmer() {} String stem(String term) { boolean changed = stem(term.toCharArray(), term.length()); if (!changed) return term; return asString(); } /** * Returns the result of the stem (assuming the word was changed) as a String. */ String asString() { String s = getString(); if (s != null) return s; return word.toString(); } CharSequence asCharSequence() { return result != null ? result : word; } String getString() { return result; } char[] getChars() { return word.getArray(); } int getLength() { return word.length(); } String result; private boolean matched() { /*** * if (!lookups.contains(word.toString())) { throw new * RuntimeException("didn't look up "+word.toString()+" prev="+prevLookup); * } ***/ // lookup(); return matchedEntry != null; } /** * Stems the text in the token. Returns true if changed. */ boolean stem(char[] term, int len) { result = null; k = len - 1; if ((k <= 1) || (k >= MaxWordLen - 1)) { return false; // don't stem } // first check the stemmer dictionaries, and avoid using the // cache if it's in there. DictEntry entry = dict_ht.get(term, 0, len); if (entry != null) { if (entry.root != null) { result = entry.root; return true; } return false; } /*** * caching off is normally faster if (cache == null) initializeStemHash(); * * // now check the cache, before we copy chars to "word" if (cache != null) * { String val = cache.get(term, 0, len); if (val != null) { if (val != * SAME) { result = val; return true; } return false; } } ***/ word.reset(); // allocate enough space so that an expansion is never needed word.reserve(len + 10); for (int i = 0; i < len; i++) { char ch = term[i]; if (!isAlpha(ch)) return false; // don't stem // don't lowercase... it's a requirement that lowercase filter be // used before this stemmer. word.unsafeWrite(ch); } matchedEntry = null; /*** * lookups.clear(); lookups.add(word.toString()); ***/ /* * This while loop will never be executed more than one time; it is here * only to allow the break statement to be used to escape as soon as a word * is recognized */ while (true) { // YCS: extra lookup()s were inserted so we don't need to // do an extra wordInDict() here. plural(); if (matched()) break; pastTense(); if (matched()) break; aspect(); if (matched()) break; ityEndings(); if (matched()) break; nessEndings(); if (matched()) break; ionEndings(); if (matched()) break; erAndOrEndings(); if (matched()) break; lyEndings(); if (matched()) break; alEndings(); if (matched()) break; entry = wordInDict(); iveEndings(); if (matched()) break; izeEndings(); if (matched()) break; mentEndings(); if (matched()) break; bleEndings(); if (matched()) break; ismEndings(); if (matched()) break; icEndings(); if (matched()) break; ncyEndings(); if (matched()) break; nceEndings(); matched(); break; } /* * try for a direct mapping (allows for cases like `Italian'->`Italy' and * `Italians'->`Italy') */ entry = matchedEntry; if (entry != null) { result = entry.root; // may be null, which means that "word" is the stem } /*** * caching off is normally faster if (cache != null && cache.size() < * maxCacheSize) { char[] key = new char[len]; System.arraycopy(term, 0, * key, 0, len); if (result != null) { cache.put(key, result); } else { * cache.put(key, word.toString()); } } ***/ /*** * if (entry == null) { if (!word.toString().equals(new String(term,0,len))) * { System.out.println("CASE:" + word.toString() + "," + new * String(term,0,len)); * * } } ***/ // no entry matched means result is "word" return true; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy