All Downloads are FREE. Search and download functionalities are using the official Maven repository.

simplenlg.lexicon.Lexicon Maven / Gradle / Ivy

/*
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
 * License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is "Simplenlg".
 *
 * The Initial Developer of the Original Code is Ehud Reiter, Albert Gatt and Dave Westwater.
 * Portions created by Ehud Reiter, Albert Gatt and Dave Westwater are Copyright (C) 2010-11 The University of Aberdeen. All Rights Reserved.
 *
 * Contributor(s): Ehud Reiter, Albert Gatt, Dave Wewstwater, Roman Kutlak, Margaret Mitchell.
 */
package simplenlg.lexicon;

import java.util.List;

import simplenlg.framework.LexicalCategory;
import simplenlg.framework.WordElement;

/**
 * This is the generic abstract class for a Lexicon. In simplenlg V4, a
 * Lexicon is a collection of
 * {@link simplenlg.framework.WordElement} objects; it does not do any
 * morphological processing (as was the case in simplenlg V3). Information about
 * WordElement can be obtained from a database (
 * {@link simplenlg.lexicon.NIHDBLexicon}) or from an XML file (
 * {@link simplenlg.lexicon.XMLLexicon}). Simplenlg V4 comes with a default
 * (XML) lexicon, which is retrieved by the getDefaultLexicon
 * method.
 * 
 * There are several ways of retrieving words. If in doubt, use
 * lookupWord. More control is available from the
 * getXXXX methods, which allow words to retrieved in several ways
 * 
    *
  1. baseform and {@link simplenlg.framework.LexicalCategory}; for example * "university" and Noun *
  2. just baseform; for example, "university" *
  3. ID string (if this is supported by the underlying DB or XML file); for * example "E0063257" is the ID for "university" in the NIH Specialist lexicon *
  4. variant; this looks for a word given a form of the word which may be * inflected (eg, "universities") or a spelling variant (eg, "color" for * "colour"). Acronyms are not considered to be variants (eg, "UK" and * "United Kingdom" are regarded as different words).
    * Note: variant lookup is not guaranteed, this is a feature which * hopefully will develop over time *
  5. variant and {@link simplenlg.framework.LexicalCategory}; for example * "universities" and Noun *
* * For each type of lookup, there are three methods *
    *
  • getWords: get all matching * {@link simplenlg.framework.WordElement} in the Lexicon. For example, * getWords("dog") would return a List of two * WordElement, one for the noun "dog" and one for the verb "dog". * If there are no matching entries in the lexicon, this method returns an empty * collection *
  • getWord: get a single matching * {@link simplenlg.framework.WordElement} in the Lexicon. For example, * getWord("dog") would a for either the noun "dog" or the * verb "dog" (unpredictable). If there are no matching entries in * the lexicon, this method will create a default WordElement based * on the information specified. *
  • hasWord: returns true if the Lexicon contains * at least one matching WordElement *
* * @author Albert Gatt (simplenlg v3 lexicon) * @author Ehud Reiter (simplenlg v4 lexicon) */ public abstract class Lexicon { /****************************************************************************/ // constructors and related /****************************************************************************/ /** * returns the default built-in lexicon * * @return default lexicon */ public static Lexicon getDefaultLexicon() { return new XMLLexicon(); } /** * create a default WordElement. May be overridden by specific types of * lexicon * * @param baseForm * - base form of word * @param category * - category of word * @return WordElement entry for specified info */ protected WordElement createWord(String baseForm, LexicalCategory category) { return new WordElement(baseForm, category); // return default // WordElement of this // baseForm, category } /** * create a default WordElement. May be overridden by specific types of * lexicon * * @param baseForm * - base form of word * @return WordElement entry for specified info */ protected WordElement createWord(String baseForm) { return new WordElement(baseForm); // return default WordElement of this // baseForm } /***************************************************************************/ // default methods for looking up words // These try the following (in this order) // 1) word with matching base // 2) word with matching variant // 3) word with matching ID // 4) create a new workd /***************************************************************************/ /** * General word lookup method, tries base form, variant, ID (in this order) * Creates new word if can't find existing word * * @param baseForm * @param category * @return word */ public WordElement lookupWord(String baseForm, LexicalCategory category) { if (hasWord(baseForm, category)) return getWord(baseForm, category); else if (hasWordFromVariant(baseForm, category)) return getWordFromVariant(baseForm, category); else if (hasWordByID(baseForm)) return getWordByID(baseForm); else return createWord(baseForm, category); } /** * General word lookup method, tries base form, variant, ID (in this order) * Creates new word if can't find existing word * * @param baseForm * @return word */ public WordElement lookupWord(String baseForm) { return lookupWord(baseForm, LexicalCategory.ANY); } /****************************************************************************/ // get words by baseform and category // fundamental version is getWords(String baseForm, Category category), // this must be defined by subclasses. Other versions are convenience // methods. These may be overriden for efficiency, but this is not required. /****************************************************************************/ /** * returns all Words which have the specified base form and category * * @param baseForm * - base form of word, eg "be" or "dog" (not "is" or "dogs") * @param category * - syntactic category of word (ANY for unknown) * @return collection of all matching Words (may be empty) */ abstract public List getWords(String baseForm, LexicalCategory category); /** * get a WordElement which has the specified base form and category * * @param baseForm * - base form of word, eg "be" or "dog" (not "is" or "dogs") * @param category * - syntactic category of word (ANY for unknown) * @return if Lexicon contains such a WordElement, it is returned (the first * match is returned if there are several matches). If the Lexicon * does not contain such a WordElement, a new WordElement is created * and returned */ public WordElement getWord(String baseForm, LexicalCategory category) {// convenience // method // derived // from // other // methods List wordElements = getWords(baseForm, category); if (wordElements.isEmpty()) return createWord(baseForm, category); // return default WordElement // of this baseForm, // category else return selectMatchingWord(wordElements, baseForm); } /** choose a single WordElement from a list of WordElements. Prefer one * which exactly matches the baseForm * @param wordElements * - list of WordElements retrieved from lexicon * @param baseForm - base form of word, eg "be" or "dog" (not "is" or "dogs") * @return single WordElement (from list) */ private WordElement selectMatchingWord(List wordElements, String baseForm) { // EHUD REITER - this method added because some DBs are case-insensitive, // so a query on "man" returns both "man" and "MAN". In such cases, the // exact match (eg, "man") should be returned // below check is redundant, since caller should check this if (wordElements == null || wordElements.isEmpty()) return createWord(baseForm); // look for exact match in base form for (WordElement wordElement: wordElements) if (wordElement.getBaseForm().equals(baseForm)) return wordElement; // Roman Kutlak: I don't think it is a good idea to return a word whose // case does not match because if a word appears in the lexicon // as an acronym only, it will be replaced as such. For example, // "foo" will return as the acronym "FOO". This does not seem desirable. // else return first element in list if(wordElements.get(0).getBaseForm().equalsIgnoreCase(baseForm)) { return createWord(baseForm, LexicalCategory.ANY); } return wordElements.get(0); } /** * return true if the lexicon contains a WordElement which has * the specified base form and category * * @param baseForm * - base form of word, eg "be" or "dog" (not "is" or "dogs") * @param category * - syntactic category of word (ANY for unknown) * @return true if Lexicon contains such a WordElement */ public boolean hasWord(String baseForm, LexicalCategory category) {// convenience // method // derived // from // other // methods) // { return !getWords(baseForm, category).isEmpty(); } /** * returns all Words which have the specified base form * * @param baseForm * - base form of word, eg "be" or "dog" (not "is" or "dogs") * @return collection of all matching Words (may be empty) */ public List getWords(String baseForm) { // convenience method // derived from // other methods return getWords(baseForm, LexicalCategory.ANY); } /** * get a WordElement which has the specified base form (of any category) * * @param baseForm * - base form of word, eg "be" or "dog" (not "is" or "dogs") * @return if Lexicon contains such a WordElement, it is returned (the first * match is returned if there are several matches). If the Lexicon * does not contain such a WordElement, a new WordElement is created * and returned */ public WordElement getWord(String baseForm) { // convenience method derived // from other methods List wordElements = getWords(baseForm); if (wordElements.isEmpty()) return createWord(baseForm); // return default WordElement of this // baseForm else return selectMatchingWord(wordElements, baseForm); } /** * return true if the lexicon contains a WordElement which has * the specified base form (in any category) * * @param baseForm * - base form of word, eg "be" or "dog" (not "is" or "dogs") * @return true if Lexicon contains such a WordElement */ public boolean hasWord(String baseForm) {// convenience method derived from // other methods) { return !getWords(baseForm).isEmpty(); } /****************************************************************************/ // get words by ID // fundamental version is getWordsByID(String id), // this must be defined by subclasses. // Other versions are convenience methods // These may be overriden for efficiency, but this is not required. /****************************************************************************/ /** * returns a List of WordElement which have this ID. IDs are * lexicon-dependent, and should be unique. Therefore the list should * contain either zero elements (if no such word exists) or one element (if * the word is found) * * @param id * - internal lexicon ID for a word * @return either empty list (if no word with this ID exists) or list * containing the matching word */ abstract public List getWordsByID(String id); /** * get a WordElement with the specified ID * * @param id * internal lexicon ID for a word * @return WordElement with this ID if found; otherwise a new WordElement is * created with the ID as the base form */ public WordElement getWordByID(String id) { List wordElements = getWordsByID(id); if (wordElements.isEmpty()) return createWord(id); // return WordElement based on ID; may help // in debugging... else return wordElements.get(0); // else return first match } /** * return true if the lexicon contains a WordElement which the * specified ID * * @param id * - internal lexicon ID for a word * @return true if Lexicon contains such a WordElement */ public boolean hasWordByID(String id) {// convenience method derived from // other methods) { return !getWordsByID(id).isEmpty(); } /****************************************************************************/ // get words by variant - try to return a WordElement given an inflectional // or spelling // variant. For the moment, acronyms are considered as separate words, not // variants // (this may change in the future) // fundamental version is getWordsFromVariant(String baseForm, Category // category), // this must be defined by subclasses. Other versions are convenience // methods. These may be overriden for efficiency, but this is not required. /****************************************************************************/ /** * returns Words which have an inflected form and/or spelling variant that * matches the specified variant, and are in the specified category.
* Note: the returned word list may not be complete, it depends on * how it is implemented by the underlying lexicon * * @param variant * - base form, inflected form, or spelling variant of word * @param category * - syntactic category of word (ANY for unknown) * @return list of all matching Words (empty list if no matching WordElement * found) */ abstract public List getWordsFromVariant(String variant, LexicalCategory category); /** * returns a WordElement which has the specified inflected form and/or * spelling variant that matches the specified variant, of the specified * category * * @param variant * - base form, inflected form, or spelling variant of word * @param category * - syntactic category of word (ANY for unknown) * @return a matching WordElement (if found), otherwise a new word is * created using thie variant as the base form */ public WordElement getWordFromVariant(String variant, LexicalCategory category) { List wordElements = getWordsFromVariant(variant, category); if (wordElements.isEmpty()) return createWord(variant, category); // return default WordElement // using variant as base // form else return selectMatchingWord(wordElements, variant); } /** * return true if the lexicon contains a WordElement which * matches the specified variant form and category * * @param variant * - base form, inflected form, or spelling variant of word * @param category * - syntactic category of word (ANY for unknown) * @return true if Lexicon contains such a WordElement */ public boolean hasWordFromVariant(String variant, LexicalCategory category) {// convenience // method // derived // from // other // methods) // { return !getWordsFromVariant(variant, category).isEmpty(); } /** * returns Words which have an inflected form and/or spelling variant that * matches the specified variant, of any category.
* Note: the returned word list may not be complete, it depends on * how it is implemented by the underlying lexicon * * @param variant * - base form, inflected form, or spelling variant of word * @return list of all matching Words (empty list if no matching WordElement * found) */ public List getWordsFromVariant(String variant) { return getWordsFromVariant(variant, LexicalCategory.ANY); } /** * returns a WordElement which has the specified inflected form and/or * spelling variant that matches the specified variant, of any category. * * @param variant * - base form, inflected form, or spelling variant of word * @return a matching WordElement (if found), otherwise a new word is * created using thie variant as the base form */ public WordElement getWordFromVariant(String variant) { List wordElements = getWordsFromVariant(variant); if (wordElements.isEmpty()) return createWord(variant); // return default WordElement using // variant as base form else return wordElements.get(0); // else return first match } /** * return true if the lexicon contains a WordElement which * matches the specified variant form (in any category) * * @param variant * - base form, inflected form, or spelling variant of word * @return true if Lexicon contains such a WordElement */ public boolean hasWordFromVariant(String variant) {// convenience method // derived from other // methods) { return !getWordsFromVariant(variant).isEmpty(); } /****************************************************************************/ // other methods /****************************************************************************/ /** * close the lexicon (if necessary) if lexicon does not need to be closed, * this does nothing */ public void close() { // default method does nothing } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy