![JAR search and dependency download from the Maven repository](/logo.png)
simplenlg.lexicon.XMLLexicon Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of SimpleNLG Show documentation
Show all versions of SimpleNLG Show documentation
Java API for Natural Language Generation
The newest version!
/*
* The contents of this file are subject to the Mozilla Public License
* Version 2.0 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* https://www.mozilla.org/en-US/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
* License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is "Simplenlg".
*
* The Initial Developer of the Original Code is Ehud Reiter, Albert Gatt and Dave Westwater.
* Portions created by Ehud Reiter, Albert Gatt and Dave Westwater are Copyright (C) 2010-11 The University of Aberdeen. All Rights Reserved.
*
* Contributor(s): Ehud Reiter, Albert Gatt, Dave Westwater, Roman Kutlak, Margaret Mitchell, and Saad Mahamood.
*/
package simplenlg.lexicon;
import java.io.File;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.*;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import simplenlg.features.Inflection;
import simplenlg.features.LexicalFeature;
import simplenlg.framework.ElementCategory;
import simplenlg.framework.LexicalCategory;
import simplenlg.framework.WordElement;
/**
* This class loads words from an XML lexicon. All features specified in the
* lexicon are loaded
*
* @author ereiter
*/
public class XMLLexicon extends Lexicon {
// node names in lexicon XML files
private static final String XML_BASE = "base"; // base form of Word
private static final String XML_CATEGORY = "category"; // base form of Word
private static final String XML_ID = "id"; // base form of Word
private static final String XML_WORD = "word"; // node defining a word
// lexicon
private Set words; // set of words
private Map indexByID; // map from ID to word
private Map> indexByBase; // map from base to set
// of words with this
// baseform
private Map> indexByVariant; // map from variants
// to set of words
// with this variant
/**********************************************************************/
// constructors
/**********************************************************************/
/**
* Load an XML Lexicon from a named file
*/
public XMLLexicon(String filename) {
super();
File file = new File(filename);
createLexicon(file.toURI());
}
/**
* Load an XML Lexicon from a File
*/
public XMLLexicon(File file) {
super();
createLexicon(file.toURI());
}
/**
* Load an XML Lexicon from a URI
*/
public XMLLexicon(URI lexiconURI) {
super();
createLexicon(lexiconURI);
}
public XMLLexicon() {
try {
URL defaultLexicon = this.getClass().getClassLoader().getResource("default-lexicon.xml");
if(null != defaultLexicon) {
createLexicon(defaultLexicon.toURI());
} else {
createLexicon(this.getClass().getResource("/simplenlg/lexicon/default-lexicon.xml").toURI());
}
} catch(URISyntaxException ex) {
System.out.println(ex.toString());
}
}
/**
* method to actually load and index the lexicon from a URI
*/
private void createLexicon(URI lexiconURI) {
// initialise objects
words = new HashSet();
indexByID = new HashMap();
indexByBase = new HashMap>();
indexByVariant = new HashMap>();
try {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
Document doc = builder.parse(lexiconURI.toString());
if(doc != null) {
Element lexRoot = doc.getDocumentElement();
NodeList wordNodes = lexRoot.getChildNodes();
for(int i = 0; i < wordNodes.getLength(); i++) {
Node wordNode = wordNodes.item(i);
// ignore things that aren't elements
if(wordNode.getNodeType() == Node.ELEMENT_NODE) {
WordElement word = convertNodeToWord(wordNode);
if(word != null) {
words.add(word);
IndexWord(word);
}
}
}
}
} catch(Exception ex) {
System.out.println(ex.toString());
}
addSpecialCases();
}
/**
* add special cases to lexicon
*/
private void addSpecialCases() {
// add variants of "be"
WordElement be = getWord("be", LexicalCategory.VERB);
if(be != null) {
updateIndex(be, "is", indexByVariant);
updateIndex(be, "am", indexByVariant);
updateIndex(be, "are", indexByVariant);
updateIndex(be, "was", indexByVariant);
updateIndex(be, "were", indexByVariant);
}
}
/**
* create a simplenlg WordElement from a Word node in a lexicon XML file
*/
private WordElement convertNodeToWord(Node wordNode) {
// if this isn't a Word node, ignore it
if(!wordNode.getNodeName().equalsIgnoreCase(XML_WORD))
return null;
// // if there is no base, flag an error and return null
// String base = XPathUtil.extractValue(wordNode, Constants.XML_BASE);
// if (base == null) {
// System.out.println("Error in loading XML lexicon: Word with no base");
// return null;
// }
// create word
WordElement word = new WordElement();
List inflections = new ArrayList();
// now copy features
NodeList nodes = wordNode.getChildNodes();
for(int i = 0; i < nodes.getLength(); i++) {
Node featureNode = nodes.item(i);
if(featureNode.getNodeType() == Node.ELEMENT_NODE) {
String feature = featureNode.getNodeName().trim();
String value = featureNode.getTextContent();
if(value != null)
value = value.trim();
if(feature == null) {
System.err.println("Error in XML lexicon node for " + word.toString());
break;
}
if(feature.equalsIgnoreCase(XML_BASE)) {
word.setBaseForm(value);
} else if(feature.equalsIgnoreCase(XML_CATEGORY))
word.setCategory(LexicalCategory.valueOf(value.toUpperCase()));
else if(feature.equalsIgnoreCase(XML_ID))
word.setId(value);
else if(value == null || value.equals("")) {
// if this is an infl code, add it to inflections
Inflection infl = Inflection.getInflCode(feature);
if(infl != null) {
inflections.add(infl);
} else {
// otherwise assume it's a boolean feature
word.setFeature(feature, true);
}
} else
word.setFeature(feature, value);
}
}
// if no infl specified, assume regular
if(inflections.isEmpty()) {
inflections.add(Inflection.REGULAR);
}
// default inflection code is "reg" if we have it, else random pick form
// infl codes available
Inflection defaultInfl = inflections.contains(Inflection.REGULAR) ? Inflection.REGULAR : inflections.get(0);
word.setFeature(LexicalFeature.DEFAULT_INFL, defaultInfl);
word.setDefaultInflectionalVariant(defaultInfl);
for(Inflection infl : inflections) {
word.addInflectionalVariant(infl);
}
// done, return word
return word;
}
/**
* add word to internal indices
*/
private void IndexWord(WordElement word) {
// first index by base form
String base = word.getBaseForm();
// shouldn't really need is, as all words have base forms
if(base != null) {
updateIndex(word, base, indexByBase);
}
// now index by ID, which should be unique (if present)
String id = word.getId();
if(id != null) {
if(indexByID.containsKey(id))
System.out.println("Lexicon error: ID " + id + " occurs more than once");
indexByID.put(id, word);
}
// now index by variant
for(String variant : getVariants(word)) {
updateIndex(word, variant, indexByVariant);
}
// done
}
/**
* convenience method to update an index
*/
private void updateIndex(WordElement word, String base, Map> index) {
if(!index.containsKey(base))
index.put(base, new ArrayList());
index.get(base).add(word);
}
/******************************************************************************************/
// main methods to get data from lexicon
/******************************************************************************************/
/*
* (non-Javadoc)
*
* @see simplenlg.lexicon.Lexicon#getWords(java.lang.String,
* simplenlg.features.LexicalCategory)
*/
@Override
public List getWords(String baseForm, LexicalCategory category) {
return getWordsFromIndex(baseForm, category, indexByBase);
}
/**
* get matching keys from an index map
*/
private List getWordsFromIndex(String indexKey,
LexicalCategory category,
Map> indexMap) {
List result = new ArrayList();
// case 1: unknown, return empty list
if(!indexMap.containsKey(indexKey)) {
return result;
}
// case 2: category is ANY, return everything
if(category == LexicalCategory.ANY) {
for(WordElement word : indexMap.get(indexKey)) {
result.add(new WordElement(word));
}
return result;
} else {
// case 3: other category, search for match
for(WordElement word : indexMap.get(indexKey)) {
if(word.getCategory() == category) {
result.add(new WordElement(word));
}
}
}
return result;
}
/*
* (non-Javadoc)
*
* @see simplenlg.lexicon.Lexicon#getWordsByID(java.lang.String)
*/
@Override
public List getWordsByID(String id) {
List result = new ArrayList();
if(indexByID.containsKey(id)) {
result.add(new WordElement(indexByID.get(id)));
}
return result;
}
/*
* (non-Javadoc)
*
* @see simplenlg.lexicon.Lexicon#getWordsFromVariant(java.lang.String,
* simplenlg.features.LexicalCategory)
*/
@Override
public List getWordsFromVariant(String variant, LexicalCategory category) {
return getWordsFromIndex(variant, category, indexByVariant);
}
/**
* quick-and-dirty routine for getting morph variants should be replaced by
* something better!
*/
private Set getVariants(WordElement word) {
Set variants = new HashSet();
variants.add(word.getBaseForm());
ElementCategory category = word.getCategory();
if(category instanceof LexicalCategory) {
switch((LexicalCategory) category){
case NOUN:
variants.add(getVariant(word, LexicalFeature.PLURAL, "s"));
break;
case ADJECTIVE:
variants.add(getVariant(word, LexicalFeature.COMPARATIVE, "er"));
variants.add(getVariant(word, LexicalFeature.SUPERLATIVE, "est"));
break;
case VERB:
variants.add(getVariant(word, LexicalFeature.PRESENT3S, "s"));
variants.add(getVariant(word, LexicalFeature.PAST, "ed"));
variants.add(getVariant(word, LexicalFeature.PAST_PARTICIPLE, "ed"));
variants.add(getVariant(word, LexicalFeature.PRESENT_PARTICIPLE, "ing"));
break;
default:
// only base needed for other forms
break;
}
}
return variants;
}
/**
* quick-and-dirty routine for computing morph forms Should be replaced by
* something better!
*/
private String getVariant(WordElement word, String feature, String suffix) {
if(word.hasFeature(feature))
return word.getFeatureAsString(feature);
else
return getForm(word.getBaseForm(), suffix);
}
/**
* quick-and-dirty routine for standard orthographic changes Should be
* replaced by something better!
*/
private String getForm(String base, String suffix) {
// add a suffix to a base form, with orthographic changes
// rule 1 - convert final "y" to "ie" if suffix does not start with "i"
// eg, cry + s = cries , not crys
if(base.endsWith("y") && !suffix.startsWith("i"))
base = base.substring(0, base.length() - 1) + "ie";
// rule 2 - drop final "e" if suffix starts with "e" or "i"
// eg, like+ed = liked, not likeed
if(base.endsWith("e") && (suffix.startsWith("e") || suffix.startsWith("i")))
base = base.substring(0, base.length() - 1);
// rule 3 - insert "e" if suffix is "s" and base ends in s, x, z, ch, sh
// eg, watch+s -> watches, not watchs
if(suffix.startsWith("s") && (base.endsWith("s") || base.endsWith("x") || base.endsWith("z") || base.endsWith(
"ch") || base.endsWith("sh")))
base = base + "e";
// have made changes, now append and return
return base + suffix; // eg, want + s = wants
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy