simplenlg.lexicon.util.NIHLexiconXMLDumpUtil Maven / Gradle / Ivy
/*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
* License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is "Simplenlg".
*
* The Initial Developer of the Original Code is Ehud Reiter, Albert Gatt and Dave Westwater.
* Portions created by Ehud Reiter, Albert Gatt and Dave Westwater are Copyright (C) 2010-11 The University of Aberdeen. All Rights Reserved.
*
* Contributor(s): Ehud Reiter, Albert Gatt, Dave Westwater, Roman Kutlak, Margaret Mitchell, and Saad Mahamood.
*/
package simplenlg.lexicon.util;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.LineNumberReader;
import simplenlg.framework.LexicalCategory;
import simplenlg.framework.WordElement;
import simplenlg.lexicon.Lexicon;
import simplenlg.lexicon.NIHDBLexicon;
/**
* This class reads in a CSV word list, looks up the words in the NIH lexicon,
* and writes the XML words into an output file. This XML file can then be used as the XML Lexicon source for SimpleNLG.
*
* @author Ehud Reiter
*/
public class NIHLexiconXMLDumpUtil {
// filenames
private static String DB_FILENAME; // DB location
private static String WORDLIST_FILENAME; // word list
private static String XML_FILENAME; // word list
/**
* This main method reads a list of CSV words and POS tags and looks up against
* the NIHDB Lexicon for a corresponding entry. If found the baseform is written out into a XML
* file, which can be used in SimpleNLG or elsewhere.
*
* @param args : List of Arguments that this command line application must be provided with in order:
*
* - The full path to the NIHDB Lexicon database file e.g. C:\\NIHDB\\lexAccess2009
* - The full path to the list of baseforms and POS tags to include in the written out XML Lexicon file
* - The full path to the XML file that the XML Lexicon will be written out to.
*
*
*Example usage:
* java simplenlg.lexicon.util.NIHLexiconXMLDumpUtil C:\\NIHDB\\lexAccess2009 C:\\NIHDB\\wordlist.csv C:\\NIHDB\\default-lexicon.xml
*
* You will need to have the HSQLDB driver (org.hsqldb.jdbc.JDBCDriver) on your Java classpath before running this application.
*
*/
public static void main(String[] args) {
Lexicon lex = null;
if(args.length == 3) {
DB_FILENAME = args[0];
WORDLIST_FILENAME = args[1];
XML_FILENAME = args[2];
// Check to see if the HSQLDB driver is available on the classpath:
boolean dbDriverAvaliable = false;
try {
Class driverClass = Class.forName("org.hsqldb.jdbc.JDBCDriver", false, NIHLexiconXMLDumpUtil.class.getClassLoader());
if(null != driverClass) {
dbDriverAvaliable = true;
}
} catch(ClassNotFoundException cnfe) {
System.err.println("*** Please add the HSQLDB JDBCDriver to your Java classpath and try again.");
}
if((null != DB_FILENAME && !DB_FILENAME.isEmpty()) &&
(null != WORDLIST_FILENAME && !WORDLIST_FILENAME.isEmpty()) &&
(null != XML_FILENAME && !XML_FILENAME.isEmpty()) && dbDriverAvaliable) {
lex = new NIHDBLexicon(DB_FILENAME);
try {
LineNumberReader wordListFile = new LineNumberReader(new FileReader (WORDLIST_FILENAME));
FileWriter xmlFile = new FileWriter(XML_FILENAME);
xmlFile.write(String.format("%n"));
String line = wordListFile.readLine();
while (line != null) {
String[] cols = line.split(",");
String base = cols[0];
String cat = cols[1];
WordElement word = null;
if (cat.equalsIgnoreCase("noun"))
word = lex.getWord(base, LexicalCategory.NOUN);
else if (cat.equalsIgnoreCase("verb"))
word = lex.getWord(base, LexicalCategory.VERB);
else if (cat.equalsIgnoreCase("adv"))
word = lex.getWord(base, LexicalCategory.ADVERB);
else if (cat.equalsIgnoreCase("adj"))
word = lex.getWord(base, LexicalCategory.ADJECTIVE);
else if (cat.equalsIgnoreCase("det"))
word = lex.getWord(base, LexicalCategory.DETERMINER);
else if (cat.equalsIgnoreCase("prep"))
word = lex.getWord(base, LexicalCategory.PREPOSITION);
else if (cat.equalsIgnoreCase("pron"))
word = lex.getWord(base, LexicalCategory.PRONOUN);
else if (cat.equalsIgnoreCase("conj"))
word = lex.getWord(base, LexicalCategory.CONJUNCTION);
else if (cat.equalsIgnoreCase("modal"))
word = lex.getWord(base, LexicalCategory.MODAL);
else if (cat.equalsIgnoreCase("interjection"))
word = lex.getWord(base, LexicalCategory.NOUN); // Kilgarriff;s interjections are mostly nouns in the lexicon
if (word == null)
System.out.println("*** The following baseform and POS tag is not found: " + base + ":" + cat);
else
xmlFile.write(word.toXML());
line = wordListFile.readLine();;
}
xmlFile.write(String.format(" %n"));
wordListFile.close();
xmlFile.close();
lex.close();
System.out.println("*** XML Lexicon Export Completed.");
} catch (Exception e) {
System.err.println("*** An Error occured during the export. The Exception message is below: ");
System.err.println(e.getMessage());
System.err.println("************************");
System.err.println("Please make sure you have the correct application arguments: ");
printArgumentsMessage();
}
}
else {
printErrorArgumentMessage();
}
} else {
printErrorArgumentMessage();
}
}
/**
* Prints Arguments Error Messages if incorrect or not enough parameters have been supplied.
*/
private static void printErrorArgumentMessage() {
System.err.println("Insuffient number of arguments supplied. Please supply the following Arguments: \n");
printArgumentsMessage();
}
/**
* Prints this utility applications arguments requirements.
*/
private static void printArgumentsMessage() {
System.err.println("\t\t 1. The full path to the NIHDB Lexicon database file e.g. C:\\NIHDB\\lexAccess2009 ");
System.err.println("\t\t 2. The full path to the list of baseforms and POS tags to include in the written out XML Lexicon file");
System.err.println("\t\t 3. The full path to the XML file that the XML Lexicon will be written out to.");
}
}