All Downloads are FREE. Search and download functionalities are using the official Maven repository.

simplenlg.lexicon.util.NIHLexiconXMLDumpUtil Maven / Gradle / Ivy

/*
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
 * License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is "Simplenlg".
 *
 * The Initial Developer of the Original Code is Ehud Reiter, Albert Gatt and Dave Westwater.
 * Portions created by Ehud Reiter, Albert Gatt and Dave Westwater are Copyright (C) 2010-11 The University of Aberdeen. All Rights Reserved.
 *
 * Contributor(s): Ehud Reiter, Albert Gatt, Dave Westwater, Roman Kutlak, Margaret Mitchell, and Saad Mahamood.
 */
package simplenlg.lexicon.util;

import java.io.FileReader;
import java.io.FileWriter;
import java.io.LineNumberReader;

import simplenlg.framework.LexicalCategory;
import simplenlg.framework.WordElement;
import simplenlg.lexicon.Lexicon;
import simplenlg.lexicon.NIHDBLexicon;


/**
 * 

This class reads in a CSV word list, looks up the words in the NIH lexicon, * and writes the XML words into an output file. This XML file can then be used as the XML Lexicon source for SimpleNLG.

* * @author Ehud Reiter */ public class NIHLexiconXMLDumpUtil { // filenames private static String DB_FILENAME; // DB location private static String WORDLIST_FILENAME; // word list private static String XML_FILENAME; // word list /** * This main method reads a list of CSV words and POS tags and looks up against * the NIHDB Lexicon for a corresponding entry. If found the baseform is written out into a XML * file, which can be used in SimpleNLG or elsewhere. * * @param args : List of Arguments that this command line application must be provided with in order: *
    *
  1. The full path to the NIHDB Lexicon database file e.g. C:\\NIHDB\\lexAccess2009
  2. *
  3. The full path to the list of baseforms and POS tags to include in the written out XML Lexicon file
  4. *
  5. The full path to the XML file that the XML Lexicon will be written out to.
  6. *
* *

Example usage: * java simplenlg.lexicon.util.NIHLexiconXMLDumpUtil C:\\NIHDB\\lexAccess2009 C:\\NIHDB\\wordlist.csv C:\\NIHDB\\default-lexicon.xml * * You will need to have the HSQLDB driver (org.hsqldb.jdbc.JDBCDriver) on your Java classpath before running this application. *

*/ public static void main(String[] args) { Lexicon lex = null; if(args.length == 3) { DB_FILENAME = args[0]; WORDLIST_FILENAME = args[1]; XML_FILENAME = args[2]; // Check to see if the HSQLDB driver is available on the classpath: boolean dbDriverAvaliable = false; try { Class driverClass = Class.forName("org.hsqldb.jdbc.JDBCDriver", false, NIHLexiconXMLDumpUtil.class.getClassLoader()); if(null != driverClass) { dbDriverAvaliable = true; } } catch(ClassNotFoundException cnfe) { System.err.println("*** Please add the HSQLDB JDBCDriver to your Java classpath and try again."); } if((null != DB_FILENAME && !DB_FILENAME.isEmpty()) && (null != WORDLIST_FILENAME && !WORDLIST_FILENAME.isEmpty()) && (null != XML_FILENAME && !XML_FILENAME.isEmpty()) && dbDriverAvaliable) { lex = new NIHDBLexicon(DB_FILENAME); try { LineNumberReader wordListFile = new LineNumberReader(new FileReader (WORDLIST_FILENAME)); FileWriter xmlFile = new FileWriter(XML_FILENAME); xmlFile.write(String.format("%n")); String line = wordListFile.readLine(); while (line != null) { String[] cols = line.split(","); String base = cols[0]; String cat = cols[1]; WordElement word = null; if (cat.equalsIgnoreCase("noun")) word = lex.getWord(base, LexicalCategory.NOUN); else if (cat.equalsIgnoreCase("verb")) word = lex.getWord(base, LexicalCategory.VERB); else if (cat.equalsIgnoreCase("adv")) word = lex.getWord(base, LexicalCategory.ADVERB); else if (cat.equalsIgnoreCase("adj")) word = lex.getWord(base, LexicalCategory.ADJECTIVE); else if (cat.equalsIgnoreCase("det")) word = lex.getWord(base, LexicalCategory.DETERMINER); else if (cat.equalsIgnoreCase("prep")) word = lex.getWord(base, LexicalCategory.PREPOSITION); else if (cat.equalsIgnoreCase("pron")) word = lex.getWord(base, LexicalCategory.PRONOUN); else if (cat.equalsIgnoreCase("conj")) word = lex.getWord(base, LexicalCategory.CONJUNCTION); else if (cat.equalsIgnoreCase("modal")) word = lex.getWord(base, LexicalCategory.MODAL); else if (cat.equalsIgnoreCase("interjection")) word = lex.getWord(base, LexicalCategory.NOUN); // Kilgarriff;s interjections are mostly nouns in the lexicon if (word == null) System.out.println("*** The following baseform and POS tag is not found: " + base + ":" + cat); else xmlFile.write(word.toXML()); line = wordListFile.readLine();; } xmlFile.write(String.format("%n")); wordListFile.close(); xmlFile.close(); lex.close(); System.out.println("*** XML Lexicon Export Completed."); } catch (Exception e) { System.err.println("*** An Error occured during the export. The Exception message is below: "); System.err.println(e.getMessage()); System.err.println("************************"); System.err.println("Please make sure you have the correct application arguments: "); printArgumentsMessage(); } } else { printErrorArgumentMessage(); } } else { printErrorArgumentMessage(); } } /** * Prints Arguments Error Messages if incorrect or not enough parameters have been supplied. */ private static void printErrorArgumentMessage() { System.err.println("Insuffient number of arguments supplied. Please supply the following Arguments: \n"); printArgumentsMessage(); } /** * Prints this utility applications arguments requirements. */ private static void printArgumentsMessage() { System.err.println("\t\t 1. The full path to the NIHDB Lexicon database file e.g. C:\\NIHDB\\lexAccess2009 "); System.err.println("\t\t 2. The full path to the list of baseforms and POS tags to include in the written out XML Lexicon file"); System.err.println("\t\t 3. The full path to the XML file that the XML Lexicon will be written out to."); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy