All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.parser.lexparser.LexicalizedParser Maven / Gradle / Ivy

Go to download

Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.

There is a newer version: 3.9.2
Show newest version
// Stanford Parser -- a probabilistic lexicalized NL CFG parser
// Copyright (c) 2002 - 2014 The Board of Trustees of
// The Leland Stanford Junior University. All Rights Reserved.
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software Foundation,
// Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
//
// For more information, bug reports, fixes, contact:
//    Christopher Manning
//    Dept of Computer Science, Gates 1A
//    Stanford CA 94305-9010
//    USA
//    [email protected]
//    http://nlp.stanford.edu/software/lex-parser.shtml

package edu.stanford.nlp.parser.lexparser;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.parser.common.ArgUtils;
import edu.stanford.nlp.parser.common.ParserGrammar;
import edu.stanford.nlp.parser.common.ParserQuery;
import edu.stanford.nlp.parser.common.ParserUtils;
import edu.stanford.nlp.parser.metrics.Eval;
import edu.stanford.nlp.parser.metrics.ParserQueryEval;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.util.ErasureUtils;
import java.util.function.Function;
import edu.stanford.nlp.util.HashIndex;
import edu.stanford.nlp.util.Index;
import edu.stanford.nlp.tagger.io.TaggedFileRecord;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.ReflectionLoading;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.Timing;
import edu.stanford.nlp.util.Triple;
import edu.stanford.nlp.util.concurrent.MulticoreWrapper;
import edu.stanford.nlp.util.concurrent.ThreadsafeProcessor;

import java.io.*;
import java.util.*;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;

/**
 * This class provides the top-level API and command-line interface to a set
 * of reasonably good treebank-trained parsers.  The name reflects the main
 * factored parsing model, which provides a lexicalized PCFG parser
 * implemented as a product
 * model of a plain PCFG parser and a lexicalized dependency parser.
 * But you can also run either component parser alone.  In particular, it
 * is often useful to do unlexicalized PCFG parsing by using just that
 * component parser.
 * 

* See the package documentation for more details and examples of use. *

* For information on invoking the parser from the command-line, and for * a more detailed list of options, see the {@link #main} method. *

* Note that training on a 1 million word treebank requires a fair amount of * memory to run. Try -mx1500m to increase the memory allocated by the JVM. * * @author Dan Klein (original version) * @author Christopher Manning (better features, ParserParams, serialization) * @author Roger Levy (internationalization) * @author Teg Grenager (grammar compaction, tokenization, etc.) * @author Galen Andrew (considerable refactoring) * @author John Bauer (made threadsafe) */ public class LexicalizedParser extends ParserGrammar implements Serializable { public Lexicon lex; public BinaryGrammar bg; public UnaryGrammar ug; public DependencyGrammar dg; public Index stateIndex, wordIndex, tagIndex; private Options op; @Override public Options getOp() { return op; } public Reranker reranker; // = null; @Override public TreebankLangParserParams getTLPParams() { return op.tlpParams; } @Override public TreebankLanguagePack treebankLanguagePack() { return getTLPParams().treebankLanguagePack(); } @Override public String[] defaultCoreNLPFlags() { return getTLPParams().defaultCoreNLPFlags(); } @Override public boolean requiresTags() { return false; } private static final String SERIALIZED_PARSER_PROPERTY = "edu.stanford.nlp.SerializedLexicalizedParser"; public static final String DEFAULT_PARSER_LOC = ((System.getenv("NLP_PARSER") != null) ? System.getenv("NLP_PARSER") : "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"); /** * Construct a new LexicalizedParser object from a previously * serialized grammar read from a System property * edu.stanford.nlp.SerializedLexicalizedParser, or a * default classpath location * ({@code edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz}). */ public static LexicalizedParser loadModel() { return loadModel(new Options()); } /** * Construct a new LexicalizedParser object from a previously * serialized grammar read from a System property * edu.stanford.nlp.SerializedLexicalizedParser, or a * default classpath location * ({@code edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz}). * * @param op Options to the parser. These get overwritten by the * Options read from the serialized parser; I think the only * thing determined by them is the encoding of the grammar * iff it is a text grammar */ public static LexicalizedParser loadModel(Options op, String ... extraFlags) { String source = System.getProperty(SERIALIZED_PARSER_PROPERTY); if (source == null) { source = DEFAULT_PARSER_LOC; } return loadModel(source, op, extraFlags); } public static LexicalizedParser loadModel(String parserFileOrUrl, String ... extraFlags) { return loadModel(parserFileOrUrl, new Options(), extraFlags); } public static LexicalizedParser loadModel(String parserFileOrUrl, List extraFlags) { String[] flags = new String[extraFlags.size()]; extraFlags.toArray(flags); return loadModel(parserFileOrUrl, flags); } /** * Construct a new LexicalizedParser. This loads a grammar * that was previously assembled and stored as a serialized file. * @param parserFileOrUrl Filename/URL to load parser from * @param op Options for this parser. These will normally be overwritten * by options stored in the file * @throws IllegalArgumentException If parser data cannot be loaded */ public static LexicalizedParser loadModel(String parserFileOrUrl, Options op, String ... extraFlags) { // System.err.print("Loading parser from file " + parserFileOrUrl); LexicalizedParser parser = getParserFromFile(parserFileOrUrl, op); if (extraFlags.length > 0) { parser.setOptionFlags(extraFlags); } return parser; } /** * Reads one object from the given ObjectInputStream, which is * assumed to be a LexicalizedParser. Throws a ClassCastException * if this is not true. The stream is not closed. */ public static LexicalizedParser loadModel(ObjectInputStream ois) { try { Object o = ois.readObject(); if (o instanceof LexicalizedParser) { return (LexicalizedParser) o; } throw new ClassCastException("Wanted LexicalizedParser, got " + o.getClass()); } catch (IOException e) { throw new RuntimeIOException(e); } catch (ClassNotFoundException e) { throw new RuntimeException(e); } } public static LexicalizedParser loadModelFromZip(String zipFilename, String modelName) { LexicalizedParser parser = null; try { File file = new File(zipFilename); if (file.exists()) { ZipFile zin = new ZipFile(file); ZipEntry zentry = zin.getEntry(modelName); if (zentry != null) { InputStream in = zin.getInputStream(zentry); // gunzip it if necessary if (modelName.endsWith(".gz")) { in = new GZIPInputStream(in); } ObjectInputStream ois = new ObjectInputStream(in); parser = loadModel(ois); ois.close(); in.close(); } zin.close(); } else { throw new FileNotFoundException("Could not find " + modelName + " inside " + zipFilename); } } catch (IOException e) { throw new RuntimeIOException(e); } return parser; } public static LexicalizedParser copyLexicalizedParser(LexicalizedParser parser) { return new LexicalizedParser(parser.lex, parser.bg, parser.ug, parser.dg, parser.stateIndex, parser.wordIndex, parser.tagIndex, parser.op); } public LexicalizedParser(Lexicon lex, BinaryGrammar bg, UnaryGrammar ug, DependencyGrammar dg, Index stateIndex, Index wordIndex, Index tagIndex, Options op) { this.lex = lex; this.bg = bg; this.ug = ug; this.dg = dg; this.stateIndex = stateIndex; this.wordIndex = wordIndex; this.tagIndex = tagIndex; this.op = op; } /** * Construct a new LexicalizedParser. * * @param trainTreebank a treebank to train from */ public static LexicalizedParser trainFromTreebank(Treebank trainTreebank, GrammarCompactor compactor, Options op) { return getParserFromTreebank(trainTreebank, null, 1.0, compactor, op, null, null); } public static LexicalizedParser trainFromTreebank(String treebankPath, FileFilter filt, Options op) { return trainFromTreebank(makeTreebank(treebankPath, op, filt), op); } public static LexicalizedParser trainFromTreebank(Treebank trainTreebank, Options op) { return trainFromTreebank(trainTreebank, null, op); } /** * Will process a list of strings into a list of HasWord and return * the parse tree associated with that list. */ public Tree parseStrings(List lst) { List words = new ArrayList(); for (String word : lst) { words.add(new Word(word)); } return parse(words); } /** * Parses the list of HasWord. If the parse fails for some reason, * an X tree is returned instead of barfing. */ public Tree parse(List lst) { try { ParserQuery pq = parserQuery(); if (pq.parse(lst)) { Tree bestparse = pq.getBestParse(); // -10000 denotes unknown words bestparse.setScore(pq.getPCFGScore() % -10000.0); return bestparse; } } catch (Exception e) { System.err.println("Following exception caught during parsing:"); e.printStackTrace(); System.err.println("Recovering using fall through strategy: will construct an (X ...) tree."); } // if can't parse or exception, fall through return ParserUtils.xTree(lst); } public List parseMultiple(final List> sentences) { List trees = new ArrayList(); for (List sentence : sentences) { trees.add(parse(sentence)); } return trees; } /** * Will launch multiple threads which calls parse on * each of the sentences in order, returning the * resulting parse trees in the same order. */ public List parseMultiple(final List> sentences, final int nthreads) { MulticoreWrapper, Tree> wrapper = new MulticoreWrapper, Tree>(nthreads, new ThreadsafeProcessor, Tree>() { public Tree process(List sentence) { return parse(sentence); } public ThreadsafeProcessor, Tree> newInstance() { return this; } }); List trees = new ArrayList(); for (List sentence : sentences) { wrapper.put(sentence); while (wrapper.peek()) { trees.add(wrapper.poll()); } } wrapper.join(); while (wrapper.peek()) { trees.add(wrapper.poll()); } return trees; } /** Return a TreePrint for formatting parsed output trees. * @return A TreePrint for formatting parsed output trees. */ public TreePrint getTreePrint() { return op.testOptions.treePrint(op.tlpParams); } /** * Similar to parse(), but instead of returning an X tree on failure, returns null. */ public Tree parseTree(List sentence) { ParserQuery pq = parserQuery(); if (pq.parse(sentence)) { return pq.getBestParse(); } else { return null; } } public List getExtraEvals() { if (reranker != null) { return reranker.getEvals(); } else { return Collections.emptyList(); } } public List getParserQueryEvals() { return Collections.emptyList(); } @Override public ParserQuery parserQuery() { if (reranker == null) { return new LexicalizedParserQuery(this); } else { return new RerankingParserQuery(op, new LexicalizedParserQuery(this), reranker); } } public LexicalizedParserQuery lexicalizedParserQuery() { return new LexicalizedParserQuery(this); } public static LexicalizedParser getParserFromFile(String parserFileOrUrl, Options op) { LexicalizedParser pd = getParserFromSerializedFile(parserFileOrUrl); if (pd == null) { pd = getParserFromTextFile(parserFileOrUrl, op); } return pd; } private static Treebank makeTreebank(String treebankPath, Options op, FileFilter filt) { System.err.println("Training a parser from treebank dir: " + treebankPath); Treebank trainTreebank = op.tlpParams.diskTreebank(); System.err.print("Reading trees..."); if (filt == null) { trainTreebank.loadPath(treebankPath); } else { trainTreebank.loadPath(treebankPath, filt); } Timing.tick("done [read " + trainTreebank.size() + " trees]."); return trainTreebank; } private static DiskTreebank makeSecondaryTreebank(String treebankPath, Options op, FileFilter filt) { System.err.println("Additionally training using secondary disk treebank: " + treebankPath + ' ' + filt); DiskTreebank trainTreebank = op.tlpParams.diskTreebank(); System.err.print("Reading trees..."); if (filt == null) { trainTreebank.loadPath(treebankPath); } else { trainTreebank.loadPath(treebankPath, filt); } Timing.tick("done [read " + trainTreebank.size() + " trees]."); return trainTreebank; } public Lexicon getLexicon() { return lex; } /** * Saves the parser defined by pd to the given filename. * If there is an error, a RuntimeIOException is thrown. */ public void saveParserToSerialized(String filename) { try { System.err.print("Writing parser in serialized format to file " + filename + ' '); ObjectOutputStream out = IOUtils.writeStreamFromString(filename); out.writeObject(this); out.close(); System.err.println("done."); } catch (IOException ioe) { throw new RuntimeIOException(ioe); } } /** * Saves the parser defined by pd to the given filename. * If there is an error, a RuntimeIOException is thrown. */ public void saveParserToTextFile(String filename) { if (reranker != null) { throw new UnsupportedOperationException("Sorry, but parsers with rerankers cannot be saved to text file"); } try { System.err.print("Writing parser in text grammar format to file " + filename); OutputStream os; if (filename.endsWith(".gz")) { // it's faster to do the buffering _outside_ the gzipping as here os = new BufferedOutputStream(new GZIPOutputStream(new FileOutputStream(filename))); } else { os = new BufferedOutputStream(new FileOutputStream(filename)); } PrintWriter out = new PrintWriter(os); String prefix = "BEGIN "; out.println(prefix + "OPTIONS"); op.writeData(out); out.println(); System.err.print("."); out.println(prefix + "STATE_INDEX"); stateIndex.saveToWriter(out); out.println(); System.err.print("."); out.println(prefix + "WORD_INDEX"); wordIndex.saveToWriter(out); out.println(); System.err.print("."); out.println(prefix + "TAG_INDEX"); tagIndex.saveToWriter(out); out.println(); System.err.print("."); String uwmClazz = ((lex.getUnknownWordModel() == null) ? "null" : lex.getUnknownWordModel().getClass().getCanonicalName()); out.println(prefix + "LEXICON " + uwmClazz); lex.writeData(out); out.println(); System.err.print("."); out.println(prefix + "UNARY_GRAMMAR"); ug.writeData(out); out.println(); System.err.print("."); out.println(prefix + "BINARY_GRAMMAR"); bg.writeData(out); out.println(); System.err.print("."); out.println(prefix + "DEPENDENCY_GRAMMAR"); if (dg != null) { dg.writeData(out); } out.println(); System.err.print("."); out.flush(); out.close(); System.err.println("done."); } catch (IOException e) { System.err.println("Trouble saving parser data to ASCII format."); throw new RuntimeIOException(e); } } private static void confirmBeginBlock(String file, String line) { if (line == null) { throw new RuntimeException(file + ": expecting BEGIN block; got end of file."); } else if (! line.startsWith("BEGIN")) { throw new RuntimeException(file + ": expecting BEGIN block; got " + line); } } protected static LexicalizedParser getParserFromTextFile(String textFileOrUrl, Options op) { try { Timing tim = new Timing(); System.err.print("Loading parser from text file " + textFileOrUrl + ' '); BufferedReader in = IOUtils.readerFromString(textFileOrUrl); Timing.startTime(); String line = in.readLine(); confirmBeginBlock(textFileOrUrl, line); op.readData(in); System.err.print("."); line = in.readLine(); confirmBeginBlock(textFileOrUrl, line); Index stateIndex = HashIndex.loadFromReader(in); System.err.print("."); line = in.readLine(); confirmBeginBlock(textFileOrUrl, line); Index wordIndex = HashIndex.loadFromReader(in); System.err.print("."); line = in.readLine(); confirmBeginBlock(textFileOrUrl, line); Index tagIndex = HashIndex.loadFromReader(in); System.err.print("."); line = in.readLine(); confirmBeginBlock(textFileOrUrl, line); Lexicon lex = op.tlpParams.lex(op, wordIndex, tagIndex); String uwmClazz = line.split(" +")[2]; if (!uwmClazz.equals("null")) { UnknownWordModel model = ReflectionLoading.loadByReflection(uwmClazz, op, lex, wordIndex, tagIndex); lex.setUnknownWordModel(model); } lex.readData(in); System.err.print("."); line = in.readLine(); confirmBeginBlock(textFileOrUrl, line); UnaryGrammar ug = new UnaryGrammar(stateIndex); ug.readData(in); System.err.print("."); line = in.readLine(); confirmBeginBlock(textFileOrUrl, line); BinaryGrammar bg = new BinaryGrammar(stateIndex); bg.readData(in); System.err.print("."); line = in.readLine(); confirmBeginBlock(textFileOrUrl, line); DependencyGrammar dg = new MLEDependencyGrammar(op.tlpParams, op.directional, op.distance, op.coarseDistance, op.trainOptions.basicCategoryTagsInDependencyGrammar, op, wordIndex, tagIndex); dg.readData(in); System.err.print("."); in.close(); System.err.println(" done [" + tim.toSecondsString() + " sec]."); return new LexicalizedParser(lex, bg, ug, dg, stateIndex, wordIndex, tagIndex, op); } catch (IOException e) { e.printStackTrace(); } return null; } public static LexicalizedParser getParserFromSerializedFile(String serializedFileOrUrl) { try { Timing tim = new Timing(); System.err.print("Loading parser from serialized file " + serializedFileOrUrl + " ..."); ObjectInputStream in = IOUtils.readStreamFromString(serializedFileOrUrl); LexicalizedParser pd = loadModel(in); in.close(); System.err.println(" done [" + tim.toSecondsString() + " sec]."); return pd; } catch (InvalidClassException ice) { // For this, it's not a good idea to continue and try it as a text file! System.err.println(); // as in middle of line from above message throw new RuntimeException("Invalid class in file: " + serializedFileOrUrl, ice); } catch (FileNotFoundException fnfe) { // For this, it's not a good idea to continue and try it as a text file! System.err.println(); // as in middle of line from above message throw new RuntimeException("File not found: " + serializedFileOrUrl, fnfe); } catch (StreamCorruptedException sce) { // suppress error message, on the assumption that we've really got // a text grammar, and that'll be tried next System.err.println(); } catch (Exception e) { System.err.println(); // as in middle of line from above message e.printStackTrace(); } return null; } private static void printOptions(boolean train, Options op) { op.display(); if (train) { op.trainOptions.display(); } else { op.testOptions.display(); } op.tlpParams.display(); } public static TreeAnnotatorAndBinarizer buildTrainBinarizer(Options op) { TreebankLangParserParams tlpParams = op.tlpParams; if (!op.trainOptions.leftToRight) { return new TreeAnnotatorAndBinarizer(tlpParams, op.forceCNF, !op.trainOptions.outsideFactor(), !op.trainOptions.predictSplits, op); } else { return new TreeAnnotatorAndBinarizer(tlpParams.headFinder(), new LeftHeadFinder(), tlpParams, op.forceCNF, !op.trainOptions.outsideFactor(), !op.trainOptions.predictSplits, op); } } public static CompositeTreeTransformer buildTrainTransformer(Options op) { TreeAnnotatorAndBinarizer binarizer = buildTrainBinarizer(op); return buildTrainTransformer(op, binarizer); } public static CompositeTreeTransformer buildTrainTransformer(Options op, TreeAnnotatorAndBinarizer binarizer) { TreebankLangParserParams tlpParams = op.tlpParams; TreebankLanguagePack tlp = tlpParams.treebankLanguagePack(); CompositeTreeTransformer trainTransformer = new CompositeTreeTransformer(); if (op.trainOptions.preTransformer != null) { trainTransformer.addTransformer(op.trainOptions.preTransformer); } if (op.trainOptions.collinsPunc) { CollinsPuncTransformer collinsPuncTransformer = new CollinsPuncTransformer(tlp); trainTransformer.addTransformer(collinsPuncTransformer); } trainTransformer.addTransformer(binarizer); if (op.wordFunction != null) { TreeTransformer wordFunctionTransformer = new TreeLeafLabelTransformer(op.wordFunction); trainTransformer.addTransformer(wordFunctionTransformer); } return trainTransformer; } /** @return a pair of binaryTrainTreebank,binaryTuneTreebank. */ public static Triple getAnnotatedBinaryTreebankFromTreebank(Treebank trainTreebank, Treebank secondaryTreebank, Treebank tuneTreebank, Options op) { // setup tree transforms TreebankLangParserParams tlpParams = op.tlpParams; TreebankLanguagePack tlp = tlpParams.treebankLanguagePack(); if (op.testOptions.verbose) { PrintWriter pwErr = tlpParams.pw(System.err); pwErr.print("Training "); pwErr.println(trainTreebank.textualSummary(tlp)); if (secondaryTreebank != null) { pwErr.print("Secondary training "); pwErr.println(secondaryTreebank.textualSummary(tlp)); } } System.err.print("Binarizing trees..."); TreeAnnotatorAndBinarizer binarizer = buildTrainBinarizer(op); CompositeTreeTransformer trainTransformer = buildTrainTransformer(op, binarizer); Treebank wholeTreebank; if (secondaryTreebank == null) { wholeTreebank = trainTreebank; } else { wholeTreebank = new CompositeTreebank(trainTreebank, secondaryTreebank); } if (op.trainOptions.selectiveSplit) { op.trainOptions.splitters = ParentAnnotationStats.getSplitCategories(wholeTreebank, op.trainOptions.tagSelectiveSplit, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, tlp); removeDeleteSplittersFromSplitters(tlp, op); if (op.testOptions.verbose) { List list = new ArrayList(op.trainOptions.splitters); Collections.sort(list); System.err.println("Parent split categories: " + list); } } if (op.trainOptions.selectivePostSplit) { // Do all the transformations once just to learn selective splits on annotated categories TreeTransformer myTransformer = new TreeAnnotator(tlpParams.headFinder(), tlpParams, op); wholeTreebank = wholeTreebank.transform(myTransformer); op.trainOptions.postSplitters = ParentAnnotationStats.getSplitCategories(wholeTreebank, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, tlp); if (op.testOptions.verbose) { System.err.println("Parent post annotation split categories: " + op.trainOptions.postSplitters); } } if (op.trainOptions.hSelSplit) { // We run through all the trees once just to gather counts for hSelSplit! int ptt = op.trainOptions.printTreeTransformations; op.trainOptions.printTreeTransformations = 0; binarizer.setDoSelectiveSplit(false); for (Tree tree : wholeTreebank) { trainTransformer.transformTree(tree); } binarizer.setDoSelectiveSplit(true); op.trainOptions.printTreeTransformations = ptt; } // we've done all the setup now. here's where the train treebank is transformed. trainTreebank = trainTreebank.transform(trainTransformer); if (secondaryTreebank != null) { secondaryTreebank = secondaryTreebank.transform(trainTransformer); } if (op.trainOptions.printAnnotatedStateCounts) { binarizer.printStateCounts(); } if (op.trainOptions.printAnnotatedRuleCounts) { binarizer.printRuleCounts(); } if (tuneTreebank != null) { tuneTreebank = tuneTreebank.transform(trainTransformer); } Timing.tick("done."); if (op.testOptions.verbose) { binarizer.dumpStats(); } return new Triple(trainTreebank, secondaryTreebank, tuneTreebank); } private static void removeDeleteSplittersFromSplitters(TreebankLanguagePack tlp, Options op) { if (op.trainOptions.deleteSplitters != null) { List deleted = new ArrayList(); for (String del : op.trainOptions.deleteSplitters) { String baseDel = tlp.basicCategory(del); boolean checkBasic = del.equals(baseDel); for (Iterator it = op.trainOptions.splitters.iterator(); it.hasNext(); ) { String elem = it.next(); String baseElem = tlp.basicCategory(elem); boolean delStr = checkBasic && baseElem.equals(baseDel) || elem.equals(del); if (delStr) { it.remove(); deleted.add(elem); } } } if (op.testOptions.verbose) { System.err.println("Removed from vertical splitters: " + deleted); } } } // TODO: Make below method work with arbitrarily large secondary treebank via iteration // TODO: Have weight implemented for training lexicon /** * A method for training from two different treebanks, the second of which is presumed * to be orders of magnitude larger. *

* Trees are not read into memory but processed as they are read from disk. *

* A weight (typically <= 1) can be put on the second treebank. * * @param trainTreebank A treebank to train from * @param secondaryTrainTreebank Another treebank to train from * @param weight A weight factor to give the secondary treebank. If the weight * is 0.25, each example in the secondaryTrainTreebank will be treated as * 1/4 of an example sentence. * @param compactor A class for compacting grammars. May be null. * @param op Options for how the grammar is built from the treebank * @param tuneTreebank A treebank to tune free params on (may be null) * @param extraTaggedWords A list of words to add to the Lexicon * @return The trained LexicalizedParser */ public static LexicalizedParser getParserFromTreebank(Treebank trainTreebank, Treebank secondaryTrainTreebank, double weight, GrammarCompactor compactor, Options op, Treebank tuneTreebank, List> extraTaggedWords) { // System.err.println("Currently " + new Date()); // now printed when command-line args are printed printOptions(true, op); Timing.startTime(); Triple treebanks = TreeAnnotatorAndBinarizer.getAnnotatedBinaryTreebankFromTreebank(trainTreebank, secondaryTrainTreebank, tuneTreebank, op); Timing.tick("done."); Treebank trainTreebankRaw = trainTreebank; trainTreebank = treebanks.first(); secondaryTrainTreebank = treebanks.second(); tuneTreebank = treebanks.third(); // +1 to account for the boundary symbol trainTreebank = new FilteringTreebank(trainTreebank, new LengthTreeFilter(op.trainOptions.trainLengthLimit + 1)); if (secondaryTrainTreebank != null) { secondaryTrainTreebank = new FilteringTreebank(secondaryTrainTreebank, new LengthTreeFilter(op.trainOptions.trainLengthLimit + 1)); } if (tuneTreebank != null) { tuneTreebank = new FilteringTreebank(tuneTreebank, new LengthTreeFilter(op.trainOptions.trainLengthLimit + 1)); } Index stateIndex; Index wordIndex; Index tagIndex; Pair bgug; Lexicon lex; if (op.trainOptions.predictSplits) { SplittingGrammarExtractor extractor = new SplittingGrammarExtractor(op); System.err.print("Extracting PCFG..."); // TODO: make use of the tagged text if (secondaryTrainTreebank == null) { extractor.extract(trainTreebank); } else { extractor.extract(trainTreebank, 1.0, secondaryTrainTreebank, weight); } bgug = extractor.bgug; lex = extractor.lex; stateIndex = extractor.stateIndex; wordIndex = extractor.wordIndex; tagIndex = extractor.tagIndex; Timing.tick("done."); } else { stateIndex = new HashIndex(); wordIndex = new HashIndex(); tagIndex = new HashIndex(); // extract grammars BinaryGrammarExtractor bgExtractor = new BinaryGrammarExtractor(op, stateIndex); // Extractor lexExtractor = new LexiconExtractor(); //TreeExtractor uwmExtractor = new UnknownWordModelExtractor(trainTreebank.size()); System.err.print("Extracting PCFG..."); if (secondaryTrainTreebank == null) { bgug = bgExtractor.extract(trainTreebank); } else { bgug = bgExtractor.extract(trainTreebank, 1.0, secondaryTrainTreebank, weight); } Timing.tick("done."); System.err.print("Extracting Lexicon..."); lex = op.tlpParams.lex(op, wordIndex, tagIndex); double trainSize = trainTreebank.size(); if (secondaryTrainTreebank != null) { trainSize += (secondaryTrainTreebank.size() * weight); } if (extraTaggedWords != null) { trainSize += extraTaggedWords.size(); } lex.initializeTraining(trainSize); // wsg2012: The raw treebank has CoreLabels, which we need for FactoredLexicon // training. If TreeAnnotator is updated so that it produces CoreLabels, then we can // remove the trainTreebankRaw. lex.train(trainTreebank, trainTreebankRaw); if (secondaryTrainTreebank != null) { lex.train(secondaryTrainTreebank, weight); } if (extraTaggedWords != null) { for (List sentence : extraTaggedWords) { // TODO: specify a weight? lex.trainUnannotated(sentence, 1.0); } } lex.finishTraining(); Timing.tick("done."); } //TODO: wsg2011 Not sure if this should come before or after //grammar compaction if (op.trainOptions.ruleSmoothing) { System.err.print("Smoothing PCFG..."); Function,Pair> smoother = new LinearGrammarSmoother(op.trainOptions, stateIndex, tagIndex); bgug = smoother.apply(bgug); Timing.tick("done."); } if (compactor != null) { System.err.print("Compacting grammar..."); Triple, UnaryGrammar, BinaryGrammar> compacted = compactor.compactGrammar(bgug, stateIndex); stateIndex = compacted.first(); bgug.setFirst(compacted.second()); bgug.setSecond(compacted.third()); Timing.tick("done."); } System.err.print("Compiling grammar..."); BinaryGrammar bg = bgug.second; bg.splitRules(); UnaryGrammar ug = bgug.first; ug.purgeRules(); Timing.tick("done"); DependencyGrammar dg = null; if (op.doDep) { System.err.print("Extracting Dependencies..."); AbstractTreeExtractor dgExtractor = new MLEDependencyGrammarExtractor(op, wordIndex, tagIndex); if (secondaryTrainTreebank == null) { dg = dgExtractor.extract(trainTreebank); } else { dg = dgExtractor.extract(trainTreebank, 1.0, secondaryTrainTreebank, weight); } //System.err.print("Extracting Unknown Word Model..."); //UnknownWordModel uwm = (UnknownWordModel)uwmExtractor.extract(trainTreebank); //Timing.tick("done."); Timing.tick("done."); if (tuneTreebank != null) { System.err.print("Tuning Dependency Model..."); dg.setLexicon(lex); // MG2008: needed if using PwGt model dg.tune(tuneTreebank); Timing.tick("done."); } } System.err.println("Done training parser."); if (op.trainOptions.trainTreeFile!=null) { try { System.err.print("Writing out binary trees to "+ op.trainOptions.trainTreeFile+"..."); IOUtils.writeObjectToFile(trainTreebank, op.trainOptions.trainTreeFile); IOUtils.writeObjectToFile(secondaryTrainTreebank, op.trainOptions.trainTreeFile); Timing.tick("done."); } catch (Exception e) { System.err.println("Problem writing out binary trees."); } } return new LexicalizedParser(lex, bg, ug, dg, stateIndex, wordIndex, tagIndex, op); } /** * This will set options to the parser, in a way exactly equivalent to * passing in the same sequence of command-line arguments. This is a useful * convenience method when building a parser programmatically. The options * passed in should * be specified like command-line arguments, including with an initial * minus sign. *

* Notes: This can be used to set parsing-time flags for a * serialized parser. You can also still change things serialized * in Options, but this will probably degrade parsing performance. * The vast majority of command line flags can be passed to this * method, but you cannot pass in options that specify the treebank * or grammar to be loaded, the grammar to be written, trees or * files to be parsed or details of their encoding, nor the * TreebankLangParserParams (-tLPP) to use. The * TreebankLangParserParams should be set up on construction of a * LexicalizedParser, by constructing an Options that uses * the required TreebankLangParserParams, and passing that to a * LexicalizedParser constructor. Note that despite this * method being an instance method, many flags are actually set as * static class variables. * * @param flags Arguments to the parser, for example, * {"-outputFormat", "typedDependencies", "-maxLength", "70"} * @throws IllegalArgumentException If an unknown flag is passed in */ public void setOptionFlags(String... flags) { op.setOptions(flags); } /** * A main program for using the parser with various options. * This program can be used for building and serializing * a parser from treebank data, for parsing sentences from a file * or URL using a serialized or text grammar parser, * and (mainly for parser quality testing) * for training and testing a parser on a treebank all in one go. * *

* Sample Usages: *

    *
  • Train a parser (saved to serializedGrammarFilename) * from a directory of trees (trainFilesPath, with an optional fileRange, e.g., 0-1000): * java -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser [-v] -train trainFilesPath [fileRange] -saveToSerializedFile serializedGrammarFilename *
  • * *
  • Train a parser (not saved) from a directory of trees, and test it (reporting scores) on a directory of trees * java -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser [-v] -train trainFilesPath [fileRange] -testTreebank testFilePath [fileRange] *
  • * *
  • Parse one or more files, given a serialized grammar and a list of files * java -mx512m edu.stanford.nlp.parser.lexparser.LexicalizedParser [-v] serializedGrammarPath filename [filename] ... *
  • * *
  • Test and report scores for a serialized grammar on trees in an output directory * java -mx512m edu.stanford.nlp.parser.lexparser.LexicalizedParser [-v] -loadFromSerializedFile serializedGrammarPath -testTreebank testFilePath [fileRange] *
  • *
* *

* If the serializedGrammarPath ends in .gz, * then the grammar is written and read as a compressed file (GZip). * If the serializedGrammarPath is a URL, starting with * http://, then the parser is read from the URL. * A fileRange specifies a numeric value that must be included within a * filename for it to be used in training or testing (this works well with * most current treebanks). It can be specified like a range of pages to be * printed, for instance as 200-2199 or * 1-300,500-725,9000 or just as 1 (if all your * trees are in a single file, just give a dummy argument such as * 0 or 1). * The parser can write a grammar as either a serialized Java object file * or in a text format (or as both), specified with the following options: * *

* java edu.stanford.nlp.parser.lexparser.LexicalizedParser * [-v] -train * trainFilesPath [fileRange] [-saveToSerializedFile grammarPath] * [-saveToTextFile grammarPath]

* If no files are supplied to parse, then a hardwired sentence * is parsed.

* * In the same position as the verbose flag (-v), many other * options can be specified. The most useful to an end user are: *

    *
  • -tLPP class Specify a different * TreebankLangParserParams, for when using a different language or * treebank (the default is English Penn Treebank). This option MUST occur * before any other language-specific options that are used (or else they * are ignored!). * (It's usually a good idea to specify this option even when loading a * serialized grammar; it is necessary if the language pack specifies a * needed character encoding or you wish to specify language-specific * options on the command line.)
  • *
  • -encoding charset Specify the character encoding of the * input and output files. This will override the value in the * TreebankLangParserParams, provided this option appears * after any -tLPP option.
  • *
  • -tokenized Says that the input is already separated * into whitespace-delimited tokens. If this option is specified, any * tokenizer specified for the language is ignored, and a universal (Unicode) * tokenizer, which divides only on whitespace, is used. * Unless you also specify * -escaper, the tokens must all be correctly * tokenized tokens of the appropriate treebank for the parser to work * well (for instance, if using the Penn English Treebank, you must have * coded "(" as "-LRB-", "3/4" as "3\/4", etc.)
  • *
  • -escaper class Specify a class of type * {@link Function}<List<HasWord>,List<HasWord>> to do * customized escaping of tokenized text. This class will be run over the * tokenized text and can fix the representation of tokens. For instance, * it could change "(" to "-LRB-" for the Penn English Treebank. A * provided escaper that does such things for the Penn English Treebank is * edu.stanford.nlp.process.PTBEscapingProcessor *
  • -tokenizerFactory class Specifies a * TokenizerFactory class to be used for tokenization
  • *
  • -tokenizerOptions options Specifies options to a * TokenizerFactory class to be used for tokenization. A comma-separated * list. For PTBTokenizer, options of interest include * americanize=false and asciiQuotes (for German). * Note that any choice of tokenizer options that conflicts with the * tokenization used in the parser training data will likely degrade parser * performance.
  • *
  • -sentences token Specifies a token that marks sentence * boundaries. A value of newline causes sentence breaking on * newlines. A value of onePerElement causes each element * (using the XML -parseInside option) to be treated as a * sentence. All other tokens will be interpreted literally, and must be * exactly the same as tokens returned by the tokenizer. For example, * you might specify "|||" and put that symbol sequence as a token between * sentences. * If no explicit sentence breaking option is chosen, sentence breaking * is done based on a set of language-particular sentence-ending patterns. *
  • *
  • -parseInside element Specifies that parsing should only * be done for tokens inside the indicated XML-style * elements (done as simple pattern matching, rather than XML parsing). * For example, if this is specified as sentence, then * the text inside the sentence element * would be parsed. * Using "-parseInside s" gives you support for the input format of * Charniak's parser. Sentences cannot span elements. Whether the * contents of the element are treated as one sentence or potentially * multiple sentences is controlled by the -sentences flag. * The default is potentially multiple sentences. * This option gives support for extracting and parsing * text from very simple SGML and XML documents, and is provided as a * user convenience for that purpose. If you want to really parse XML * documents before NLP parsing them, you should use an XML parser, and then * call to a LexicalizedParser on appropriate CDATA. *
  • -tagSeparator char Specifies to look for tags on words * following the word and separated from it by a special character * char. For instance, many tagged corpora have the * representation "house/NN" and you would use -tagSeparator /. * Notes: This option requires that the input be pretokenized. * The separator has to be only a single character, and there is no * escaping mechanism. However, splitting is done on the last * instance of the character in the token, so that cases like * "3\/4/CD" are handled correctly. The parser will in all normal * circumstances use the tag you provide, but will override it in the * case of very common words in cases where the tag that you provide * is not one that it regards as a possible tagging for the word. * The parser supports a format where only some of the words in a sentence * have a tag (if you are calling the parser programmatically, you indicate * them by having them implement the HasTag interface). * You can do this at the command-line by only having tags after some words, * but you are limited by the fact that there is no way to escape the * tagSeparator character.
  • *
  • -maxLength leng Specify the longest sentence that * will be parsed (and hence indirectly the amount of memory * needed for the parser). If this is not specified, the parser will * try to dynamically grow its parse chart when long sentence are * encountered, but may run out of memory trying to do so.
  • *
  • -outputFormat styles Choose the style(s) of output * sentences: penn for prettyprinting as in the Penn * treebank files, or oneline for printing sentences one * per line, words, wordsAndTags, * dependencies, typedDependencies, * or typedDependenciesCollapsed. * Multiple options may be specified as a comma-separated * list. See TreePrint class for further documentation.
  • *
  • -outputFormatOptions Provide options that control the * behavior of various -outputFormat choices, such as * lexicalize, stem, markHeadNodes, * or xml. {@link edu.stanford.nlp.trees.TreePrint} * Options are specified as a comma-separated list.
  • *
  • -writeOutputFiles Write output files corresponding * to the input files, with the same name but a ".stp" * file extension. The format of these files depends on the * outputFormat option. (If not specified, output is sent * to stdout.)
  • *
  • -outputFilesExtension The extension that is appended to * the filename that is being parsed to produce an output file name (with the * -writeOutputFiles option). The default is stp. Don't * include the period. *
  • -outputFilesDirectory The directory in which output * files are written (when the -writeOutputFiles option is specified). * If not specified, output files are written in the same directory as the * input files. *
  • -nthreads Parsing files and testing on treebanks * can use multiple threads. This option tells the parser how many * threads to use. A negative number indicates to use as many * threads as the machine has cores. *
* See also the package documentation for more details and examples of use. * * @param args Command line arguments, as above */ public static void main(String[] args) { boolean train = false; boolean saveToSerializedFile = false; boolean saveToTextFile = false; String serializedInputFileOrUrl = null; String textInputFileOrUrl = null; String serializedOutputFileOrUrl = null; String textOutputFileOrUrl = null; String treebankPath = null; Treebank testTreebank = null; Treebank tuneTreebank = null; String testPath = null; FileFilter testFilter = null; String tunePath = null; FileFilter tuneFilter = null; FileFilter trainFilter = null; String secondaryTreebankPath = null; double secondaryTreebankWeight = 1.0; FileFilter secondaryTrainFilter = null; // variables needed to process the files to be parsed TokenizerFactory tokenizerFactory = null; String tokenizerOptions = null; String tokenizerFactoryClass = null; String tokenizerMethod = null; boolean tokenized = false; // whether or not the input file has already been tokenized Function, List> escaper = null; String tagDelimiter = null; String sentenceDelimiter = null; String elementDelimiter = null; int argIndex = 0; if (args.length < 1) { System.err.println("Basic usage (see Javadoc for more): java edu.stanford.nlp.parser.lexparser.LexicalizedParser parserFileOrUrl filename*"); return; } Options op = new Options(); List optionArgs = new ArrayList(); String encoding = null; // while loop through option arguments while (argIndex < args.length && args[argIndex].charAt(0) == '-') { if (args[argIndex].equalsIgnoreCase("-train") || args[argIndex].equalsIgnoreCase("-trainTreebank")) { train = true; Pair treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-train"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; treebankPath = treebankDescription.first(); trainFilter = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-train2")) { // train = true; // cdm july 2005: should require -train for this Triple treebankDescription = ArgUtils.getWeightedTreebankDescription(args, argIndex, "-train2"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; secondaryTreebankPath = treebankDescription.first(); secondaryTrainFilter = treebankDescription.second(); secondaryTreebankWeight = treebankDescription.third(); } else if (args[argIndex].equalsIgnoreCase("-tLPP") && (argIndex + 1 < args.length)) { try { op.tlpParams = (TreebankLangParserParams) Class.forName(args[argIndex + 1]).newInstance(); } catch (ClassNotFoundException e) { System.err.println("Class not found: " + args[argIndex + 1]); throw new RuntimeException(e); } catch (InstantiationException e) { System.err.println("Couldn't instantiate: " + args[argIndex + 1] + ": " + e.toString()); throw new RuntimeException(e); } catch (IllegalAccessException e) { System.err.println("Illegal access" + e); throw new RuntimeException(e); } argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-encoding")) { // sets encoding for TreebankLangParserParams // redone later to override any serialized parser one read in encoding = args[argIndex + 1]; op.tlpParams.setInputEncoding(encoding); op.tlpParams.setOutputEncoding(encoding); argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-tokenized")) { tokenized = true; argIndex += 1; } else if (args[argIndex].equalsIgnoreCase("-escaper")) { try { escaper = ReflectionLoading.loadByReflection(args[argIndex + 1]); } catch (Exception e) { System.err.println("Couldn't instantiate escaper " + args[argIndex + 1] + ": " + e); } argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-tokenizerOptions")) { tokenizerOptions = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-tokenizerFactory")) { tokenizerFactoryClass = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-tokenizerMethod")) { tokenizerMethod = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-sentences")) { sentenceDelimiter = args[argIndex + 1]; if (sentenceDelimiter.equalsIgnoreCase("newline")) { sentenceDelimiter = "\n"; } argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-parseInside")) { elementDelimiter = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-tagSeparator")) { tagDelimiter = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-loadFromSerializedFile") || args[argIndex].equalsIgnoreCase("-model")) { // load the parser from a binary serialized file // the next argument must be the path to the parser file serializedInputFileOrUrl = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-loadFromTextFile")) { // load the parser from declarative text file // the next argument must be the path to the parser file textInputFileOrUrl = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-saveToSerializedFile")) { saveToSerializedFile = true; if (ArgUtils.numSubArgs(args, argIndex) < 1) { System.err.println("Missing path: -saveToSerialized filename"); } else { serializedOutputFileOrUrl = args[argIndex + 1]; } argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-saveToTextFile")) { // save the parser to declarative text file saveToTextFile = true; textOutputFileOrUrl = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-saveTrainTrees")) { // save the training trees to a binary file op.trainOptions.trainTreeFile = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-treebank") || args[argIndex].equalsIgnoreCase("-testTreebank") || args[argIndex].equalsIgnoreCase("-test")) { Pair treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-test"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; testPath = treebankDescription.first(); testFilter = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-tune")) { Pair treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-tune"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; tunePath = treebankDescription.first(); tuneFilter = treebankDescription.second(); } else { int oldIndex = argIndex; argIndex = op.setOptionOrWarn(args, argIndex); for (int i = oldIndex; i < argIndex; i++) { optionArgs.add(args[i]); } } } // end while loop through arguments // all other arguments are order dependent and // are processed in order below if (tuneFilter != null || tunePath != null) { if (tunePath == null) { if (treebankPath == null) { throw new RuntimeException("No tune treebank path specified..."); } else { System.err.println("No tune treebank path specified. Using train path: \"" + treebankPath + '\"'); tunePath = treebankPath; } } tuneTreebank = op.tlpParams.testMemoryTreebank(); tuneTreebank.loadPath(tunePath, tuneFilter); } if (!train && op.testOptions.verbose) { StringUtils.printErrInvocationString("LexicalizedParser", args); } LexicalizedParser lp; // always initialized in next if-then-else block if (train) { StringUtils.printErrInvocationString("LexicalizedParser", args); // so we train a parser using the treebank GrammarCompactor compactor = null; if (op.trainOptions.compactGrammar() == 3) { compactor = new ExactGrammarCompactor(op, false, false); } Treebank trainTreebank = makeTreebank(treebankPath, op, trainFilter); Treebank secondaryTrainTreebank = null; if (secondaryTreebankPath != null) { secondaryTrainTreebank = makeSecondaryTreebank(secondaryTreebankPath, op, secondaryTrainFilter); } List> extraTaggedWords = null; if (op.trainOptions.taggedFiles != null) { extraTaggedWords = new ArrayList>(); List fileRecords = TaggedFileRecord.createRecords(new Properties(), op.trainOptions.taggedFiles); for (TaggedFileRecord record : fileRecords) { for (List sentence : record.reader()) { extraTaggedWords.add(sentence); } } } lp = getParserFromTreebank(trainTreebank, secondaryTrainTreebank, secondaryTreebankWeight, compactor, op, tuneTreebank, extraTaggedWords); } else if (textInputFileOrUrl != null) { // so we load the parser from a text grammar file lp = getParserFromTextFile(textInputFileOrUrl, op); } else { // so we load a serialized parser if (serializedInputFileOrUrl == null && argIndex < args.length) { // the next argument must be the path to the serialized parser serializedInputFileOrUrl = args[argIndex]; argIndex++; } if (serializedInputFileOrUrl == null) { System.err.println("No grammar specified, exiting..."); return; } String[] extraArgs = new String[optionArgs.size()]; extraArgs = optionArgs.toArray(extraArgs); try { lp = loadModel(serializedInputFileOrUrl, op, extraArgs); op = lp.op; } catch (IllegalArgumentException e) { System.err.println("Error loading parser, exiting..."); throw e; } } // set up tokenizerFactory with options if provided if (tokenizerFactoryClass != null || tokenizerOptions != null) { try { if (tokenizerFactoryClass != null) { Class> clazz = ErasureUtils.uncheckedCast(Class.forName(tokenizerFactoryClass)); Method factoryMethod; if (tokenizerOptions != null) { factoryMethod = clazz.getMethod(tokenizerMethod != null ? tokenizerMethod : "newWordTokenizerFactory", String.class); tokenizerFactory = ErasureUtils.uncheckedCast(factoryMethod.invoke(null, tokenizerOptions)); } else { factoryMethod = clazz.getMethod(tokenizerMethod != null ? tokenizerMethod : "newTokenizerFactory"); tokenizerFactory = ErasureUtils.uncheckedCast(factoryMethod.invoke(null)); } } else { // have options but no tokenizer factory. use the parser // langpack's factory and set its options tokenizerFactory = lp.op.langpack().getTokenizerFactory(); tokenizerFactory.setOptions(tokenizerOptions); } } catch (IllegalAccessException | InvocationTargetException | ClassNotFoundException | NoSuchMethodException e) { System.err.println("Couldn't instantiate TokenizerFactory " + tokenizerFactoryClass + " with options " + tokenizerOptions); throw new RuntimeException(e); } } // the following has to go after reading parser to make sure // op and tlpParams are the same for train and test // THIS IS BUTT UGLY BUT IT STOPS USER SPECIFIED ENCODING BEING // OVERWRITTEN BY ONE SPECIFIED IN SERIALIZED PARSER if (encoding != null) { op.tlpParams.setInputEncoding(encoding); op.tlpParams.setOutputEncoding(encoding); } if (testFilter != null || testPath != null) { if (testPath == null) { if (treebankPath == null) { throw new RuntimeException("No test treebank path specified..."); } else { System.err.println("No test treebank path specified. Using train path: \"" + treebankPath + '\"'); testPath = treebankPath; } } testTreebank = op.tlpParams.testMemoryTreebank(); testTreebank.loadPath(testPath, testFilter); } op.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(op.tlpParams.sisterSplitters())); // at this point we should be sure that op.tlpParams is // set appropriately (from command line, or from grammar file), // and will never change again. -- Roger // Now what do we do with the parser we've made if (saveToTextFile) { // save the parser to textGrammar format if (textOutputFileOrUrl != null) { lp.saveParserToTextFile(textOutputFileOrUrl); } else { System.err.println("Usage: must specify a text grammar output path"); } } if (saveToSerializedFile) { if (serializedOutputFileOrUrl != null) { lp.saveParserToSerialized(serializedOutputFileOrUrl); } else if (textOutputFileOrUrl == null && testTreebank == null) { // no saving/parsing request has been specified System.err.println("usage: " + "java edu.stanford.nlp.parser.lexparser.LexicalizedParser " + "-train trainFilesPath [fileRange] -saveToSerializedFile serializedParserFilename"); } } if (op.testOptions.verbose || train) { // Tell the user a little or a lot about what we have made // get lexicon size separately as it may have its own prints in it.... String lexNumRules = lp.lex != null ? Integer.toString(lp.lex.numRules()): ""; System.err.println("Grammar\tStates\tTags\tWords\tUnaryR\tBinaryR\tTaggings"); System.err.println("Grammar\t" + lp.stateIndex.size() + '\t' + lp.tagIndex.size() + '\t' + lp.wordIndex.size() + '\t' + (lp.ug != null ? lp.ug.numRules(): "") + '\t' + (lp.bg != null ? lp.bg.numRules(): "") + '\t' + lexNumRules); System.err.println("ParserPack is " + op.tlpParams.getClass().getName()); System.err.println("Lexicon is " + lp.lex.getClass().getName()); if (op.testOptions.verbose) { System.err.println("Tags are: " + lp.tagIndex); // System.err.println("States are: " + lp.pd.stateIndex); // This is too verbose. It was already printed out by the below printOptions command if the flag -printStates is given (at training time)! } printOptions(false, op); } if (testTreebank != null) { // test parser on treebank EvaluateTreebank evaluator = new EvaluateTreebank(lp); evaluator.testOnTreebank(testTreebank); } else if (argIndex >= args.length) { // no more arguments, so we just parse our own test sentence PrintWriter pwOut = op.tlpParams.pw(); PrintWriter pwErr = op.tlpParams.pw(System.err); ParserQuery pq = lp.parserQuery(); if (pq.parse(op.tlpParams.defaultTestSentence())) { lp.getTreePrint().printTree(pq.getBestParse(), pwOut); } else { pwErr.println("Error. Can't parse test sentence: " + op.tlpParams.defaultTestSentence()); } } else { // We parse filenames given by the remaining arguments ParseFiles.parseFiles(args, argIndex, tokenized, tokenizerFactory, elementDelimiter, sentenceDelimiter, escaper, tagDelimiter, op, lp.getTreePrint(), lp); } } // end main private static final long serialVersionUID = 2; } // end class LexicalizedParser




© 2015 - 2024 Weber Informatics LLC | Privacy Policy