Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
edu.stanford.nlp.trees.GrammaticalStructureConversionUtils Maven / Gradle / Ivy
Go to download
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
package edu.stanford.nlp.trees;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.*;
import edu.stanford.nlp.parser.lexparser.TreebankLangParserParams;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.process.WhitespaceTokenizer;
import edu.stanford.nlp.trees.international.pennchinese.CTBErrorCorrectingTreeNormalizer;
import edu.stanford.nlp.util.*;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.*;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.*;
import java.util.function.Function;
import java.util.function.Predicate;
/**
* Contains several utility methods to convert constituency trees to
* dependency trees.
*
* Used by {@link GrammaticalStructure#main(String[])}
*/
public class GrammaticalStructureConversionUtils {
public static final String DEFAULT_PARSER_FILE = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
/**
* Print typed dependencies in either the Stanford dependency representation
* or in the conllx format.
*
* @param deps Typed dependencies to print
* @param tree Tree corresponding to typed dependencies (only necessary if conllx
* == true)
* @param conllx If true use conllx format, otherwise use Stanford representation
* @param extraSep If true, in the Stanford representation, the extra dependencies
* (which do not preserve the tree structure) are printed after the
* basic dependencies
* @param convertToUPOS If true convert the POS tags to universal POS tags and output
* them along the original POS tags.
*/
public static void printDependencies(GrammaticalStructure gs, Collection deps, Tree tree,
boolean conllx, boolean extraSep, boolean convertToUPOS) {
System.out.println(dependenciesToString(gs, deps, tree, conllx, extraSep, convertToUPOS));
}
/**
* Calls dependenciesToCoNLLXString with the basic dependencies
* from a grammatical structure.
*
* (see {@link #dependenciesToCoNLLXString(Collection, CoreMap)})
*/
public static String dependenciesToCoNLLXString(GrammaticalStructure gs, CoreMap sentence) {
return dependenciesToCoNLLXString(gs.typedDependencies(), sentence);
}
/**
*
* Returns a dependency tree in CoNNL-X format.
* It requires a CoreMap for the sentence with a TokensAnnotation.
* Each token has to contain a word and a POS tag.
*
* @param deps The list of TypedDependency relations.
* @param sentence The corresponding CoreMap for the sentence.
* @return Dependency tree in CoNLL-X format.
*/
public static String dependenciesToCoNLLXString(Collection deps, CoreMap sentence) {
StringBuilder bf = new StringBuilder();
HashMap indexedDeps = new HashMap<>(deps.size());
for (TypedDependency dep : deps) {
indexedDeps.put(dep.dep().index(), dep);
}
List tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
if (tokens == null) {
throw new RuntimeException("dependenciesToCoNLLXString: CoreMap does not have required TokensAnnotation.");
}
int idx = 1;
for (CoreLabel token : tokens) {
String word = token.value();
String pos = token.tag();
String cPos = (token.get(CoreAnnotations.CoarseTagAnnotation.class) != null) ?
token.get(CoreAnnotations.CoarseTagAnnotation.class) : pos;
String lemma = token.lemma() != null ? token.lemma() : "_";
Integer gov = indexedDeps.containsKey(idx) ? indexedDeps.get(idx).gov().index() : 0;
String reln = indexedDeps.containsKey(idx) ? indexedDeps.get(idx).reln().toString() : "erased";
String out = String.format("%d\t%s\t%s\t%s\t%s\t_\t%d\t%s\t_\t_\n", idx, word, lemma, cPos, pos, gov, reln);
bf.append(out);
idx++;
}
return bf.toString();
}
public static String dependenciesToString(GrammaticalStructure gs, Collection deps, Tree tree,
boolean conllx, boolean extraSep, boolean convertToUPOS) {
StringBuilder bf = new StringBuilder();
Map indexToPos = Generics.newHashMap();
indexToPos.put(0,0); // to deal with the special node "ROOT"
List gsLeaves = gs.root.getLeaves();
for (int i = 0; i < gsLeaves.size(); i++) {
TreeGraphNode leaf = (TreeGraphNode) gsLeaves.get(i);
indexToPos.put(leaf.label.index(), i + 1);
}
if (conllx) {
List leaves = tree.getLeaves();
List uposLabels = null;
if (convertToUPOS) {
Tree uposTree = UniversalPOSMapper.mapTree(tree);
uposLabels = uposTree.preTerminalYield();
} else {
uposLabels = tree.preTerminalYield();
}
int index = 0;
CoreMap sentence = new CoreLabel();
List tokens = new ArrayList<>(leaves.size());
for (Tree leaf : leaves) {
index++;
if (!indexToPos.containsKey(index)) {
continue;
}
CoreLabel token = new CoreLabel();
token.setIndex(index);
token.setValue(leaf.value());
token.setWord(leaf.value());
token.setTag(leaf.parent(tree).value());
token.set(CoreAnnotations.CoarseTagAnnotation.class, uposLabels.get(index - 1).value());
tokens.add(token);
}
sentence.set(CoreAnnotations.TokensAnnotation.class, tokens);
bf.append(dependenciesToCoNLLXString(deps, sentence));
} else {
if (extraSep) {
List extraDeps = new ArrayList<>();
for (TypedDependency dep : deps) {
if (dep.extra()) {
extraDeps.add(dep);
} else {
bf.append(toStringIndex(dep, indexToPos));
bf.append("\n");
}
}
// now we print the separator for extra dependencies, and print these if
// there are some
if (!extraDeps.isEmpty()) {
bf.append("======\n");
for (TypedDependency dep : extraDeps) {
bf.append(toStringIndex(dep, indexToPos));
bf.append("\n");
}
}
} else {
for (TypedDependency dep : deps) {
bf.append(toStringIndex(dep, indexToPos));
bf.append("\n");
}
}
}
return bf.toString();
}
private static String toStringIndex(TypedDependency td, Map indexToPos) {
IndexedWord gov = td.gov();
IndexedWord dep = td.dep();
return td.reln() + "(" + gov.value() + "-" + indexToPos.get(gov.index()) + gov.toPrimes() + ", " + dep.value() + "-" + indexToPos.get(dep.index()) + dep.toPrimes() + ")";
}
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(GrammaticalStructureConversionUtils.class);
private static String[] parseClassConstructArgs(String namePlusArgs) {
String[] args = StringUtils.EMPTY_STRING_ARRAY;
String name = namePlusArgs;
if (namePlusArgs.matches(".*\\([^)]*\\)$")) {
String argStr = namePlusArgs.replaceFirst("^.*\\(([^)]*)\\)$", "$1");
args = argStr.split(",");
name = namePlusArgs.replaceFirst("\\([^)]*\\)$", "");
}
String[] tokens = new String[1 + args.length];
tokens[0] = name;
System.arraycopy(args, 0, tokens, 1, args.length);
return tokens;
}
private static DependencyReader loadAlternateDependencyReader(String altDepReaderName) {
Class altDepReaderClass = null;
String[] toks = parseClassConstructArgs(altDepReaderName);
altDepReaderName = toks[0];
String[] depReaderArgs = new String[toks.length - 1];
System.arraycopy(toks, 1, depReaderArgs, 0, toks.length - 1);
try {
Class cl = Class.forName(altDepReaderName);
altDepReaderClass = cl.asSubclass(DependencyReader.class);
} catch (ClassNotFoundException e) {
// have a second go below
}
if (altDepReaderClass == null) {
try {
Class cl = Class.forName("edu.stanford.nlp.trees." + altDepReaderName);
altDepReaderClass = cl.asSubclass(DependencyReader.class);
} catch (ClassNotFoundException e) {
//
}
}
if (altDepReaderClass == null) {
log.info("Can't load dependency reader " + altDepReaderName + " or edu.stanford.nlp.trees." + altDepReaderName);
return null;
}
DependencyReader altDepReader; // initialized below
if (depReaderArgs.length == 0) {
try {
altDepReader = altDepReaderClass.newInstance();
} catch (InstantiationException e) {
throw new RuntimeException(e);
} catch (IllegalAccessException e) {
log.info("No argument constructor to " + altDepReaderName + " is not public");
return null;
}
} else {
try {
altDepReader = altDepReaderClass.getConstructor(String[].class).newInstance((Object) depReaderArgs);
} catch (IllegalArgumentException | SecurityException | InvocationTargetException e) {
throw new RuntimeException(e);
} catch (InstantiationException e) {
e.printStackTrace();
return null;
} catch (IllegalAccessException e) {
log.info(depReaderArgs.length + " argument constructor to " + altDepReaderName + " is not public.");
return null;
} catch (NoSuchMethodException e) {
log.info("String arguments constructor to " + altDepReaderName + " does not exist.");
return null;
}
}
return altDepReader;
}
private static DependencyPrinter loadAlternateDependencyPrinter(String altDepPrinterName) {
Class altDepPrinterClass = null;
String[] toks = parseClassConstructArgs(altDepPrinterName);
altDepPrinterName = toks[0];
String[] depPrintArgs = new String[toks.length - 1];
System.arraycopy(toks, 1, depPrintArgs, 0, toks.length - 1);
try {
Class cl = Class.forName(altDepPrinterName);
altDepPrinterClass = cl.asSubclass(DependencyPrinter.class);
} catch (ClassNotFoundException e) {
//
}
if (altDepPrinterClass == null) {
try {
Class cl = Class.forName("edu.stanford.nlp.trees." + altDepPrinterName);
altDepPrinterClass = cl.asSubclass(DependencyPrinter.class);
} catch (ClassNotFoundException e) {
//
}
}
if (altDepPrinterClass == null) {
System.err.printf("Unable to load alternative printer %s or %s. Is your classpath set correctly?\n", altDepPrinterName, "edu.stanford.nlp.trees." + altDepPrinterName);
return null;
}
try {
DependencyPrinter depPrinter;
if (depPrintArgs.length == 0) {
depPrinter = altDepPrinterClass.newInstance();
} else {
depPrinter = altDepPrinterClass.getConstructor(String[].class).newInstance((Object) depPrintArgs);
}
return depPrinter;
} catch (IllegalArgumentException e) {
e.printStackTrace();
return null;
} catch (SecurityException e) {
e.printStackTrace();
return null;
} catch (InstantiationException e) {
e.printStackTrace();
return null;
} catch (IllegalAccessException e) {
e.printStackTrace();
return null;
} catch (InvocationTargetException e) {
e.printStackTrace();
return null;
} catch (NoSuchMethodException e) {
if (depPrintArgs.length == 0) {
System.err.printf("Can't find no-argument constructor %s().%n", altDepPrinterName);
} else {
System.err.printf("Can't find constructor %s(%s).%n", altDepPrinterName, Arrays.toString(depPrintArgs));
}
return null;
}
}
private static Function, Tree> loadParser(String parserFile, String parserOptions, boolean makeCopulaHead) {
if (parserFile == null || "".equals(parserFile)) {
parserFile = DEFAULT_PARSER_FILE;
if (parserOptions == null) {
parserOptions = "-retainTmpSubcategories";
}
}
if (parserOptions == null) {
parserOptions = "";
}
if (makeCopulaHead) {
parserOptions = "-makeCopulaHead " + parserOptions;
}
parserOptions = parserOptions.trim();
// Load parser by reflection, so that this class doesn't require parser
// for runtime use
// LexicalizedParser lp = LexicalizedParser.loadModel(parserFile);
// For example, the tregex package uses TreePrint, which uses
// GrammaticalStructure, which would then import the
// LexicalizedParser. The tagger can read trees, which means it
// would depend on tregex and therefore depend on the parser.
Function, Tree> lp;
try {
Class[] classes = new Class[] { String.class, String[].class };
Method method = Class.forName("edu.stanford.nlp.parser.lexparser.LexicalizedParser").getMethod("loadModel", classes);
String[] opts = StringUtils.EMPTY_STRING_ARRAY;
if ( ! parserOptions.isEmpty()) {
opts = parserOptions.split(" +");
}
lp = (Function,Tree>) method.invoke(null, parserFile, opts);
} catch (Exception cnfe) {
throw new RuntimeException(cnfe);
}
return lp;
}
/**
* Allow a collection of trees, that is a Treebank, appear to be a collection
* of GrammaticalStructures.
*
* @author danielcer
*
*/
private static class TreeBankGrammaticalStructureWrapper implements Iterable {
private final Iterable trees;
private final boolean keepPunct;
private final TreebankLangParserParams params;
private final Map origTrees = new WeakHashMap<>();
public TreeBankGrammaticalStructureWrapper(Iterable wrappedTrees, boolean keepPunct, TreebankLangParserParams params) {
trees = wrappedTrees;
this.keepPunct = keepPunct;
this.params = params;
}
@Override
public Iterator iterator() {
return new GsIterator();
}
public Tree getOriginalTree(GrammaticalStructure gs) {
return origTrees.get(gs);
}
private class GsIterator implements Iterator {
private final Iterator tbIterator = trees.iterator();
private final Predicate puncFilter;
private final HeadFinder hf;
private GrammaticalStructure next;
public GsIterator() {
if (keepPunct) {
puncFilter = Filters.acceptFilter();
} else if (params.generateOriginalDependencies()) {
puncFilter = params.treebankLanguagePack().punctuationWordRejectFilter();
} else {
puncFilter = params.treebankLanguagePack().punctuationTagRejectFilter();
}
hf = params.typedDependencyHeadFinder();
primeGs();
}
private void primeGs() {
GrammaticalStructure gs = null;
while (gs == null && tbIterator.hasNext()) {
Tree t = tbIterator.next();
// log.info("GsIterator: Next tree is");
// log.info(t);
if (t == null) {
continue;
}
try {
gs = params.getGrammaticalStructure(t, puncFilter, hf);
origTrees.put(gs, t);
next = gs;
// log.info("GsIterator: Next tree is");
// log.info(t);
return;
} catch (NullPointerException npe) {
log.info("Bung tree caused below dump. Continuing....");
log.info(t);
npe.printStackTrace();
}
}
next = null;
}
@Override
public boolean hasNext() {
return next != null;
}
@Override
public GrammaticalStructure next() {
GrammaticalStructure ret = next;
if (ret == null) {
throw new NoSuchElementException();
}
primeGs();
return ret;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
}
} // end static class TreebankGrammaticalStructureWrapper
/**
* Enum to identify the different TokenizerTypes. To add a new
* TokenizerType, add it to the list with a default options string
* and add a clause in getTokenizerType to identify it.
*/
public enum ConverterOptions {
UniversalEnglish("en", new NPTmpRetainingTreeNormalizer(0, false, 1, false),
"edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams", false, true),
UniversalChinese("zh", new CTBErrorCorrectingTreeNormalizer(false, false, false, false),
"edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams", false, false),
English("en-sd", new NPTmpRetainingTreeNormalizer(0, false, 1, false),
"edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams", true, true),
Chinese("zh-sd", new CTBErrorCorrectingTreeNormalizer(false, false, false, false),
"edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams", true, false);
public final String abbreviation;
public final TreeNormalizer treeNormalizer;
public final String tlPPClassName;
public final boolean stanfordDependencies;
/* Conversion to UPOS is currently only supported for English. */
public final boolean convertToUPOS;
ConverterOptions(String abbreviation, TreeNormalizer treeNormalizer, String tlPPClassName,
boolean stanfordDependencies, boolean convertToUPOS) {
this.abbreviation = abbreviation;
this.treeNormalizer = treeNormalizer;
this.tlPPClassName = tlPPClassName;
/* Generate old Stanford Dependencies instead of UD, when set to true. */
this.stanfordDependencies = stanfordDependencies;
this.convertToUPOS = convertToUPOS;
}
private static final Map nameToTokenizerMap = initializeNameMap();
private static Map initializeNameMap() {
Map map = Generics.newHashMap();
for (ConverterOptions opts : ConverterOptions.values()) {
if (opts.abbreviation != null) {
map.put(opts.abbreviation.toUpperCase(), opts);
}
map.put(opts.toString().toUpperCase(), opts);
}
return Collections.unmodifiableMap(map);
}
public static ConverterOptions getConverterOptions(String language) {
if (language == null) { return nameToTokenizerMap.get("EN"); }
ConverterOptions opts = nameToTokenizerMap.get(language.toUpperCase());
return opts != null ? opts : nameToTokenizerMap.get("EN");
}
}
/**
* Given sentences or trees, output the typed dependencies.
*
* By default, the method outputs the collapsed typed dependencies with
* processing of conjuncts. The input can be given as plain text (one sentence
* by line) using the option -sentFile, or as trees using the option
* -treeFile. For -sentFile, the input has to be strictly one sentence per
* line. You can specify where to find a parser with -parserFile
* serializedParserPath. See LexicalizedParser for more flexible processing of
* text files (including with Stanford Dependencies output). The above options
* assume a file as input. You can also feed trees (only) via stdin by using
* the option -filter. If one does not specify a -parserFile, one
* can specify which language pack to use with -tLPP, This option
* specifies a class which determines which GrammaticalStructure to
* use, which HeadFinder to use, etc. It will default to
* edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams,
* but any TreebankLangParserParams can be specified.
*
* If no method of producing trees is given other than to use the
* LexicalizedParser, but no parser is specified, a default parser
* is used, the English parser. You can specify options to load
* with the parser using the -parserOpts flag. If the default
* parser is used, and no options are provided, the option
* -retainTmpSubcategories is used.
*
* The following options can be used to specify the types of dependencies
* wanted:
*
* -collapsed collapsed dependencies
* -basic non-collapsed dependencies that preserve a tree structure
* -nonCollapsed non-collapsed dependencies that do not preserve a tree
* structure (the basic dependencies plus the extra ones)
* -CCprocessed
* collapsed dependencies and conjunctions processed (dependencies are added
* for each conjunct) -- this is the default if no options are passed
* -collapsedTree collapsed dependencies retaining a tree structure
* -makeCopulaHead Contrary to the approach argued for in the SD papers,
* nevertheless make the verb 'to be' the head, not the predicate noun, adjective,
* etc. (However, when the verb 'to be' is used as an auxiliary verb, the main
* verb is still treated as the head.)
* -originalDependencies generate the dependencies using the original converter
* instead of the Universal Dependencies converter.
*
*
* The {@code -conllx} option will output the dependencies in the CoNLL format,
* instead of in the standard Stanford format (relation(governor,dependent))
* and will retain punctuation by default.
* When used in the "collapsed" format, words such as prepositions, conjunctions
* which get collapsed into the grammatical relations and are not part of the
* sentence per se anymore will be annotated with "erased" as grammatical relation
* and attached to the fake "ROOT" node with index 0.
*
* There is also an option to retain dependencies involving punctuation:
* {@code -keepPunct}
*
* The {@code -extraSep} option used with -nonCollapsed will print the basic
* dependencies first, then a separator ======, and then the extra
* dependencies that do not preserve the tree structure. The -test option is
* used for debugging: it prints the grammatical structure, as well as the
* basic, collapsed and CCprocessed dependencies. It also checks the
* connectivity of the collapsed dependencies. If the collapsed dependencies
* list doesn't constitute a connected graph, it prints the possible offending
* nodes (one of them is the real root of the graph).
*
* Using the -conllxFile, you can pass a file containing Stanford dependencies
* in the CoNLL format (e.g., the basic dependencies), and obtain another
* representation using one of the representation options.
*
* Usage:
* java edu.stanford.nlp.trees.GrammaticalStructure [-treeFile FILE | -sentFile FILE | -conllxFile FILE | -filter]
* [-collapsed -basic -CCprocessed -test -generateOriginalDependencies]
*
* @param args Command-line arguments, as above
*/
@SuppressWarnings("unchecked")
public static void convertTrees(String[] args, String defaultLang) {
/* Use a tree normalizer that removes all empty nodes.
This prevents wrong indexing of the nodes in the dependency relations. */
Iterable gsBank = null;
Properties props = StringUtils.argsToProperties(args);
String language = props.getProperty("language", defaultLang);
ConverterOptions opts = ConverterOptions.getConverterOptions(language);
MemoryTreebank tb = new MemoryTreebank(opts.treeNormalizer);
Iterable trees = tb;
String encoding = props.getProperty("encoding", "utf-8");
try {
System.setOut(new PrintStream(System.out, true, encoding));
} catch (IOException e) {
throw new RuntimeException(e);
}
String treeFileName = props.getProperty("treeFile");
String sentFileName = props.getProperty("sentFile");
String conllXFileName = props.getProperty("conllxFile");
String altDepPrinterName = props.getProperty("altprinter");
String altDepReaderName = props.getProperty("altreader");
String altDepReaderFilename = props.getProperty("altreaderfile");
String filter = props.getProperty("filter");
boolean makeCopulaHead = props.getProperty("makeCopulaHead") != null;
boolean generateOriginalDependencies = props.getProperty("originalDependencies") != null || opts.stanfordDependencies;
// TODO: if a parser is specified, load this from the parser
// instead of ever loading it from this way
String tLPP = props.getProperty("tLPP", opts.tlPPClassName);
TreebankLangParserParams params = ReflectionLoading.loadByReflection(tLPP);
params.setGenerateOriginalDependencies(generateOriginalDependencies);
if (makeCopulaHead) {
// TODO: generalize and allow for more options
String[] options = { "-makeCopulaHead" };
params.setOptionFlag(options, 0);
}
if (sentFileName == null && (altDepReaderName == null || altDepReaderFilename == null) && treeFileName == null && conllXFileName == null && filter == null) {
try {
System.err.printf("Usage: java %s%n", GrammaticalStructure.class.getCanonicalName());
System.err.println("Options:");
System.err.println(" Dependency representation:");
System.err.println(" -basic:\t\tGenerate basic dependencies.");
System.err.println(" -enhanced:\t\tGenerate enhanced dependencies, currently only implemented for English UD.");
System.err.println(" -enhanced++:\tGenerate enhanced++ dependencies (default), currently only implemented for English UD.");
System.err.println(" -collapsed:\t\tGenerate collapsed dependencies, deprecated.");
System.err.println(" -CCprocessed:\tGenerate CC-processed dependencies, deprecated.");
System.err.println(" -collapsedTree:\tGenerate collapsed-tree dependencies, deprecated.");
System.err.println("");
System.err.println(" Input:");
System.err.println(" -treeFile :\tConvert from constituency trees in ");
System.err.println(" -sentFile :\tParse and convert sentences from . Only implemented for English.");
System.err.println("");
System.err.println(" Output:");
System.err.println(" -conllx:\t\tOutput dependencies in CoNLL format.");
System.err.println("");
System.err.println(" Language:");
System.err.println(" -language [en|zh|en-sd|zh-sd]:\t (Universal English Dependencies, Universal Chinese Dependencies, English Stanford Dependencies, Chinese Stanford Dependencies)");
System.err.println("");
System.err.println("");
System.err.println("");
System.err.println("Example:");
TreeReader tr = new PennTreeReader(new StringReader("((S (NP (NNP Sam)) (VP (VBD died) (NP-TMP (NN today)))))"));
tb.add(tr.readTree());
} catch (Exception e) {
log.info("Horrible error: " + e);
e.printStackTrace();
}
} else if (altDepReaderName != null && altDepReaderFilename != null) {
DependencyReader altDepReader = loadAlternateDependencyReader(altDepReaderName);
try {
gsBank = altDepReader.readDependencies(altDepReaderFilename);
} catch (IOException e) {
log.info("Error reading " + altDepReaderFilename);
return;
}
} else if (treeFileName != null) {
tb.loadPath(treeFileName);
} else if (filter != null) {
tb.load(IOUtils.readerFromStdin());
} else if (conllXFileName != null) {
try {
gsBank = params.readGrammaticalStructureFromFile(conllXFileName);
} catch (RuntimeIOException e) {
log.info("Error reading " + conllXFileName);
return;
}
} else {
String parserFile = props.getProperty("parserFile");
String parserOpts = props.getProperty("parserOpts");
boolean tokenized = props.getProperty("tokenized") != null;
Function, Tree> lp = loadParser(parserFile, parserOpts, makeCopulaHead);
trees = new LazyLoadTreesByParsing(sentFileName, encoding, tokenized, lp);
// Instead of getting this directly from the LP, use reflection
// so that a package which uses GrammaticalStructure doesn't
// necessarily have to use LexicalizedParser
try {
Method method = lp.getClass().getMethod("getTLPParams");
params = (TreebankLangParserParams) method.invoke(lp);
params.setGenerateOriginalDependencies(generateOriginalDependencies);
} catch (Exception cnfe) {
throw new RuntimeException(cnfe);
}
}
// treats the output according to the options passed
boolean basic = props.getProperty("basic") != null;
boolean collapsed = props.getProperty("collapsed") != null;
boolean CCprocessed = props.getProperty("CCprocessed") != null;
boolean collapsedTree = props.getProperty("collapsedTree") != null;
boolean nonCollapsed = props.getProperty("nonCollapsed") != null;
boolean extraSep = props.getProperty("extraSep") != null;
boolean parseTree = props.getProperty("parseTree") != null;
boolean test = props.getProperty("test") != null;
boolean keepPunct = true; //always keep punctuation marks
boolean conllx = props.getProperty("conllx") != null;
// todo: Support checkConnected on more options (including basic)
boolean checkConnected = props.getProperty("checkConnected") != null;
boolean portray = props.getProperty("portray") != null;
boolean enhanced = props.getProperty("enhanced") != null;
boolean enhancedPlusPlus = props.getProperty("enhanced++") != null;
// If requested load alternative printer
DependencyPrinter altDepPrinter = null;
if (altDepPrinterName != null) {
altDepPrinter = loadAlternateDependencyPrinter(altDepPrinterName);
}
// log.info("First tree in tb is");
// log.info(((MemoryTreebank) tb).get(0));
Method m = null;
if (test) {
// see if we can use SemanticGraph(Factory) to check for being a DAG
// Do this by reflection to avoid this becoming a dependency when we distribute the parser
try {
Class sgf = Class.forName("edu.stanford.nlp.semgraph.SemanticGraphFactory");
m = sgf.getDeclaredMethod("makeFromTree", GrammaticalStructure.class, boolean.class, boolean.class, boolean.class, boolean.class, boolean.class, boolean.class, Predicate.class, String.class, int.class);
} catch (Exception e) {
log.info("Test cannot check for cycles in tree format (classes not available)");
}
}
if (gsBank == null) {
gsBank = new TreeBankGrammaticalStructureWrapper(trees, keepPunct, params);
}
for (GrammaticalStructure gs : gsBank) {
Tree tree;
if (gsBank instanceof TreeBankGrammaticalStructureWrapper) {
// log.info("Using TreeBankGrammaticalStructureWrapper branch");
tree = ((TreeBankGrammaticalStructureWrapper) gsBank).getOriginalTree(gs);
// log.info("Tree is: ");
// log.info(t);
} else {
// log.info("Using gs.root() branch");
tree = gs.root(); // recover tree
// log.info("Tree from gs is");
// log.info(t);
}
if (test) { // print the grammatical structure, the basic, collapsed and CCprocessed
System.out.println("============= parse tree =======================");
tree.pennPrint();
System.out.println();
System.out.println("------------- GrammaticalStructure -------------");
System.out.println(gs);
boolean allConnected = true;
boolean connected;
Collection bungRoots = null;
System.out.println("------------- basic dependencies ---------------");
List gsb = gs.typedDependencies(GrammaticalStructure.Extras.NONE);
System.out.println(StringUtils.join(gsb, "\n"));
connected = GrammaticalStructure.isConnected(gsb);
if ( ! connected && bungRoots == null) {
bungRoots = GrammaticalStructure.getRoots(gsb);
}
allConnected = connected && allConnected;
System.out.println("------------- non-collapsed dependencies (basic + extra) ---------------");
List gse = gs.typedDependencies(GrammaticalStructure.Extras.MAXIMAL);
System.out.println(StringUtils.join(gse, "\n"));
connected = GrammaticalStructure.isConnected(gse);
if ( ! connected && bungRoots == null) {
bungRoots = GrammaticalStructure.getRoots(gse);
}
allConnected = connected && allConnected;
System.out.println("------------- collapsed dependencies -----------");
System.out.println(StringUtils.join(gs.typedDependenciesCollapsed(GrammaticalStructure.Extras.MAXIMAL), "\n"));
System.out.println("------------- collapsed dependencies tree -----------");
System.out.println(StringUtils.join(gs.typedDependenciesCollapsedTree(), "\n"));
System.out.println("------------- CCprocessed dependencies --------");
List gscc = gs.typedDependenciesCollapsed(GrammaticalStructure.Extras.MAXIMAL);
System.out.println(StringUtils.join(gscc, "\n"));
System.out.println("-----------------------------------------------");
// connectivity tests
connected = GrammaticalStructure.isConnected(gscc);
if ( ! connected && bungRoots == null) {
bungRoots = GrammaticalStructure.getRoots(gscc);
}
allConnected = connected && allConnected;
if (allConnected) {
System.out.println("dependencies form connected graphs.");
} else {
System.out.println("dependency graph NOT connected! possible offending nodes: " + bungRoots);
}
// test for collapsed dependencies being a tree:
// make sure at least it doesn't contain cycles (i.e., is a DAG)
// Do this by reflection so parser doesn't need SemanticGraph and its
// libraries
if (m != null) {
try {
// the first arg is null because it's a static method....
Object semGraph = m.invoke(null, gs, false, true, false, false, false, false, null, null, 0);
Class sg = Class.forName("edu.stanford.nlp.semgraph.SemanticGraph");
Method mDag = sg.getDeclaredMethod("isDag");
boolean isDag = (Boolean) mDag.invoke(semGraph);
System.out.println("tree dependencies form a DAG: " + isDag);
} catch (Exception e) {
e.printStackTrace();
}
}
}// end of "test" output
else {
if (parseTree) {
System.out.println("============= parse tree =======================");
tree.pennPrint();
System.out.println();
}
if (basic) {
if (collapsed || CCprocessed || collapsedTree || nonCollapsed || enhanced || enhancedPlusPlus) {
System.out.println("------------- basic dependencies ---------------");
}
if (altDepPrinter == null) {
printDependencies(gs, gs.typedDependencies(GrammaticalStructure.Extras.NONE), tree, conllx, false, opts.convertToUPOS);
} else {
System.out.println(altDepPrinter.dependenciesToString(gs, gs.typedDependencies(GrammaticalStructure.Extras.NONE), tree));
}
}
if (nonCollapsed) {
if (basic || CCprocessed || collapsed || collapsedTree) {
System.out.println("----------- non-collapsed dependencies (basic + extra) -----------");
}
printDependencies(gs, gs.allTypedDependencies(), tree, conllx, extraSep, opts.convertToUPOS);
}
if (collapsed) {
if (basic || CCprocessed || collapsedTree || nonCollapsed) {
System.out.println("----------- collapsed dependencies -----------");
}
printDependencies(gs, gs.typedDependenciesCollapsed(GrammaticalStructure.Extras.MAXIMAL), tree, conllx, false, opts.convertToUPOS);
}
if (CCprocessed) {
if (basic || collapsed || collapsedTree || nonCollapsed) {
System.out.println("---------- CCprocessed dependencies ----------");
}
List deps = gs.typedDependenciesCCprocessed(GrammaticalStructure.Extras.MAXIMAL);
if (checkConnected) {
if (!GrammaticalStructure.isConnected(deps)) {
log.info("Graph is not connected for:");
log.info(tree);
log.info("possible offending nodes: " + GrammaticalStructure.getRoots(deps));
}
}
printDependencies(gs, deps, tree, conllx, false, opts.convertToUPOS);
}
if (collapsedTree) {
if (basic || CCprocessed || collapsed || nonCollapsed) {
System.out.println("----------- collapsed dependencies tree -----------");
}
printDependencies(gs, gs.typedDependenciesCollapsedTree(), tree, conllx, false, opts.convertToUPOS);
}
if (enhanced) {
if (basic || enhancedPlusPlus) {
System.out.println("----------- enhanced dependencies tree -----------");
}
printDependencies(gs, gs.typedDependenciesEnhanced(), tree, conllx, false, opts.convertToUPOS);
}
if (enhancedPlusPlus) {
if (basic || enhanced) {
System.out.println("----------- enhanced++ dependencies tree -----------");
}
printDependencies(gs, gs.typedDependenciesEnhancedPlusPlus(), tree, conllx, false, opts.convertToUPOS);
}
// default use: enhanced++ for UD, CCprocessed for SD (to parallel what happens within the parser)
if (!basic && !collapsed && !CCprocessed && !collapsedTree && !nonCollapsed && !enhanced && !enhancedPlusPlus) {
// System.out.println("----------- CCprocessed dependencies -----------");
if (generateOriginalDependencies) {
printDependencies(gs, gs.typedDependenciesCCprocessed(GrammaticalStructure.Extras.MAXIMAL), tree, conllx, false, opts.convertToUPOS);
} else {
printDependencies(gs, gs.typedDependenciesEnhancedPlusPlus(), tree, conllx, false, opts.convertToUPOS);
}
}
}
if (portray) {
try {
// put up a window showing it
Class sgu = Class.forName("edu.stanford.nlp.semgraph.SemanticGraphUtils");
Method mRender = sgu.getDeclaredMethod("render", GrammaticalStructure.class, String.class);
// the first arg is null because it's a static method....
mRender.invoke(null, gs, "Collapsed, CC processed deps");
} catch (Exception e) {
throw new RuntimeException("Couldn't use swing to portray semantic graph", e);
}
}
} // end for
} // end convertTrees
// todo [cdm 2013]: Take this out and make it a trees class: TreeIterableByParsing
static class LazyLoadTreesByParsing implements Iterable {
final Reader reader;
final String filename;
final boolean tokenized;
final String encoding;
final Function, Tree> lp;
public LazyLoadTreesByParsing(String filename, String encoding, boolean tokenized, Function, Tree> lp) {
this.filename = filename;
this.encoding = encoding;
this.reader = null;
this.tokenized = tokenized;
this.lp = lp;
}
@Override
public Iterator iterator() {
final BufferedReader iReader;
if (reader != null) {
iReader = new BufferedReader(reader);
} else {
try {
iReader = new BufferedReader(new InputStreamReader(new FileInputStream(filename), encoding));
} catch (IOException e) {
throw new RuntimeException(e);
}
}
return new Iterator() {
String line; // = null;
@Override
public boolean hasNext() {
if (line != null) {
return true;
} else {
try {
line = iReader.readLine();
} catch (IOException e) {
throw new RuntimeException(e);
}
if (line == null) {
try {
if (reader == null) iReader.close();
} catch (Exception e) {
throw new RuntimeException(e);
}
return false;
}
return true;
}
}
@Override
public Tree next() {
if (line == null) {
throw new NoSuchElementException();
}
Reader lineReader = new StringReader(line);
line = null;
List words;
if (tokenized) {
words = WhitespaceTokenizer.newWordWhitespaceTokenizer(lineReader).tokenize();
} else {
words = PTBTokenizer.newPTBTokenizer(lineReader).tokenize();
}
if (!words.isEmpty()) {
// the parser throws an exception if told to parse an empty sentence.
Tree parseTree = lp.apply(words);
return parseTree;
} else {
return new SimpleTree();
}
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
} // end static class LazyLoadTreesByParsing
}