All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.trees.GrammaticalStructureConversionUtils Maven / Gradle / Ivy

Go to download

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.trees;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.*;
import edu.stanford.nlp.parser.lexparser.TreebankLangParserParams;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.process.WhitespaceTokenizer;
import edu.stanford.nlp.trees.international.pennchinese.CTBErrorCorrectingTreeNormalizer;
import edu.stanford.nlp.util.*;
import edu.stanford.nlp.util.logging.Redwood;

import java.io.*;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.*;
import java.util.function.Function;
import java.util.function.Predicate;

/**
 * Contains several utility methods to convert constituency trees to
 * dependency trees.
 *
 * Used by {@link GrammaticalStructure#main(String[])}
 */

public class GrammaticalStructureConversionUtils {


  public static final String DEFAULT_PARSER_FILE = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";

  /**
   * Print typed dependencies in either the Stanford dependency representation
   * or in the conllx format.
   *
   * @param deps Typed dependencies to print
   * @param tree Tree corresponding to typed dependencies (only necessary if conllx
   *          == true)
   * @param conllx If true use conllx format, otherwise use Stanford representation
   * @param extraSep If true, in the Stanford representation, the extra dependencies
   *          (which do not preserve the tree structure) are printed after the
   *          basic dependencies
   * @param convertToUPOS If true convert the POS tags to universal POS tags and output
   *                      them along the original POS tags.
   */
  public static void printDependencies(GrammaticalStructure gs, Collection deps, Tree tree,
                                       boolean conllx, boolean extraSep, boolean convertToUPOS) {
    System.out.println(dependenciesToString(gs, deps, tree, conllx, extraSep, convertToUPOS));
  }


  /**
   * Calls dependenciesToCoNLLXString with the basic dependencies
   * from a grammatical structure.
   *
   * (see {@link #dependenciesToCoNLLXString(Collection, CoreMap)})
   */
  public static String dependenciesToCoNLLXString(GrammaticalStructure gs, CoreMap sentence) {
    return dependenciesToCoNLLXString(gs.typedDependencies(), sentence);
  }


  /**
   *
   * Returns a dependency tree in CoNNL-X format.
   * It requires a CoreMap for the sentence with a TokensAnnotation.
   * Each token has to contain a word and a POS tag.
   *
   * @param deps The list of TypedDependency relations.
   * @param sentence The corresponding CoreMap for the sentence.
   * @return Dependency tree in CoNLL-X format.
   */
  public static String dependenciesToCoNLLXString(Collection deps, CoreMap sentence) {
    StringBuilder bf = new StringBuilder();

    HashMap indexedDeps = new HashMap<>(deps.size());
    for (TypedDependency dep : deps) {
      indexedDeps.put(dep.dep().index(), dep);
    }

    List tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
    if (tokens == null) {
      throw new RuntimeException("dependenciesToCoNLLXString: CoreMap does not have required TokensAnnotation.");
    }
    int idx = 1;

    for (CoreLabel token : tokens) {
      String word = token.value();
      String pos = token.tag();
      String cPos = (token.get(CoreAnnotations.CoarseTagAnnotation.class) != null) ?
          token.get(CoreAnnotations.CoarseTagAnnotation.class) : pos;
      String lemma = token.lemma() != null ? token.lemma() : "_";
      Integer gov = indexedDeps.containsKey(idx) ? indexedDeps.get(idx).gov().index() : 0;
      String reln = indexedDeps.containsKey(idx) ? indexedDeps.get(idx).reln().toString() : "erased";
      String out = String.format("%d\t%s\t%s\t%s\t%s\t_\t%d\t%s\t_\t_\n", idx, word, lemma, cPos, pos, gov, reln);
      bf.append(out);
      idx++;
    }
    return bf.toString();
  }

  public static String dependenciesToString(GrammaticalStructure gs, Collection deps, Tree tree,
                                            boolean conllx, boolean extraSep, boolean convertToUPOS) {
    StringBuilder bf = new StringBuilder();

    Map indexToPos = Generics.newHashMap();
    indexToPos.put(0,0); // to deal with the special node "ROOT"
    List gsLeaves = gs.root.getLeaves();
    for (int i = 0; i < gsLeaves.size(); i++) {
      TreeGraphNode leaf = (TreeGraphNode) gsLeaves.get(i);
      indexToPos.put(leaf.label.index(), i + 1);
    }

    if (conllx) {

      List leaves = tree.getLeaves();
      List

* Usage:
* java edu.stanford.nlp.trees.GrammaticalStructure [-treeFile FILE | -sentFile FILE | -conllxFile FILE | -filter]
* [-collapsed -basic -CCprocessed -test -generateOriginalDependencies]
* * @param args Command-line arguments, as above */ @SuppressWarnings("unchecked") public static void convertTrees(String[] args, String defaultLang) { /* Use a tree normalizer that removes all empty nodes. This prevents wrong indexing of the nodes in the dependency relations. */ Iterable gsBank = null; Properties props = StringUtils.argsToProperties(args); String language = props.getProperty("language", defaultLang); ConverterOptions opts = ConverterOptions.getConverterOptions(language); MemoryTreebank tb = new MemoryTreebank(opts.treeNormalizer); Iterable trees = tb; String encoding = props.getProperty("encoding", "utf-8"); try { System.setOut(new PrintStream(System.out, true, encoding)); } catch (IOException e) { throw new RuntimeException(e); } String treeFileName = props.getProperty("treeFile"); String sentFileName = props.getProperty("sentFile"); String conllXFileName = props.getProperty("conllxFile"); String altDepPrinterName = props.getProperty("altprinter"); String altDepReaderName = props.getProperty("altreader"); String altDepReaderFilename = props.getProperty("altreaderfile"); String filter = props.getProperty("filter"); boolean makeCopulaHead = props.getProperty("makeCopulaHead") != null; boolean generateOriginalDependencies = props.getProperty("originalDependencies") != null || opts.stanfordDependencies; // TODO: if a parser is specified, load this from the parser // instead of ever loading it from this way String tLPP = props.getProperty("tLPP", opts.tlPPClassName); TreebankLangParserParams params = ReflectionLoading.loadByReflection(tLPP); params.setGenerateOriginalDependencies(generateOriginalDependencies); if (makeCopulaHead) { // TODO: generalize and allow for more options String[] options = { "-makeCopulaHead" }; params.setOptionFlag(options, 0); } if (sentFileName == null && (altDepReaderName == null || altDepReaderFilename == null) && treeFileName == null && conllXFileName == null && filter == null) { try { System.err.printf("Usage: java %s%n", GrammaticalStructure.class.getCanonicalName()); System.err.println("Options:"); System.err.println(" Dependency representation:"); System.err.println(" -basic:\t\tGenerate basic dependencies."); System.err.println(" -enhanced:\t\tGenerate enhanced dependencies, currently only implemented for English UD."); System.err.println(" -enhanced++:\tGenerate enhanced++ dependencies (default), currently only implemented for English UD."); System.err.println(" -collapsed:\t\tGenerate collapsed dependencies, deprecated."); System.err.println(" -CCprocessed:\tGenerate CC-processed dependencies, deprecated."); System.err.println(" -collapsedTree:\tGenerate collapsed-tree dependencies, deprecated."); System.err.println(""); System.err.println(" Input:"); System.err.println(" -treeFile :\tConvert from constituency trees in "); System.err.println(" -sentFile :\tParse and convert sentences from . Only implemented for English."); System.err.println(""); System.err.println(" Output:"); System.err.println(" -conllx:\t\tOutput dependencies in CoNLL format."); System.err.println(""); System.err.println(" Language:"); System.err.println(" -language [en|zh|en-sd|zh-sd]:\t (Universal English Dependencies, Universal Chinese Dependencies, English Stanford Dependencies, Chinese Stanford Dependencies)"); System.err.println(""); System.err.println(""); System.err.println(""); System.err.println("Example:"); TreeReader tr = new PennTreeReader(new StringReader("((S (NP (NNP Sam)) (VP (VBD died) (NP-TMP (NN today)))))")); tb.add(tr.readTree()); } catch (Exception e) { log.info("Horrible error: " + e); e.printStackTrace(); } } else if (altDepReaderName != null && altDepReaderFilename != null) { DependencyReader altDepReader = loadAlternateDependencyReader(altDepReaderName); try { gsBank = altDepReader.readDependencies(altDepReaderFilename); } catch (IOException e) { log.info("Error reading " + altDepReaderFilename); return; } } else if (treeFileName != null) { tb.loadPath(treeFileName); } else if (filter != null) { tb.load(IOUtils.readerFromStdin()); } else if (conllXFileName != null) { try { gsBank = params.readGrammaticalStructureFromFile(conllXFileName); } catch (RuntimeIOException e) { log.info("Error reading " + conllXFileName); return; } } else { String parserFile = props.getProperty("parserFile"); String parserOpts = props.getProperty("parserOpts"); boolean tokenized = props.getProperty("tokenized") != null; Function, Tree> lp = loadParser(parserFile, parserOpts, makeCopulaHead); trees = new LazyLoadTreesByParsing(sentFileName, encoding, tokenized, lp); // Instead of getting this directly from the LP, use reflection // so that a package which uses GrammaticalStructure doesn't // necessarily have to use LexicalizedParser try { Method method = lp.getClass().getMethod("getTLPParams"); params = (TreebankLangParserParams) method.invoke(lp); params.setGenerateOriginalDependencies(generateOriginalDependencies); } catch (Exception cnfe) { throw new RuntimeException(cnfe); } } // treats the output according to the options passed boolean basic = props.getProperty("basic") != null; boolean collapsed = props.getProperty("collapsed") != null; boolean CCprocessed = props.getProperty("CCprocessed") != null; boolean collapsedTree = props.getProperty("collapsedTree") != null; boolean nonCollapsed = props.getProperty("nonCollapsed") != null; boolean extraSep = props.getProperty("extraSep") != null; boolean parseTree = props.getProperty("parseTree") != null; boolean test = props.getProperty("test") != null; boolean keepPunct = true; //always keep punctuation marks boolean conllx = props.getProperty("conllx") != null; // todo: Support checkConnected on more options (including basic) boolean checkConnected = props.getProperty("checkConnected") != null; boolean portray = props.getProperty("portray") != null; boolean enhanced = props.getProperty("enhanced") != null; boolean enhancedPlusPlus = props.getProperty("enhanced++") != null; // If requested load alternative printer DependencyPrinter altDepPrinter = null; if (altDepPrinterName != null) { altDepPrinter = loadAlternateDependencyPrinter(altDepPrinterName); } // log.info("First tree in tb is"); // log.info(((MemoryTreebank) tb).get(0)); Method m = null; if (test) { // see if we can use SemanticGraph(Factory) to check for being a DAG // Do this by reflection to avoid this becoming a dependency when we distribute the parser try { Class sgf = Class.forName("edu.stanford.nlp.semgraph.SemanticGraphFactory"); m = sgf.getDeclaredMethod("makeFromTree", GrammaticalStructure.class, boolean.class, boolean.class, boolean.class, boolean.class, boolean.class, boolean.class, Predicate.class, String.class, int.class); } catch (Exception e) { log.info("Test cannot check for cycles in tree format (classes not available)"); } } if (gsBank == null) { gsBank = new TreeBankGrammaticalStructureWrapper(trees, keepPunct, params); } for (GrammaticalStructure gs : gsBank) { Tree tree; if (gsBank instanceof TreeBankGrammaticalStructureWrapper) { // log.info("Using TreeBankGrammaticalStructureWrapper branch"); tree = ((TreeBankGrammaticalStructureWrapper) gsBank).getOriginalTree(gs); // log.info("Tree is: "); // log.info(t); } else { // log.info("Using gs.root() branch"); tree = gs.root(); // recover tree // log.info("Tree from gs is"); // log.info(t); } if (test) { // print the grammatical structure, the basic, collapsed and CCprocessed System.out.println("============= parse tree ======================="); tree.pennPrint(); System.out.println(); System.out.println("------------- GrammaticalStructure -------------"); System.out.println(gs); boolean allConnected = true; boolean connected; Collection bungRoots = null; System.out.println("------------- basic dependencies ---------------"); List gsb = gs.typedDependencies(GrammaticalStructure.Extras.NONE); System.out.println(StringUtils.join(gsb, "\n")); connected = GrammaticalStructure.isConnected(gsb); if ( ! connected && bungRoots == null) { bungRoots = GrammaticalStructure.getRoots(gsb); } allConnected = connected && allConnected; System.out.println("------------- non-collapsed dependencies (basic + extra) ---------------"); List gse = gs.typedDependencies(GrammaticalStructure.Extras.MAXIMAL); System.out.println(StringUtils.join(gse, "\n")); connected = GrammaticalStructure.isConnected(gse); if ( ! connected && bungRoots == null) { bungRoots = GrammaticalStructure.getRoots(gse); } allConnected = connected && allConnected; System.out.println("------------- collapsed dependencies -----------"); System.out.println(StringUtils.join(gs.typedDependenciesCollapsed(GrammaticalStructure.Extras.MAXIMAL), "\n")); System.out.println("------------- collapsed dependencies tree -----------"); System.out.println(StringUtils.join(gs.typedDependenciesCollapsedTree(), "\n")); System.out.println("------------- CCprocessed dependencies --------"); List gscc = gs.typedDependenciesCollapsed(GrammaticalStructure.Extras.MAXIMAL); System.out.println(StringUtils.join(gscc, "\n")); System.out.println("-----------------------------------------------"); // connectivity tests connected = GrammaticalStructure.isConnected(gscc); if ( ! connected && bungRoots == null) { bungRoots = GrammaticalStructure.getRoots(gscc); } allConnected = connected && allConnected; if (allConnected) { System.out.println("dependencies form connected graphs."); } else { System.out.println("dependency graph NOT connected! possible offending nodes: " + bungRoots); } // test for collapsed dependencies being a tree: // make sure at least it doesn't contain cycles (i.e., is a DAG) // Do this by reflection so parser doesn't need SemanticGraph and its // libraries if (m != null) { try { // the first arg is null because it's a static method.... Object semGraph = m.invoke(null, gs, false, true, false, false, false, false, null, null, 0); Class sg = Class.forName("edu.stanford.nlp.semgraph.SemanticGraph"); Method mDag = sg.getDeclaredMethod("isDag"); boolean isDag = (Boolean) mDag.invoke(semGraph); System.out.println("tree dependencies form a DAG: " + isDag); } catch (Exception e) { e.printStackTrace(); } } }// end of "test" output else { if (parseTree) { System.out.println("============= parse tree ======================="); tree.pennPrint(); System.out.println(); } if (basic) { if (collapsed || CCprocessed || collapsedTree || nonCollapsed || enhanced || enhancedPlusPlus) { System.out.println("------------- basic dependencies ---------------"); } if (altDepPrinter == null) { printDependencies(gs, gs.typedDependencies(GrammaticalStructure.Extras.NONE), tree, conllx, false, opts.convertToUPOS); } else { System.out.println(altDepPrinter.dependenciesToString(gs, gs.typedDependencies(GrammaticalStructure.Extras.NONE), tree)); } } if (nonCollapsed) { if (basic || CCprocessed || collapsed || collapsedTree) { System.out.println("----------- non-collapsed dependencies (basic + extra) -----------"); } printDependencies(gs, gs.allTypedDependencies(), tree, conllx, extraSep, opts.convertToUPOS); } if (collapsed) { if (basic || CCprocessed || collapsedTree || nonCollapsed) { System.out.println("----------- collapsed dependencies -----------"); } printDependencies(gs, gs.typedDependenciesCollapsed(GrammaticalStructure.Extras.MAXIMAL), tree, conllx, false, opts.convertToUPOS); } if (CCprocessed) { if (basic || collapsed || collapsedTree || nonCollapsed) { System.out.println("---------- CCprocessed dependencies ----------"); } List deps = gs.typedDependenciesCCprocessed(GrammaticalStructure.Extras.MAXIMAL); if (checkConnected) { if (!GrammaticalStructure.isConnected(deps)) { log.info("Graph is not connected for:"); log.info(tree); log.info("possible offending nodes: " + GrammaticalStructure.getRoots(deps)); } } printDependencies(gs, deps, tree, conllx, false, opts.convertToUPOS); } if (collapsedTree) { if (basic || CCprocessed || collapsed || nonCollapsed) { System.out.println("----------- collapsed dependencies tree -----------"); } printDependencies(gs, gs.typedDependenciesCollapsedTree(), tree, conllx, false, opts.convertToUPOS); } if (enhanced) { if (basic || enhancedPlusPlus) { System.out.println("----------- enhanced dependencies tree -----------"); } printDependencies(gs, gs.typedDependenciesEnhanced(), tree, conllx, false, opts.convertToUPOS); } if (enhancedPlusPlus) { if (basic || enhanced) { System.out.println("----------- enhanced++ dependencies tree -----------"); } printDependencies(gs, gs.typedDependenciesEnhancedPlusPlus(), tree, conllx, false, opts.convertToUPOS); } // default use: enhanced++ for UD, CCprocessed for SD (to parallel what happens within the parser) if (!basic && !collapsed && !CCprocessed && !collapsedTree && !nonCollapsed && !enhanced && !enhancedPlusPlus) { // System.out.println("----------- CCprocessed dependencies -----------"); if (generateOriginalDependencies) { printDependencies(gs, gs.typedDependenciesCCprocessed(GrammaticalStructure.Extras.MAXIMAL), tree, conllx, false, opts.convertToUPOS); } else { printDependencies(gs, gs.typedDependenciesEnhancedPlusPlus(), tree, conllx, false, opts.convertToUPOS); } } } if (portray) { try { // put up a window showing it Class sgu = Class.forName("edu.stanford.nlp.semgraph.SemanticGraphUtils"); Method mRender = sgu.getDeclaredMethod("render", GrammaticalStructure.class, String.class); // the first arg is null because it's a static method.... mRender.invoke(null, gs, "Collapsed, CC processed deps"); } catch (Exception e) { throw new RuntimeException("Couldn't use swing to portray semantic graph", e); } } } // end for } // end convertTrees // todo [cdm 2013]: Take this out and make it a trees class: TreeIterableByParsing static class LazyLoadTreesByParsing implements Iterable { final Reader reader; final String filename; final boolean tokenized; final String encoding; final Function, Tree> lp; public LazyLoadTreesByParsing(String filename, String encoding, boolean tokenized, Function, Tree> lp) { this.filename = filename; this.encoding = encoding; this.reader = null; this.tokenized = tokenized; this.lp = lp; } @Override public Iterator iterator() { final BufferedReader iReader; if (reader != null) { iReader = new BufferedReader(reader); } else { try { iReader = new BufferedReader(new InputStreamReader(new FileInputStream(filename), encoding)); } catch (IOException e) { throw new RuntimeException(e); } } return new Iterator() { String line; // = null; @Override public boolean hasNext() { if (line != null) { return true; } else { try { line = iReader.readLine(); } catch (IOException e) { throw new RuntimeException(e); } if (line == null) { try { if (reader == null) iReader.close(); } catch (Exception e) { throw new RuntimeException(e); } return false; } return true; } } @Override public Tree next() { if (line == null) { throw new NoSuchElementException(); } Reader lineReader = new StringReader(line); line = null; List words; if (tokenized) { words = WhitespaceTokenizer.newWordWhitespaceTokenizer(lineReader).tokenize(); } else { words = PTBTokenizer.newPTBTokenizer(lineReader).tokenize(); } if (!words.isEmpty()) { // the parser throws an exception if told to parse an empty sentence. Tree parseTree = lp.apply(words); return parseTree; } else { return new SimpleTree(); } } @Override public void remove() { throw new UnsupportedOperationException(); } }; } } // end static class LazyLoadTreesByParsing }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy