All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.trees.TreePrint Maven / Gradle / Ivy

Go to download

Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.

There is a newer version: 3.9.2
Show newest version
package edu.stanford.nlp.trees;

import edu.stanford.nlp.ling.*;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.trees.international.pennchinese.ChineseEnglishWordMap;
import edu.stanford.nlp.util.*;
import edu.stanford.nlp.util.XMLUtils;

import java.io.*;
import java.util.*;
import java.util.function.Function;
import java.util.function.Predicate;


/**
 * A class for customizing the print method(s) for a
 * edu.stanford.nlp.trees.Tree as the output of the
 * parser.  This class supports printing in multiple ways and altering
 * behavior via properties specified at construction.
 *
 * @author Roger Levy
 * @author Christopher Manning
 * @author Galen Andrew
 */
public class TreePrint {

  // TODO: Add support for makeCopulaHead as an outputFormatOption here.

  public static final String rootLabelOnlyFormat = "rootSymbolOnly";
  public static final String headMark = "=H";

  /** The legal output tree formats. */
  public static final String[] outputTreeFormats = {
    "penn",
    "oneline",
    rootLabelOnlyFormat,
    "words",
    "wordsAndTags",
    "dependencies",
    "typedDependencies",
    "typedDependenciesCollapsed",
    "latexTree",
    "xmlTree",
    "collocations",
    "semanticGraph",
    "conllStyleDependencies",
    "conll2007"
  };

  private final Properties formats;
  private final Properties options;

  private final boolean markHeadNodes; // = false;
  private final boolean lexicalize; // = false;
  private final boolean removeEmpty;
  private final boolean ptb2text;
  private final boolean transChinese; // = false;
  private final boolean basicDependencies;
  private final boolean collapsedDependencies;
  private final boolean nonCollapsedDependencies;
  private final boolean nonCollapsedDependenciesSeparated;
  private final boolean CCPropagatedDependencies;
  private final boolean treeDependencies;

  private final boolean includeTags;

  private final HeadFinder hf;
  private final TreebankLanguagePack tlp;
  private final WordStemmer stemmer;
  private final Predicate> dependencyFilter;
  private final Predicate> dependencyWordFilter;
  private final GrammaticalStructureFactory gsf;

  /** Pool use of one WordNetConnection.  I don't really know if
   *  Dan Bikel's WordNet code is thread safe, but it definitely doesn't
   *  close its files, and too much of our code makes TreePrint objects and
   *  then drops them on the floor, and so we run out of file handles.
   *  That is, if this variable isn't static, code crashes.
   *  Maybe we should change this code to use jwnl(x)?
   *  CDM July 2006.
   */
  private static WordNetConnection wnc;


  /** This PrintWriter is used iff the user doesn't pass one in to a
   *  call to printTree().  It prints to System.out.
   */
  private final PrintWriter pw = new PrintWriter(System.out, true);


  /** Construct a new TreePrint that will print the given formats.
   *  Warning! This is the anglocentric constructor.
   *  It will work correctly only for English.
   *
   *  @param formats The formats to print the tree in.
   */
  public TreePrint(String formats) {
    this(formats, "", new PennTreebankLanguagePack());
  }

  /** Make a TreePrint instance with no options specified. */
  public TreePrint(String formats, TreebankLanguagePack tlp) {
    this(formats, "", tlp);
  }

  /** Make a TreePrint instance. This one uses the default tlp headFinder. */
  public TreePrint(String formats, String options, TreebankLanguagePack tlp) {
    this(formats, options, tlp, tlp.headFinder(), tlp.typedDependencyHeadFinder());
  }

  /**
   * Make a TreePrint instance.
   *
   * @param formatString A comma separated list of ways to print each Tree.
   *                For instance, "penn" or "words,typedDependencies".
   *                Known formats are: oneline, penn, latexTree, xmlTree, words,
   *                wordsAndTags, rootSymbolOnly, dependencies,
   *                typedDependencies, typedDependenciesCollapsed,
   *                collocations, semanticGraph, conllStyleDependencies,
   *                conll2007.  The last two are both tab-separated values
   *                formats.  The latter has a lot more columns filled with
   *                underscores. All of them print a blank line after
   *                the output except for oneline.  oneline is also not
   *                meaningful in XML output (it is ignored: use penn instead).
   *                (Use of typedDependenciesCollapsed is deprecated.  It
   *                works but we recommend instead selecting a type of
   *                dependencies using the optionsString argument.  Note in
   *                particular that typedDependenciesCollapsed does not do
   *                CC propagation, which we generally recommend.)
   * @param optionsString Options that additionally specify how trees are to
   *                be printed (for instance, whether stemming should be done).
   *                Known options are: stem, lexicalize, markHeadNodes,
   *                xml, removeTopBracket, transChinese,
   *                includePunctuationDependencies, basicDependencies, treeDependencies,
   *                CCPropagatedDependencies, collapsedDependencies, nonCollapsedDependencies,
   *                nonCollapsedDependenciesSeparated, includeTags
   *                .
   * @param tlp     The TreebankLanguagePack used to do things like delete
   *                or ignore punctuation in output
   * @param hf      The HeadFinder used in printing output
   */
  public TreePrint(String formatString, String optionsString, TreebankLanguagePack tlp, HeadFinder hf, HeadFinder typedDependencyHF) {
    formats = StringUtils.stringToProperties(formatString);
    options = StringUtils.stringToProperties(optionsString);
    List okOutputs = Arrays.asList(outputTreeFormats);
    for (Object formObj : formats.keySet()) {
      String format = (String) formObj;
      if ( ! okOutputs.contains(format)) {
        throw new RuntimeException("Error: output tree format " + format + " not supported. Known formats are: " + okOutputs);
      }
    }

    this.hf = hf;
    this.tlp = tlp;
    boolean includePunctuationDependencies;
    includePunctuationDependencies = propertyToBoolean(this.options,
                                                       "includePunctuationDependencies");

    boolean generateOriginalDependencies = tlp.generateOriginalDependencies();

    Predicate puncFilter;
    if (includePunctuationDependencies) {
      dependencyFilter = Filters.acceptFilter();
      dependencyWordFilter = Filters.acceptFilter();
      puncFilter = Filters.acceptFilter();
    } else {
      dependencyFilter = new Dependencies.DependentPuncTagRejectFilter(tlp.punctuationTagRejectFilter());
      dependencyWordFilter = new Dependencies.DependentPuncWordRejectFilter(tlp.punctuationWordRejectFilter());
      //Universal dependencies filter punction by tags
      puncFilter = generateOriginalDependencies ? tlp.punctuationWordRejectFilter() : tlp.punctuationTagRejectFilter();
    }
    if (propertyToBoolean(this.options, "stem")) {
      stemmer = new WordStemmer();
    } else {
      stemmer = null;
    }
    if (formats.containsKey("typedDependenciesCollapsed") ||
        formats.containsKey("typedDependencies") ||
        (formats.containsKey("conll2007") && tlp.supportsGrammaticalStructures())) {
      gsf = tlp.grammaticalStructureFactory(puncFilter, typedDependencyHF);
    } else {
      gsf = null;
    }

    lexicalize = propertyToBoolean(this.options, "lexicalize");
    markHeadNodes = propertyToBoolean(this.options, "markHeadNodes");
    transChinese = propertyToBoolean(this.options, "transChinese");
    ptb2text = propertyToBoolean(this.options, "ptb2text");
    removeEmpty = propertyToBoolean(this.options, "noempty") || ptb2text;

    basicDependencies =  propertyToBoolean(this.options, "basicDependencies");
    collapsedDependencies = propertyToBoolean(this.options, "collapsedDependencies");
    nonCollapsedDependencies = propertyToBoolean(this.options, "nonCollapsedDependencies");
    nonCollapsedDependenciesSeparated = propertyToBoolean(this.options, "nonCollapsedDependenciesSeparated");
    treeDependencies = propertyToBoolean(this.options, "treeDependencies");

    includeTags = propertyToBoolean(this.options, "includeTags");

    // if no option format for the dependencies is specified, CCPropagated is the default
    if ( ! basicDependencies && ! collapsedDependencies && ! nonCollapsedDependencies && ! nonCollapsedDependenciesSeparated && ! treeDependencies) {
      CCPropagatedDependencies = true;
    } else {
      CCPropagatedDependencies = propertyToBoolean(this.options, "CCPropagatedDependencies");
    }
  }


  private static boolean propertyToBoolean(Properties prop, String key) {
    return Boolean.parseBoolean(prop.getProperty(key));
  }

  /**
   * Prints the tree to the default PrintWriter.
   * @param t The tree to display
   */
  public void printTree(Tree t) {
    printTree(t, pw);
  }

  /**
   * Prints the tree, with an empty ID.
   * @param t The tree to display
   * @param pw The PrintWriter to print it to
   */
  public void printTree(final Tree t, PrintWriter pw) {
    printTree(t, "", pw);
  }


  /**
   * Prints the tree according to the options specified for this instance.
   * If the tree t is null, then the code prints
   * a line indicating a skipped tree.  Under the XML option this is
   * an s element with the skipped attribute having
   * value true, and, otherwise, it is the token
   * SENTENCE_SKIPPED_OR_UNPARSABLE.
   *
   * @param t The tree to display
   * @param id A name for this sentence
   * @param pw Where to display the tree
   */
  public void printTree(final Tree t, final String id, final PrintWriter pw) {
    final boolean inXml = propertyToBoolean(options, "xml");
    if (t == null) {
      // Parsing didn't succeed.
      if (inXml) {
        pw.print("");
        pw.println();
      } else {
        pw.println("SENTENCE_SKIPPED_OR_UNPARSABLE");
      }
    } else {
      if (inXml) {
        pw.print("");
      }
      printTreeInternal(t, pw, inXml);
      if (inXml) {
        pw.println("");
        pw.println();
      }
    }
  }


  /**
   * Prints the trees according to the options specified for this instance.
   * If the tree t is null, then the code prints
   * a line indicating a skipped tree.  Under the XML option this is
   * an s element with the skipped attribute having
   * value true, and, otherwise, it is the token
   * SENTENCE_SKIPPED_OR_UNPARSABLE.
   *
   * @param trees The list of trees to display
   * @param id A name for this sentence
   * @param pw Where to dislay the tree
   */
  public void printTrees(final List> trees, final String id, final PrintWriter pw) {
    final boolean inXml = propertyToBoolean(options, "xml");
    int ii = 0;  // incremented before used, so first tree is numbered 1
    for (ScoredObject tp : trees) {
      ii++;
      Tree t = tp.object();
      double score = tp.score();

      if (t == null) {
        // Parsing didn't succeed.
        if (inXml) {
          pw.print("");
          pw.println();
        } else {
          pw.println("SENTENCE_SKIPPED_OR_UNPARSABLE Parse #" + ii + " with score " + score);
        }
      } else {
        if (inXml) {
          pw.print("");
        } else {
          pw.print("# Parse ");
          pw.print(ii);
          pw.print(" with score ");
          pw.println(score);
        }
        printTreeInternal(t, pw, inXml);
        if (inXml) {
          pw.println("");
          pw.println();
        }
      }
    }
  }


  /** Print the internal part of a tree having already identified it.
   *  The ID and outer XML element is printed wrapping this method, but none
   *  of the internal content.
   *
   * @param t The tree to print. Now known to be non-null
   * @param pw Where to print it to
   * @param inXml Whether to use XML style printing
   */
  private void printTreeInternal(final Tree t, final PrintWriter pw, final boolean inXml) {
    Tree outputTree = t;

    if (formats.containsKey("conll2007") || removeEmpty) {
      outputTree = outputTree.prune(new BobChrisTreeNormalizer.EmptyFilter());
    }

    if (formats.containsKey("words")) {
      if (inXml) {
        ArrayList




© 2015 - 2024 Weber Informatics LLC | Privacy Policy