All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.trees.Tree Maven / Gradle / Ivy

Go to download

Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.

The newest version!
package edu.stanford.nlp.trees;

import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintStream;
import java.io.PrintWriter;
import java.io.Serializable;
import java.io.StringReader;
import java.io.StringWriter;
import java.util.*;
import java.util.function.Function;
import java.util.function.Predicate;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasIndex;
import edu.stanford.nlp.ling.HasTag;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.ling.LabelFactory;
import edu.stanford.nlp.ling.LabeledWord;
import edu.stanford.nlp.ling.SentenceUtils;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.util.*;
import edu.stanford.nlp.util.logging.Redwood;


/**
 * The abstract class {@code Tree} is used to collect all of the
 * tree types, and acts as a generic extensible type.  This is the
 * standard implementation of inheritance-based polymorphism.
 * All {@code Tree} objects support accessors for their children (a
 * {@code Tree[]}), their label (a {@code Label}), and their
 * score (a {@code double}).  However, different concrete
 * implementations may or may not include the latter two, in which
 * case a default value is returned.  The class Tree defines no data
 * fields.  The two abstract methods that must be implemented are:
 * {@code children()}, and {@code treeFactory()}.  Notes
 * that {@code setChildren(Tree[])} is now an optional
 * operation, whereas it was previously required to be
 * implemented. There is now support for finding the parent of a
 * tree.  This may be done by search from a tree root, or via a
 * directly stored parent.  The {@code Tree} class now
 * implements the {@code Collection} interface: in terms of
 * this, each node of the tree is an element of the
 * collection; hence one can explore the tree by using the methods of
 * this interface.  A {@code Tree} is regarded as a read-only
 * {@code Collection} (even though the {@code Tree} class
 * has various methods that modify trees).  Moreover, the
 * implementation is not thread-safe: no attempt is made to
 * detect and report concurrent modifications.
 *
 * @author Christopher Manning
 * @author Dan Klein
 * @author Sarah Spikes ([email protected]) - filled in types
 */
public abstract class Tree extends AbstractCollection implements Label, Labeled, Scored, Serializable  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(Tree.class);

  private static final long serialVersionUID = 5441849457648722744L;

  /**
   * A leaf node should have a zero-length array for its
   * children. For efficiency, classes can use this array as a
   * return value for children() for leaf nodes if desired.
   * This can also be used elsewhere when you want an empty Tree array.
   */
  public static final Tree[] EMPTY_TREE_ARRAY = new Tree[0];

  public Tree() {
  }

  /**
   * Says whether a node is a leaf.  Can be used on an arbitrary
   * {@code Tree}.  Being a leaf is defined as having no
   * children.  This must be implemented as returning a zero-length
   * Tree[] array for children().
   *
   * @return true if this object is a leaf
   */
  public boolean isLeaf() {
    return numChildren() == 0;
  }


  /**
   * Says how many children a tree node has in its local tree.
   * Can be used on an arbitrary {@code Tree}.  Being a leaf is defined
   * as having no children.
   *
   * @return The number of direct children of the tree node
   */
  public int numChildren() {
    return children().length;
  }


  /**
   * Says whether the current node has only one child.
   * Can be used on an arbitrary {@code Tree}.
   *
   * @return Whether the node heads a unary rewrite
   */
  public boolean isUnaryRewrite() {
    return numChildren() == 1;
  }


  /**
   * Return whether this node is a preterminal or not.  A preterminal is
   * defined to be a node with one child which is itself a leaf.
   *
   * @return true if the node is a preterminal; false otherwise
   */
  public boolean isPreTerminal() {
    Tree[] kids = children();
    return (kids.length == 1) && (kids[0].isLeaf());
  }


  /**
   * Return whether all the children of this node are preterminals or not.
   * A preterminal is
   * defined to be a node with one child which is itself a leaf.
   * Considered false if the node has no children
   *
   * @return true if the node is a prepreterminal; false otherwise
   */
  public boolean isPrePreTerminal() {
    Tree[] kids = children();
    if (kids.length == 0) {
      return false;
    }
    for (Tree kid : kids) {
      if ( ! kid.isPreTerminal()) {
        return false;
      }
    }
    return true;
  }


  /**
   * Return whether this node is a phrasal node or not.  A phrasal node
   * is defined to be a node which is not a leaf or a preterminal.
   * Worded positively, this means that it must have two or more children,
   * or one child that is not a leaf.
   *
   * @return {@code true} if the node is phrasal; {@code false} otherwise
   */
  public boolean isPhrasal() {
    Tree[] kids = children();
    return !(kids == null || kids.length == 0 || (kids.length == 1 && kids[0].isLeaf()));
  }


  /**
   * Implements equality for Tree's.  Two Tree objects are equal if they
   * have equal {@link #value}s, the same number of children, and their children
   * are pairwise equal.
   *
   * @param o The object to compare with
   * @return Whether two things are equal
   */
  @Override
  public boolean equals(Object o) {
    if (o == this) {
      return true;
    }
    if (!(o instanceof Tree)) {
      return false;
    }
    Tree t = (Tree) o;
    String value1 = this.value();
    String value2 = t.value();
    if (value1 != null || value2 != null) {
    	if (value1 == null || value2 == null || !value1.equals(value2)) {
    		return false;
    	}
    }
    Tree[] myKids = children();
    Tree[] theirKids = t.children();
    //if((myKids == null && (theirKids == null || theirKids.length != 0)) || (theirKids == null && myKids.length != 0) || (myKids.length != theirKids.length)){
    if (myKids.length != theirKids.length) {
      return false;
    }
    for (int i = 0; i < myKids.length; i++) {
      if (!myKids[i].equals(theirKids[i])) {
        return false;
      }
    }
    return true;
  }


  /**
   * Implements a hashCode for Tree's.  Two trees should have the same
   * hashcode if they are equal, so we hash on the label value and
   * the children's label values.
   *
   * @return The hash code
   */
  @Override
  public int hashCode() {
    String v = this.value();
    int hc = (v == null) ? 1 : v.hashCode();
    Tree[] kids = children();
    for (int i = 0; i < kids.length; i++) {
      v = kids[i].value();
      int hc2 = (v == null) ? i : v.hashCode();
      hc ^= (hc2 << i);
    }
    return hc;
  }


  /**
   * Returns the position of a Tree in the children list, if present,
   * or -1 if it is not present.  Trees are checked for presence with
   * object equality, ==.  Note that there are very few cases where an
   * indexOf that used .equals() instead of == would be useful and
   * correct.  In most cases, you want to figure out which child of
   * the parent a known tree is, so looking for object equality will
   * be faster and will avoid cases where you happen to have two
   * subtrees that are exactly the same.
   *
   * @param tree The tree to look for in children list
   * @return Its index in the list or -1
   */
  public int objectIndexOf(Tree tree) {
    Tree[] kids = children();
    for (int i = 0; i < kids.length; i++) {
      if (kids[i] == tree) {
        return i;
      }
    }
    return -1;
  }


  /**
   * Returns an array of children for the current node.  If there
   * are no children (if the node is a leaf), this must return a
   * Tree[] array of length 0.  A null children() value for tree
   * leaves was previously supported, but no longer is.
   * A caller may assume that either {@code isLeaf()} returns
   * true, or this node has a nonzero number of children.
   *
   * @return The children of the node
   * @see #getChildrenAsList()
   */
  public abstract Tree[] children();


  /**
   * Returns a List of children for the current node.  If there are no
   * children, then a (non-null) {@code List} of size 0 will
   * be returned.  The list has new list structure but pointers to,
   * not copies of the children.  That is, the returned list is mutable,
   * and simply adding to or deleting items from it is safe, but beware
   * changing the contents of the children.
   *
   * @return The children of the node
   */
  public List getChildrenAsList() {
    return new ArrayList<>(Arrays.asList(children()));
  }


  /**
   * Set the children of this node to be the children given in the
   * array.  This is an optional operation; by default it is
   * unsupported.  Note for subclasses that if there are no
   * children, the children() method must return a Tree[] array of
   * length 0.  This class provides a
   * {@code EMPTY_TREE_ARRAY} canonical zero-length Tree[] array
   * to represent zero children, but it is not required that
   * leaf nodes use this particular zero-length array to represent
   * a leaf node.
   *
   * @param children The array of children, each a {@code Tree}
   * @see #setChildren(List)
   */
  public void setChildren(Tree[] children) {
    throw new UnsupportedOperationException();
  }


  /**
   * Set the children of this tree node to the given list.  This
   * method is implemented in the {@code Tree} class by
   * converting the {@code List} into a tree array and calling
   * the array-based method.  Subclasses which use a
   * {@code List}-based representation of tree children should
   * override this method.  This implementation allows the case
   * that the {@code List} is {@code null}: it yields a
   * node with no children (represented by a canonical zero-length
   * children() array).
   *
   * @param childTreesList A list of trees to become children of the node.
   *          This method does not retain the List that you pass it (copying
   *          is done), but it will retain the individual children (they are
   *          not copied).
   * @see #setChildren(Tree[])
   */
  public void setChildren(List childTreesList) {
    if (childTreesList == null || childTreesList.isEmpty()) {
      setChildren(EMPTY_TREE_ARRAY);
    } else {
      Tree[] childTrees = new Tree[childTreesList.size()];
      childTreesList.toArray(childTrees);
      setChildren(childTrees);
    }
  }


  /**
   * Returns the label associated with the current node, or null
   * if there is no label.  The default implementation always
   * returns {@code null}.
   *
   * @return The label of the node
   */
  @Override
  public Label label() {
    return null;
  }


  /**
   * Sets the label associated with the current node, if there is one.
   * The default implementation ignores the label.
   *
   * @param label The label
   */
  @Override
  public void setLabel(Label label) {
    // a noop
  }


  /**
   * Returns the score associated with the current node, or NaN
   * if there is no score.  The default implementation returns NaN.
   *
   * @return The score
   */
  @Override
  public double score() {
    return Double.NaN;
  }


  /**
   * Sets the score associated with the current node, if there is one.
   *
   * @param score The score
   */
  public void setScore(double score) {
    throw new UnsupportedOperationException("You must use a tree type that implements scoring in order call setScore()");
  }


  /**
   * Returns the first child of a tree, or {@code null} if none.
   *
   * @return The first child
   */
  public Tree firstChild() {
    Tree[] kids = children();
    if (kids.length == 0) {
      return null;
    }
    return kids[0];
  }


  /**
   * Returns the last child of a tree, or {@code null} if none.
   *
   * @return The last child
   */
  public Tree lastChild() {
    Tree[] kids = children();
    if (kids.length == 0) {
      return null;
    }
    return kids[kids.length - 1];
  }

  /** Return the highest node of the (perhaps trivial) unary chain that
   *  this node is part of.
   *  In case this node is the only child of its parent, trace up the chain of
   *  unaries, and return the uppermost node of the chain (the node whose
   *  parent has multiple children, or the node that is the root of the tree).
   *
   *  @param root The root of the tree that contains this subtree
   *  @return The uppermost node of the unary chain, if this node is in a unary
   *         chain, or else the current node
   */
  public Tree upperMostUnary(Tree root) {
    Tree parent = parent(root);
    if (parent == null) {
      return this;
    }
    if (parent.numChildren() > 1) {
      return this;
    }
    return parent.upperMostUnary(root);
  }

  /**
   * Assign a SpanAnnotation on each node of this tree.
   *  The index starts at zero.
   */
  public void setSpans() {
    constituentsNodes(0);
  }

  /**
   * Returns SpanAnnotation of this node, or null if annotation is not assigned.
   * Use {@code setSpans()} to assign SpanAnnotations to a tree.
   *
   * @return an IntPair: the SpanAnnotation of this node.
   */
  public IntPair getSpan() {
    if(label() instanceof CoreMap && ((CoreMap) label()).containsKey(CoreAnnotations.SpanAnnotation.class))
      return ((CoreMap) label()).get(CoreAnnotations.SpanAnnotation.class);
    return null;
  }

  /**
   * Returns the Constituents generated by the parse tree. Constituents
   * are computed with respect to whitespace (e.g., at the word level).
   *
   * @return a Set of the constituents as constituents of
   *         type {@code Constituent}
   */
  public Set constituents() {
    return constituents(new SimpleConstituentFactory());
  }


  /**
   * Returns the Constituents generated by the parse tree.
   * The Constituents of a sentence include the preterminal categories
   * but not the leaves.
   *
   * @param cf ConstituentFactory used to build the Constituent objects
   * @return a Set of the constituents as SimpleConstituent type
   *         (in the current implementation, a {@code HashSet}
   */
  public Set constituents(ConstituentFactory cf) {
    return constituents(cf,false);
  }

  /**
   * Returns the Constituents generated by the parse tree.
   * The Constituents of a sentence include the preterminal categories
   * but not the leaves.
   *
   * @param cf ConstituentFactory used to build the Constituent objects
   * @param maxDepth The maximum depth at which to add constituents,
   *                 where 0 is the root level.  Negative maxDepth
   *                 indicates no maximum.
   * @return a Set of the constituents as SimpleConstituent type
   *         (in the current implementation, a {@code HashSet}
   */
  public Set constituents(ConstituentFactory cf, int maxDepth) {
    Set constituentsSet = Generics.newHashSet();
    constituents(constituentsSet, 0, cf, false, null, maxDepth, 0);
    return constituentsSet;
  }

  /**
   * Returns the Constituents generated by the parse tree.
   * The Constituents of a sentence include the preterminal categories
   * but not the leaves.
   *
   * @param cf ConstituentFactory used to build the Constituent objects
   * @param charLevel If true, compute bracketings irrespective of whitespace boundaries.
   * @return a Set of the constituents as SimpleConstituent type
   *         (in the current implementation, a {@code HashSet}
   */
  public Set constituents(ConstituentFactory cf, boolean charLevel) {
    Set constituentsSet = Generics.newHashSet();
    constituents(constituentsSet, 0, cf, charLevel, null, -1, 0);
    return constituentsSet;
  }

  public Set constituents(ConstituentFactory cf, boolean charLevel, Predicate filter) {
    Set constituentsSet = Generics.newHashSet();
    constituents(constituentsSet, 0, cf, charLevel, filter, -1, 0);
    return constituentsSet;
  }

  /**
   * Same as int constituents but just puts the span as an IntPair
   * in the CoreLabel of the nodes.
   *
   * @param left The left position to begin labeling from
   * @return The index of the right frontier of the constituent
   */
  private int constituentsNodes(int left) {
    if (isLeaf()) {
      if (label() instanceof CoreLabel) {
        ((CoreLabel) label()).set(CoreAnnotations.SpanAnnotation.class, new IntPair(left, left));
      } else {
        throw new UnsupportedOperationException("Can only set spans on trees which use CoreLabel");
      }
      return (left + 1);
    }
    int position = left;

    // enumerate through daughter trees
    Tree[] kids = children();
    for (Tree kid : kids)
      position = kid.constituentsNodes(position);

    //Parent span
    if (label() instanceof CoreLabel) {
      ((CoreLabel) label()).set(CoreAnnotations.SpanAnnotation.class, new IntPair(left, position - 1));
    } else {
      throw new UnsupportedOperationException("Can only set spans on trees which use CoreLabel");
    }

    return position;
  }

  /**
   * Adds the constituents derived from {@code this} tree to
   * the ordered {@code Constituent} {@code Set}, beginning
   * numbering from the second argument and returning the number of
   * the right edge.  The reason for the return of the right frontier
   * is in order to produce bracketings recursively by threading through
   * the daughters of a given tree.
   *
   * @param constituentsSet set of constituents to add results of bracketing
   *                        this tree to
   * @param left            left position to begin labeling the bracketings with
   * @param cf              ConstituentFactory used to build the Constituent objects
   * @param charLevel       If true, compute constituents without respect to whitespace. Otherwise, preserve whitespace boundaries.
   * @param filter          A filter to use to decide whether or not to add a tree as a constituent.
   * @param maxDepth        The maximum depth at which to allow constituents.  Set to negative to indicate all depths allowed.
   * @param depth           The current depth
   * @return Index of right frontier of Constituent
   */
  private int constituents(Set constituentsSet, int left, ConstituentFactory cf, boolean charLevel, Predicate filter, int maxDepth, int depth) {

    if(isPreTerminal())
      return left + ((charLevel) ? firstChild().value().length() : 1);

    int position = left;

    // log.info("In bracketing trees left is " + left);
    // log.info("  label is " + label() +
    //                       "; num daughters: " + children().length);
    Tree[] kids = children();
    for (Tree kid : kids) {
      position = kid.constituents(constituentsSet, position, cf, charLevel, filter, maxDepth, depth + 1);
      // log.info("  position went to " + position);
    }

    if ((filter == null || filter.test(this)) &&
        (maxDepth < 0 || depth <= maxDepth)) {
      //Compute span of entire tree at the end of recursion
      constituentsSet.add(cf.newConstituent(left, position - 1, label(), score()));
    }
    // log.info("  added " + label());
    return position;
  }


  /**
   * Returns a new Tree that represents the local Tree at a certain node.
   * That is, it builds a new tree that copies the mother and daughter
   * nodes (but not their Labels), as non-Leaf nodes,
   * but zeroes out their children.
   *
   * @return A local tree
   */
  public Tree localTree() {
    Tree[] kids = children();
    Tree[] newKids = new Tree[kids.length];
    TreeFactory tf = treeFactory();
    for (int i = 0, n = kids.length; i < n; i++) {
      newKids[i] = tf.newTreeNode(kids[i].label(), Arrays.asList(EMPTY_TREE_ARRAY));
    }
    return tf.newTreeNode(label(), Arrays.asList(newKids));
  }


  /**
   * Returns a set of one level {@code Tree}s that ares the local trees
   * of the tree.
   * That is, it builds a new tree that copies the mother and daughter
   * nodes (but not their Labels), for each phrasal node,
   * but zeroes out their children.
   *
   * @return A set of local tree
   */
  public Set localTrees() {
    Set set = Generics.newHashSet();
    for (Tree st : this) {
      if (st.isPhrasal()) {
        set.add(st.localTree());
      }
    }
    return set;
  }


  /**
   * Most instances of {@code Tree} will take a lot more than
   * than the default {@code StringBuffer} size of 16 to print
   * as an indented list of the whole tree, so we enlarge the default.
   */
  private static final int initialPrintStringBuilderSize = 500;

  /**
   * Appends the printed form of a parse tree (as a bracketed String)
   * to a {@code StringBuilder}.
   * The implementation of this may be more efficient than for
   * {@code toString()} on complex trees.
   *
   * @param sb The {@code StringBuilder} to which the tree will be appended
   * @return Returns the {@code StringBuilder} passed in with extra stuff in it
   */
  public StringBuilder toStringBuilder(StringBuilder sb) {
    return toStringBuilder(sb, label -> (label.value() == null) ? "": label.value());
  }

  /**
   * Appends the printed form of a parse tree (as a bracketed String)
   * to a {@code StringBuilder}.
   * The implementation of this may be more efficient than for
   * {@code toString()} on complex trees.
   *
   * @param sb The {@code StringBuilder} to which the tree will be appended
   * @param labelFormatter Formatting routine for how to print a Label
   * @return Returns the {@code StringBuilder} passed in with extra stuff in it
   */
  public StringBuilder toStringBuilder(StringBuilder sb, Function labelFormatter) {
    if (isLeaf()) {
      if (label() != null) {
        sb.append(labelFormatter.apply(label()));
      }
      return sb;
    } else {
      sb.append('(');
      if (label() != null) {
        sb.append(labelFormatter.apply(label()));
      }
      Tree[] kids = children();
      if (kids != null) {
        for (Tree kid : kids) {
          sb.append(' ');
          kid.toStringBuilder(sb, labelFormatter);
        }
      }
      return sb.append(')');
    }
  }


  /**
   * Converts parse tree to string in Penn Treebank format.
   *
   * Implementation note: Internally, the method gains
   * efficiency by chaining use of a single {@code StringBuilder}
   * through all the printing.
   *
   * @return the tree as a bracketed list on one line
   */
  @Override
  public String toString() {
    return toStringBuilder(new StringBuilder(Tree.initialPrintStringBuilderSize)).toString();
  }


  private static final int indentIncr = 2;


  private static String makeIndentString(int indent) {
    StringBuilder sb = new StringBuilder(indent);
    for (int i = 0; i < indentIncr; i++) {
      sb.append(' ');
    }
    return sb.toString();
  }


  public void printLocalTree() {
    printLocalTree(new PrintWriter(System.out, true));
  }

  /**
   * Only prints the local tree structure, does not recurse
   */
  public void printLocalTree(PrintWriter pw) {
    pw.print("(" + label() + ' ');
    for (Tree kid : children()) {
      pw.print("(");
      pw.print(kid.label());
      pw.print(") ");
    }
    pw.println(")");
  }


  /**
   * Indented list printing of a tree.  The tree is printed in an
   * indented list notation, with node labels followed by node scores.
   */
  public void indentedListPrint() {
    indentedListPrint(new PrintWriter(System.out, true), false);
  }


  /**
   * Indented list printing of a tree.  The tree is printed in an
   * indented list notation, with node labels followed by node scores.
   *
   * @param pw The PrintWriter to print the tree to
   * @param printScores Whether to print the scores (log probs) of tree nodes
   */
  public void indentedListPrint(PrintWriter pw, boolean printScores) {
    indentedListPrint("", makeIndentString(indentIncr), pw, printScores);
  }


  /**
   * Indented list printing of a tree.  The tree is printed in an
   * indented list notation, with node labels followed by node scores.
   * String parameters are used rather than integer levels for efficiency.
   *
   * @param indent The base {@code String} (normally just spaces)
   *               to print before each line of tree
   * @param pad    The additional {@code String} (normally just more
   *               spaces) to add when going to a deeper level of {@code Tree}.
   * @param pw     The PrintWriter to print the tree to
   * @param printScores Whether to print the scores (log probs) of tree nodes
   */
  private void indentedListPrint(String indent, String pad, PrintWriter pw, boolean printScores) {
    StringBuilder sb = new StringBuilder(indent);
    Label label = label();
    if (label != null) {
      sb.append(label);
    }
    if (printScores) {
      sb.append("  ");
      sb.append(score());
    }
    pw.println(sb);
    Tree[] children = children();
    String newIndent = indent + pad;
    for (Tree child : children) {
      child.indentedListPrint(newIndent, pad, pw, printScores);
    }
  }

  /**
   * Indented xml printing of a tree.  The tree is printed in an indented xml notation.
   */
  public void indentedXMLPrint() {
    indentedXMLPrint(new PrintWriter(System.out, true), false);
  }


  /**
   * Indented xml printing of a tree.  The tree is printed in an
   * indented xml notation, with node labels followed by node scores.
   *
   * @param pw The PrintWriter to print the tree to
   * @param printScores Whether to print the scores (log probs) of tree nodes
   */
  public void indentedXMLPrint(PrintWriter pw, boolean printScores) {
    indentedXMLPrint("", makeIndentString(indentIncr), pw, printScores);
  }


  /**
   * Indented xml printing of a tree.  The tree is printed in an
   * indented xml notation, with node labels followed by node scores.
   * String parameters are used rather than integer levels for efficiency.
   *
   * @param indent The base {@code String} (normally just spaces)
   *               to print before each line of tree
   * @param pad    The additional {@code String} (normally just more
   *               spaces) to add when going to a deeper level of {@code Tree}.
   * @param pw     The PrintWriter to print the tree to
   * @param printScores Whether to print the scores (log probs) of tree nodes
   */
  private void indentedXMLPrint(String indent, String pad,
                                PrintWriter pw, boolean printScores) {
    StringBuilder sb = new StringBuilder(indent);
    Tree[] children = children();
    Label label = label();
    if (label != null) {
      sb.append('<');
      if (children.length > 0) {
        sb.append("node value=\"");
      } else {
        sb.append("leaf value=\"");
      }
      sb.append(XMLUtils.escapeXML(SentenceUtils.wordToString(label, true)));
      sb.append('"');
      if (printScores) {
        sb.append(" score=");
        sb.append(score());
      }
      if (children.length > 0) {
        sb.append('>');
      } else {
        sb.append("/>");
      }
    } else {
      if (children.length > 0) {
        sb.append("");
      } else {
        sb.append("");
      }
    }
    pw.println(sb);
    if (children.length > 0) {
      String newIndent = indent + pad;
      for (Tree child : children) {
        child.indentedXMLPrint(newIndent, pad, pw, printScores);
      }
      pw.println(indent + "");
    }
  }


  private static void displayChildren(Tree[] trChildren, int indent, boolean parentLabelNull,
                                      Function labelFormatter, PrintWriter pw) {
    boolean firstSibling = true;
    boolean leftSibIsPreTerm = true;  // counts as true at beginning
    for (Tree currentTree : trChildren) {
      currentTree.display(indent, parentLabelNull, firstSibling, leftSibIsPreTerm, false, labelFormatter, pw);
      leftSibIsPreTerm = currentTree.isPreTerminal();
      // CC is a special case for English, but leave it in so we can exactly match PTB3 tree formatting
      if (currentTree.value() != null && currentTree.value().startsWith("CC")) {
        leftSibIsPreTerm = false;
      }
      firstSibling = false;
    }
  }

  /**
   *  Returns the value of the node's label as a String.  This is done by
   *  calling {@code toString()} on the value, if it exists. Otherwise,
   *  an empty string is returned.
   *
   *  @return The label of a tree node as a String
   */
  public String nodeString() {
    return (value() == null) ? "" : value();
  }

  /**
   * Display a node, implementing Penn Treebank style layout
   */
  private void display(int indent, boolean parentLabelNull, boolean firstSibling, boolean leftSiblingPreTerminal, boolean topLevel, Function labelFormatter, PrintWriter pw) {
    // the condition for staying on the same line in Penn Treebank
    boolean suppressIndent = (parentLabelNull || (firstSibling && isPreTerminal()) || (leftSiblingPreTerminal && isPreTerminal() && (label() == null || !label().value().startsWith("CC"))));
    if (suppressIndent) {
      pw.print(" ");
      // pw.flush();
    } else {
      if (!topLevel) {
        pw.println();
      }
      for (int i = 0; i < indent; i++) {
        pw.print("  ");
        // pw.flush();
      }
    }
    if (isLeaf() || isPreTerminal()) {
      String terminalString = toStringBuilder(new StringBuilder(), labelFormatter).toString();
      pw.print(terminalString);
      pw.flush();
      return;
    }
    pw.print("(");
    pw.print(labelFormatter.apply(label()));
    // pw.flush();
    boolean parentIsNull = label() == null || label().value() == null;
    displayChildren(children(), indent + 1, parentIsNull, labelFormatter, pw);
    pw.print(")");
    pw.flush();
  }

  /**
   * Print the tree as done in Penn Treebank merged files.
   * The formatting should be exactly the same, but we don't print the
   * trailing whitespace found in Penn Treebank trees.
   * The basic deviation from a bracketed indented tree is to in general
   * collapse the printing of adjacent preterminals onto one line of
   * tags and words.  Additional complexities are that conjunctions
   * (tag CC) are not collapsed in this way, and that the unlabeled
   * outer brackets are collapsed onto the same line as the next
   * bracket down.
   *
   * @param pw The tree is printed to this {@code PrintWriter}
   */
  public void pennPrint(PrintWriter pw) {
    pennPrint(pw, label -> (label.value() == null) ? "": label.value());
  }

  public void pennPrint(PrintWriter pw, Function labelFormatter) {
    display(0, false, false, false, true, labelFormatter, pw);
    pw.println();
    pw.flush();
  }


  /**
   * Print the tree as done in Penn Treebank merged files.
   * The formatting should be exactly the same, but we don't print the
   * trailing whitespace found in Penn Treebank trees.
   * The basic deviation from a bracketed indented tree is to in general
   * collapse the printing of adjacent preterminals onto one line of
   * tags and words.  Additional complexities are that conjunctions
   * (tag CC) are not collapsed in this way, and that the unlabeled
   * outer brackets are collapsed onto the same line as the next
   * bracket down.
   *
   * @param ps The tree is printed to this {@code PrintStream}
   */
  public void pennPrint(PrintStream ps) {
    pennPrint(new PrintWriter(new OutputStreamWriter(ps), true));
  }

  public void pennPrint(PrintStream ps, Function labelFormatter) {
    pennPrint(new PrintWriter(new OutputStreamWriter(ps), true), labelFormatter);
  }

  /**
   * Calls {@code pennPrint()} and saves output to a String
   *
   * @return The indent S-expression representation of a Tree
   */
  public String pennString() {
    StringWriter sw = new StringWriter();
    pennPrint(new PrintWriter(sw));
    return sw.toString();
  }

  /**
   * Print the tree as done in Penn Treebank merged files.
   * The formatting should be exactly the same, but we don't print the
   * trailing whitespace found in Penn Treebank trees.
   * The tree is printed to {@code System.out}. The basic deviation
   * from a bracketed indented tree is to in general
   * collapse the printing of adjacent preterminals onto one line of
   * tags and words.  Additional complexities are that conjunctions
   * (tag CC) are not collapsed in this way, and that the unlabeled
   * outer brackets are collapsed onto the same line as the next
   * bracket down.
   */
  public void pennPrint() {
    pennPrint(System.out);
  }


  /**
   * Finds the depth of the tree.  The depth is defined as the length
   * of the longest path from this node to a leaf node.  Leaf nodes
   * have depth zero.  POS tags have depth 1. Phrasal nodes have
   * depth >= 2.
   *
   * @return the depth
   */
  public int depth() {
    if (isLeaf()) {
      return 0;
    }
    int maxDepth = 0;
    Tree[] kids = children();
    for (Tree kid : kids) {
      int curDepth = kid.depth();
      if (curDepth > maxDepth) {
        maxDepth = curDepth;
      }
    }
    return maxDepth + 1;
  }

  /**
   * Finds the distance from this node to the specified node.
   * return -1 if this is not an ancestor of node.
   *
   * @param node A subtree contained in this tree
   * @return the depth
   */
  public int depth(Tree node) {
    Tree p = node.parent(this);
    if (this == node) { return 0; }
    if (p == null) { return -1; }
    int depth = 1;
    while (this != p) {
      p = p.parent(this);
      depth++;
    }
    return depth;
  }


  /**
   * Returns the tree leaf that is the head of the tree.
   *
   * @param hf The head-finding algorithm to use
   * @param parent  The parent of this tree
   * @return The head tree leaf if any, else {@code null}
   */
  public Tree headTerminal(HeadFinder hf, Tree parent) {
    if (isLeaf()) {
      return this;
    }
    Tree head = hf.determineHead(this, parent);
    if (head != null) {
      return head.headTerminal(hf, parent);
    }
    log.info("Head is null: " + this);
    return null;
  }

  /**
   * Returns the tree leaf that is the head of the tree.
   *
   * @param hf The headfinding algorithm to use
   * @return The head tree leaf if any, else {@code null}
   */
  public Tree headTerminal(HeadFinder hf) {
    return headTerminal(hf, null);
  }


  /**
   * Returns the preterminal tree that is the head of the tree.
   * See {@link #isPreTerminal()} for
   * the definition of a preterminal node. Beware that some tree nodes may
   * have no preterminal head.
   *
   * @param hf The headfinding algorithm to use
   * @return The head preterminal tree, if any, else {@code null}
   * @throws IllegalArgumentException if called on a leaf node
   */
  public Tree headPreTerminal(HeadFinder hf) {
    if (isPreTerminal()) {
      return this;
    } else if (isLeaf()) {
      throw new IllegalArgumentException("Called headPreTerminal on a leaf: " + this);
    } else {
      Tree head = hf.determineHead(this);
      if (head != null) {
        return head.headPreTerminal(hf);
      }
      log.info("Head preterminal is null: " + this);
      return null;
    }
  }

  /**
   * Finds the head words of each tree and assigns
   * HeadWordLabelAnnotation on each node pointing to the correct
   * CoreLabel.  This relies on the nodes being CoreLabels, so it
   * throws an IllegalArgumentException if this is ever not true.
   */
  public void percolateHeadAnnotations(HeadFinder hf) {
    if (!(label() instanceof CoreLabel)) {
      throw new IllegalArgumentException("Expected CoreLabels in the trees");
    }
    CoreLabel nodeLabel = (CoreLabel) label();

    if (isLeaf()) {
      return;
    }

    if (isPreTerminal()) {
      nodeLabel.set(TreeCoreAnnotations.HeadWordLabelAnnotation.class, (CoreLabel) children()[0].label());
      nodeLabel.set(TreeCoreAnnotations.HeadTagLabelAnnotation.class, nodeLabel);
      return;
    }

    for (Tree kid : children()) {
      kid.percolateHeadAnnotations(hf);
    }

    final Tree head = hf.determineHead(this);
    if (head == null) {
      throw new NullPointerException("HeadFinder " + hf + " returned null for " + this);
    } else if (head.isLeaf()) {
      nodeLabel.set(TreeCoreAnnotations.HeadWordLabelAnnotation.class, (CoreLabel) head.label());
      nodeLabel.set(TreeCoreAnnotations.HeadTagLabelAnnotation.class, (CoreLabel) head.parent(this).label());
    } else if (head.isPreTerminal()) {
      nodeLabel.set(TreeCoreAnnotations.HeadWordLabelAnnotation.class, (CoreLabel) head.children()[0].label());
      nodeLabel.set(TreeCoreAnnotations.HeadTagLabelAnnotation.class, (CoreLabel) head.label());
    } else {
      if (!(head.label() instanceof CoreLabel)) {
        throw new AssertionError("Horrible bug");
      }
      CoreLabel headLabel = (CoreLabel) head.label();
      nodeLabel.set(TreeCoreAnnotations.HeadWordLabelAnnotation.class, headLabel.get(TreeCoreAnnotations.HeadWordLabelAnnotation.class));
      nodeLabel.set(TreeCoreAnnotations.HeadTagLabelAnnotation.class, headLabel.get(TreeCoreAnnotations.HeadTagLabelAnnotation.class));
    }
  }


  /**
   * Finds the heads of the tree.  This code assumes that the label
   * does store and return sensible values for the category, word, and tag.
   * It will be a no-op otherwise.  The tree is modified.  The routine
   * assumes the Tree has word leaves and tag preterminals, and copies
   * their category to word and tag respectively, if they have a null
   * value.
   *
   * @param hf The headfinding algorithm to use
   */
  public void percolateHeads(HeadFinder hf) {
    Label nodeLabel = label();
    if (isLeaf()) {
      // Sanity check: word() is usually set by the TreeReader.
      if (nodeLabel instanceof HasWord) {
        HasWord w = (HasWord) nodeLabel;
        if (w.word() == null) {
          w.setWord(nodeLabel.value());
        }
      }

    } else {
      for (Tree kid : children()) {
        kid.percolateHeads(hf);
      }

      final Tree head = hf.determineHead(this);
      if (head != null) {
        final Label headLabel = head.label();

        // Set the head tag.
        String headTag = (headLabel instanceof HasTag) ? ((HasTag) headLabel).tag() : null;
        if (headTag == null && head.isLeaf()) {
          // below us is a leaf
          headTag = nodeLabel.value();
        }

        // Set the head word
        String headWord = (headLabel instanceof HasWord) ? ((HasWord) headLabel).word() : null;
        if (headWord == null && head.isLeaf()) {
          // below us is a leaf
          // this might be useful despite case for leaf above in
          // case the leaf label type doesn't support word()
          headWord = headLabel.value();
        }

        // Set the head index
        int headIndex = (headLabel instanceof HasIndex) ? ((HasIndex) headLabel).index() : -1;

        if (nodeLabel instanceof HasWord) {
          ((HasWord) nodeLabel).setWord(headWord);
        }
        if (nodeLabel instanceof HasTag) {
          ((HasTag) nodeLabel).setTag(headTag);
        }
        if (nodeLabel instanceof HasIndex && headIndex >= 0) {
          ((HasIndex) nodeLabel).setIndex(headIndex);
        }

      } else {
        log.info("Head is null: " + this);
      }
    }
  }

  /**
   * Return a Set of TaggedWord-TaggedWord dependencies, represented as
   * Dependency objects, for the Tree.  This will only give
   * useful results if the internal tree node labels support HasWord and
   * HasTag, and head percolation has already been done (see
   * percolateHeads()).
   *
   * @return Set of dependencies (each a Dependency)
   */
  public Set> dependencies() {
    return dependencies(Filters.acceptFilter());
  }

  public Set> dependencies(Predicate> f) {
    return dependencies(f, true, true, false);
  }

  /**
   * Convert a constituency label to a dependency label. Options are provided for selecting annotations
   * to copy.
   *
   * @param oldLabel
   * @param copyLabel
   * @param copyIndex
   * @param copyPosTag
   */
  private static Label makeDependencyLabel(Label oldLabel, boolean copyLabel, boolean copyIndex, boolean copyPosTag) {
    if ( ! copyLabel)
      return oldLabel;

    String wordForm = (oldLabel instanceof HasWord) ? ((HasWord) oldLabel).word() : oldLabel.value();
    Label newLabel = oldLabel.labelFactory().newLabel(wordForm);
    if (newLabel instanceof HasWord) ((HasWord) newLabel).setWord(wordForm);
    if (copyPosTag && newLabel instanceof HasTag && oldLabel instanceof HasTag) {
      String tag = ((HasTag) oldLabel).tag();
      ((HasTag) newLabel).setTag(tag);
    }
    if (copyIndex && newLabel instanceof HasIndex && oldLabel instanceof HasIndex) {
      int index = ((HasIndex) oldLabel).index();
      ((HasIndex) newLabel).setIndex(index);
    }

    return newLabel;
  }

  /**
   * Return a set of TaggedWord-TaggedWord dependencies, represented as
   * Dependency objects, for the Tree.  This will only give
   * useful results if the internal tree node labels support HasWord and
   * head percolation has already been done (see percolateHeads()).
   *
   * @param f Dependencies are excluded for which the Dependency is not
   *          accepted by the Filter
   * @return Set of dependencies (each a Dependency)
   */
  public Set> dependencies(Predicate> f, boolean isConcrete, boolean copyLabel, boolean copyPosTag) {
    Set> deps = Generics.newHashSet();
    for (Tree node : this) {
      // Skip leaves and unary re-writes
      if (node.isLeaf() || node.children().length < 2) {
        continue;
      }
      // Create the head label (percolateHeads has already been executed)
      Label headLabel = makeDependencyLabel(node.label(), copyLabel, isConcrete, copyPosTag);
      String headWord = ((HasWord) headLabel).word();
      if (headWord == null) {
        headWord = headLabel.value();
      }
      int headIndex = (isConcrete && (headLabel instanceof HasIndex)) ? ((HasIndex) headLabel).index() : -1;

      // every child with a different (or repeated) head is an argument
      boolean seenHead = false;
      for (Tree child : node.children()) {
        Label depLabel = makeDependencyLabel(child.label(), copyLabel, isConcrete, copyPosTag);
        String depWord = ((HasWord) depLabel).word();
        if (depWord == null) {
          depWord = depLabel.value();
        }
        int depIndex = (isConcrete && (depLabel instanceof HasIndex)) ? ((HasIndex) depLabel).index() : -1;

        if (!seenHead && headIndex == depIndex && headWord.equals(depWord)) {
          seenHead = true;
        } else {
          Dependency dependency = (isConcrete && depIndex != headIndex) ?
              new UnnamedConcreteDependency(headLabel, depLabel) :
              new UnnamedDependency(headLabel, depLabel);

          if (f.test(dependency)) {
            deps.add(dependency);
          }
        }
      }
    }
    return deps;
  }

  /**
   * Return a set of Label-Label dependencies, represented as
   * Dependency objects, for the Tree.  The Labels are the ones of the leaf
   * nodes of the tree, without mucking with them.
   *
   * @param f  Dependencies are excluded for which the Dependency is not
   *           accepted by the Filter
   * @param hf The HeadFinder to use to identify the head of constituents.
   *           The code assumes
   *           that it can use {@code headPreTerminal(hf)} to find a
   *           tag and word to make a CoreLabel.
   * @return Set of dependencies (each a {@code Dependency} between two
   *           {@code CoreLabel}s, which each contain a tag(), word(),
   *           and value(), the last two of which are identical).
   */
  public Set> mapDependencies(Predicate> f, HeadFinder hf) {
    if (hf == null) {
      throw new IllegalArgumentException("mapDependencies: need HeadFinder");
    }
    Set> deps = Generics.newHashSet();
    for (Tree node : this) {
      if (node.isLeaf() || node.children().length < 2) {
        continue;
      }
      // Label l = node.label();
      // log.info("doing kids of label: " + l);
      //Tree hwt = node.headPreTerminal(hf);
      Tree hwt = node.headTerminal(hf);
      // log.info("have hf, found head preterm: " + hwt);
      if (hwt == null) {
        throw new IllegalStateException("mapDependencies: HeadFinder failed!");
      }

      for (Tree child : node.children()) {
        // Label dl = child.label();
        // Tree dwt = child.headPreTerminal(hf);
        Tree dwt = child.headTerminal(hf);
        if (dwt == null) {
          throw new IllegalStateException("mapDependencies: HeadFinder failed!");
        }
        //log.info("kid is " + dl);
         //log.info("transformed to " + dml.toString("value{map}"));
        if (dwt != hwt) {
          Dependency p = new UnnamedDependency(hwt.label(), dwt.label());
          if (f.test(p)) {
            deps.add(p);
          }
        }
      }
    }
    return deps;
  }

  /**
   * Return a set of Label-Label dependencies, represented as
   * Dependency objects, for the Tree.  The Labels are the ones of the leaf
   * nodes of the tree, without mucking with them. The head of the sentence is a
   * dependent of a synthetic "root" label.
   *
   * @param f  Dependencies are excluded for which the Dependency is not
   *           accepted by the Filter
   * @param hf The HeadFinder to use to identify the head of constituents.
   *           The code assumes
   *           that it can use {@code headPreTerminal(hf)} to find a
   *           tag and word to make a CoreLabel.
   * @param    rootName Name of the root node.
   * @return   Set of dependencies (each a {@code Dependency} between two
   *           {@code CoreLabel}s, which each contain a tag(), word(),
   *           and value(), the last two of which are identical).
   */
  public Set> mapDependencies(Predicate> f, HeadFinder hf, String rootName) {
    Set> deps = mapDependencies(f, hf);
    if(rootName != null) {
      Label hl = headTerminal(hf).label();
      CoreLabel rl = new CoreLabel();
      rl.set(CoreAnnotations.TextAnnotation.class, rootName);
      rl.set(CoreAnnotations.IndexAnnotation.class, 0);
      deps.add(new NamedDependency(rl, hl, rootName));
    }
    return deps;
  }

  /**
   * Gets the yield of the tree.  The {@code Label} of all leaf nodes
   * is returned
   * as a list ordered by the natural left to right order of the
   * leaves.  Null values, if any, are inserted into the list like any
   * other value.
   *
   * @return a {@code List} of the data in the tree's leaves.
   */
  public ArrayList

* This method assumes CoreLabels! */ public void percolateHeadIndices() { if (isPreTerminal()) { int nodeIndex = ((HasIndex) firstChild().label()).index(); ((HasIndex) label()).setIndex(nodeIndex); return; } // Assign the head index to the first child that we encounter with a matching // surface form. Obviously a head can have the same surface form as its dependent, // and in this case the head index is ambiguous. String wordAnnotation = ((HasWord) label()).word(); if (wordAnnotation == null) { wordAnnotation = value(); } boolean seenHead = false; for (Tree child : children()) { child.percolateHeadIndices(); String childWordAnnotation = ((HasWord) child.label()).word(); if (childWordAnnotation == null) { childWordAnnotation = child.value(); } if ( !seenHead && wordAnnotation.equals(childWordAnnotation)) { seenHead = true; int nodeIndex = ((HasIndex) child.label()).index(); ((HasIndex) label()).setIndex(nodeIndex); } } } /** Index all spans (constituents) in the tree. * For this, spans uses 0-based indexing and the span records the fencepost * to the left of the first word and after the last word of the span. * The spans are only recorded if the Tree has labels of a class which * extends CoreMap. */ public void indexSpans() { indexSpans(0); } public void indexSpans(int startIndex) { indexSpans(new MutableInteger(startIndex)); } /** * Assigns span indices (BeginIndexAnnotation and EndIndexAnnotation) to all nodes in a tree. * The beginning index is equivalent to the IndexAnnotation of the first leaf in the constituent. * The end index is equivalent to the first integer after the IndexAnnotation of the last leaf in the constituent. * * @param startIndex Begin indexing at this value */ public Pair indexSpans(MutableInteger startIndex) { int start = Integer.MAX_VALUE; int end = Integer.MIN_VALUE; if(isLeaf()){ start = startIndex.intValue(); end = startIndex.intValue() + 1; startIndex.incValue(1); } else { for (Tree kid : children()) { Pair span = kid.indexSpans(startIndex); if(span.first < start) start = span.first; if(span.second > end) end = span.second; } } Label label = label(); if (label instanceof CoreMap) { CoreMap afl = (CoreMap) label(); afl.set(CoreAnnotations.BeginIndexAnnotation.class, start); afl.set(CoreAnnotations.EndIndexAnnotation.class, end); } return new Pair<>(start, end); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy