edu.stanford.nlp.trees.GrammaticalRelation Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
There is a newer version: 3.9.2
Show newest version
// Stanford Dependencies - Code for producing and using Stanford dependencies.
// Copyright © 2005-2014 The Board of Trustees of
// The Leland Stanford Junior University. All Rights Reserved.
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//
// For more information, bug reports, fixes, contact:
//    Christopher Manning
//    Dept of Computer Science, Gates 1A
//    Stanford CA 94305-9010
//    USA
//    [email protected]
//    http://nlp.stanford.edu/software/stanford-dependencies.shtml

package edu.stanford.nlp.trees;

import edu.stanford.nlp.international.Languages;
import edu.stanford.nlp.trees.international.pennchinese.ChineseGrammaticalRelations;
import edu.stanford.nlp.trees.tregex.TregexMatcher;
import edu.stanford.nlp.trees.tregex.TregexPattern;
import edu.stanford.nlp.trees.tregex.TregexPatternCompiler;
import edu.stanford.nlp.util.ArraySet;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.StringUtils;

import java.io.ObjectStreamException;
import java.io.Serializable;
import java.lang.ref.SoftReference;
import java.util.*;
import java.util.regex.Pattern;


/**
 * {@code GrammaticalRelation} is used to define a
 * standardized, hierarchical set of grammatical relations,
 * together with patterns for identifying them in
 * parse trees.
 *
 * Each GrammaticalRelation has:
 * 

 *   A String short name, which should be a lowercase
 *       abbreviation of some kind (in the fure mainly Universal Dependency names).
 *   A String long name, which should be descriptive.
 *   A parent in the GrammaticalRelation hierarchy.
 *   A {@link Pattern Pattern} called
 *   sourcePattern which matches (parent) nodes from which
 *   this GrammaticalRelation could hold.  (Note: this is done
 *   with the Java regex Pattern matches() predicate. The pattern
 *   must match the
 *   whole node name, and ^ or $ aren't needed.
 *   Tregex constructions like __ do not work. Use ".*" to be applicable
 *   at all nodes. This prefiltering is used for efficiency.)
 *   A list of zero or more {@link TregexPattern
 *   TregexPatterns} called targetPatterns,
 *   which describe the local tree structure which must hold between
 *   the source node and a target node for the
 *   GrammaticalRelation to apply. (Note: {@code tregex}
 *   regular expressions match with the {@code find()} method, while
 *   literal string label descriptions that are not regular expressions must
 *   be {@code equals()}.)
 * 
 *
 * The targetPatterns associated
 * with a GrammaticalRelation are designed as follows.
 * In order to recognize a grammatical relation X holding between
 * nodes A and B in a parse tree, we want to associate with
 * GrammaticalRelation X a {@link TregexPattern
 * TregexPattern} such that:
 * 
 *   the root of the pattern matches A, and
 *   the pattern includes a node labeled "target", which matches B.
 * 
 * For example, for the grammatical relation PREDICATE
 * which holds between a clause and its primary verb phrase, we might
 * want to use the pattern {@code "S < VP=target"}, in which the
 * root will match a clause and the node labeled "target"
 * will match the verb phrase.
 *
 * For a given grammatical relation, the method {@link
 * GrammaticalRelation#getRelatedNodes getRelatedNodes()}
 * takes a Tree node as an argument and attempts to
 * return other nodes which have this grammatical relation to the
 * argument node.  By default, this method operates as follows: it
 * steps through the patterns in the pattern list, trying to match
 * each pattern against the argument node, until it finds some
 * matches.  If a pattern matches, all matching nodes (that is, each
 * node which corresponds to node label "target" in some match) are
 * returned as a list; otherwise the next pattern is tried.

 *
 * For some grammatical relations, we need more sophisticated logic to
 * identify related nodes.  In such cases, {@link
 * GrammaticalRelation#getRelatedNodes getRelatedNodes()}
 * can be overridden on a per-relation basis using anonymous subclassing.

 *
 * @see GrammaticalStructure
 * @see EnglishGrammaticalStructure
 * @see EnglishGrammaticalRelations
 * @see edu.stanford.nlp.trees.international.pennchinese.ChineseGrammaticalRelations
 *
 * @author Bill MacCartney
 * @author Galen Andrew (refactoring English-specific stuff)
 * @author Ilya Sherman (refactoring annotation-relation pairing, which is now gone)
 */
public class GrammaticalRelation implements Comparable, Serializable {

  private static final long serialVersionUID = 892618003417550128L;

  private static final boolean DEBUG = System.getProperty("GrammaticalRelation", null) != null;

  private static final EnumMap>
    stringsToRelations = new EnumMap>(Language.class);

  /**
   * The "governor" grammatical relation, which is the inverse of "dependent".

   * 

   * Example: "the red car" → gov(red, car)
   */
  public static final GrammaticalRelation GOVERNOR =
    new GrammaticalRelation(Language.Any, "gov", "governor", null);


  /**
   * The "dependent" grammatical relation, which is the inverse of "governor".

   * 
   * Example: "the red car" → dep(car, red)
   */
  public static final GrammaticalRelation DEPENDENT =
    new GrammaticalRelation(Language.Any, "dep", "dependent", null);


  /**
   *  The "root" grammatical relation between a faked "ROOT" node, and the root of the sentence.
   */
  public static final GrammaticalRelation ROOT =
    new GrammaticalRelation(Language.Any, "root", "root", null);


  /**
   * Dummy relation, used while collapsing relations, e.g., in English & Chinese GrammaticalStructure
   */
  public static final GrammaticalRelation KILL =
    new GrammaticalRelation(Language.Any, "KILL", "dummy relation kill", null);


  /**
   * Returns the GrammaticalRelation having the given string
   * representation (e.g. "nsubj"), or null if no such is found.
   *
   * @param s The short name of the GrammaticalRelation
   * @param values The set of GrammaticalRelations to look for it among.
   * @return The GrammaticalRelation with that name
   */
  public static GrammaticalRelation valueOf(String s, Collection values) {
    for (GrammaticalRelation reln : values) {
      if (reln.toString().equals(s)) return reln;
    }

    return null;
  }

  /** Convert from a String representation of a GrammaticalRelation to a
   *  GrammaticalRelation.  Where possible, you should avoid using this
   *  method and simply work with true GrammaticalRelations rather than
   *  String representations.  Correct behavior of this method depends
   *  on the underlying data structure resources used being kept in sync
   *  with the toString() and equals() methods.  However, there is really
   *  no choice but to use this method when storing GrammaticalRelations
   *  to text files and then reading them back in, so this method is not
   *  deprecated.
   *
   *  @param s The String representation of a GrammaticalRelation
   *  @return The grammatical relation represented by this String
   */
  public static GrammaticalRelation valueOf(Language language, String s) {
    GrammaticalRelation reln = (stringsToRelations.get(language) != null ? valueOf(s, stringsToRelations.get(language).values()) : null);
    if (reln == null) {
      // TODO this breaks the hierarchical structure of the classes,
      //      but it makes English relations that much likelier to work.
      reln = EnglishGrammaticalRelations.valueOf(s);
    }
    if (reln == null) {
      // the block below fails when 'specific' includes underscores.
      // this is possible on weird web text, which generates relations such as prep______
      /*
      String[] names = s.split("_");
      String specific = names.length > 1? names[1] : null;
      reln = new GrammaticalRelation(language, names[0], null, null, null, specific);
      */
      String name;
      String specific;
      int underscorePosition = s.indexOf('_');
      if (underscorePosition > 0) {
        name = s.substring(0, underscorePosition);
        specific = s.substring(underscorePosition + 1);
      } else {
        name = s;
        specific = null;
      }
      reln = new GrammaticalRelation(language, name, null, null, specific);

    }
    return reln;
  }

  private static Map> valueOfCache = new HashMap>();

  public static GrammaticalRelation valueOf(String s, Languages.Language language) {
    GrammaticalRelation value = null;
    SoftReference possiblyCachedValue = valueOfCache.get(s);
    if (possiblyCachedValue != null) { value = possiblyCachedValue.get(); }
    if (value == null) {  // TODO(gabor) we have the language conversion going on again...
      Language depLanguage = Language.Any;
      switch (language) {
        case Arabic:
          break;
        case Chinese:
          depLanguage = Language.Chinese;
          break;
        case English:
          depLanguage = Language.English;
          break;
        case German:
          break;
        case French:
          break;
        case Hebrew:
          break;
        case Spanish:
          break;
        case Unknown:
          depLanguage = Language.Any;
          break;
      }
      value = valueOf(depLanguage, s);
      valueOfCache.put(s, new SoftReference<>(value));
    }
    return value;
  }

  public static GrammaticalRelation valueOf(String s) {
    return valueOf(s, Languages.Language.English);
  }

  /**
   * This function is used to determine whether the GrammaticalRelation in
   * question is one that was created to be a thin wrapper around a String
   * representation by valueOf(String), or whether it is a full-fledged
   * GrammaticalRelation created by direct invocation of the constructor.
   *
   * @return Whether this relation is just a wrapper created by valueOf(String)
   */
  public boolean isFromString() {
    return longName == null;
  }


  public static enum Language { Any, English, Chinese }


  /* Non-static stuff */
  private final Language language;
  private final String shortName;
  private final String longName;
  private final GrammaticalRelation parent;
  private final List children = new ArrayList();
  // a regexp for node values at which this relation can hold
  private final Pattern sourcePattern;
  private final List targetPatterns = new ArrayList();
  private final String specific; // to hold the specific prep or conjunction associated with the grammatical relation

  // TODO document constructor
  // TODO change to put specificString after longName, and then use String... for targetPatterns
  private GrammaticalRelation(Language language,
                             String shortName,
                             String longName,
                             GrammaticalRelation parent,
                             String sourcePattern,
                             TregexPatternCompiler tregexCompiler,
                             String[] targetPatterns,
                             String specificString) {
    this.language = language;
    this.shortName = shortName;
    this.longName = longName;
    this.parent = parent;
    this.specific = specificString; // this can be null!

    if (parent != null) {
      parent.addChild(this);
    }

    if (sourcePattern != null) {
      try {
        this.sourcePattern = Pattern.compile(sourcePattern);
      } catch (java.util.regex.PatternSyntaxException e) {
        throw new RuntimeException("Bad pattern: " + sourcePattern);
      }
    } else {
      this.sourcePattern = null;
    }

    for (String pattern : targetPatterns) {
      try {
        TregexPattern p = tregexCompiler.compile(pattern);
        this.targetPatterns.add(p);
      } catch (edu.stanford.nlp.trees.tregex.TregexParseException pe) {
        throw new RuntimeException("Bad pattern: " + pattern, pe);
      }
    }

    Map sToR = stringsToRelations.get(language);
    if (sToR == null) {
      sToR = Generics.newHashMap();
      stringsToRelations.put(language, sToR);
    }
    GrammaticalRelation previous = sToR.put(toString(), this);
    if (previous != null) {
      if (!previous.isFromString() && !isFromString()) {
        throw new IllegalArgumentException("There is already a relation named " + toString() + '!');
      } else {
        /* We get here if we previously just built a fake relation from a string
         * we previously read in from a file.
         */
        // TODO is it worth copying all of the information from this real
        //      relation into the old fake one?
      }
    }
  }

  // This is the main constructor used
  public GrammaticalRelation(Language language,
                             String shortName,
                             String longName,
                             GrammaticalRelation parent,
                             String sourcePattern,
                             TregexPatternCompiler tregexCompiler,
                             String... targetPatterns) {
    this(language, shortName, longName, parent, sourcePattern, tregexCompiler, targetPatterns, null);
  }

  // Used for non-leaf relations with no patterns
  public GrammaticalRelation(Language language,
                             String shortName,
                             String longName,
                             GrammaticalRelation parent) {
    this(language, shortName, longName, parent, null, null, StringUtils.EMPTY_STRING_ARRAY, null);
  }

  // used to create collapsed relations with specificString
  public GrammaticalRelation(Language language,
                             String shortName,
                             String longName,
                             GrammaticalRelation parent,
                             String specificString) {
    this(language, shortName, longName, parent, null, null, StringUtils.EMPTY_STRING_ARRAY, specificString);
  }

  private void addChild(GrammaticalRelation child) {
    children.add(child);
  }

  /** Given a {@code Tree} node {@code t}, attempts to
   *  return a list of nodes to which node {@code t} has this
   *  grammatical relation, with {@code t} as the governor.
   *
   *  @param t Target for finding dependents of t related by this GR
   *  @param root The root of the Tree
   *  @return A Collection of dependent nodes to which t bears this GR
   */
  public Collection getRelatedNodes(TreeGraphNode t, TreeGraphNode root, HeadFinder headFinder) {
    Set nodeList = new ArraySet();
    for (TregexPattern p : targetPatterns) {    // cdm: I deleted: && nodeList.isEmpty()
      // Initialize the TregexMatcher with the HeadFinder so that we
      // can use the same HeadFinder through the entire process of
      // building the dependencies
      TregexMatcher m = p.matcher(root, headFinder);
      while (m.findAt(t)) {
        TreeGraphNode target = (TreeGraphNode) m.getNode("target");
        if (target == null) {
          throw new AssertionError("Expression has no target: " + p);
        }
        nodeList.add(target);
        if (DEBUG) {
          System.err.println("found " + this + "(" + t + "-" + t.headWordNode() + ", " + m.getNode("target") + "-" + ((TreeGraphNode) m.getNode("target")).headWordNode() + ") using pattern " + p);
          for (String nodeName : m.getNodeNames()) {
            if (nodeName.equals("target"))
              continue;
            System.err.println("  node " + nodeName + ": " + m.getNode(nodeName));
          }
        }
      }
    }
    return nodeList;
  }

  /** Returns true iff the value of Tree
   *  node t matches the sourcePattern for
   *  this GrammaticalRelation, indicating that this
   *  GrammaticalRelation is one that could hold between
   *  Tree node t and some other node.
   */
  public boolean isApplicable(Tree t) {
    // System.err.println("Testing whether " + sourcePattern + " matches " + ((TreeGraphNode) t).toOneLineString());
    return (sourcePattern != null) && (t.value() != null) &&
             sourcePattern.matcher(t.value()).matches();
  }

  /** Returns whether this is equal to or an ancestor of gr in the grammatical relations hierarchy. */
  public boolean isAncestor(GrammaticalRelation gr) {
    while (gr != null) {
      // Changed this test from this == gr (mrsmith)
      if (this.equals(gr)) { return true; }
      gr = gr.parent;
    }
    return false;
  }

  /**
   * Returns short name (abbreviation) for this
   * GrammaticalRelation.  toString() for collapsed
   * relations will include the word that was collapsed.
   * 

   * Implementation note: Note that this method must be synced with
   * the equals() and valueOf(String) methods
   */
  @Override
  public final String toString() {
    if (specific == null) {
      return shortName;
    } else {
      return shortName + '_' + specific;
    }
  }

  /**
   * Returns a String representation of this
   * GrammaticalRelation and the hierarchy below
   * it, with one node per line, indented according to level.
   *
   * @return String representation of this
   *         GrammaticalRelation
   */
  public String toPrettyString() {
    StringBuilder buf = new StringBuilder("\n");
    toPrettyString(0, buf);
    return buf.toString();
  }

  /**
   * Returns a String representation of this
   * GrammaticalRelation and the hierarchy below
   * it, with one node per line, indented according to
   * indentLevel.
   *
   * @param indentLevel how many levels to indent (0 for root node)
   */
  private void toPrettyString(int indentLevel, StringBuilder buf) {
    for (int i = 0; i < indentLevel; i++) {
      buf.append("  ");
    }
    buf.append(shortName).append(" (").append(longName).append("): ").append(targetPatterns);
    for (GrammaticalRelation child : children) {
      buf.append('\n');
      child.toPrettyString(indentLevel + 1, buf);
    }
  }

  /** Grammatical relations are equal with other grammatical relations if they
   *  have the same shortName and specific (if present).
   *  Implementation note: Note that this method must be synced with
   *  the toString() and valueOf(String) methods
   *
   *  @param o Object to be compared
   *  @return Whether equal
   */
  @SuppressWarnings({"StringEquality", "ThrowableInstanceNeverThrown"})
  @Override
  public boolean equals(Object o) {
    if (this == o) return true;
    if (o instanceof String) {
      // TODO: Remove this. It's broken but was meant to cover legacy code. It would be correct to just return false.
      new Throwable("Warning: comparing GrammaticalRelation to String").printStackTrace();
      return this.toString().equals(o);
    }
    if (!(o instanceof GrammaticalRelation)) return false;

    final GrammaticalRelation gr = (GrammaticalRelation) o;
    // == okay for language as enum!
    // TODO(gabor) perhaps Language.Any shouldn't be equal to any language? This is a bit of a hack around some dependencies caring about language and others not.
    return (this.language == Language.Any || gr.language == Language.Any || this.language == gr.language) &&
             this.shortName.equals(gr.shortName) &&
             (this.specific == gr.specific ||
              (this.specific != null && this.specific.equals(gr.specific)));
  }

  @Override
  public int hashCode() {
    int result = 17;
    result = 29 * result + (language != null ? language.toString().hashCode() : 0);
    result = 29 * result + (shortName != null ? shortName.hashCode() : 0);
    result = 29 * result + (specific != null ? specific.hashCode() : 0);
    return result;
  }

  @Override
  public int compareTo(GrammaticalRelation o) {
    String thisN = this.toString();
    String oN = o.toString();
    return thisN.compareTo(oN);
  }

  public String getLongName() {
    return longName;
  }

  public String getShortName() {
    return shortName;
  }

  // TODO(gabor) this is nontrivially brittle. I guess in the long term we should only have one "Language" enum?
  /**
   * Get the language of the grammatical relation.
   */
  public Languages.Language getLanguage() {
    switch (this.language) {
      case Any: return Languages.Language.Unknown;
      case English: return Languages.Language.English;
      case Chinese: return Languages.Language.Chinese;
      default:
        throw new IllegalStateException("Unknown language: " + this.language);
    }
  }

  public String getSpecific() {
    return specific;
  }

  /**
   * When deserializing a GrammaticalRelation, it needs to be matched
   * up with the existing singleton relation of the same type.
   *
   * TODO: there are a bunch of things wrong with this.  For one
   * thing, it's crazy slow, since it goes through all the existing
   * relations in an array.  For another, it would be cleaner to have
   * subclasses for the English and Chinese relations
   */
  protected Object readResolve() throws ObjectStreamException {
    switch (language) {
    case Any: {
      if (shortName.equals(GOVERNOR.shortName)) {
        return GOVERNOR;
      } else if (shortName.equals(DEPENDENT.shortName)) {
        return DEPENDENT;
      } else if (shortName.equals(ROOT.shortName)) {
        return ROOT;
      } else if (shortName.equals(KILL.shortName)) {
        return KILL;
      } else {
        throw new RuntimeException("Unknown general relation " + shortName);
      }
    }
    case English: {
      GrammaticalRelation rel = EnglishGrammaticalRelations.valueOf(toString());
      if (rel == null) {
        switch (shortName) {
          case "conj":
            return EnglishGrammaticalRelations.getConj(specific);
          case "prep":
            return EnglishGrammaticalRelations.getPrep(specific);
          case "prepc":
            return EnglishGrammaticalRelations.getPrepC(specific);
          default:
            // TODO: we need to figure out what to do with relations
            // which were serialized and then deprecated.  Perhaps there
            // is a good way to make them singletons
            return this;
          //throw new RuntimeException("Unknown English relation " + this);
        }
      } else {
        return rel;
      }
    }
    case Chinese: {
      GrammaticalRelation rel = ChineseGrammaticalRelations.valueOf(toString());
      if (rel == null) {
        // TODO: we need to figure out what to do with relations
        // which were serialized and then deprecated.  Perhaps there
        // is a good way to make them singletons
        return this;
        //throw new RuntimeException("Unknown Chinese relation " + this);
      }
      return rel;
    }
    default: {
      throw new RuntimeException("Unknown language " + language);
    }
    }
  }

  /**
   * Returns the parent of this GrammaticalRelation.
   */
  public GrammaticalRelation getParent() {
    return parent;
  }

  public static void main(String[] args) {
    final String[] names = {"dep", "pred", "prep_to","rcmod"};
    for (String name : names) {
      GrammaticalRelation reln = valueOf(Language.English, name);
      System.out.println("Data for GrammaticalRelation loaded as valueOf(\"" + name + "\"):");
      System.out.println("\tShort name:    " + reln.getShortName());
      System.out.println("\tLong name:     " + reln.getLongName());
      System.out.println("\tSpecific name: " + reln.getSpecific());
    }
  }

}