edu.stanford.nlp.trees.AbstractCollinsHeadFinder Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.trees; 
import edu.stanford.nlp.util.logging.Redwood;

import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.Map;

/**
 * A base class for a HeadFinder similar to the one described in
 * Michael Collins' 1999 thesis.  For a given constituent we perform operations
 * like (this is for "left" or "right":
 *  * for categoryList in categoryLists
 *   for index = 1 to n [or n to 1 if R->L]
 *     for category in categoryList
 *       if category equals daughter[index] choose it.
 * 
 * 
 * with a final default that goes with the direction (L->R or R->L)
 * For most constituents, there will be only one category in the list,
 * the exception being, in Collins' original version, NP.
 * 
 * 
 * It is up to the overriding base class to initialize the map
 * from constituent type to categoryLists, "nonTerminalInfo",
 * in its constructor.
 * Entries are presumed to be of type String[][].  Each String[] is a list of
 * categories, except for the first entry, which specifies direction of
 * traversal and must be one of the following:
 * 
 * 
 *  "left" means search left-to-right by category and then by position
 * 
 "leftdis" means search left-to-right by position and then by category
 * 
 "right" means search right-to-left by category and then by position
 * 
 "rightdis" means search right-to-left by position and then by category
 * 
 "leftexcept" means to take the first thing from the left that isn't in the list
 * 
 "rightexcept" means to take the first thing from the right that isn't on the list
 * 
 * 
 * Changes:
 * 
 * 
 *  2002/10/28 -- Category label identity checking now uses the
 * equals() method instead of ==, so not interning category labels
 * shouldn't break things anymore.  (Roger Levy) 

 * 
 2003/02/10 -- Changed to use TreebankLanguagePack and to cut on
 * characters that set off annotations, so this should work even if
 * functional tags are still on nodes. 

 * 
 2004/03/30 -- Made abstract base class and subclasses for CollinsHeadFinder,
 * ModCollinsHeadFinder, SemanticHeadFinder, ChineseHeadFinder
 * (and trees.icegb.ICEGBHeadFinder, trees.international.negra.NegraHeadFinder,
 * and movetrees.EnglishPennMaxProjectionHeadFinder)
 * 
 2011/01/13 -- Add support for categoriesToAvoid (which can be set to ensure that
 * punctuation is not the head if there are other options)
 * 
 *
 * @author Christopher Manning
 * @author Galen Andrew
 */
public abstract class AbstractCollinsHeadFinder implements HeadFinder /* Serializable */, CopulaHeadFinder  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(AbstractCollinsHeadFinder.class);

  private static final boolean DEBUG = System.getProperty("HeadFinder", null) != null;
  protected final TreebankLanguagePack tlp;
  protected Map nonTerminalInfo;

  /** Default direction if no rule is found for category (the head/parent).
   *  Subclasses can turn it on if they like.
   *  If they don't it is an error if no rule is defined for a category
   *  (null is returned).
   */
  protected String[] defaultRule; // = null;

  /** These are built automatically from categoriesToAvoid and used in a fairly
   *  different fashion from defaultRule (above).  These are used for categories
   *  that do have defined rules but where none of them have matched.  Rather
   *  than picking the rightmost or leftmost child, we will use these to pick
   *  the the rightmost or leftmost child which isn't in categoriesToAvoid.
   */
  protected String[] defaultLeftRule;
  protected String[] defaultRightRule;

  /**
   * Construct a HeadFinder.
   * The TreebankLanguagePack is used to get basic categories. The remaining arguments
   * set categories which, if it comes to last resort processing (i.e., none of
   * the rules matched), will be avoided as heads. In last resort processing,
   * it will attempt to match the leftmost or rightmost constituent not in this
   * set but will fall back to the left or rightmost constituent if necessary.
   *
   * @param tlp TreebankLanguagePack used to determine basic category
   * @param categoriesToAvoid Constituent types to avoid as head
   */
  protected AbstractCollinsHeadFinder(TreebankLanguagePack tlp, String... categoriesToAvoid) {
    this.tlp = tlp;
    // automatically build defaultLeftRule, defaultRightRule
    defaultLeftRule = new String[categoriesToAvoid.length + 1];
    defaultRightRule = new String[categoriesToAvoid.length + 1];
    if (categoriesToAvoid.length > 0) {
      defaultLeftRule[0] = "leftexcept";
      defaultRightRule[0] = "rightexcept";
      System.arraycopy(categoriesToAvoid, 0, defaultLeftRule, 1, categoriesToAvoid.length);
      System.arraycopy(categoriesToAvoid, 0, defaultRightRule, 1, categoriesToAvoid.length);
    } else {
      defaultLeftRule[0] = "left";
      defaultRightRule[0] = "right";
    }
  }

  /**
   * Generally will be false, except for SemanticHeadFinder
   */
  @Override
  public boolean makesCopulaHead() {
    return false;
  }

  /**
   * A way for subclasses for corpora with explicit head markings
   * to return the explicitly marked head
   *
   * @param t a tree to find the head of
   * @return the marked head-- null if no marked head
   */
  // to be overridden in subclasses for corpora
  //
  protected Tree findMarkedHead(Tree t) {
    return null;
  }

  /**
   * Determine which daughter of the current parse tree is the head.
   *
   * @param t The parse tree to examine the daughters of.
   *          If this is a leaf, null is returned
   * @return The daughter parse tree that is the head of t
   * @see Tree#percolateHeads(HeadFinder)
   *      for a routine to call this and spread heads throughout a tree
   */
  @Override
  public Tree determineHead(Tree t) {
    return determineHead(t, null);
  }

  /**
   * Determine which daughter of the current parse tree is the head.
   *
   * @param t The parse tree to examine the daughters of.
   *          If this is a leaf, null is returned
   * @param parent The parent of t
   * @return The daughter parse tree that is the head of t.
   *   Returns null for leaf nodes.
   * @see Tree#percolateHeads(HeadFinder)
   *      for a routine to call this and spread heads throughout a tree
   */
  @Override
  public Tree determineHead(Tree t, Tree parent) {
    if (nonTerminalInfo == null) {
      throw new IllegalStateException("Classes derived from AbstractCollinsHeadFinder must create and fill HashMap nonTerminalInfo.");
    }
    if (t == null || t.isLeaf()) {
      throw new IllegalArgumentException("Can't return head of null or leaf Tree.");
    }
    if (DEBUG) {
      log.info("determineHead for " + t.value());
    }

    Tree[] kids = t.children();

    Tree theHead;
    // first check if subclass found explicitly marked head
    if ((theHead = findMarkedHead(t)) != null) {
      if (DEBUG) {
        log.info("Find marked head method returned " +
                           theHead.label() + " as head of " + t.label());
      }
      return theHead;
    }

    // if the node is a unary, then that kid must be the head
    // it used to special case preterminal and ROOT/TOP case
    // but that seemed bad (especially hardcoding string "ROOT")
    if (kids.length == 1) {
      if (DEBUG) {
        log.info("Only one child determines " +
                           kids[0].label() + " as head of " + t.label());
      }
      return kids[0];
    }

    return determineNonTrivialHead(t, parent);
  }

  /** Called by determineHead and may be overridden in subclasses
   *  if special treatment is necessary for particular categories.
   *
   *  @param t The tre to determine the head daughter of
   *  @param parent The parent of t (or may be null)
   *  @return The head daughter of t
   */
  protected Tree determineNonTrivialHead(Tree t, Tree parent) {
    Tree theHead = null;
    String motherCat = tlp.basicCategory(t.label().value());
    if (motherCat.startsWith("@")) {
      motherCat = motherCat.substring(1);
    }
    if (DEBUG) {
      log.info("Looking for head of " + t.label() +
                         "; value is |" + t.label().value() + "|, " +
                         " baseCat is |" + motherCat + '|');
    }
    // We know we have nonterminals underneath
    // (a bit of a Penn Treebank assumption, but).

    // Look at label.
    // a total special case....
    // first look for POS tag at end
    // this appears to be redundant in the Collins case since the rule already would do that
    //    Tree lastDtr = t.lastChild();
    //    if (tlp.basicCategory(lastDtr.label().value()).equals("POS")) {
    //      theHead = lastDtr;
    //    } else {
    String[][] how = nonTerminalInfo.get(motherCat);
    Tree[] kids = t.children();
    if (how == null) {
      if (DEBUG) {
        log.info("Warning: No rule found for " + motherCat +
                           " (first char: " + motherCat.charAt(0) + ')');
        log.info("Known nonterms are: " + nonTerminalInfo.keySet());
      }
      if (defaultRule != null) {
        if (DEBUG) {
          log.info("  Using defaultRule");
        }
        return traverseLocate(kids, defaultRule, true);
      } else {
        // TreePrint because TreeGraphNode only prints the node number,
        // doesn't print the tree structure
        TreePrint printer = new TreePrint("penn");
        StringWriter buffer = new StringWriter();
        printer.printTree(t, new PrintWriter(buffer));
        // TODO: we could get really fancy and define our own
        // exception class to represent this
        throw new IllegalArgumentException("No head rule defined for " + motherCat + " using " + this.getClass() + " in " + buffer.toString());
      }
    }
    for (int i = 0; i < how.length; i++) {
      boolean lastResort = (i == how.length - 1);
      theHead = traverseLocate(kids, how[i], lastResort);
      if (theHead != null) {
        break;
      }
    }
    if (DEBUG) {
      log.info("  Chose " + theHead.label());
    }
    return theHead;
  }

  /**
   * Attempt to locate head daughter tree from among daughters.
   * Go through daughterTrees looking for things from or not in a set given by
   * the contents of the array how, and if
   * you do not find one, take leftmost or rightmost perhaps matching thing iff
   * lastResort is true, otherwise return null.
   */
  protected Tree traverseLocate(Tree[] daughterTrees, String[] how, boolean lastResort) {
    int headIdx;
    switch (how[0]) {
      case "left":
        headIdx = findLeftHead(daughterTrees, how);
        break;
      case "leftdis":
        headIdx = findLeftDisHead(daughterTrees, how);
        break;
      case "leftexcept":
        headIdx = findLeftExceptHead(daughterTrees, how);
        break;
      case "right":
        headIdx = findRightHead(daughterTrees, how);
        break;
      case "rightdis":
        headIdx = findRightDisHead(daughterTrees, how);
        break;
      case "rightexcept":
        headIdx = findRightExceptHead(daughterTrees, how);
        break;
      default:
        throw new IllegalStateException("ERROR: invalid direction type " + how[0] + " to nonTerminalInfo map in AbstractCollinsHeadFinder.");
    }

    // what happens if our rule didn't match anything
    if (headIdx < 0) {
      if (lastResort) {
        // use the default rule to try to match anything except categoriesToAvoid
        // if that doesn't match, we'll return the left or rightmost child (by
        // setting headIdx).  We want to be careful to ensure that postOperationFix
        // runs exactly once.
        String[] rule;
        if (how[0].startsWith("left")) {
          headIdx = 0;
          rule = defaultLeftRule;
        } else {
          headIdx = daughterTrees.length - 1;
          rule = defaultRightRule;
        }
        Tree child = traverseLocate(daughterTrees, rule, false);
        if (child != null) {
          return child;
        } else {
          return daughterTrees[headIdx];
        }
      } else {
        // if we're not the last resort, we can return null to let the next rule try to match
        return null;
      }
    }

    headIdx = postOperationFix(headIdx, daughterTrees);

    return daughterTrees[headIdx];
  }

  private int findLeftHead(Tree[] daughterTrees, String[] how) {
    for (int i = 1; i < how.length; i++) {
      for (int headIdx = 0; headIdx < daughterTrees.length; headIdx++) {
        String childCat = tlp.basicCategory(daughterTrees[headIdx].label().value());
        if (how[i].equals(childCat)) {
          return headIdx;
        }
      }
    }
    return -1;
  }

  private int findLeftDisHead(Tree[] daughterTrees, String[] how) {
    for (int headIdx = 0; headIdx < daughterTrees.length; headIdx++) {
      String childCat = tlp.basicCategory(daughterTrees[headIdx].label().value());
      for (int i = 1; i < how.length; i++) {
        if (how[i].equals(childCat)) {
          return headIdx;
        }
      }
    }
    return -1;
  }

  private int findLeftExceptHead(Tree[] daughterTrees, String[] how) {
    for (int headIdx = 0; headIdx < daughterTrees.length; headIdx++) {
      String childCat = tlp.basicCategory(daughterTrees[headIdx].label().value());
      boolean found = true;
      for (int i = 1; i < how.length; i++) {
        if (how[i].equals(childCat)) {
          found = false;
        }
      }
      if (found) {
        return headIdx;
      }
    }
    return -1;
  }

  private int findRightHead(Tree[] daughterTrees, String[] how) {
    for (int i = 1; i < how.length; i++) {
      for (int headIdx = daughterTrees.length - 1; headIdx >= 0; headIdx--) {
        String childCat = tlp.basicCategory(daughterTrees[headIdx].label().value());
        if (how[i].equals(childCat)) {
          return headIdx;
        }
      }
    }
    return -1;
  }

  // from right, but search for any of the categories, not by category in turn
  private int findRightDisHead(Tree[] daughterTrees, String[] how) {
    for (int headIdx = daughterTrees.length - 1; headIdx >= 0; headIdx--) {
      String childCat = tlp.basicCategory(daughterTrees[headIdx].label().value());
      for (int i = 1; i < how.length; i++) {
        if (how[i].equals(childCat)) {
          return headIdx;
        }
      }
    }
    return -1;
  }

  private int findRightExceptHead(Tree[] daughterTrees, String[] how) {
    for (int headIdx = daughterTrees.length - 1; headIdx >= 0; headIdx--) {
      String childCat = tlp.basicCategory(daughterTrees[headIdx].label().value());
      boolean found = true;
      for (int i = 1; i < how.length; i++) {
        if (how[i].equals(childCat)) {
          found = false;
        }
      }
      if (found) {
        return headIdx;
      }
    }
    return -1;
  }

  /**
   * A way for subclasses to fix any heads under special conditions.
   * The default does nothing.
   *
   * @param headIdx       The index of the proposed head
   * @param daughterTrees The array of daughter trees
   * @return The new headIndex
   */
  protected int postOperationFix(int headIdx, Tree[] daughterTrees) {
    return headIdx;
  }

  private static final long serialVersionUID = -6540278059442931087L;

}