All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.trees.AbstractCollinsHeadFinder Maven / Gradle / Ivy

Go to download

Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.

There is a newer version: 3.9.2
Show newest version
package edu.stanford.nlp.trees;

import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.Map;

/**
 * A base class for a HeadFinder similar to the one described in
 * Michael Collins' 1999 thesis.  For a given constituent we perform operations
 * like (this is for "left" or "right":
 * 
 * for categoryList in categoryLists
 *   for index = 1 to n [or n to 1 if R->L]
 *     for category in categoryList
 *       if category equals daughter[index] choose it.
 * 
*

* with a final default that goes with the direction (L->R or R->L) * For most constituents, there will be only one category in the list, * the exception being, in Collins' original version, NP. *

*

* It is up to the overriding base class to initialize the map * from constituent type to categoryLists, "nonTerminalInfo", * in its constructor. * Entries are presumed to be of type String[][]. Each String[] is a list of * categories, except for the first entry, which specifies direction of * traversal and must be one of the following: *

*
    *
  • "left" means search left-to-right by category and then by position *
  • "leftdis" means search left-to-right by position and then by category *
  • "right" means search right-to-left by category and then by position *
  • "rightdis" means search right-to-left by position and then by category *
  • "leftexcept" means to take the first thing from the left that isn't in the list *
  • "rightexcept" means to take the first thing from the right that isn't on the list *
*

* Changes: *

*
    *
  • 2002/10/28 -- Category label identity checking now uses the * equals() method instead of ==, so not interning category labels * shouldn't break things anymore. (Roger Levy)
    *
  • 2003/02/10 -- Changed to use TreebankLanguagePack and to cut on * characters that set off annotations, so this should work even if * functional tags are still on nodes.
    *
  • 2004/03/30 -- Made abstract base class and subclasses for CollinsHeadFinder, * ModCollinsHeadFinder, SemanticHeadFinder, ChineseHeadFinder * (and trees.icegb.ICEGBHeadFinder, trees.international.negra.NegraHeadFinder, * and movetrees.EnglishPennMaxProjectionHeadFinder) *
  • 2011/01/13 -- Add support for categoriesToAvoid (which can be set to ensure that * punctuation is not the head if there are other options) *
* * @author Christopher Manning * @author Galen Andrew */ public abstract class AbstractCollinsHeadFinder implements HeadFinder /* Serializable */, CopulaHeadFinder { private static final boolean DEBUG = System.getProperty("HeadFinder", null) != null; protected final TreebankLanguagePack tlp; protected Map nonTerminalInfo; /** Default direction if no rule is found for category (the head/parent). * Subclasses can turn it on if they like. * If they don't it is an error if no rule is defined for a category * (null is returned). */ protected String[] defaultRule; // = null; /** These are built automatically from categoriesToAvoid and used in a fairly * different fashion from defaultRule (above). These are used for categories * that do have defined rules but where none of them have matched. Rather * than picking the rightmost or leftmost child, we will use these to pick * the the rightmost or leftmost child which isn't in categoriesToAvoid. */ protected String[] defaultLeftRule; protected String[] defaultRightRule; /** * Construct a HeadFinder. * The TreebankLanguagePack is used to get basic categories. The remaining arguments * set categories which, if it comes to last resort processing (i.e., none of * the rules matched), will be avoided as heads. In last resort processing, * it will attempt to match the leftmost or rightmost constituent not in this * set but will fall back to the left or rightmost constituent if necessary. * * @param tlp TreebankLanguagePack used to determine basic category * @param categoriesToAvoid Constituent types to avoid as head */ protected AbstractCollinsHeadFinder(TreebankLanguagePack tlp, String... categoriesToAvoid) { this.tlp = tlp; // automatically build defaultLeftRule, defaultRightRule defaultLeftRule = new String[categoriesToAvoid.length + 1]; defaultRightRule = new String[categoriesToAvoid.length + 1]; if (categoriesToAvoid.length > 0) { defaultLeftRule[0] = "leftexcept"; defaultRightRule[0] = "rightexcept"; System.arraycopy(categoriesToAvoid, 0, defaultLeftRule, 1, categoriesToAvoid.length); System.arraycopy(categoriesToAvoid, 0, defaultRightRule, 1, categoriesToAvoid.length); } else { defaultLeftRule[0] = "left"; defaultRightRule[0] = "right"; } } /** * Generally will be false, except for SemanticHeadFinder */ @Override public boolean makesCopulaHead() { return false; } /** * A way for subclasses for corpora with explicit head markings * to return the explicitly marked head * * @param t a tree to find the head of * @return the marked head-- null if no marked head */ // to be overridden in subclasses for corpora // protected Tree findMarkedHead(Tree t) { return null; } /** * Determine which daughter of the current parse tree is the head. * * @param t The parse tree to examine the daughters of. * If this is a leaf, null is returned * @return The daughter parse tree that is the head of t * @see Tree#percolateHeads(HeadFinder) * for a routine to call this and spread heads throughout a tree */ @Override public Tree determineHead(Tree t) { return determineHead(t, null); } /** * Determine which daughter of the current parse tree is the head. * * @param t The parse tree to examine the daughters of. * If this is a leaf, null is returned * @param parent The parent of t * @return The daughter parse tree that is the head of t. * Returns null for leaf nodes. * @see Tree#percolateHeads(HeadFinder) * for a routine to call this and spread heads throughout a tree */ @Override public Tree determineHead(Tree t, Tree parent) { if (nonTerminalInfo == null) { throw new IllegalStateException("Classes derived from AbstractCollinsHeadFinder must create and fill HashMap nonTerminalInfo."); } if (t == null || t.isLeaf()) { throw new IllegalArgumentException("Can't return head of null or leaf Tree."); } if (DEBUG) { System.err.println("determineHead for " + t.value()); } Tree[] kids = t.children(); Tree theHead; // first check if subclass found explicitly marked head if ((theHead = findMarkedHead(t)) != null) { if (DEBUG) { System.err.println("Find marked head method returned " + theHead.label() + " as head of " + t.label()); } return theHead; } // if the node is a unary, then that kid must be the head // it used to special case preterminal and ROOT/TOP case // but that seemed bad (especially hardcoding string "ROOT") if (kids.length == 1) { if (DEBUG) { System.err.println("Only one child determines " + kids[0].label() + " as head of " + t.label()); } return kids[0]; } return determineNonTrivialHead(t, parent); } /** Called by determineHead and may be overridden in subclasses * if special treatment is necessary for particular categories. * * @param t The tre to determine the head daughter of * @param parent The parent of t (or may be null) * @return The head daughter of t */ protected Tree determineNonTrivialHead(Tree t, Tree parent) { Tree theHead = null; String motherCat = tlp.basicCategory(t.label().value()); if (motherCat.startsWith("@")) { motherCat = motherCat.substring(1); } if (DEBUG) { System.err.println("Looking for head of " + t.label() + "; value is |" + t.label().value() + "|, " + " baseCat is |" + motherCat + '|'); } // We know we have nonterminals underneath // (a bit of a Penn Treebank assumption, but). // Look at label. // a total special case.... // first look for POS tag at end // this appears to be redundant in the Collins case since the rule already would do that // Tree lastDtr = t.lastChild(); // if (tlp.basicCategory(lastDtr.label().value()).equals("POS")) { // theHead = lastDtr; // } else { String[][] how = nonTerminalInfo.get(motherCat); Tree[] kids = t.children(); if (how == null) { if (DEBUG) { System.err.println("Warning: No rule found for " + motherCat + " (first char: " + motherCat.charAt(0) + ')'); System.err.println("Known nonterms are: " + nonTerminalInfo.keySet()); } if (defaultRule != null) { if (DEBUG) { System.err.println(" Using defaultRule"); } return traverseLocate(kids, defaultRule, true); } else { // TreePrint because TreeGraphNode only prints the node number, // doesn't print the tree structure TreePrint printer = new TreePrint("penn"); StringWriter buffer = new StringWriter(); printer.printTree(t, new PrintWriter(buffer)); // TODO: we could get really fancy and define our own // exception class to represent this throw new IllegalArgumentException("No head rule defined for " + motherCat + " using " + this.getClass() + " in " + buffer.toString()); } } for (int i = 0; i < how.length; i++) { boolean lastResort = (i == how.length - 1); theHead = traverseLocate(kids, how[i], lastResort); if (theHead != null) { break; } } if (DEBUG) { System.err.println(" Chose " + theHead.label()); } return theHead; } /** * Attempt to locate head daughter tree from among daughters. * Go through daughterTrees looking for things from or not in a set given by * the contents of the array how, and if * you do not find one, take leftmost or rightmost perhaps matching thing iff * lastResort is true, otherwise return null. */ protected Tree traverseLocate(Tree[] daughterTrees, String[] how, boolean lastResort) { int headIdx; switch (how[0]) { case "left": headIdx = findLeftHead(daughterTrees, how); break; case "leftdis": headIdx = findLeftDisHead(daughterTrees, how); break; case "leftexcept": headIdx = findLeftExceptHead(daughterTrees, how); break; case "right": headIdx = findRightHead(daughterTrees, how); break; case "rightdis": headIdx = findRightDisHead(daughterTrees, how); break; case "rightexcept": headIdx = findRightExceptHead(daughterTrees, how); break; default: throw new IllegalStateException("ERROR: invalid direction type " + how[0] + " to nonTerminalInfo map in AbstractCollinsHeadFinder."); } // what happens if our rule didn't match anything if (headIdx < 0) { if (lastResort) { // use the default rule to try to match anything except categoriesToAvoid // if that doesn't match, we'll return the left or rightmost child (by // setting headIdx). We want to be careful to ensure that postOperationFix // runs exactly once. String[] rule; if (how[0].startsWith("left")) { headIdx = 0; rule = defaultLeftRule; } else { headIdx = daughterTrees.length - 1; rule = defaultRightRule; } Tree child = traverseLocate(daughterTrees, rule, false); if (child != null) { return child; } else { return daughterTrees[headIdx]; } } else { // if we're not the last resort, we can return null to let the next rule try to match return null; } } headIdx = postOperationFix(headIdx, daughterTrees); return daughterTrees[headIdx]; } private int findLeftHead(Tree[] daughterTrees, String[] how) { for (int i = 1; i < how.length; i++) { for (int headIdx = 0; headIdx < daughterTrees.length; headIdx++) { String childCat = tlp.basicCategory(daughterTrees[headIdx].label().value()); if (how[i].equals(childCat)) { return headIdx; } } } return -1; } private int findLeftDisHead(Tree[] daughterTrees, String[] how) { for (int headIdx = 0; headIdx < daughterTrees.length; headIdx++) { String childCat = tlp.basicCategory(daughterTrees[headIdx].label().value()); for (int i = 1; i < how.length; i++) { if (how[i].equals(childCat)) { return headIdx; } } } return -1; } private int findLeftExceptHead(Tree[] daughterTrees, String[] how) { for (int headIdx = 0; headIdx < daughterTrees.length; headIdx++) { String childCat = tlp.basicCategory(daughterTrees[headIdx].label().value()); boolean found = true; for (int i = 1; i < how.length; i++) { if (how[i].equals(childCat)) { found = false; } } if (found) { return headIdx; } } return -1; } private int findRightHead(Tree[] daughterTrees, String[] how) { for (int i = 1; i < how.length; i++) { for (int headIdx = daughterTrees.length - 1; headIdx >= 0; headIdx--) { String childCat = tlp.basicCategory(daughterTrees[headIdx].label().value()); if (how[i].equals(childCat)) { return headIdx; } } } return -1; } // from right, but search for any of the categories, not by category in turn private int findRightDisHead(Tree[] daughterTrees, String[] how) { for (int headIdx = daughterTrees.length - 1; headIdx >= 0; headIdx--) { String childCat = tlp.basicCategory(daughterTrees[headIdx].label().value()); for (int i = 1; i < how.length; i++) { if (how[i].equals(childCat)) { return headIdx; } } } return -1; } private int findRightExceptHead(Tree[] daughterTrees, String[] how) { for (int headIdx = daughterTrees.length - 1; headIdx >= 0; headIdx--) { String childCat = tlp.basicCategory(daughterTrees[headIdx].label().value()); boolean found = true; for (int i = 1; i < how.length; i++) { if (how[i].equals(childCat)) { found = false; } } if (found) { return headIdx; } } return -1; } /** * A way for subclasses to fix any heads under special conditions. * The default does nothing. * * @param headIdx The index of the proposed head * @param daughterTrees The array of daughter trees * @return The new headIndex */ protected int postOperationFix(int headIdx, Tree[] daughterTrees) { return headIdx; } private static final long serialVersionUID = -6540278059442931087L; }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy