edu.stanford.nlp.trees.AbstractCollinsHeadFinder Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
package edu.stanford.nlp.trees;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.Map;
/**
* A base class for a HeadFinder similar to the one described in
* Michael Collins' 1999 thesis. For a given constituent we perform operations
* like (this is for "left" or "right":
*
* for categoryList in categoryLists
* for index = 1 to n [or n to 1 if R->L]
* for category in categoryList
* if category equals daughter[index] choose it.
*
*
* with a final default that goes with the direction (L->R or R->L)
* For most constituents, there will be only one category in the list,
* the exception being, in Collins' original version, NP.
*
*
* It is up to the overriding base class to initialize the map
* from constituent type to categoryLists, "nonTerminalInfo",
* in its constructor.
* Entries are presumed to be of type String[][]. Each String[] is a list of
* categories, except for the first entry, which specifies direction of
* traversal and must be one of the following:
*
*
* - "left" means search left-to-right by category and then by position
*
- "leftdis" means search left-to-right by position and then by category
*
- "right" means search right-to-left by category and then by position
*
- "rightdis" means search right-to-left by position and then by category
*
- "leftexcept" means to take the first thing from the left that isn't in the list
*
- "rightexcept" means to take the first thing from the right that isn't on the list
*
*
* Changes:
*
*
* - 2002/10/28 -- Category label identity checking now uses the
* equals() method instead of ==, so not interning category labels
* shouldn't break things anymore. (Roger Levy)
* - 2003/02/10 -- Changed to use TreebankLanguagePack and to cut on
* characters that set off annotations, so this should work even if
* functional tags are still on nodes.
* - 2004/03/30 -- Made abstract base class and subclasses for CollinsHeadFinder,
* ModCollinsHeadFinder, SemanticHeadFinder, ChineseHeadFinder
* (and trees.icegb.ICEGBHeadFinder, trees.international.negra.NegraHeadFinder,
* and movetrees.EnglishPennMaxProjectionHeadFinder)
*
- 2011/01/13 -- Add support for categoriesToAvoid (which can be set to ensure that
* punctuation is not the head if there are other options)
*
*
* @author Christopher Manning
* @author Galen Andrew
*/
public abstract class AbstractCollinsHeadFinder implements HeadFinder /* Serializable */, CopulaHeadFinder {
private static final boolean DEBUG = System.getProperty("HeadFinder", null) != null;
protected final TreebankLanguagePack tlp;
protected Map nonTerminalInfo;
/** Default direction if no rule is found for category (the head/parent).
* Subclasses can turn it on if they like.
* If they don't it is an error if no rule is defined for a category
* (null is returned).
*/
protected String[] defaultRule; // = null;
/** These are built automatically from categoriesToAvoid and used in a fairly
* different fashion from defaultRule (above). These are used for categories
* that do have defined rules but where none of them have matched. Rather
* than picking the rightmost or leftmost child, we will use these to pick
* the the rightmost or leftmost child which isn't in categoriesToAvoid.
*/
protected String[] defaultLeftRule;
protected String[] defaultRightRule;
/**
* Construct a HeadFinder.
* The TreebankLanguagePack is used to get basic categories. The remaining arguments
* set categories which, if it comes to last resort processing (i.e., none of
* the rules matched), will be avoided as heads. In last resort processing,
* it will attempt to match the leftmost or rightmost constituent not in this
* set but will fall back to the left or rightmost constituent if necessary.
*
* @param tlp TreebankLanguagePack used to determine basic category
* @param categoriesToAvoid Constituent types to avoid as head
*/
protected AbstractCollinsHeadFinder(TreebankLanguagePack tlp, String... categoriesToAvoid) {
this.tlp = tlp;
// automatically build defaultLeftRule, defaultRightRule
defaultLeftRule = new String[categoriesToAvoid.length + 1];
defaultRightRule = new String[categoriesToAvoid.length + 1];
if (categoriesToAvoid.length > 0) {
defaultLeftRule[0] = "leftexcept";
defaultRightRule[0] = "rightexcept";
System.arraycopy(categoriesToAvoid, 0, defaultLeftRule, 1, categoriesToAvoid.length);
System.arraycopy(categoriesToAvoid, 0, defaultRightRule, 1, categoriesToAvoid.length);
} else {
defaultLeftRule[0] = "left";
defaultRightRule[0] = "right";
}
}
/**
* Generally will be false, except for SemanticHeadFinder
*/
@Override
public boolean makesCopulaHead() {
return false;
}
/**
* A way for subclasses for corpora with explicit head markings
* to return the explicitly marked head
*
* @param t a tree to find the head of
* @return the marked head-- null if no marked head
*/
// to be overridden in subclasses for corpora
//
protected Tree findMarkedHead(Tree t) {
return null;
}
/**
* Determine which daughter of the current parse tree is the head.
*
* @param t The parse tree to examine the daughters of.
* If this is a leaf, null
is returned
* @return The daughter parse tree that is the head of t
* @see Tree#percolateHeads(HeadFinder)
* for a routine to call this and spread heads throughout a tree
*/
@Override
public Tree determineHead(Tree t) {
return determineHead(t, null);
}
/**
* Determine which daughter of the current parse tree is the head.
*
* @param t The parse tree to examine the daughters of.
* If this is a leaf, null
is returned
* @param parent The parent of t
* @return The daughter parse tree that is the head of t
.
* Returns null for leaf nodes.
* @see Tree#percolateHeads(HeadFinder)
* for a routine to call this and spread heads throughout a tree
*/
@Override
public Tree determineHead(Tree t, Tree parent) {
if (nonTerminalInfo == null) {
throw new IllegalStateException("Classes derived from AbstractCollinsHeadFinder must create and fill HashMap nonTerminalInfo.");
}
if (t == null || t.isLeaf()) {
throw new IllegalArgumentException("Can't return head of null or leaf Tree.");
}
if (DEBUG) {
System.err.println("determineHead for " + t.value());
}
Tree[] kids = t.children();
Tree theHead;
// first check if subclass found explicitly marked head
if ((theHead = findMarkedHead(t)) != null) {
if (DEBUG) {
System.err.println("Find marked head method returned " +
theHead.label() + " as head of " + t.label());
}
return theHead;
}
// if the node is a unary, then that kid must be the head
// it used to special case preterminal and ROOT/TOP case
// but that seemed bad (especially hardcoding string "ROOT")
if (kids.length == 1) {
if (DEBUG) {
System.err.println("Only one child determines " +
kids[0].label() + " as head of " + t.label());
}
return kids[0];
}
return determineNonTrivialHead(t, parent);
}
/** Called by determineHead and may be overridden in subclasses
* if special treatment is necessary for particular categories.
*
* @param t The tre to determine the head daughter of
* @param parent The parent of t (or may be null)
* @return The head daughter of t
*/
protected Tree determineNonTrivialHead(Tree t, Tree parent) {
Tree theHead = null;
String motherCat = tlp.basicCategory(t.label().value());
if (motherCat.startsWith("@")) {
motherCat = motherCat.substring(1);
}
if (DEBUG) {
System.err.println("Looking for head of " + t.label() +
"; value is |" + t.label().value() + "|, " +
" baseCat is |" + motherCat + '|');
}
// We know we have nonterminals underneath
// (a bit of a Penn Treebank assumption, but).
// Look at label.
// a total special case....
// first look for POS tag at end
// this appears to be redundant in the Collins case since the rule already would do that
// Tree lastDtr = t.lastChild();
// if (tlp.basicCategory(lastDtr.label().value()).equals("POS")) {
// theHead = lastDtr;
// } else {
String[][] how = nonTerminalInfo.get(motherCat);
Tree[] kids = t.children();
if (how == null) {
if (DEBUG) {
System.err.println("Warning: No rule found for " + motherCat +
" (first char: " + motherCat.charAt(0) + ')');
System.err.println("Known nonterms are: " + nonTerminalInfo.keySet());
}
if (defaultRule != null) {
if (DEBUG) {
System.err.println(" Using defaultRule");
}
return traverseLocate(kids, defaultRule, true);
} else {
// TreePrint because TreeGraphNode only prints the node number,
// doesn't print the tree structure
TreePrint printer = new TreePrint("penn");
StringWriter buffer = new StringWriter();
printer.printTree(t, new PrintWriter(buffer));
// TODO: we could get really fancy and define our own
// exception class to represent this
throw new IllegalArgumentException("No head rule defined for " + motherCat + " using " + this.getClass() + " in " + buffer.toString());
}
}
for (int i = 0; i < how.length; i++) {
boolean lastResort = (i == how.length - 1);
theHead = traverseLocate(kids, how[i], lastResort);
if (theHead != null) {
break;
}
}
if (DEBUG) {
System.err.println(" Chose " + theHead.label());
}
return theHead;
}
/**
* Attempt to locate head daughter tree from among daughters.
* Go through daughterTrees looking for things from or not in a set given by
* the contents of the array how, and if
* you do not find one, take leftmost or rightmost perhaps matching thing iff
* lastResort is true, otherwise return null
.
*/
protected Tree traverseLocate(Tree[] daughterTrees, String[] how, boolean lastResort) {
int headIdx;
switch (how[0]) {
case "left":
headIdx = findLeftHead(daughterTrees, how);
break;
case "leftdis":
headIdx = findLeftDisHead(daughterTrees, how);
break;
case "leftexcept":
headIdx = findLeftExceptHead(daughterTrees, how);
break;
case "right":
headIdx = findRightHead(daughterTrees, how);
break;
case "rightdis":
headIdx = findRightDisHead(daughterTrees, how);
break;
case "rightexcept":
headIdx = findRightExceptHead(daughterTrees, how);
break;
default:
throw new IllegalStateException("ERROR: invalid direction type " + how[0] + " to nonTerminalInfo map in AbstractCollinsHeadFinder.");
}
// what happens if our rule didn't match anything
if (headIdx < 0) {
if (lastResort) {
// use the default rule to try to match anything except categoriesToAvoid
// if that doesn't match, we'll return the left or rightmost child (by
// setting headIdx). We want to be careful to ensure that postOperationFix
// runs exactly once.
String[] rule;
if (how[0].startsWith("left")) {
headIdx = 0;
rule = defaultLeftRule;
} else {
headIdx = daughterTrees.length - 1;
rule = defaultRightRule;
}
Tree child = traverseLocate(daughterTrees, rule, false);
if (child != null) {
return child;
} else {
return daughterTrees[headIdx];
}
} else {
// if we're not the last resort, we can return null to let the next rule try to match
return null;
}
}
headIdx = postOperationFix(headIdx, daughterTrees);
return daughterTrees[headIdx];
}
private int findLeftHead(Tree[] daughterTrees, String[] how) {
for (int i = 1; i < how.length; i++) {
for (int headIdx = 0; headIdx < daughterTrees.length; headIdx++) {
String childCat = tlp.basicCategory(daughterTrees[headIdx].label().value());
if (how[i].equals(childCat)) {
return headIdx;
}
}
}
return -1;
}
private int findLeftDisHead(Tree[] daughterTrees, String[] how) {
for (int headIdx = 0; headIdx < daughterTrees.length; headIdx++) {
String childCat = tlp.basicCategory(daughterTrees[headIdx].label().value());
for (int i = 1; i < how.length; i++) {
if (how[i].equals(childCat)) {
return headIdx;
}
}
}
return -1;
}
private int findLeftExceptHead(Tree[] daughterTrees, String[] how) {
for (int headIdx = 0; headIdx < daughterTrees.length; headIdx++) {
String childCat = tlp.basicCategory(daughterTrees[headIdx].label().value());
boolean found = true;
for (int i = 1; i < how.length; i++) {
if (how[i].equals(childCat)) {
found = false;
}
}
if (found) {
return headIdx;
}
}
return -1;
}
private int findRightHead(Tree[] daughterTrees, String[] how) {
for (int i = 1; i < how.length; i++) {
for (int headIdx = daughterTrees.length - 1; headIdx >= 0; headIdx--) {
String childCat = tlp.basicCategory(daughterTrees[headIdx].label().value());
if (how[i].equals(childCat)) {
return headIdx;
}
}
}
return -1;
}
// from right, but search for any of the categories, not by category in turn
private int findRightDisHead(Tree[] daughterTrees, String[] how) {
for (int headIdx = daughterTrees.length - 1; headIdx >= 0; headIdx--) {
String childCat = tlp.basicCategory(daughterTrees[headIdx].label().value());
for (int i = 1; i < how.length; i++) {
if (how[i].equals(childCat)) {
return headIdx;
}
}
}
return -1;
}
private int findRightExceptHead(Tree[] daughterTrees, String[] how) {
for (int headIdx = daughterTrees.length - 1; headIdx >= 0; headIdx--) {
String childCat = tlp.basicCategory(daughterTrees[headIdx].label().value());
boolean found = true;
for (int i = 1; i < how.length; i++) {
if (how[i].equals(childCat)) {
found = false;
}
}
if (found) {
return headIdx;
}
}
return -1;
}
/**
* A way for subclasses to fix any heads under special conditions.
* The default does nothing.
*
* @param headIdx The index of the proposed head
* @param daughterTrees The array of daughter trees
* @return The new headIndex
*/
protected int postOperationFix(int headIdx, Tree[] daughterTrees) {
return headIdx;
}
private static final long serialVersionUID = -6540278059442931087L;
}