edu.stanford.nlp.trees.UniversalSemanticHeadFinder Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
package edu.stanford.nlp.trees;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import java.util.function.Predicate;
import edu.stanford.nlp.ling.HasCategory;
import edu.stanford.nlp.ling.HasTag;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.trees.tregex.TregexMatcher;
import edu.stanford.nlp.trees.tregex.TregexPattern;
import edu.stanford.nlp.util.ArrayUtils;
import edu.stanford.nlp.util.Generics;
/**
* Implements a 'semantic head' variant of the the HeadFinder found
* in Michael Collins' 1999 thesis.
* This version chooses the semantic head verb rather than the verb form
* for cases with verbs. And it makes similar themed changes to other
* categories: e.g., in question phrases, like "Which Brazilian game", the
* head is made "game" not "Which" as in common PTB head rules.
*
* By default the SemanticHeadFinder uses a treatment of copula where the
* complement of the copula is taken as the head. That is, a sentence like
* "Bill is big" will be analyzed as
*
* nsubj
(big, Bill)
* cop
(big, is)
*
* This analysis is used for questions and declaratives for adjective
* complements and declarative nominal complements. However Wh-sentences
* with nominal complements do not receive this treatment.
* "Who is the president?" is analyzed with "the president" as nsubj and "who"
* as "attr" of the copula:
* nsubj
(is, president)
* attr
(is, Who)
*
* (Such nominal copula sentences are complex: arguably, depending on the
* circumstances, several analyses are possible, with either the overt NP able
* to be any of the subject, the predicate, or one of two referential entities
* connected by an equational copula. These uses aren't differentiated.)
*
* Existential sentences are treated as follows:
* "There is a man"
* expl
(is, There)
* det
(man-4, a-3)
* nsubj
(is-2, man-4)
*
* @author John Rappaport
* @author Marie-Catherine de Marneffe
* @author Anna Rafferty
* @author Sebastian Schuster
*/
public class UniversalSemanticHeadFinder extends ModCollinsHeadFinder {
private static final boolean DEBUG = System.getProperty("SemanticHeadFinder", null) != null;
/* A few times the apostrophe is missing on "'s", so we have "s" */
/* Tricky auxiliaries: "a", "na" is from "(gon|wan)na", "ve" from "Weve", etc. "of" as non-standard for "have" */
/* "as" is "has" with missing first letter. "to" is rendered "the" once in EWT. */
private static final String[] auxiliaries = {
"will", "wo", "shall", "sha", "may", "might", "should", "would", "can", "could", "ca", "must", "'ll", "ll", "-ll", "cold",
"has", "have", "had", "having", "'ve", "ve", "v", "of", "hav", "hvae", "as",
"get", "gets", "getting", "got", "gotten", "do", "does", "did", "'d", "d", "du",
"to", "2", "na", "a", "ot", "ta", "the", "too" };
// include Charniak tags (AUX, AUXG) so can do BLLIP right
private static final String[] verbTags = {"TO", "MD", "VB", "VBD", "VBP", "VBZ", "VBG", "VBN", "AUX", "AUXG"};
// These ones are always auxiliaries, even if the word is "too", "my", or whatever else appears in web text.
private static final String[] unambiguousAuxTags = {"TO", "MD", "AUX", "AUXG"};
private final Set verbalAuxiliaries;
private final Set copulars;
private final Set passiveAuxiliaries;
private final Set verbalTags;
private final Set unambiguousAuxiliaryTags;
private final boolean makeCopulaHead;
public UniversalSemanticHeadFinder() {
this(new PennTreebankLanguagePack(), true);
}
public UniversalSemanticHeadFinder(boolean noCopulaHead) {
this(new PennTreebankLanguagePack(), noCopulaHead);
}
/** Create a SemanticHeadFinder.
*
* @param tlp The TreebankLanguagePack, used by the superclass to get basic
* category of constituents.
* @param noCopulaHead If true, a copular verb (a form of be)
* is not treated as head when it has an AdjP or NP complement. If false,
* a copula verb is still always treated as a head. But it will still
* be treated as an auxiliary in periphrastic tenses with a VP complement.
*/
public UniversalSemanticHeadFinder(TreebankLanguagePack tlp, boolean noCopulaHead) {
super(tlp);
// TODO: reverse the polarity of noCopulaHead
this.makeCopulaHead = !noCopulaHead;
ruleChanges();
// make a distinction between auxiliaries and copula verbs to
// get the NP has semantic head in sentences like "Bill is an honest man". (Added "sha" for "shan't" May 2009
verbalAuxiliaries = Generics.newHashSet(Arrays.asList(auxiliaries));
passiveAuxiliaries = Generics.newHashSet(Arrays.asList(EnglishPatterns.beGetVerbs));
//copula verbs having an NP complement
copulars = Generics.newHashSet();
if (noCopulaHead) {
copulars.addAll(Arrays.asList(EnglishPatterns.copularVerbs));
}
verbalTags = Generics.newHashSet(Arrays.asList(verbTags));
unambiguousAuxiliaryTags = Generics.newHashSet(Arrays.asList(unambiguousAuxTags));
}
@Override
public boolean makesCopulaHead() {
return makeCopulaHead;
}
//makes modifications of Collins' rules to better fit with semantic notions of heads
private void ruleChanges() {
// NP: don't want a POS to be the head
nonTerminalInfo.put("NP", new String[][]{{"rightdis", "NN", "NNP", "NNPS", "NNS", "NX", "NML", "JJR", "WP" }, {"left", "NP", "PRP"}, {"rightdis", "$", "ADJP", "FW", "CD", "JJ", "QP"}, {"rightdis", "JJS", "DT", "WDT", "NML", "PRN", "RB", "RBR", "ADVP"}, {"left", "POS"}});
nonTerminalInfo.put("NX", nonTerminalInfo.get("NP"));
nonTerminalInfo.put("NML", nonTerminalInfo.get("NP"));
// WHNP clauses should have the same sort of head as an NP
// but it a WHNP has a NP and a WHNP under it, the WHNP should be the head. E.g., (WHNP (WHNP (WP$ whose) (JJ chief) (JJ executive) (NN officer))(, ,) (NP (NNP James) (NNP Gatward))(, ,))
nonTerminalInfo.put("WHNP", new String[][]{{"rightdis", "NN", "NNP", "NNPS", "NNS", "NX", "NML", "JJR", "WP"}, {"left", "WHNP", "NP"}, {"rightdis", "$", "ADJP", "PRN", "FW"}, {"right", "CD"}, {"rightdis", "JJ", "JJS", "RB", "QP"}, {"left", "WHPP", "WHADJP", "WP$", "WDT"}});
//WHADJP
nonTerminalInfo.put("WHADJP", new String[][]{{"left", "ADJP", "JJ", "JJR", "WP"}, {"right", "RB"}, {"right"}});
//WHADJP
nonTerminalInfo.put("WHADVP", new String[][]{{"rightdis", "WRB", "WHADVP", "RB", "JJ"}}); // if not WRB or WHADVP, probably has flat NP structure, allow JJ for "how long" constructions
// QP: we don't want the first CD to be the semantic head (e.g., "three billion": head should be "billion"), so we go from right to left
nonTerminalInfo.put("QP", new String[][]{{"right", "$", "NNS", "NN", "CD", "JJ", "PDT", "DT", "IN", "RB", "NCD", "QP", "JJR", "JJS"}});
// S, SBAR and SQ clauses should prefer the main verb as the head
// S: "He considered him a friend" -> we want a friend to be the head
nonTerminalInfo.put("S", new String[][]{{"left", "VP", "S", "FRAG", "SBAR", "ADJP", "UCP", "TO"}, {"right", "NP"}});
nonTerminalInfo.put("SBAR", new String[][]{{"left", "S", "SQ", "SINV", "SBAR", "FRAG", "VP", "WHNP", "WHPP", "WHADVP", "WHADJP", "IN", "DT"}});
// VP shouldn't be needed in SBAR, but occurs in one buggy tree in PTB3 wsj_1457 and otherwise does no harm
if (makeCopulaHead) {
nonTerminalInfo.put("SQ", new String[][]{{"left", "VP", "SQ", "VB", "VBZ", "VBD", "VBP", "MD", "AUX", "AUXG", "ADJP"}});
} else {
nonTerminalInfo.put("SQ", new String[][]{{"left", "VP", "SQ", "ADJP", "VB", "VBZ", "VBD", "VBP", "MD", "AUX", "AUXG"}});
}
// UCP take the first element as head
nonTerminalInfo.put("UCP", new String[][]{{"left"}});
// CONJP: We generally want the rightmost particle or the leftmost conjunction as head
// JJ is for weird tagging of "not only" in PTB
nonTerminalInfo.put("CONJP", new String[][]{{"right", "JJ", "RB"}, {"left", "CC", "IN"}, {"right", "VB"}});
// FRAG: crap rule needs to be change if you want to parse
// glosses; but it is correct to have ADJP and ADVP before S
// because of weird parses of reduced sentences.
nonTerminalInfo.put("FRAG", new String[][]{{"left", "IN"}, {"right", "RB"}, {"left", "NP"}, {"left", "ADJP", "ADVP", "FRAG", "S", "SBAR", "VP"}});
// PRN: sentence first
nonTerminalInfo.put("PRN", new String[][]{{"left", "VP", "SQ", "S", "SINV", "SBAR", "NP", "ADJP", "PP", "ADVP", "INTJ", "WHNP", "NAC", "VBP", "JJ", "NN", "NNP"}});
// add the constituent XS (special node to add a layer in a QP tree introduced in our QPTreeTransformer)
nonTerminalInfo.put("XS", new String[][]{{"right", "IN"}});
// add a rule to deal with the CoNLL data
nonTerminalInfo.put("EMBED", new String[][]{{"right", "INTJ"}});
// USD: NP is head of PP
nonTerminalInfo.put("PP", new String[][]{{"left", "NP", "S", "SBAR", "SBARQ", "ADVP", "PP", "VP", "ADJP", "FRAG", "UCP", "PRN"}, {"right"}});
nonTerminalInfo.put("WHPP", nonTerminalInfo.get("PP"));
// Special constituent for multi-word expressions
nonTerminalInfo.put("MWE", new String[][]{{"left"}});
nonTerminalInfo.put("PCONJP", new String[][]{{"left"}});
nonTerminalInfo.put("ADJP", new String[][]{{"left", "$"}, {"rightdis", "NNS", "NN", "NNP", "JJ", "QP", "VBN", "VBG"}, {"left", "ADJP"}, {"rightdis", "JJP", "JJR", "JJS", "DT", "RB", "RBR", "CD", "IN", "VBD"}, {"left", "ADVP", "NP"}});
nonTerminalInfo.put("INTJ", new String[][]{{"rightdis", "NNS", "NN", "NNP"}, {"left"}});
nonTerminalInfo.put("ADVP", new String[][]{{"rightdis", "RB", "RBR", "RBS", "JJ", "JJR", "JJS"},
{"rightdis", "RP", "DT", "NN", "CD", "NP", "VBN", "NNP", "CC", "FW", "NNS", "ADJP", "NML"}, {"left"}});
}
private boolean shouldSkip(Tree t, boolean origWasInterjection) {
return t.isPreTerminal() && (tlp.isPunctuationTag(t.value()) || ! origWasInterjection && "UH".equals(t.value())) ||
"INTJ".equals(t.value()) && ! origWasInterjection;
}
private int findPreviousHead(int headIdx, Tree[] daughterTrees, boolean origWasInterjection) {
boolean seenSeparator = false;
int newHeadIdx = headIdx;
while (newHeadIdx >= 0) {
newHeadIdx = newHeadIdx - 1;
if (newHeadIdx < 0) {
return newHeadIdx;
}
String label = tlp.basicCategory(daughterTrees[newHeadIdx].value());
if (",".equals(label) || ":".equals(label)) {
seenSeparator = true;
} else if (daughterTrees[newHeadIdx].isPreTerminal() && (tlp.isPunctuationTag(label) || ! origWasInterjection && "UH".equals(label)) ||
"INTJ".equals(label) && ! origWasInterjection) {
// keep looping
} else {
if ( ! seenSeparator) {
newHeadIdx = -1;
}
break;
}
}
return newHeadIdx;
}
/**
* Overwrite the postOperationFix method. For "a, b and c" or similar: we want "a" to be the head.
*/
@Override
protected int postOperationFix(int headIdx, Tree[] daughterTrees) {
if (headIdx >= 2) {
String prevLab = tlp.basicCategory(daughterTrees[headIdx - 1].value());
if (prevLab.equals("CC") || prevLab.equals("CONJP")) {
boolean origWasInterjection = "UH".equals(tlp.basicCategory(daughterTrees[headIdx].value()));
int newHeadIdx = headIdx - 2;
// newHeadIdx is now left of conjunction. Now try going back over commas, etc. for 3+ conjuncts
// Don't allow INTJ unless conjoined with INTJ - important in informal genres "Oh and don't forget to call!"
while (newHeadIdx >= 0 && shouldSkip(daughterTrees[newHeadIdx], origWasInterjection)) {
newHeadIdx--;
}
// We're now at newHeadIdx < 0 or have found a left head
// Now consider going back some number of punct that includes a , or : tagged thing and then find non-punct
while (newHeadIdx >= 2) {
int nextHead = findPreviousHead(newHeadIdx, daughterTrees, origWasInterjection);
if (nextHead < 0) {
break;
}
newHeadIdx = nextHead;
}
if (newHeadIdx >= 0) {
headIdx = newHeadIdx;
}
}
}
return headIdx;
}
// Note: The first two SBARQ patterns only work when the SQ
// structure has already been removed in CoordinationTransformer.
static final TregexPattern[] headOfCopulaTregex = {
// Matches phrases such as "what is wrong"
TregexPattern.compile("SBARQ < (WHNP $++ (/^VB/ < " + EnglishPatterns.copularWordRegex + " $++ ADJP=head))"),
// matches WHNP $+ VB REMOVE_TMP_AND_ADV = tree -> {
if (tree == null)
return false;
Label label = tree.label();
if (label == null)
return false;
if (label.value().contains("-TMP") || label.value().contains("-ADV"))
return false;
if (label.value().startsWith("VP") && noVerbOverTempTregex.matcher(tree).matches()) {
return false;
}
return true;
};
/**
* Determine which daughter of the current parse tree is the
* head. It assumes that the daughters already have had their
* heads determined. Uses special rule for VP heads
*
* @param t The parse tree to examine the daughters of.
* This is assumed to never be a leaf
* @return The parse tree that is the head
*/
@Override
protected Tree determineNonTrivialHead(Tree t, Tree parent) {
String motherCat = tlp.basicCategory(t.label().value());
if (DEBUG) {
System.err.println("At " + motherCat + ", my parent is " + parent);
}
// Some conj expressions seem to make more sense with the "not" or
// other key words as the head. For example, "and not" means
// something completely different than "and". Furthermore,
// downstream code was written assuming "not" would be the head...
if (motherCat.equals("CONJP")) {
for (TregexPattern pattern : headOfConjpTregex) {
TregexMatcher matcher = pattern.matcher(t);
if (matcher.matchesAt(t)) {
return matcher.getNode("head");
}
}
// if none of the above patterns match, use the standard method
}
if (motherCat.equals("SBARQ") || motherCat.equals("SINV")) {
if (!makeCopulaHead) {
for (TregexPattern pattern : headOfCopulaTregex) {
TregexMatcher matcher = pattern.matcher(t);
if (matcher.matchesAt(t)) {
return matcher.getNode("head");
}
}
}
// if none of the above patterns match, use the standard method
}
// do VPs with auxiliary as special case
if ((motherCat.equals("VP") || motherCat.equals("SQ") || motherCat.equals("SINV"))) {
Tree[] kids = t.children();
// try to find if there is an auxiliary verb
if (DEBUG) {
System.err.println("Semantic head finder: at VP");
System.err.println("Class is " + t.getClass().getName());
t.pennPrint(System.err);
//System.err.println("hasVerbalAuxiliary = " + hasVerbalAuxiliary(kids, verbalAuxiliaries));
}
// looks for auxiliaries
Tree[] tmpFilteredChildren = null;
if (hasVerbalAuxiliary(kids, verbalAuxiliaries, true) || hasPassiveProgressiveAuxiliary(kids)) {
// String[] how = new String[] {"left", "VP", "ADJP", "NP"};
// Including NP etc seems okay for copular sentences but is
// problematic for other auxiliaries, like 'he has an answer'
String[] how ;
if (hasVerbalAuxiliary(kids, copulars, true)) {
// Only allow ADJP in copular constructions
// In constructions like "It gets cold", "get" should be the head
how = new String[]{ "left", "VP", "ADJP" };
} else {
how = new String[]{ "left", "VP" };
}
if (tmpFilteredChildren == null) {
tmpFilteredChildren = ArrayUtils.filter(kids, REMOVE_TMP_AND_ADV);
}
Tree pti = traverseLocate(tmpFilteredChildren, how, false);
if (DEBUG) {
System.err.println("Determined head (case 1) for " + t.value() + " is: " + pti);
}
if (pti != null) {
return pti;
// } else {
// System.err.println("------");
// System.err.println("SemanticHeadFinder failed to reassign head for");
// t.pennPrint(System.err);
// System.err.println("------");
}
}
// looks for copular verbs
if (hasVerbalAuxiliary(kids, copulars, false) && ! isExistential(t, parent) && ! isWHQ(t, parent)) {
String[][] how;
//TODO: also allow ADVP to be heads
if (motherCat.equals("SQ")) {
how = new String[][]{{"right", "VP", "ADJP", "NP", "UCP", "PP", "WHADJP", "WHNP"}};
} else {
how = new String[][]{{"left", "VP", "ADJP", "NP", "UCP", "PP", "WHADJP", "WHNP"}};
}
// Avoid undesirable heads by filtering them from the list of potential children
if (tmpFilteredChildren == null) {
tmpFilteredChildren = ArrayUtils.filter(kids, REMOVE_TMP_AND_ADV);
}
Tree pti = null;
for (int i = 0; i < how.length && pti == null; i++) {
pti = traverseLocate(tmpFilteredChildren, how[i], false);
}
// In SQ, only allow an NP to become head if there is another one to the left (then it's probably predicative)
if (motherCat.equals("SQ") && pti != null && pti.label() != null && pti.label().value().startsWith("NP")) {
boolean foundAnotherNp = false;
for (Tree kid : kids) {
if (kid == pti) {
break;
} else if (kid.label() != null && kid.label().value().startsWith("NP")) {
foundAnotherNp = true;
break;
}
}
if ( ! foundAnotherNp) {
pti = null;
}
}
if (DEBUG) {
System.err.println("Determined head (case 2) for " + t.value() + " is: " + pti);
}
if (pti != null) {
return pti;
} else {
if (DEBUG) {
System.err.println("------");
System.err.println("SemanticHeadFinder failed to reassign head for");
t.pennPrint(System.err);
System.err.println("------");
}
}
}
}
Tree hd = super.determineNonTrivialHead(t, parent);
/* ----
// This should now be handled at the AbstractCollinsHeadFinder level, so see if we can comment this out
// Heuristically repair punctuation heads
Tree[] hdChildren = hd.children();
if (hdChildren != null && hdChildren.length > 0 &&
hdChildren[0].isLeaf()) {
if (tlp.isPunctuationWord(hdChildren[0].label().value())) {
Tree[] tChildren = t.children();
if (DEBUG) {
System.err.printf("head is punct: %s\n", hdChildren[0].label());
}
for (int i = tChildren.length - 1; i >= 0; i--) {
if (!tlp.isPunctuationWord(tChildren[i].children()[0].label().value())) {
hd = tChildren[i];
if (DEBUG) {
System.err.printf("New head of %s is %s%n", hd.label(), hd.children()[0].label());
}
break;
}
}
}
}
*/
if (DEBUG) {
System.err.println("Determined head (case 3) for " + t.value() + " is: " + hd);
}
return hd;
}
/* Checks whether the tree t is an existential constituent
* There are two cases:
* -- affirmative sentences in which "there" is a left sister of the VP
* -- questions in which "there" is a daughter of the SQ.
*
*/
private boolean isExistential(Tree t, Tree parent) {
if (DEBUG) {
System.err.println("isExistential: " + t + ' ' + parent);
}
boolean toReturn = false;
String motherCat = tlp.basicCategory(t.label().value());
// affirmative case
if (motherCat.equals("VP") && parent != null) {
//take t and the sisters
Tree[] kids = parent.children();
// iterate over the sisters before t and checks if existential
for (Tree kid : kids) {
if (!kid.value().equals("VP")) {
List