All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.trees.SemanticHeadFinder Maven / Gradle / Ivy

Go to download

Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.

There is a newer version: 3.9.2
Show newest version
package edu.stanford.nlp.trees;

import edu.stanford.nlp.ling.HasCategory;
import edu.stanford.nlp.ling.HasTag;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.trees.tregex.TregexMatcher;
import edu.stanford.nlp.trees.tregex.TregexPattern;
import edu.stanford.nlp.util.ArrayUtils;
import java.util.function.Predicate;
import edu.stanford.nlp.util.Generics;

import java.util.Arrays;
import java.util.List;
import java.util.Set;


/**
 * Implements a 'semantic head' variant of the the HeadFinder found
 * in Michael Collins' 1999 thesis.
 * This version chooses the semantic head verb rather than the verb form
 * for cases with verbs.  And it makes similar themed changes to other
 * categories: e.g., in question phrases, like "Which Brazilian game", the
 * head is made "game" not "Which" as in common PTB head rules.

*

* By default the SemanticHeadFinder uses a treatment of copula where the * complement of the copula is taken as the head. That is, a sentence like * "Bill is big" will be analyzed as

*

* nsubj(big, Bill)
* cop(big, is)

*

* This analysis is used for questions and declaratives for adjective * complements and declarative nominal complements. However Wh-sentences * with nominal complements do not receive this treatment. * "Who is the president?" is analyzed with "the president" as nsubj and "who" * as "attr" of the copula:

* nsubj(is, president)
* attr(is, Who)

*

* (Such nominal copula sentences are complex: arguably, depending on the * circumstances, several analyses are possible, with either the overt NP able * to be any of the subject, the predicate, or one of two referential entities * connected by an equational copula. These uses aren't differentiated.) *

* Existential sentences are treated as follows:
* "There is a man"
* expl(is, There)
* det(man-4, a-3)
* nsubj(is-2, man-4)
* * @author John Rappaport * @author Marie-Catherine de Marneffe * @author Anna Rafferty */ public class SemanticHeadFinder extends ModCollinsHeadFinder { private static final boolean DEBUG = System.getProperty("SemanticHeadFinder", null) != null; /* A few times the apostrophe is missing on "'s", so we have "s" */ /* Tricky auxiliaries: "a", "na" is from "(gon|wan)na", "ve" from "Weve", etc. "of" as non-standard for "have" */ /* "as" is "has" with missing first letter. "to" is rendered "the" once in EWT. */ private static final String[] auxiliaries = { "will", "wo", "shall", "sha", "may", "might", "should", "would", "can", "could", "ca", "must", "'ll", "ll", "-ll", "cold", "has", "have", "had", "having", "'ve", "ve", "v", "of", "hav", "hvae", "as", "get", "gets", "getting", "got", "gotten", "do", "does", "did", "'d", "d", "du", "to", "2", "na", "a", "ot", "ta", "the", "too" }; // include Charniak tags (AUX, AUXG) so can do BLLIP right private static final String[] verbTags = {"TO", "MD", "VB", "VBD", "VBP", "VBZ", "VBG", "VBN", "AUX", "AUXG"}; // These ones are always auxiliaries, even if the word is "too", "my", or whatever else appears in web text. private static final String[] unambiguousAuxTags = {"TO", "MD", "AUX", "AUXG"}; private final Set verbalAuxiliaries; private final Set copulars; private final Set passiveAuxiliaries; private final Set verbalTags; private final Set unambiguousAuxiliaryTags; private final boolean makeCopulaHead; public SemanticHeadFinder() { this(new PennTreebankLanguagePack(), true); } public SemanticHeadFinder(boolean noCopulaHead) { this(new PennTreebankLanguagePack(), noCopulaHead); } /** Create a SemanticHeadFinder. * * @param tlp The TreebankLanguagePack, used by the superclass to get basic * category of constituents. * @param noCopulaHead If true, a copular verb (a form of be) * is not treated as head when it has an AdjP or NP complement. If false, * a copula verb is still always treated as a head. But it will still * be treated as an auxiliary in periphrastic tenses with a VP complement. */ public SemanticHeadFinder(TreebankLanguagePack tlp, boolean noCopulaHead) { super(tlp); ruleChanges(); // make a distinction between auxiliaries and copula verbs to // get the NP has semantic head in sentences like "Bill is an honest man". (Added "sha" for "shan't" May 2009 verbalAuxiliaries = Generics.newHashSet(Arrays.asList(auxiliaries)); passiveAuxiliaries = Generics.newHashSet(Arrays.asList(EnglishPatterns.beGetVerbs)); //copula verbs having an NP complement copulars = Generics.newHashSet(); if (noCopulaHead) { copulars.addAll(Arrays.asList(EnglishPatterns.copularVerbs)); } // TODO: reverse the polarity of noCopulaHead this.makeCopulaHead = !noCopulaHead; verbalTags = Generics.newHashSet(Arrays.asList(verbTags)); unambiguousAuxiliaryTags = Generics.newHashSet(Arrays.asList(unambiguousAuxTags)); } @Override public boolean makesCopulaHead() { return makeCopulaHead; } //makes modifications of Collins' rules to better fit with semantic notions of heads private void ruleChanges() { // NP: don't want a POS to be the head // verbs are here so that POS isn't favored in the case of bad parses nonTerminalInfo.put("NP", new String[][]{{"rightdis", "NN", "NNP", "NNPS", "NNS", "NX", "NML", "JJR", "WP" }, {"left", "NP", "PRP"}, {"rightdis", "$", "ADJP", "FW"}, {"right", "CD"}, {"rightdis", "JJ", "JJS", "QP", "DT", "WDT", "NML", "PRN", "RB", "RBR", "ADVP"}, {"rightdis", "VP", "VB", "VBZ", "VBD", "VBP"}, {"left", "POS"}}); nonTerminalInfo.put("NX", nonTerminalInfo.get("NP")); nonTerminalInfo.put("NML", nonTerminalInfo.get("NP")); // WHNP clauses should have the same sort of head as an NP // but it a WHNP has a NP and a WHNP under it, the WHNP should be the head. E.g., (WHNP (WHNP (WP$ whose) (JJ chief) (JJ executive) (NN officer))(, ,) (NP (NNP James) (NNP Gatward))(, ,)) nonTerminalInfo.put("WHNP", new String[][]{{"rightdis", "NN", "NNP", "NNPS", "NNS", "NX", "NML", "JJR", "WP"}, {"left", "WHNP", "NP"}, {"rightdis", "$", "ADJP", "PRN", "FW"}, {"right", "CD"}, {"rightdis", "JJ", "JJS", "RB", "QP"}, {"left", "WHPP", "WHADJP", "WP$", "WDT"}}); //WHADJP nonTerminalInfo.put("WHADJP", new String[][]{{"left", "ADJP", "JJ", "JJR", "WP"}, {"right", "RB"}, {"right"}}); //WHADJP nonTerminalInfo.put("WHADVP", new String[][]{{"rightdis", "WRB", "WHADVP", "RB", "JJ"}}); // if not WRB or WHADVP, probably has flat NP structure, allow JJ for "how long" constructions // QP: we don't want the first CD to be the semantic head (e.g., "three billion": head should be "billion"), so we go from right to left nonTerminalInfo.put("QP", new String[][]{{"right", "$", "NNS", "NN", "CD", "JJ", "PDT", "DT", "IN", "RB", "NCD", "QP", "JJR", "JJS"}}); // S, SBAR and SQ clauses should prefer the main verb as the head // S: "He considered him a friend" -> we want a friend to be the head nonTerminalInfo.put("S", new String[][]{{"left", "VP", "S", "FRAG", "SBAR", "ADJP", "UCP", "TO"}, {"right", "NP"}}); nonTerminalInfo.put("SBAR", new String[][]{{"left", "S", "SQ", "SINV", "SBAR", "FRAG", "VP", "WHNP", "WHPP", "WHADVP", "WHADJP", "IN", "DT"}}); // VP shouldn't be needed in SBAR, but occurs in one buggy tree in PTB3 wsj_1457 and otherwise does no harm nonTerminalInfo.put("SQ", new String[][]{{"left", "VP", "SQ", "ADJP", "VB", "VBZ", "VBD", "VBP", "MD", "AUX", "AUXG"}}); // UCP take the first element as head nonTerminalInfo.put("UCP", new String[][]{{"left"}}); // CONJP: we want different heads for "but also" and "but not" and we don't want "not" to be the head in "not to mention"; now make "mention" head of "not to mention" nonTerminalInfo.put("CONJP", new String[][]{{"right", "CC", "VB", "JJ", "RB", "IN" }}); // FRAG: crap rule needs to be change if you want to parse // glosses; but it is correct to have ADJP and ADVP before S // because of weird parses of reduced sentences. nonTerminalInfo.put("FRAG", new String[][]{{"left", "IN"}, {"right", "RB"}, {"left", "NP"}, {"left", "ADJP", "ADVP", "FRAG", "S", "SBAR", "VP"}}); // PRN: sentence first nonTerminalInfo.put("PRN", new String[][]{{"left", "VP", "SQ", "S", "SINV", "SBAR", "NP", "ADJP", "PP", "ADVP", "INTJ", "WHNP", "NAC", "VBP", "JJ", "NN", "NNP"}}); // add the constituent XS (special node to add a layer in a QP tree introduced in our QPTreeTransformer) nonTerminalInfo.put("XS", new String[][]{{"right", "IN"}}); // add a rule to deal with the CoNLL data nonTerminalInfo.put("EMBED", new String[][]{{"right", "INTJ"}}); } private boolean shouldSkip(Tree t, boolean origWasInterjection) { return t.isPreTerminal() && (tlp.isPunctuationTag(t.value()) || ! origWasInterjection && "UH".equals(t.value())) || "INTJ".equals(t.value()) && ! origWasInterjection; } private int findPreviousHead(int headIdx, Tree[] daughterTrees, boolean origWasInterjection) { boolean seenSeparator = false; int newHeadIdx = headIdx; while (newHeadIdx >= 0) { newHeadIdx = newHeadIdx - 1; if (newHeadIdx < 0) { return newHeadIdx; } String label = tlp.basicCategory(daughterTrees[newHeadIdx].value()); if (",".equals(label) || ":".equals(label)) { seenSeparator = true; } else if (daughterTrees[newHeadIdx].isPreTerminal() && (tlp.isPunctuationTag(label) || ! origWasInterjection && "UH".equals(label)) || "INTJ".equals(label) && ! origWasInterjection) { // keep looping } else { if ( ! seenSeparator) { newHeadIdx = -1; } break; } } return newHeadIdx; } /** * Overwrite the postOperationFix method. For "a, b and c" or similar: we want "a" to be the head. */ @Override protected int postOperationFix(int headIdx, Tree[] daughterTrees) { if (headIdx >= 2) { String prevLab = tlp.basicCategory(daughterTrees[headIdx - 1].value()); if (prevLab.equals("CC") || prevLab.equals("CONJP")) { boolean origWasInterjection = "UH".equals(tlp.basicCategory(daughterTrees[headIdx].value())); int newHeadIdx = headIdx - 2; // newHeadIdx is now left of conjunction. Now try going back over commas, etc. for 3+ conjuncts // Don't allow INTJ unless conjoined with INTJ - important in informal genres "Oh and don't forget to call!" while (newHeadIdx >= 0 && shouldSkip(daughterTrees[newHeadIdx], origWasInterjection)) { newHeadIdx--; } // We're now at newHeadIdx < 0 or have found a left head // Now consider going back some number of punct that includes a , or : tagged thing and then find non-punct while (newHeadIdx >= 2) { int nextHead = findPreviousHead(newHeadIdx, daughterTrees, origWasInterjection); if (nextHead < 0) { break; } newHeadIdx = nextHead; } if (newHeadIdx >= 0) { headIdx = newHeadIdx; } } } return headIdx; } // Note: The first two SBARQ patterns only work when the SQ // structure has already been removed in CoordinationTransformer. static final TregexPattern[] headOfCopulaTregex = { // Matches phrases such as "what is wrong" TregexPattern.compile("SBARQ < (WHNP $++ (/^VB/ < " + EnglishPatterns.copularWordRegex + " $++ ADJP=head))"), // matches WHNP $+ VB REMOVE_TMP_AND_ADV = tree -> { if (tree == null) return false; Label label = tree.label(); if (label == null) return false; if (label.value().contains("-TMP") || label.value().contains("-ADV")) return false; if (label.value().startsWith("VP") && noVerbOverTempTregex.matcher(tree).matches()) { return false; } return true; }; /** * Determine which daughter of the current parse tree is the * head. It assumes that the daughters already have had their * heads determined. Uses special rule for VP heads * * @param t The parse tree to examine the daughters of. * This is assumed to never be a leaf * @return The parse tree that is the head */ @Override protected Tree determineNonTrivialHead(Tree t, Tree parent) { String motherCat = tlp.basicCategory(t.label().value()); if (DEBUG) { System.err.println("At " + motherCat + ", my parent is " + parent); } // Some conj expressions seem to make more sense with the "not" or // other key words as the head. For example, "and not" means // something completely different than "and". Furthermore, // downstream code was written assuming "not" would be the head... if (motherCat.equals("CONJP")) { for (TregexPattern pattern : headOfConjpTregex) { TregexMatcher matcher = pattern.matcher(t); if (matcher.matchesAt(t)) { return matcher.getNode("head"); } } // if none of the above patterns match, use the standard method } if (motherCat.equals("SBARQ") || motherCat.equals("SINV")) { if (!makeCopulaHead) { for (TregexPattern pattern : headOfCopulaTregex) { TregexMatcher matcher = pattern.matcher(t); if (matcher.matchesAt(t)) { return matcher.getNode("head"); } } } // if none of the above patterns match, use the standard method } // do VPs with auxiliary as special case if ((motherCat.equals("VP") || motherCat.equals("SQ") || motherCat.equals("SINV"))) { Tree[] kids = t.children(); // try to find if there is an auxiliary verb if (DEBUG) { System.err.println("Semantic head finder: at VP"); System.err.println("Class is " + t.getClass().getName()); t.pennPrint(System.err); //System.err.println("hasVerbalAuxiliary = " + hasVerbalAuxiliary(kids, verbalAuxiliaries)); } // looks for auxiliaries Tree[] tmpFilteredChildren = null; if (hasVerbalAuxiliary(kids, verbalAuxiliaries, true) || hasPassiveProgressiveAuxiliary(kids)) { // String[] how = new String[] {"left", "VP", "ADJP", "NP"}; // Including NP etc seems okay for copular sentences but is // problematic for other auxiliaries, like 'he has an answer' // But maybe doing ADJP is fine! String[] how = { "left", "VP", "ADJP" }; if (tmpFilteredChildren == null) { tmpFilteredChildren = ArrayUtils.filter(kids, REMOVE_TMP_AND_ADV); } Tree pti = traverseLocate(tmpFilteredChildren, how, false); if (DEBUG) { System.err.println("Determined head (case 1) for " + t.value() + " is: " + pti); } if (pti != null) { return pti; // } else { // System.err.println("------"); // System.err.println("SemanticHeadFinder failed to reassign head for"); // t.pennPrint(System.err); // System.err.println("------"); } } // looks for copular verbs if (hasVerbalAuxiliary(kids, copulars, false) && ! isExistential(t, parent) && ! isWHQ(t, parent)) { String[] how; if (motherCat.equals("SQ")) { how = new String[]{"right", "VP", "ADJP", "NP", "WHADJP", "WHNP"}; } else { how = new String[]{"left", "VP", "ADJP", "NP", "WHADJP", "WHNP"}; } // Avoid undesirable heads by filtering them from the list of potential children if (tmpFilteredChildren == null) { tmpFilteredChildren = ArrayUtils.filter(kids, REMOVE_TMP_AND_ADV); } Tree pti = traverseLocate(tmpFilteredChildren, how, false); // In SQ, only allow an NP to become head if there is another one to the left (then it's probably predicative) if (motherCat.equals("SQ") && pti != null && pti.label() != null && pti.label().value().startsWith("NP")) { boolean foundAnotherNp = false; for (Tree kid : kids) { if (kid == pti) { break; } else if (kid.label() != null && kid.label().value().startsWith("NP")) { foundAnotherNp = true; break; } } if ( ! foundAnotherNp) { pti = null; } } if (DEBUG) { System.err.println("Determined head (case 2) for " + t.value() + " is: " + pti); } if (pti != null) { return pti; } else { if (DEBUG) { System.err.println("------"); System.err.println("SemanticHeadFinder failed to reassign head for"); t.pennPrint(System.err); System.err.println("------"); } } } } Tree hd = super.determineNonTrivialHead(t, parent); /* ---- // This should now be handled at the AbstractCollinsHeadFinder level, so see if we can comment this out // Heuristically repair punctuation heads Tree[] hdChildren = hd.children(); if (hdChildren != null && hdChildren.length > 0 && hdChildren[0].isLeaf()) { if (tlp.isPunctuationWord(hdChildren[0].label().value())) { Tree[] tChildren = t.children(); if (DEBUG) { System.err.printf("head is punct: %s\n", hdChildren[0].label()); } for (int i = tChildren.length - 1; i >= 0; i--) { if (!tlp.isPunctuationWord(tChildren[i].children()[0].label().value())) { hd = tChildren[i]; if (DEBUG) { System.err.printf("New head of %s is %s%n", hd.label(), hd.children()[0].label()); } break; } } } } */ if (DEBUG) { System.err.println("Determined head (case 3) for " + t.value() + " is: " + hd); } return hd; } /* Checks whether the tree t is an existential constituent * There are two cases: * -- affirmative sentences in which "there" is a left sister of the VP * -- questions in which "there" is a daughter of the SQ. * */ private boolean isExistential(Tree t, Tree parent) { if (DEBUG) { System.err.println("isExistential: " + t + ' ' + parent); } boolean toReturn = false; String motherCat = tlp.basicCategory(t.label().value()); // affirmative case if (motherCat.equals("VP") && parent != null) { //take t and the sisters Tree[] kids = parent.children(); // iterate over the sisters before t and checks if existential for (Tree kid : kids) { if (!kid.value().equals("VP")) { List





© 2015 - 2024 Weber Informatics LLC | Privacy Policy