edu.stanford.nlp.trees.ModCollinsHeadFinder Maven / Gradle / Ivy

Go to download
package edu.stanford.nlp.trees;

import edu.stanford.nlp.util.Generics;

/**
 * Implements a variant on the HeadFinder found in Michael Collins' 1999
 * thesis. This starts with
 * Collins' head finder. As in {@code CollinsHeadFinder}, we've
 * added a head rule for NX.
 *
 * Changes:
 * 
 * The PRN rule used to just take the leftmost thing, we now have it
 * choose the leftmost lexical category (not the common punctuation etc.)
 * 
Delete IN as a possible head of S, and add FRAG (low priority)
 * 
Place NN before QP in ADJP head rules (more to do for ADJP!)
 * 
Place PDT before RB and after CD in QP rules.  Also prefer CD to
 * DT or RB.  And DT to RB.
 * 
Add DT, WDT as low priority choice for head of NP. Add PRP before PRN
 * Add RBR as low priority choice of head for NP.
 * 
Prefer NP or NX as head of NX, and otherwise default to rightmost not
 * leftmost (NP-like headedness)
 * 
VP: add JJ and NNP as low priority heads (many tagging errors)
 *   Place JJ above NP in priority, as it is to be preferred to NP object.
 * 
PP: add PP as a possible head (rare conjunctions)
 * 
Added rule for POSSP (can be introduced by parser)
 * 
Added a sensible-ish rule for X.
 * 
Added NML head rules, which are the same as for NP.
 * 
NP head rule: NP and NML are treated almost identically (NP has precedence)
 * 
NAC head rule: NML comes after NN/NNS but after NNP/NNPS
 * 
PP head rule: JJ added
 * 
Added JJP (appearing in David Vadas's annotation), which seems to play
 * the same role as ADJP.
 * 
 * These rules are suitable for the Penn Treebank.
 * 
 * A case that you apparently just can't handle well in this framework is
 * (NP (NP ... NP)).  If this is a conjunction, apposition or similar, then
 * the leftmost NP is the head, but if the first is a measure phrase like
 * (NP $ 38) (NP a share) then the second should probably be the head.
 *
 * @author Christopher Manning
 * @author Michel Galley
 */
public class ModCollinsHeadFinder extends CollinsHeadFinder {

  public ModCollinsHeadFinder() {
    this(new PennTreebankLanguagePack());
  }

  public ModCollinsHeadFinder(TreebankLanguagePack tlp) {
    super(tlp, tlp.punctuationTags()); // avoid punctuation as head in final default rule

    nonTerminalInfo = Generics.newHashMap();

    // This version from Collins' diss (1999: 236-238)
    // NNS, NN is actually sensible (money, etc.)!
    // QP early isn't; should prefer JJR NN RB
    // remove ADVP; it just shouldn't be there.
    // if two JJ, should take right one (e.g. South Korean)
    // nonTerminalInfo.put("ADJP", new String[][]{{"left", "NNS", "NN", "$", "QP"}, {"right", "JJ"}, {"left", "VBN", "VBG", "ADJP", "JJP", "JJR", "NP", "JJS", "DT", "FW", "RBR", "RBS", "SBAR", "RB"}});
    nonTerminalInfo.put("ADJP", new String[][]{{"left", "$"}, {"rightdis", "NNS", "NN", "JJ", "QP", "VBN", "VBG"}, {"left", "ADJP"}, {"rightdis", "JJP", "JJR", "JJS", "DT", "RB", "RBR", "CD", "IN", "VBD"}, {"left", "ADVP", "NP"}});
    nonTerminalInfo.put("JJP", new String[][]{{"left", "NNS", "NN", "$", "QP", "JJ", "VBN", "VBG", "ADJP", "JJP", "JJR", "NP", "JJS", "DT", "FW", "RBR", "RBS", "SBAR", "RB"}});  // JJP is introduced for NML-like adjective phrases in Vadas' treebank; Chris wishes he hadn't used JJP which should be a POS-tag.
    // ADVP rule rewritten by Chris in Nov 2010 to be rightdis.  This is right! JJ.* is often head and rightmost.
    nonTerminalInfo.put("ADVP", new String[][]{{"left", "ADVP", "IN"},
                                               {"rightdis", "RB", "RBR", "RBS", "JJ", "JJR", "JJS"},
                                               {"rightdis", "RP", "DT", "NN", "CD", "NP", "VBN", "NNP", "CC", "FW", "NNS", "ADJP", "NML"}});
    nonTerminalInfo.put("CONJP", new String[][]{{"right", "CC", "RB", "IN"}});
    nonTerminalInfo.put("FRAG", new String[][]{{"right"}}); // crap
    nonTerminalInfo.put("INTJ", new String[][]{{"left"}});
    nonTerminalInfo.put("LST", new String[][]{{"right", "LS", ":"}});

    // NML is head in: (NAC-LOC (NML San Antonio) (, ,) (NNP Texas))
    // TODO: NNP should be head (rare cases, could be ignored):
    //   (NAC (NML New York) (NNP Court) (PP of Appeals))
    //   (NAC (NML Prudential Insurance) (NNP Co.) (PP Of America))
    // Chris: This could maybe still do with more thought, but NAC is rare.
    nonTerminalInfo.put("NAC", new String[][]{{"left", "NN", "NNS", "NML", "NNP", "NNPS", "NP", "NAC", "EX", "$", "CD", "QP", "PRP", "VBG", "JJ", "JJS", "JJR", "ADJP", "JJP", "FW"}});

    // Added JJ to PP head table, since it is a head in several cases, e.g.:
    // (PP (JJ next) (PP to them))
    // When you have both JJ and IN daughters, it is invariably "such as" -- not so clear which should be head, but leave as IN
    // should prefer JJ? (PP (JJ such) (IN as) (NP (NN crocidolite)))  Michel thinks we should make JJ a head of PP
    // added SYM as used in new treebanks for symbols filling role of IN
    // Changed PP search to left -- just what you want for conjunction (and consistent with SemanticHeadFinder)
    nonTerminalInfo.put("PP", new String[][]{{"right", "IN", "TO", "VBG", "VBN", "RP", "FW", "JJ", "SYM"}, {"left", "PP"}});

    nonTerminalInfo.put("PRN", new String[][]{{"left", "VP", "NP", "PP", "SQ", "S", "SINV", "SBAR", "ADJP", "JJP", "ADVP", "INTJ", "WHNP", "NAC", "VBP", "JJ", "NN", "NNP"}});
    nonTerminalInfo.put("PRT", new String[][]{{"right", "RP"}});
    // add '#' for pounds!!
    nonTerminalInfo.put("QP", new String[][]{{"left", "$", "IN", "NNS", "NN", "JJ", "CD", "PDT", "DT", "RB", "NCD", "QP", "JJR", "JJS"}});
    // reduced relative clause can be any predicate VP, ADJP, NP, PP.
    // For choosing between NP and PP, really need to know which one is temporal and to choose the other.
    // It's not clear ADVP needs to be in the list at all (delete?).
    nonTerminalInfo.put("RRC", new String[][]{{"left", "RRC"}, {"right", "VP", "ADJP", "JJP", "NP", "PP", "ADVP"}});

    // delete IN -- go for main part of sentence; add FRAG

    nonTerminalInfo.put("S", new String[][]{{"left", "TO", "VP", "S", "FRAG", "SBAR", "ADJP", "JJP", "UCP", "NP"}});
    nonTerminalInfo.put("SBAR", new String[][]{{"left", "WHNP", "WHPP", "WHADVP", "WHADJP", "IN", "DT", "S", "SQ", "SINV", "SBAR", "FRAG"}});
    nonTerminalInfo.put("SBARQ", new String[][]{{"left", "SQ", "S", "SINV", "SBARQ", "FRAG", "SBAR"}});
    // cdm: if you have 2 VP under an SINV, you should really take the 2nd as syntactic head, because the first is a topicalized VP complement of the second, but for now I didn't change this, since it didn't help parsing.  (If it were changed, it'd need to be also changed to the opposite in SemanticHeadFinder.)
    nonTerminalInfo.put("SINV", new String[][]{{"left", "VBZ", "VBD", "VBP", "VB", "MD", "VBN", "VP", "S", "SINV", "ADJP", "JJP", "NP"}});
    nonTerminalInfo.put("SQ", new String[][]{{"left", "VBZ", "VBD", "VBP", "VB", "MD", "AUX", "AUXG", "VP", "SQ"}});  // TODO: Should maybe put S before SQ for tag questions. Check.
    nonTerminalInfo.put("UCP", new String[][]{{"right"}});
    // below is weird!! Make 2 lists, one for good and one for bad heads??
    // VP: added AUX and AUXG to work with Charniak tags
    nonTerminalInfo.put("VP", new String[][]{{"left", "TO", "VBD", "VBN", "MD", "VBZ", "VB", "VBG", "VBP", "VP", "AUX", "AUXG", "ADJP", "JJP", "NN", "NNS", "JJ", "NP", "NNP"}});
    nonTerminalInfo.put("WHADJP", new String[][]{{"left", "WRB", "WHADVP", "RB", "JJ", "ADJP", "JJP", "JJR"}});
    nonTerminalInfo.put("WHADVP", new String[][]{{"right", "WRB", "WHADVP"}});
    nonTerminalInfo.put("WHNP", new String[][]{{"left", "WDT", "WP", "WP$", "WHADJP", "WHPP", "WHNP"}});
    nonTerminalInfo.put("WHPP", new String[][]{{"right", "IN", "TO", "FW"}});
    nonTerminalInfo.put("X", new String[][]{{"right", "S", "VP", "ADJP", "JJP", "NP", "SBAR", "PP", "X"}});
    nonTerminalInfo.put("NP", new String[][]{{"rightdis", "NN", "NNP", "NNPS", "NNS", "NML", "NX", "POS", "JJR"}, {"left", "NP", "PRP"}, {"rightdis", "$", "ADJP", "JJP", "PRN", "FW"}, {"right", "CD"}, {"rightdis", "JJ", "JJS", "RB", "QP", "DT", "WDT", "RBR", "ADVP"}});
    nonTerminalInfo.put("NX", nonTerminalInfo.get("NP"));
    // TODO: seems JJ should be head of NML in this case:
    // (NP (NML (JJ former) (NML Red Sox) (JJ great)) (NNP Luis) (NNP Tiant)),
    // (although JJ great is tagged wrong)
    nonTerminalInfo.put("NML", nonTerminalInfo.get("NP"));


    nonTerminalInfo.put("POSSP", new String[][]{{"right", "POS"}});

    /* HJT: Adding the following to deal with oddly formed data in (for example) the Brown corpus */
    nonTerminalInfo.put("ROOT", new String[][]{{"left", "S", "SQ", "SINV", "SBAR", "FRAG"}});
    // Just to handle trees which have TOP instead of ROOT at the root
    nonTerminalInfo.put("TOP", nonTerminalInfo.get("ROOT"));
    nonTerminalInfo.put("TYPO", new String[][]{{"left", "NN", "NP", "NML", "NNP", "NNPS", "TO",
      "VBD", "VBN", "MD", "VBZ", "VB", "VBG", "VBP", "VP", "ADJP", "JJP", "FRAG"}}); // for Brown (Roger)
    nonTerminalInfo.put("ADV", new String[][]{{"right", "RB", "RBR", "RBS", "FW",
      "ADVP", "TO", "CD", "JJR", "JJ", "IN", "NP", "NML", "JJS", "NN"}});

    // SWBD
    nonTerminalInfo.put("EDITED", new String[][] {{"left"}});  // crap rule for Switchboard (if don't delete EDITED nodes)
    // in sw2756, a "VB". (copy "VP" to handle this problem, though should really fix it on reading)
    nonTerminalInfo.put("VB", new String[][]{{"left", "TO", "VBD", "VBN", "MD", "VBZ", "VB", "VBG", "VBP", "VP", "AUX", "AUXG", "ADJP", "JJP", "NN", "NNS", "JJ", "NP", "NNP"}});

    nonTerminalInfo.put("META", new String[][] {{"left"}});  // rule for OntoNotes, but maybe should just be deleted in TreeReader??
    nonTerminalInfo.put("XS", new String[][] {{"right", "IN"}}); // rule for new structure in QP, introduced by Stanford in QPTreeTransformer
    // nonTerminalInfo.put(null, new String[][] {{"left"}});  // rule for OntoNotes from Michel, but it would be better to fix this in TreeReader or to use a default rule?

    // todo: Uncomment this line if we always want to take the leftmost if no head rule is defined for the mother category.
    // defaultRule = defaultLeftRule; // Don't exception, take leftmost if no rule defined for a certain parent category
  }

  private static final long serialVersionUID = -5870387458902637256L;

}