edu.berkeley.nlp.ling.CollinsHeadFinder Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of berkeleyparser Show documentation
The Berkeley parser analyzes the grammatical structure of natural language using probabilistic context-free grammars (PCFGs).
The newest version!
package edu.berkeley.nlp.ling;

//import edu.berkeley.nlp.ling.CategoryWordTag;

import java.io.StringReader;
import java.util.List;
import java.util.HashMap;

import edu.berkeley.nlp.syntax.Tree;
import edu.berkeley.nlp.syntax.Trees;
import edu.berkeley.nlp.treebank.PennTreebankLanguagePack;
import edu.berkeley.nlp.treebank.TreebankLanguagePack;

/**
 * Implements the HeadFinder found in Michael Collins' 1999 thesis. Except:
 * we've added a head rule for NX, which returns the leftmost item. No rule for
 * the head of NX is found in any of the versions of Collins' head table that we
 * have (did he perhaps use the NP rules for NX? -- no Bikel, CL, 2005 says it
 * defaults to leftmost). These rules are suitable for the Penn Treebank.
 * 
 * May 2004: Added support for AUX and AUXG to the VP rules; these cause no
 * interference in Penn Treebank parsing, but means that these rules also work
 * for the BLLIP corpus (or Charniak parser output in general). Feb 2005: Fixes
 * to coordination reheading so that punctuation cannot become head.
 *
 * @author Christopher Manning
 */

public class CollinsHeadFinder extends AbstractCollinsHeadFinder {

	public CollinsHeadFinder() {
		this(new PennTreebankLanguagePack());
	}

	protected int postOperationFix(int headIdx, List> daughterTrees) {
		if (headIdx >= 2) {
			String prevLab = daughterTrees.get(headIdx - 1).getLabel();
			if (prevLab.equals("CC")) {
				int newHeadIdx = headIdx - 2;
				Tree t = daughterTrees.get(newHeadIdx);
				while (newHeadIdx >= 0 && t.isPreTerminal()
						&& tlp.isPunctuationTag(t.getLabel())) {
					newHeadIdx--;
				}
				if (newHeadIdx >= 0) {
					headIdx = newHeadIdx;
				}
			}
		}
		return headIdx;
	}

	@SuppressWarnings("unchecked")
	public CollinsHeadFinder(TreebankLanguagePack tlp) {
		super(tlp);

		nonTerminalInfo = new HashMap();
		// This version from Collins' diss (1999: 236-238)
		nonTerminalInfo.put("ADJP", new String[][] { { "left", "NNS", "QP", "NN",
			"$", "ADVP", "JJ", "VBN", "VBG", "ADJP", "JJR", "NP", "JJS", "DT",
			"FW", "RBR", "RBS", "SBAR", "RB" } });
		nonTerminalInfo.put("ADVP", new String[][] { { "right", "RB", "RBR", "RBS",
			"FW", "ADVP", "TO", "CD", "JJR", "JJ", "IN", "NP", "JJS", "NN" } });
		nonTerminalInfo.put("CONJP",
				new String[][] { { "right", "CC", "RB", "IN" } });
		nonTerminalInfo.put("FRAG", new String[][] { { "right" } }); // crap
		nonTerminalInfo.put("INTJ", new String[][] { { "left" } });
		nonTerminalInfo.put("LST", new String[][] { { "right", "LS", ":" } });
		nonTerminalInfo.put("NAC", new String[][] { { "left", "NN", "NNS", "NNP",
			"NNPS", "NP", "NAC", "EX", "$", "CD", "QP", "PRP", "VBG", "JJ", "JJS",
			"JJR", "ADJP", "FW" } });
		nonTerminalInfo.put("NX", new String[][] { { "left" } }); // crap
		nonTerminalInfo.put("PP", new String[][] { { "right", "IN", "TO", "VBG",
			"VBN", "RP", "FW" } });
		// should prefer JJ? (PP (JJ such) (IN as) (NP (NN crocidolite)))
		nonTerminalInfo.put("PRN", new String[][] { { "left" } });
		nonTerminalInfo.put("PRT", new String[][] { { "right", "RP" } });
		nonTerminalInfo.put("QP", new String[][] { { "left", "$", "IN", "NNS",
			"NN", "JJ", "RB", "DT", "CD", "NCD", "QP", "JJR", "JJS" } });
		nonTerminalInfo.put("RRC", new String[][] { { "right", "VP", "NP", "ADVP",
			"ADJP", "PP" } });
		nonTerminalInfo.put("S", new String[][] { { "left", "TO", "IN", "VP", "S",
			"SBAR", "ADJP", "UCP", "NP" } });
		nonTerminalInfo.put("SBAR", new String[][] { { "left", "WHNP", "WHPP",
			"WHADVP", "WHADJP", "IN", "DT", "S", "SQ", "SINV", "SBAR", "FRAG" } });
		nonTerminalInfo.put("SBARQ", new String[][] { { "left", "SQ", "S", "SINV",
			"SBARQ", "FRAG" } });
		nonTerminalInfo.put("SINV", new String[][] { { "left", "VBZ", "VBD", "VBP",
			"VB", "MD", "VP", "S", "SINV", "ADJP", "NP" } });
		nonTerminalInfo.put("SQ", new String[][] { { "left", "VBZ", "VBD", "VBP",
			"VB", "MD", "VP", "SQ" } });
		nonTerminalInfo.put("UCP", new String[][] { { "right" } });
		nonTerminalInfo.put("VP", new String[][] { { "left", "TO", "VBD", "VBN",
			"MD", "VBZ", "VB", "VBG", "VBP", "AUX", "AUXG", "VP", "ADJP", "NN",
			"NNS", "NP" } });
		nonTerminalInfo.put("WHADJP", new String[][] { { "left", "CC", "WRB", "JJ",
		"ADJP" } });
		nonTerminalInfo.put("WHADVP", new String[][] { { "right", "CC", "WRB" } });
		nonTerminalInfo.put("WHNP", new String[][] { { "left", "WDT", "WP", "WP$",
			"WHADJP", "WHPP", "WHNP" } });
		nonTerminalInfo.put("WHPP",
				new String[][] { { "right", "IN", "TO", "FW" } });
		nonTerminalInfo.put("X", new String[][] { { "right" } }); // crap rule
		nonTerminalInfo.put("NP", new String[][] {
				{ "rightdis", "NN", "NNP", "NNPS", "NNS", "NX", "POS", "JJR" },
				{ "left", "NP" }, { "rightdis", "$", "ADJP", "PRN" },
				{ "right", "CD" }, { "rightdis", "JJ", "JJS", "RB", "QP" } });
		nonTerminalInfo.put("TYPO", new String[][] { { "left" } }); // another crap
		// rule, for
		// Switchboard
		// (Roger)
	}

	/**
	 * Go through trees and determine their heads and print them. Just for
	 * debuggin'. 

	 * Usage: 
	 * java edu.stanford.nlp.trees.CollinsHeadFinder treebankFilePath
	 * 
	 *
	 * @param args
	 *          The treebankFilePath
	 */

	public static void main(String[] args) {
		Trees.PennTreeReader reader = new Trees.PennTreeReader(new StringReader("((S (NP (DT the) (JJ quick) (JJ (AA (BB (CC brown)))) (NN fox)) (VP (VBD jumped) (PP (IN over) (NP (DT the) (JJ lazy) (NN dog)))) (. .)))"));
		Tree tree = reader.next();
		System.out.println("tree "+tree);

		CollinsHeadFinder headFinder = new CollinsHeadFinder();
		while (!tree.isLeaf()) {
			Tree head = headFinder.determineHead(tree);
			System.out.println("head "+head);
			tree=head;
		}
	}

	private static final long serialVersionUID = -8747319554557223437L;

}