All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.trees.CollinsHeadFinder Maven / Gradle / Ivy

Go to download

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.trees;

import edu.stanford.nlp.ling.CategoryWordTag;
import edu.stanford.nlp.util.Generics;


/**
 * Implements the HeadFinder found in Michael Collins' 1999 thesis.
 * Except: we've added a head rule for NX, which returns the leftmost item.
 * No rule for the head of NX is found in any of the versions of
 * Collins' head table that we have (did he perhaps use the NP rules
 * for NX? -- no Bikel, CL, 2005 says it defaults to leftmost).
 * These rules are suitable for the Penn Treebank.
 * 

* May 2004: Added support for AUX and AUXG to the VP rules; these cause * no interference in Penn Treebank parsing, but means that these rules * also work for the BLLIP corpus (or Charniak parser output in general). * Feb 2005: Fixes to coordination reheading so that punctuation cannot * become head. * * @author Christopher Manning */ public class CollinsHeadFinder extends AbstractCollinsHeadFinder { private static final String[] EMPTY_STRING_ARRAY = {}; public CollinsHeadFinder() { this(new PennTreebankLanguagePack()); } /** This constructor provides the traditional behavior, where there is * no special avoidance of punctuation categories. * * @param tlp TreebankLanguagePack used for basic category function */ public CollinsHeadFinder(TreebankLanguagePack tlp) { this(tlp, EMPTY_STRING_ARRAY); } public CollinsHeadFinder(TreebankLanguagePack tlp, String... categoriesToAvoid) { super(tlp, categoriesToAvoid); nonTerminalInfo = Generics.newHashMap(); // This version from Collins' diss (1999: 236-238) nonTerminalInfo.put("ADJP", new String[][]{{"left", "NNS", "QP", "NN", "$", "ADVP", "JJ", "VBN", "VBG", "ADJP", "JJR", "NP", "JJS", "DT", "FW", "RBR", "RBS", "SBAR", "RB"}}); nonTerminalInfo.put("ADVP", new String[][]{{"right", "RB", "RBR", "RBS", "FW", "ADVP", "TO", "CD", "JJR", "JJ", "IN", "NP", "JJS", "NN"}}); nonTerminalInfo.put("CONJP", new String[][]{{"right", "CC", "RB", "IN"}}); nonTerminalInfo.put("FRAG", new String[][]{{"right"}}); // crap nonTerminalInfo.put("INTJ", new String[][]{{"left"}}); nonTerminalInfo.put("LST", new String[][]{{"right", "LS", ":"}}); nonTerminalInfo.put("NAC", new String[][]{{"left", "NN", "NNS", "NNP", "NNPS", "NP", "NAC", "EX", "$", "CD", "QP", "PRP", "VBG", "JJ", "JJS", "JJR", "ADJP", "FW"}}); nonTerminalInfo.put("NX", new String[][]{{"left"}}); // crap nonTerminalInfo.put("PP", new String[][]{{"right", "IN", "TO", "VBG", "VBN", "RP", "FW"}}); // should prefer JJ? (PP (JJ such) (IN as) (NP (NN crocidolite))) nonTerminalInfo.put("PRN", new String[][]{{"left"}}); nonTerminalInfo.put("PRT", new String[][]{{"right", "RP"}}); nonTerminalInfo.put("QP", new String[][]{{"left", "$", "IN", "NNS", "NN", "JJ", "RB", "DT", "CD", "NCD", "QP", "JJR", "JJS"}}); nonTerminalInfo.put("RRC", new String[][]{{"right", "VP", "NP", "ADVP", "ADJP", "PP"}}); nonTerminalInfo.put("S", new String[][]{{"left", "TO", "IN", "VP", "S", "SBAR", "ADJP", "UCP", "NP"}}); nonTerminalInfo.put("SBAR", new String[][]{{"left", "WHNP", "WHPP", "WHADVP", "WHADJP", "IN", "DT", "S", "SQ", "SINV", "SBAR", "FRAG"}}); nonTerminalInfo.put("SBARQ", new String[][]{{"left", "SQ", "S", "SINV", "SBARQ", "FRAG"}}); nonTerminalInfo.put("SINV", new String[][]{{"left", "VBZ", "VBD", "VBP", "VB", "MD", "VP", "S", "SINV", "ADJP", "NP"}}); nonTerminalInfo.put("SQ", new String[][]{{"left", "VBZ", "VBD", "VBP", "VB", "MD", "VP", "SQ"}}); nonTerminalInfo.put("UCP", new String[][]{{"right"}}); nonTerminalInfo.put("VP", new String[][]{{"left", "TO", "VBD", "VBN", "MD", "VBZ", "VB", "VBG", "VBP", "AUX", "AUXG", "VP", "ADJP", "NN", "NNS", "NP"}}); nonTerminalInfo.put("WHADJP", new String[][]{{"left", "CC", "WRB", "JJ", "ADJP"}}); nonTerminalInfo.put("WHADVP", new String[][]{{"right", "CC", "WRB"}}); nonTerminalInfo.put("WHNP", new String[][]{{"left", "WDT", "WP", "WP$", "WHADJP", "WHPP", "WHNP"}}); nonTerminalInfo.put("WHPP", new String[][]{{"right", "IN", "TO", "FW"}}); nonTerminalInfo.put("X", new String[][]{{"right"}}); // crap rule nonTerminalInfo.put("NP", new String[][]{{"rightdis", "NN", "NNP", "NNPS", "NNS", "NX", "POS", "JJR"}, {"left", "NP"}, {"rightdis", "$", "ADJP", "PRN"}, {"right", "CD"}, {"rightdis", "JJ", "JJS", "RB", "QP"}}); nonTerminalInfo.put("TYPO", new String[][] {{"left"}}); // another crap rule, for Brown (Roger) nonTerminalInfo.put("EDITED", new String[][] {{"left"}}); // crap rule for Switchboard (if don't delete EDITED nodes) nonTerminalInfo.put("XS", new String[][] {{"right", "IN"}}); // rule for new structure in QP } @Override protected int postOperationFix(int headIdx, Tree[] daughterTrees) { if (headIdx >= 2) { String prevLab = tlp.basicCategory(daughterTrees[headIdx - 1].value()); if (prevLab.equals("CC") || prevLab.equals("CONJP")) { int newHeadIdx = headIdx - 2; Tree t = daughterTrees[newHeadIdx]; while (newHeadIdx >= 0 && t.isPreTerminal() && tlp.isPunctuationTag(t.value())) { newHeadIdx--; } if (newHeadIdx >= 0) { headIdx = newHeadIdx; } } } return headIdx; } /** * Go through trees and determine their heads and print them. * Just for debuggin'.
* Usage: * java edu.stanford.nlp.trees.CollinsHeadFinder treebankFilePath * * * @param args The treebankFilePath */ public static void main(String[] args) { Treebank treebank = new DiskTreebank(); CategoryWordTag.suppressTerminalDetails = true; treebank.loadPath(args[0]); final HeadFinder chf = new CollinsHeadFinder(); treebank.apply(pt -> { pt.percolateHeads(chf); pt.pennPrint(); System.out.println(); }); } private static final long serialVersionUID = -8747319554557223437L; }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy