All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.parser.lexparser.ExhaustivePCFGParser Maven / Gradle / Ivy

Go to download

Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.

There is a newer version: 3.9.2
Show newest version
// Stanford Parser -- a probabilistic lexicalized NL CFG parser
// Copyright (c) 2002, 2003, 2004, 2005 The Board of Trustees of
// The Leland Stanford Junior University. All Rights Reserved.
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//
// For more information, bug reports, fixes, contact:
//    Christopher Manning
//    Dept of Computer Science, Gates 1A
//    Stanford CA 94305-9010
//    USA
//    [email protected]
//    http://nlp.stanford.edu/downloads/lex-parser.shtml

package edu.stanford.nlp.parser.lexparser;

import edu.stanford.nlp.io.EncodingPrintWriter;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasContext;
import edu.stanford.nlp.ling.HasOffset;
import edu.stanford.nlp.ling.HasTag;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.math.SloppyMath;
import edu.stanford.nlp.parser.KBestViterbiParser;
import edu.stanford.nlp.parser.common.ParserAnnotations;
import edu.stanford.nlp.parser.common.ParserConstraint;
import edu.stanford.nlp.trees.TreeFactory;
import edu.stanford.nlp.trees.TreebankLanguagePack;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.LabeledScoredTreeFactory;
import edu.stanford.nlp.util.*;
import edu.stanford.nlp.util.PriorityQueue;

import java.util.*;
import java.util.regex.Matcher;

/** An exhaustive generalized CKY PCFG parser.
 *  Fairly carefully optimized to be fast.
 *  
* If reusing this object for multiple parses, remember to correctly * set any options such as the constraints field. * * @author Dan Klein * @author Christopher Manning (I seem to maintain it....) * @author Jenny Finkel (N-best and sampling code, former from Liang/Chiang) */ public class ExhaustivePCFGParser implements Scorer, KBestViterbiParser { // public static long insideTime = 0; // for profiling // public static long outsideTime = 0; protected final String goalStr; protected final Index stateIndex; protected final Index wordIndex; protected final Index tagIndex; protected final TreeFactory tf; protected final BinaryGrammar bg; protected final UnaryGrammar ug; protected final Lexicon lex; protected final Options op; protected final TreebankLanguagePack tlp; protected OutsideRuleFilter orf; // inside scores protected float[][][] iScore; // start idx, end idx, state -> logProb (ragged; null for end <= start) // outside scores protected float[][][] oScore; // start idx, end idx, state -> logProb protected float bestScore; protected int[][][] wordsInSpan; // number of words in span with this state protected boolean[][] oFilteredStart; // [start][state]; only used by unused outsideRuleFilter protected boolean[][] oFilteredEnd; // [end][state]; only used by unused outsideRuleFilter protected boolean[][] iPossibleByL; // [start][state] protected boolean[][] iPossibleByR; // [end][state] protected boolean[][] oPossibleByL; // [start][state] protected boolean[][] oPossibleByR; // [end][state] protected int[] words; // words of sentence being parsed as word Numberer ints private int[] beginOffsets; private int[] endOffsets; private CoreLabel[] originalCoreLabels; private HasTag[] originalTags; protected int length; // one larger than true length of sentence; includes boundary symbol in count protected boolean[][] tags; protected int myMaxLength = -0xDEADBEEF; protected final int numStates; protected int arraySize = 0; /** * When you want to force the parser to parse a particular * subsequence into a particular state. Parses will only be made * where there is a constituent over the given span which matches * (as regular expression) the state Pattern given. See the * documentation of the ParserConstraint class for information on * specifying a ParserConstraint. *
* Implementation note: It would be cleaner to make this a * Collections.emptyList, but that actually significantly slows down * the processing in the case of empty lists. Checking for null * saves quite a bit of time. */ protected List constraints = null; private CoreLabel getCoreLabel(int labelIndex) { if (originalCoreLabels[labelIndex] != null) { CoreLabel terminalLabel = originalCoreLabels[labelIndex]; if (terminalLabel.value() == null && terminalLabel.word() != null) { terminalLabel.setValue(terminalLabel.word()); } return terminalLabel; } String wordStr = wordIndex.get(words[labelIndex]); CoreLabel terminalLabel = new CoreLabel(); terminalLabel.setValue(wordStr); terminalLabel.setWord(wordStr); terminalLabel.setBeginPosition(beginOffsets[labelIndex]); terminalLabel.setEndPosition(endOffsets[labelIndex]); if (originalTags[labelIndex] != null) { terminalLabel.setTag(originalTags[labelIndex].tag()); } return terminalLabel; } @Override public double oScore(Edge edge) { double oS = oScore[edge.start][edge.end][edge.state]; if (op.testOptions.pcfgThreshold) { double iS = iScore[edge.start][edge.end][edge.state]; if (iS + oS - bestScore < op.testOptions.pcfgThresholdValue) { return Double.NEGATIVE_INFINITY; } } return oS; } @Override public double iScore(Edge edge) { return iScore[edge.start][edge.end][edge.state]; } @Override public boolean oPossible(Hook hook) { return (hook.isPreHook() ? oPossibleByR[hook.end][hook.state] : oPossibleByL[hook.start][hook.state]); } @Override public boolean iPossible(Hook hook) { return (hook.isPreHook() ? iPossibleByR[hook.start][hook.subState] : iPossibleByL[hook.end][hook.subState]); } public boolean oPossibleL(int state, int start) { return oPossibleByL[start][state]; } public boolean oPossibleR(int state, int end) { return oPossibleByR[end][state]; } public boolean iPossibleL(int state, int start) { return iPossibleByL[start][state]; } public boolean iPossibleR(int state, int end) { return iPossibleByR[end][state]; } protected void buildOFilter() { oFilteredStart = new boolean[length][numStates]; oFilteredEnd = new boolean[length + 1][numStates]; orf.init(); for (int start = 0; start < length; start++) { orf.leftAccepting(oFilteredStart[start]); orf.advanceRight(tags[start]); } for (int end = length; end > 0; end--) { orf.rightAccepting(oFilteredEnd[end]); orf.advanceLeft(tags[end - 1]); } } public double validateBinarizedTree(Tree tree, int start) { if (tree.isLeaf()) { return 0.0; } float epsilon = 0.0001f; if (tree.isPreTerminal()) { String wordStr = tree.children()[0].label().value(); int tag = tagIndex.indexOf(tree.label().value()); int word = wordIndex.indexOf(wordStr); IntTaggedWord iTW = new IntTaggedWord(word, tag); float score = lex.score(iTW, start, wordStr, null); float bound = iScore[start][start + 1][stateIndex.indexOf(tree.label().value())]; if (score > bound + epsilon) { System.out.println("Invalid tagging:"); System.out.println(" Tag: " + tree.label().value()); System.out.println(" Word: " + tree.children()[0].label().value()); System.out.println(" Score: " + score); System.out.println(" Bound: " + bound); } return score; } int parent = stateIndex.indexOf(tree.label().value()); int firstChild = stateIndex.indexOf(tree.children()[0].label().value()); if (tree.numChildren() == 1) { UnaryRule ur = new UnaryRule(parent, firstChild); double score = SloppyMath.max(ug.scoreRule(ur), -10000.0) + validateBinarizedTree(tree.children()[0], start); double bound = iScore[start][start + tree.yield().size()][parent]; if (score > bound + epsilon) { System.out.println("Invalid unary:"); System.out.println(" Parent: " + tree.label().value()); System.out.println(" Child: " + tree.children()[0].label().value()); System.out.println(" Start: " + start); System.out.println(" End: " + (start + tree.yield().size())); System.out.println(" Score: " + score); System.out.println(" Bound: " + bound); } return score; } int secondChild = stateIndex.indexOf(tree.children()[1].label().value()); BinaryRule br = new BinaryRule(parent, firstChild, secondChild); double score = SloppyMath.max(bg.scoreRule(br), -10000.0) + validateBinarizedTree(tree.children()[0], start) + validateBinarizedTree(tree.children()[1], start + tree.children()[0].yield().size()); double bound = iScore[start][start + tree.yield().size()][parent]; if (score > bound + epsilon) { System.out.println("Invalid binary:"); System.out.println(" Parent: " + tree.label().value()); System.out.println(" LChild: " + tree.children()[0].label().value()); System.out.println(" RChild: " + tree.children()[1].label().value()); System.out.println(" Start: " + start); System.out.println(" End: " + (start + tree.yield().size())); System.out.println(" Score: " + score); System.out.println(" Bound: " + bound); } return score; } // needs to be set up so that uses same Train options... public Tree scoreNonBinarizedTree(Tree tree) { TreeAnnotatorAndBinarizer binarizer = new TreeAnnotatorAndBinarizer(op.tlpParams, op.forceCNF, !op.trainOptions.outsideFactor(), true, op); tree = binarizer.transformTree(tree); scoreBinarizedTree(tree, 0); return op.tlpParams.subcategoryStripper().transformTree(new Debinarizer(op.forceCNF).transformTree(tree)); // return debinarizer.transformTree(t); } // public double scoreBinarizedTree(Tree tree, int start) { if (tree.isLeaf()) { return 0.0; } if (tree.isPreTerminal()) { String wordStr = tree.children()[0].label().value(); int tag = tagIndex.indexOf(tree.label().value()); int word = wordIndex.indexOf(wordStr); IntTaggedWord iTW = new IntTaggedWord(word, tag); // if (lex.score(iTW,(leftmost ? 0 : 1)) == Double.NEGATIVE_INFINITY) { // System.out.println("NO SCORE FOR: "+iTW); // } float score = lex.score(iTW, start, wordStr, null); tree.setScore(score); return score; } int parent = stateIndex.indexOf(tree.label().value()); int firstChild = stateIndex.indexOf(tree.children()[0].label().value()); if (tree.numChildren() == 1) { UnaryRule ur = new UnaryRule(parent, firstChild); //+ DEBUG // if (ug.scoreRule(ur) < -10000) { // System.out.println("Grammar doesn't have rule: " + ur); // } // return SloppyMath.max(ug.scoreRule(ur), -10000.0) + scoreBinarizedTree(tree.children()[0], leftmost); double score = ug.scoreRule(ur) + scoreBinarizedTree(tree.children()[0], start); tree.setScore(score); return score; } int secondChild = stateIndex.indexOf(tree.children()[1].label().value()); BinaryRule br = new BinaryRule(parent, firstChild, secondChild); //+ DEBUG // if (bg.scoreRule(br) < -10000) { // System.out.println("Grammar doesn't have rule: " + br); // } // return SloppyMath.max(bg.scoreRule(br), -10000.0) + // scoreBinarizedTree(tree.children()[0], leftmost) + // scoreBinarizedTree(tree.children()[1], false); double score = bg.scoreRule(br) + scoreBinarizedTree(tree.children()[0], start) + scoreBinarizedTree(tree.children()[1], start + tree.children()[0].yield().size()); tree.setScore(score); return score; } static final boolean spillGuts = false; static final boolean dumpTagging = false; private long time = System.currentTimeMillis(); protected void tick(String str) { long time2 = System.currentTimeMillis(); long diff = time2 - time; time = time2; System.err.print("done. " + diff + "\n" + str); } protected boolean floodTags = false; protected List sentence = null; protected Lattice lr = null; protected int[][] narrowLExtent; // = null; // [end][state]: the rightmost left extent of state s ending at position i protected int[][] wideLExtent; // = null; // [end][state] the leftmost left extent of state s ending at position i protected int[][] narrowRExtent; // = null; // [start][state]: the leftmost right extent of state s starting at position i protected int[][] wideRExtent; // = null; // [start][state] the rightmost right extent of state s starting at position i protected final boolean[] isTag; // this records whether grammar states (stateIndex) correspond to POS tags public boolean parse(List sentence) { lr = null; // better nullPointer exception than silent error //System.out.println("is it a taggedword?" + (sentence.get(0) instanceof TaggedWord)); //debugging if (sentence != this.sentence) { this.sentence = sentence; floodTags = false; } if (op.testOptions.verbose) { Timing.tick("Starting pcfg parse."); } if (spillGuts) { tick("Starting PCFG parse..."); } length = sentence.size(); if (length > arraySize) { considerCreatingArrays(length); } int goal = stateIndex.indexOf(goalStr); if (op.testOptions.verbose) { // System.out.println(numStates + " states, " + goal + " is the goal state."); // System.err.println(new ArrayList(ug.coreRules.keySet())); System.err.print("Initializing PCFG..."); } // map input words to words array (wordIndex ints) words = new int[length]; beginOffsets = new int[length]; endOffsets = new int[length]; originalCoreLabels = new CoreLabel[length]; originalTags = new HasTag[length]; int unk = 0; StringBuilder unkWords = new StringBuilder("["); // int unkIndex = wordIndex.size(); for (int i = 0; i < length; i++) { String s = sentence.get(i).word(); if (sentence.get(i) instanceof HasOffset) { HasOffset word = (HasOffset) sentence.get(i); beginOffsets[i] = word.beginPosition(); endOffsets[i] = word.endPosition(); } else { //Storing the positions of the word interstices //Account for single space between words beginOffsets[i] = ((i == 0) ? 0 : endOffsets[i - 1] + 1); endOffsets[i] = beginOffsets[i] + s.length(); } if (sentence.get(i) instanceof CoreLabel) { originalCoreLabels[i] = (CoreLabel) sentence.get(i); } if (sentence.get(i) instanceof HasTag) { HasTag tag = (HasTag) sentence.get(i); if (tag.tag() != null) { originalTags[i] = tag; } } if (op.testOptions.verbose && (!wordIndex.contains(s) || !lex.isKnown(wordIndex.indexOf(s)))) { unk++; unkWords.append(' '); unkWords.append(s); unkWords.append(" { "); for (int jj = 0; jj < s.length(); jj++) { char ch = s.charAt(jj); unkWords.append(Character.getType(ch)).append(" "); } unkWords.append("}"); } // TODO: really, add a new word? //words[i] = wordIndex.indexOf(s, unkIndex); //if (words[i] == unkIndex) { // ++unkIndex; //} words[i] = wordIndex.addToIndex(s); //if (wordIndex.contains(s)) { // words[i] = wordIndex.indexOf(s); //} else { // words[i] = wordIndex.indexOf(Lexicon.UNKNOWN_WORD); //} } // initialize inside and outside score arrays if (spillGuts) { tick("Wiping arrays..."); } if (Thread.interrupted()) { throw new RuntimeInterruptedException(); } for (int start = 0; start < length; start++) { for (int end = start + 1; end <= length; end++) { Arrays.fill(iScore[start][end], Float.NEGATIVE_INFINITY); if (op.doDep && ! op.testOptions.useFastFactored) { Arrays.fill(oScore[start][end], Float.NEGATIVE_INFINITY); } if (op.testOptions.lengthNormalization) { Arrays.fill(wordsInSpan[start][end], 1); } } } if (Thread.interrupted()) { throw new RuntimeInterruptedException(); } for (int loc = 0; loc <= length; loc++) { Arrays.fill(narrowLExtent[loc], -1); // the rightmost left with state s ending at i that we can get is the beginning Arrays.fill(wideLExtent[loc], length + 1); // the leftmost left with state s ending at i that we can get is the end } for (int loc = 0; loc < length; loc++) { Arrays.fill(narrowRExtent[loc], length + 1); // the leftmost right with state s starting at i that we can get is the end Arrays.fill(wideRExtent[loc], -1); // the rightmost right with state s starting at i that we can get is the beginning } // int puncTag = stateIndex.indexOf("."); // boolean lastIsPunc = false; if (op.testOptions.verbose) { Timing.tick("done."); unkWords.append(" ]"); op.tlpParams.pw(System.err).println("Unknown words: " + unk + " " + unkWords); System.err.print("Starting filters..."); } if (Thread.interrupted()) { throw new RuntimeInterruptedException(); } // do tags if (spillGuts) { tick("Tagging..."); } initializeChart(sentence); //if (op.testOptions.outsideFilter) // buildOFilter(); if (op.testOptions.verbose) { Timing.tick("done."); System.err.print("Starting insides..."); } // do the inside probabilities doInsideScores(); if (op.testOptions.verbose) { // insideTime += Timing.tick("done."); Timing.tick("done."); System.out.println("PCFG parsing " + length + " words (incl. stop): insideScore = " + iScore[0][length][goal]); } bestScore = iScore[0][length][goal]; boolean succeeded = hasParse(); if (op.testOptions.doRecovery && !succeeded && !floodTags) { floodTags = true; // sentence will try to reparse // ms: disabled message. this is annoying and it doesn't really provide much information //System.err.println("Trying recovery parse..."); return parse(sentence); } if ( ! op.doDep || op.testOptions.useFastFactored) { return succeeded; } if (op.testOptions.verbose) { System.err.print("Starting outsides..."); } // outside scores oScore[0][length][goal] = 0.0f; doOutsideScores(); //System.out.println("State rate: "+((int)(1000*ohits/otries))/10.0); //System.out.println("Traversals: "+ohits); if (op.testOptions.verbose) { // outsideTime += Timing.tick("Done."); Timing.tick("done."); } if (op.doDep) { initializePossibles(); } if (Thread.interrupted()) { throw new RuntimeInterruptedException(); } return succeeded; } public boolean parse(HTKLatticeReader lr) { //TODO wsg 20-jan-2010 // There are presently 2 issues with HTK lattice parsing: // (1) The initializeChart() method present in rev. 19820 did not properly initialize // lattices (or sub-lattices) like this (where A,B,C are nodes, and NN is the POS tag arc label): // // --NN--> B --NN-- // / \ // A ------NN-------> C // // (2) extractBestParse() was not implemented properly. // // To re-implement support for HTKLatticeReader it is necessary to create an interface // for the two different lattice implementations and then modify initializeChart() and // extractBestParse() as appropriate. Another solution would be to duplicate these two // methods and make the necessary changes for HTKLatticeReader. In both cases, the // acoustic model score provided by the HTK lattices should be included in the weighting. // // Note that I never actually tested HTKLatticeReader, so I am uncertain if this facility // actually worked in the first place. // System.err.printf("%s: HTK lattice parsing presently disabled.\n", this.getClass().getName()); return false; } public boolean parse(Lattice lr) { sentence = null; // better nullPointer exception than silent error if (lr != this.lr) { this.lr = lr; floodTags = false; } if (op.testOptions.verbose) Timing.tick("Doing lattice PCFG parse..."); // The number of whitespace nodes in the lattice length = lr.getNumNodes() - 1; //Subtract 1 since considerCreatingArrays will add the final interstice if (length > arraySize) considerCreatingArrays(length); int goal = stateIndex.indexOf(goalStr); // if (op.testOptions.verbose) { // System.err.println("Unaries: " + ug.rules()); // System.err.println("Binaries: " + bg.rules()); // System.err.println("Initializing PCFG..."); // System.err.println(" " + numStates + " states, " + goal + " is the goal state."); // } // System.err.println("Tagging states"); // for(int i = 0; i < numStates; i++) { // if(isTag[i]) { // int tagId = Numberer.translate(stateSpace, "tags", i); // String tag = (String) tagNumberer.object(tagId); // System.err.printf(" %d: %s\n",i,tag); // } // } // Create a map of all words in the lattice // // int numEdges = lr.getNumEdges(); // words = new int[numEdges]; // offsets = new IntPair[numEdges]; // // int unk = 0; // int i = 0; // StringBuilder unkWords = new StringBuilder("["); // for (LatticeEdge edge : lr) { // String s = edge.word; // if (op.testOptions.verbose && !lex.isKnown(wordNumberer.number(s))) { // unk++; // unkWords.append(" " + s); // } // words[i++] = wordNumberer.number(s); // } for (int start = 0; start < length; start++) { for (int end = start + 1; end <= length; end++) { Arrays.fill(iScore[start][end], Float.NEGATIVE_INFINITY); if (op.doDep) Arrays.fill(oScore[start][end], Float.NEGATIVE_INFINITY); } } for (int loc = 0; loc <= length; loc++) { Arrays.fill(narrowLExtent[loc], -1); // the rightmost left with state s ending at i that we can get is the beginning Arrays.fill(wideLExtent[loc], length + 1); // the leftmost left with state s ending at i that we can get is the end } for (int loc = 0; loc < length; loc++) { Arrays.fill(narrowRExtent[loc], length + 1); // the leftmost right with state s starting at i that we can get is the end Arrays.fill(wideRExtent[loc], -1); // the rightmost right with state s starting at i that we can get is the beginning } initializeChart(lr); doInsideScores(); bestScore = iScore[0][length][goal]; if (op.testOptions.verbose) { Timing.tick("done."); System.err.println("PCFG " + length + " words (incl. stop) iScore " + bestScore); } boolean succeeded = hasParse(); // Try a recovery parse if (!succeeded && op.testOptions.doRecovery && !floodTags) { floodTags = true; System.err.printf(this.getClass().getName() + ": Parse failed. Trying recovery parse..."); succeeded = parse(lr); if(!succeeded) return false; } oScore[0][length][goal] = 0.0f; doOutsideScores(); if (op.testOptions.verbose) { Timing.tick("done."); } if (op.doDep) { initializePossibles(); } return succeeded; } /** These arrays are used by the factored parser (only) during edge combination. * The method assumes that the iScore and oScore arrays have been initialized. */ protected void initializePossibles() { for (int loc = 0; loc < length; loc++) { Arrays.fill(iPossibleByL[loc], false); Arrays.fill(oPossibleByL[loc], false); } for (int loc = 0; loc <= length; loc++) { Arrays.fill(iPossibleByR[loc], false); Arrays.fill(oPossibleByR[loc], false); } for (int start = 0; start < length; start++) { for (int end = start + 1; end <= length; end++) { for (int state = 0; state < numStates; state++) { if (iScore[start][end][state] > Float.NEGATIVE_INFINITY && oScore[start][end][state] > Float.NEGATIVE_INFINITY) { iPossibleByL[start][state] = true; iPossibleByR[end][state] = true; oPossibleByL[start][state] = true; oPossibleByR[end][state] = true; } } } } } private void doOutsideScores() { for (int diff = length; diff >= 1; diff--) { if (Thread.interrupted()) { throw new RuntimeInterruptedException(); } for (int start = 0; start + diff <= length; start++) { int end = start + diff; // do unaries for (int s = 0; s < numStates; s++) { float oS = oScore[start][end][s]; if (oS == Float.NEGATIVE_INFINITY) { continue; } UnaryRule[] rules = ug.closedRulesByParent(s); for (UnaryRule ur : rules) { float pS = ur.score; float tot = oS + pS; if (tot > oScore[start][end][ur.child] && iScore[start][end][ur.child] > Float.NEGATIVE_INFINITY) { oScore[start][end][ur.child] = tot; } } } // do binaries for (int s = 0; s < numStates; s++) { int min1 = narrowRExtent[start][s]; if (end < min1) { continue; } BinaryRule[] rules = bg.splitRulesWithLC(s); for (BinaryRule br : rules) { float oS = oScore[start][end][br.parent]; if (oS == Float.NEGATIVE_INFINITY) { continue; } int max1 = narrowLExtent[end][br.rightChild]; if (max1 < min1) { continue; } int min = min1; int max = max1; if (max - min > 2) { int min2 = wideLExtent[end][br.rightChild]; min = (min1 > min2 ? min1 : min2); if (max1 < min) { continue; } int max2 = wideRExtent[start][br.leftChild]; max = (max1 < max2 ? max1 : max2); if (max < min) { continue; } } float pS = br.score; for (int split = min; split <= max; split++) { float lS = iScore[start][split][br.leftChild]; if (lS == Float.NEGATIVE_INFINITY) { continue; } float rS = iScore[split][end][br.rightChild]; if (rS == Float.NEGATIVE_INFINITY) { continue; } float totL = pS + rS + oS; if (totL > oScore[start][split][br.leftChild]) { oScore[start][split][br.leftChild] = totL; } float totR = pS + lS + oS; if (totR > oScore[split][end][br.rightChild]) { oScore[split][end][br.rightChild] = totR; } } } } for (int s = 0; s < numStates; s++) { int max1 = narrowLExtent[end][s]; if (max1 < start) { continue; } BinaryRule[] rules = bg.splitRulesWithRC(s); for (BinaryRule br : rules) { float oS = oScore[start][end][br.parent]; if (oS == Float.NEGATIVE_INFINITY) { continue; } int min1 = narrowRExtent[start][br.leftChild]; if (max1 < min1) { continue; } int min = min1; int max = max1; if (max - min > 2) { int min2 = wideLExtent[end][br.rightChild]; min = (min1 > min2 ? min1 : min2); if (max1 < min) { continue; } int max2 = wideRExtent[start][br.leftChild]; max = (max1 < max2 ? max1 : max2); if (max < min) { continue; } } float pS = br.score; for (int split = min; split <= max; split++) { float lS = iScore[start][split][br.leftChild]; if (lS == Float.NEGATIVE_INFINITY) { continue; } float rS = iScore[split][end][br.rightChild]; if (rS == Float.NEGATIVE_INFINITY) { continue; } float totL = pS + rS + oS; if (totL > oScore[start][split][br.leftChild]) { oScore[start][split][br.leftChild] = totL; } float totR = pS + lS + oS; if (totR > oScore[split][end][br.rightChild]) { oScore[split][end][br.rightChild] = totR; } } } } /* for (int s = 0; s < numStates; s++) { float oS = oScore[start][end][s]; //if (iScore[start][end][s] == Float.NEGATIVE_INFINITY || // oS == Float.NEGATIVE_INFINITY) if (oS == Float.NEGATIVE_INFINITY) continue; BinaryRule[] rules = bg.splitRulesWithParent(s); for (int r=0; r min2 ? min1 : min2); if (max1 < min) continue; int max2 = wideRExtent[start][br.leftChild]; int max = (max1 < max2 ? max1 : max2); if (max < min) continue; float pS = (float) br.score; for (int split = min; split <= max; split++) { float lS = iScore[start][split][br.leftChild]; if (lS == Float.NEGATIVE_INFINITY) continue; float rS = iScore[split][end][br.rightChild]; if (rS == Float.NEGATIVE_INFINITY) continue; float totL = pS+rS+oS; if (totL > oScore[start][split][br.leftChild]) { oScore[start][split][br.leftChild] = totL; } float totR = pS+lS+oS; if (totR > oScore[split][end][br.rightChild]) { oScore[split][end][br.rightChild] = totR; } } } } */ } } } /** Fills in the iScore array of each category over each span * of length 2 or more. */ void doInsideScores() { for (int diff = 2; diff <= length; diff++) { if (Thread.interrupted()) { throw new RuntimeInterruptedException(); } // usually stop one short because boundary symbol only combines // with whole sentence span. So for 3 word sentence + boundary = 4, // length == 4, and do [0,2], [1,3]; [0,3]; [0,4] for (int start = 0; start < ((diff == length) ? 1: length - diff); start++) { doInsideChartCell(diff, start); } // for start } // for diff (i.e., span) } // end doInsideScores() private void doInsideChartCell(final int diff, final int start) { final boolean lengthNormalization = op.testOptions.lengthNormalization; if (spillGuts) { tick("Binaries for span " + diff + " start " + start + " ..."); } int end = start + diff; final List constraints = getConstraints(); if (constraints != null) { for (ParserConstraint c : constraints) { if ((start > c.start && start < c.end && end > c.end) || (end > c.start && end < c.end && start < c.start)) { return; } } } // 2011-11-26 jdk1.6: caching/hoisting a bunch of variables gives you about 15% speed up! // caching this saves a bit of time in the inner loop, maybe 1.8% int[] narrowRExtent_start = narrowRExtent[start]; // caching this saved 2% in the inner loop int[] wideRExtent_start = wideRExtent[start]; int[] narrowLExtent_end = narrowLExtent[end]; int[] wideLExtent_end = wideLExtent[end]; float[][] iScore_start = iScore[start]; float[] iScore_start_end = iScore_start[end]; for (int leftState = 0; leftState < numStates; leftState++) { int narrowR = narrowRExtent_start[leftState]; if (narrowR >= end) { // can this left constituent leave space for a right constituent? continue; } BinaryRule[] leftRules = bg.splitRulesWithLC(leftState); // if (spillGuts) System.out.println("Found " + leftRules.length + " left rules for state " + stateIndex.get(leftState)); for (BinaryRule rule : leftRules) { int rightChild = rule.rightChild; int narrowL = narrowLExtent_end[rightChild]; if (narrowL < narrowR) { // can this right constituent fit next to the left constituent? continue; } int min2 = wideLExtent_end[rightChild]; int min = (narrowR > min2 ? narrowR : min2); // Erik Frey 2009-12-17: This is unnecessary: narrowR is <= narrowL (established in previous check) and wideLExtent[e][r] is always <= narrowLExtent[e][r] by design, so the check will never evaluate true. // if (min > narrowL) { // can this right constituent stretch far enough to reach the left constituent? // continue; // } int max1 = wideRExtent_start[leftState]; int max = (max1 < narrowL ? max1 : narrowL); if (min > max) { // can this left constituent stretch far enough to reach the right constituent? continue; } float pS = rule.score; int parentState = rule.parent; float oldIScore = iScore_start_end[parentState]; float bestIScore = oldIScore; boolean foundBetter; // always set below for this rule //System.out.println("Min "+min+" max "+max+" start "+start+" end "+end); if ( ! lengthNormalization) { // find the split that can use this rule to make the max score for (int split = min; split <= max; split++) { if (constraints != null) { boolean skip = false; for (ParserConstraint c : constraints) { if (((start < c.start && end >= c.end) || (start <= c.start && end > c.end)) && split > c.start && split < c.end) { skip = true; break; } if ((start == c.start && split == c.end)) { String tag = stateIndex.get(leftState); Matcher m = c.state.matcher(tag); if (!m.matches()) { skip = true; break; } } if ((split == c.start && end == c.end)) { String tag = stateIndex.get(rightChild); Matcher m = c.state.matcher(tag); if (!m.matches()) { skip = true; break; } } } if (skip) { continue; } } float lS = iScore_start[split][leftState]; if (lS == Float.NEGATIVE_INFINITY) { continue; } float rS = iScore[split][end][rightChild]; if (rS == Float.NEGATIVE_INFINITY) { continue; } float tot = pS + lS + rS; if (spillGuts) { System.err.println("Rule " + rule + " over [" + start + "," + end + ") has log score " + tot + " from L[" + stateIndex.get(leftState) + "=" + leftState + "] = "+ lS + " R[" + stateIndex.get(rightChild) + "=" + rightChild + "] = " + rS); } if (tot > bestIScore) { bestIScore = tot; } } // for split point foundBetter = bestIScore > oldIScore; } else { // find split that uses this rule to make the max *length normalized* score int bestWordsInSpan = wordsInSpan[start][end][parentState]; float oldNormIScore = oldIScore / bestWordsInSpan; float bestNormIScore = oldNormIScore; for (int split = min; split <= max; split++) { float lS = iScore_start[split][leftState]; if (lS == Float.NEGATIVE_INFINITY) { continue; } float rS = iScore[split][end][rightChild]; if (rS == Float.NEGATIVE_INFINITY) { continue; } float tot = pS + lS + rS; int newWordsInSpan = wordsInSpan[start][split][leftState] + wordsInSpan[split][end][rightChild]; float normTot = tot / newWordsInSpan; if (normTot > bestNormIScore) { bestIScore = tot; bestNormIScore = normTot; bestWordsInSpan = newWordsInSpan; } } // for split point foundBetter = bestNormIScore > oldNormIScore; if (foundBetter) { wordsInSpan[start][end][parentState] = bestWordsInSpan; } } // fi op.testOptions.lengthNormalization if (foundBetter) { // this way of making "parentState" is better than previous iScore_start_end[parentState] = bestIScore; if (spillGuts) System.err.println("Could build " + stateIndex.get(parentState) + " from " + start + " to " + end + " score " + bestIScore); if (oldIScore == Float.NEGATIVE_INFINITY) { if (start > narrowLExtent_end[parentState]) { narrowLExtent_end[parentState] = wideLExtent_end[parentState] = start; } else if (start < wideLExtent_end[parentState]) { wideLExtent_end[parentState] = start; } if (end < narrowRExtent_start[parentState]) { narrowRExtent_start[parentState] = wideRExtent_start[parentState] = end; } else if (end > wideRExtent_start[parentState]) { wideRExtent_start[parentState] = end; } } } // end if foundBetter } // end for leftRules } // end for leftState // do right restricted rules for (int rightState = 0; rightState < numStates; rightState++) { int narrowL = narrowLExtent_end[rightState]; if (narrowL <= start) { continue; } BinaryRule[] rightRules = bg.splitRulesWithRC(rightState); // if (spillGuts) System.out.println("Found " + rightRules.length + " right rules for state " + stateIndex.get(rightState)); for (BinaryRule rule : rightRules) { // if (spillGuts) System.out.println("Considering rule for " + start + " to " + end + ": " + rightRules[i]); int leftChild = rule.leftChild; int narrowR = narrowRExtent_start[leftChild]; if (narrowR > narrowL) { continue; } int min2 = wideLExtent_end[rightState]; int min = (narrowR > min2 ? narrowR : min2); // Erik Frey 2009-12-17: This is unnecessary: narrowR is <= narrowL (established in previous check) and wideLExtent[e][r] is always <= narrowLExtent[e][r] by design, so the check will never evaluate true. // if (min > narrowL) { // continue; // } int max1 = wideRExtent_start[leftChild]; int max = (max1 < narrowL ? max1 : narrowL); if (min > max) { continue; } float pS = rule.score; int parentState = rule.parent; float oldIScore = iScore_start_end[parentState]; float bestIScore = oldIScore; boolean foundBetter; // always initialized below //System.out.println("Start "+start+" end "+end+" min "+min+" max "+max); if ( ! lengthNormalization) { // find the split that can use this rule to make the max score for (int split = min; split <= max; split++) { if (constraints != null) { boolean skip = false; for (ParserConstraint c : constraints) { if (((start < c.start && end >= c.end) || (start <= c.start && end > c.end)) && split > c.start && split < c.end) { skip = true; break; } if ((start == c.start && split == c.end)) { String tag = stateIndex.get(leftChild); Matcher m = c.state.matcher(tag); if (!m.matches()) { //if (!tag.startsWith(c.state+"^")) { skip = true; break; } } if ((split == c.start && end == c.end)) { String tag = stateIndex.get(rightState); Matcher m = c.state.matcher(tag); if (!m.matches()) { //if (!tag.startsWith(c.state+"^")) { skip = true; break; } } } if (skip) { continue; } } float lS = iScore_start[split][leftChild]; // cdm [2012]: Test whether removing these 2 tests might speed things up because less branching? // jab [2014]: oddly enough, removing these tests helps the chinese parser but not the english parser. if (lS == Float.NEGATIVE_INFINITY) { continue; } float rS = iScore[split][end][rightState]; if (rS == Float.NEGATIVE_INFINITY) { continue; } float tot = pS + lS + rS; if (tot > bestIScore) { bestIScore = tot; } } // end for split foundBetter = bestIScore > oldIScore; } else { // find split that uses this rule to make the max *length normalized* score int bestWordsInSpan = wordsInSpan[start][end][parentState]; float oldNormIScore = oldIScore / bestWordsInSpan; float bestNormIScore = oldNormIScore; for (int split = min; split <= max; split++) { float lS = iScore_start[split][leftChild]; if (lS == Float.NEGATIVE_INFINITY) { continue; } float rS = iScore[split][end][rightState]; if (rS == Float.NEGATIVE_INFINITY) { continue; } float tot = pS + lS + rS; int newWordsInSpan = wordsInSpan[start][split][leftChild] + wordsInSpan[split][end][rightState]; float normTot = tot / newWordsInSpan; if (normTot > bestNormIScore) { bestIScore = tot; bestNormIScore = normTot; bestWordsInSpan = newWordsInSpan; } } // end for split foundBetter = bestNormIScore > oldNormIScore; if (foundBetter) { wordsInSpan[start][end][parentState] = bestWordsInSpan; } } // end if lengthNormalization if (foundBetter) { // this way of making "parentState" is better than previous iScore_start_end[parentState] = bestIScore; if (spillGuts) System.err.println("Could build " + stateIndex.get(parentState) + " from " + start + " to " + end + " with score " + bestIScore); if (oldIScore == Float.NEGATIVE_INFINITY) { if (start > narrowLExtent_end[parentState]) { narrowLExtent_end[parentState] = wideLExtent_end[parentState] = start; } else if (start < wideLExtent_end[parentState]) { wideLExtent_end[parentState] = start; } if (end < narrowRExtent_start[parentState]) { narrowRExtent_start[parentState] = wideRExtent_start[parentState] = end; } else if (end > wideRExtent_start[parentState]) { wideRExtent_start[parentState] = end; } } } // end if foundBetter } // for rightRules } // for rightState if (spillGuts) { tick("Unaries for span " + diff + "..."); } // do unary rules -- one could promote this loop and put start inside for (int state = 0; state < numStates; state++) { float iS = iScore_start_end[state]; if (iS == Float.NEGATIVE_INFINITY) { continue; } UnaryRule[] unaries = ug.closedRulesByChild(state); for (UnaryRule ur : unaries) { if (constraints != null) { boolean skip = false; for (ParserConstraint c : constraints) { if ((start == c.start && end == c.end)) { String tag = stateIndex.get(ur.parent); Matcher m = c.state.matcher(tag); if (!m.matches()) { //if (!tag.startsWith(c.state+"^")) { skip = true; break; } } } if (skip) { continue; } } int parentState = ur.parent; float pS = ur.score; float tot = iS + pS; float cur = iScore_start_end[parentState]; boolean foundBetter; // always set below if (lengthNormalization) { int totWordsInSpan = wordsInSpan[start][end][state]; float normTot = tot / totWordsInSpan; int curWordsInSpan = wordsInSpan[start][end][parentState]; float normCur = cur / curWordsInSpan; foundBetter = normTot > normCur; if (foundBetter) { wordsInSpan[start][end][parentState] = wordsInSpan[start][end][state]; } } else { foundBetter = (tot > cur); } if (foundBetter) { if (spillGuts) System.err.println("Could build " + stateIndex.get(parentState) + " from " + start + " to " + end + " with score " + tot); iScore_start_end[parentState] = tot; if (cur == Float.NEGATIVE_INFINITY) { if (start > narrowLExtent_end[parentState]) { narrowLExtent_end[parentState] = wideLExtent_end[parentState] = start; } else if (start < wideLExtent_end[parentState]) { wideLExtent_end[parentState] = start; } if (end < narrowRExtent_start[parentState]) { narrowRExtent_start[parentState] = wideRExtent_start[parentState] = end; } else if (end > wideRExtent_start[parentState]) { wideRExtent_start[parentState] = end; } } } // end if foundBetter } // for UnaryRule r } // for unary rules } private void initializeChart(Lattice lr) { for (LatticeEdge edge : lr) { int start = edge.start; int end = edge.end; String word = edge.word; // Add pre-terminals, augmented with edge weights for (int state = 0; state < numStates; state++) { if (isTag[state]) { IntTaggedWord itw = new IntTaggedWord(word, stateIndex.get(state), wordIndex, tagIndex); float newScore = lex.score(itw, start, word, null) + (float) edge.weight; if (newScore > iScore[start][end][state]) { iScore[start][end][state] = newScore; narrowRExtent[start][state] = Math.min(end, narrowRExtent[start][state]); narrowLExtent[end][state] = Math.max(start, narrowLExtent[end][state]); wideRExtent[start][state] = Math.max(end, wideRExtent[start][state]); wideLExtent[end][state] = Math.min(start, wideLExtent[end][state]); } } } // Give scores to all tags if the parse fails (more flexible tagging) if (floodTags && (!op.testOptions.noRecoveryTagging)) { for (int state = 0; state < numStates; state++) { float iS = iScore[start][end][state]; if (isTag[state] && iS == Float.NEGATIVE_INFINITY) { iScore[start][end][state] = -1000.0f + (float) edge.weight; narrowRExtent[start][state] = end; narrowLExtent[end][state] = start; wideRExtent[start][state] = end; wideLExtent[end][state] = start; } } } // Add unary rules (possibly chains) that terminate in POS tags for (int state = 0; state < numStates; state++) { float iS = iScore[start][end][state]; if (iS == Float.NEGATIVE_INFINITY) { continue; } UnaryRule[] unaries = ug.closedRulesByChild(state); for (UnaryRule ur : unaries) { int parentState = ur.parent; float pS = ur.score; float tot = iS + pS; if (tot > iScore[start][end][parentState]) { iScore[start][end][parentState] = tot; narrowRExtent[start][parentState] = Math.min(end, narrowRExtent[start][parentState]); narrowLExtent[end][parentState] = Math.max(start, narrowLExtent[end][parentState]); wideRExtent[start][parentState] = Math.max(end, wideRExtent[start][parentState]); wideLExtent[end][parentState] = Math.min(start, wideLExtent[end][parentState]); // narrowRExtent[start][parentState] = start + 1; //end // narrowLExtent[end][parentState] = end - 1; //start // wideRExtent[start][parentState] = start + 1; //end // wideLExtent[end][parentState] = end - 1; //start } } } } } private void initializeChart(List sentence) { int boundary = wordIndex.indexOf(Lexicon.BOUNDARY); for (int start = 0; start < length; start++) { if (op.testOptions.maxSpanForTags > 1) { // only relevant for parsing single words as multiple input tokens. // todo [cdm 2012]: This case seems buggy in never doing unaries over span 1 items // note we don't look for "words" including the end symbol! for (int end = start + 1; (end < length - 1 && end - start <= op.testOptions.maxSpanForTags) || (start + 1 == end); end++) { StringBuilder word = new StringBuilder(); //wsg: Feb 2010 - Appears to support character-level parsing for (int i = start; i < end; i++) { if (sentence.get(i) instanceof HasWord) { HasWord cl = sentence.get(i); word.append(cl.word()); } else { word.append(sentence.get(i).toString()); } } for (int state = 0; state < numStates; state++) { float iS = iScore[start][end][state]; if (iS == Float.NEGATIVE_INFINITY && isTag[state]) { IntTaggedWord itw = new IntTaggedWord(word.toString(), stateIndex.get(state), wordIndex, tagIndex); iScore[start][end][state] = lex.score(itw, start, word.toString(), null); if (iScore[start][end][state] > Float.NEGATIVE_INFINITY) { narrowRExtent[start][state] = start + 1; narrowLExtent[end][state] = end - 1; wideRExtent[start][state] = start + 1; wideLExtent[end][state] = end - 1; } } } } } else { // "normal" chart initialization of the [start,start+1] cell int word = words[start]; int end = start + 1; Arrays.fill(tags[start], false); float[] iScore_start_end = iScore[start][end]; int[] narrowRExtent_start = narrowRExtent[start]; int[] narrowLExtent_end = narrowLExtent[end]; int[] wideRExtent_start = wideRExtent[start]; int[] wideLExtent_end = wideLExtent[end]; //Force tags String trueTagStr = null; if (sentence.get(start) instanceof HasTag) { trueTagStr = ((HasTag) sentence.get(start)).tag(); if ("".equals(trueTagStr)) { trueTagStr = null; } } // Another option for forcing tags: supply a regex String candidateTagRegex = null; if (sentence.get(start) instanceof CoreLabel) { candidateTagRegex = ((CoreLabel) sentence.get(start)).get(ParserAnnotations.CandidatePartOfSpeechAnnotation.class); if ("".equals(candidateTagRegex)) { candidateTagRegex = null; } } //Word context (e.g., morphosyntactic info) String wordContextStr = null; if(sentence.get(start) instanceof HasContext) { wordContextStr = ((HasContext) sentence.get(start)).originalText(); if("".equals(wordContextStr)) wordContextStr = null; } boolean assignedSomeTag = false; if ( ! floodTags || word == boundary) { // in this case we generate the taggings in the lexicon, // which may itself be tagging flexibly or using a strict lexicon. if (dumpTagging) { EncodingPrintWriter.err.println("Normal tagging " + wordIndex.get(word) + " [" + word + "]", "UTF-8"); } for (Iterator taggingI = lex.ruleIteratorByWord(word, start, wordContextStr); taggingI.hasNext(); ) { IntTaggedWord tagging = taggingI.next(); int state = stateIndex.indexOf(tagIndex.get(tagging.tag)); // if word was supplied with a POS tag, skip all taggings // not basicCategory() compatible with supplied tag. if (trueTagStr != null) { if ((!op.testOptions.forceTagBeginnings && !tlp.basicCategory(tagging.tagString(tagIndex)).equals(trueTagStr)) || (op.testOptions.forceTagBeginnings && !tagging.tagString(tagIndex).startsWith(trueTagStr))) { if (dumpTagging) { EncodingPrintWriter.err.println(" Skipping " + tagging + " as it doesn't match trueTagStr: " + trueTagStr, "UTF-8"); } continue; } } if (candidateTagRegex != null) { if ((!op.testOptions.forceTagBeginnings && !tlp.basicCategory(tagging.tagString(tagIndex)).matches(candidateTagRegex)) || (op.testOptions.forceTagBeginnings && !tagging.tagString(tagIndex).matches(candidateTagRegex))) { if (dumpTagging) { EncodingPrintWriter.err.println(" Skipping " + tagging + " as it doesn't match candidateTagRegex: " + candidateTagRegex, "UTF-8"); } continue; } } // try { float lexScore = lex.score(tagging, start, wordIndex.get(tagging.word), wordContextStr); // score the cell according to P(word|tag) in the lexicon if (lexScore > Float.NEGATIVE_INFINITY) { assignedSomeTag = true; iScore_start_end[state] = lexScore; narrowRExtent_start[state] = end; narrowLExtent_end[state] = start; wideRExtent_start[state] = end; wideLExtent_end[state] = start; } // } catch (Exception e) { // e.printStackTrace(); // System.out.println("State: " + state + " tags " + Numberer.getGlobalNumberer("tags").object(tagging.tag)); // } int tag = tagging.tag; tags[start][tag] = true; if (dumpTagging) { EncodingPrintWriter.err.println("Word pos " + start + " tagging " + tagging + " score " + iScore_start_end[state] + " [state " + stateIndex.get(state) + " = " + state + "]", "UTF-8"); } //if (start == length-2 && tagging.parent == puncTag) // lastIsPunc = true; } } // end if ( ! floodTags || word == boundary) if ( ! assignedSomeTag) { // If you got here, either you were using forceTags (gold tags) // and the gold tag was not seen with that word in the training data // or we are in floodTags=true (recovery parse) mode // Here, we give words all tags for // which the lexicon score is not -Inf, not just seen or // specified taggings if (dumpTagging) { EncodingPrintWriter.err.println("Forced FlexiTagging " + wordIndex.get(word), "UTF-8"); } for (int state = 0; state < numStates; state++) { if (isTag[state] && iScore_start_end[state] == Float.NEGATIVE_INFINITY) { if (trueTagStr != null) { String tagString = stateIndex.get(state); if ( ! tlp.basicCategory(tagString).equals(trueTagStr)) { continue; } } float lexScore = lex.score(new IntTaggedWord(word, tagIndex.indexOf(stateIndex.get(state))), start, wordIndex.get(word), wordContextStr); if (candidateTagRegex != null) { String tagString = stateIndex.get(state); if (!tlp.basicCategory(tagString).matches(candidateTagRegex)) { continue; } } if (lexScore > Float.NEGATIVE_INFINITY) { iScore_start_end[state] = lexScore; narrowRExtent_start[state] = end; narrowLExtent_end[state] = start; wideRExtent_start[state] = end; wideLExtent_end[state] = start; } if (dumpTagging) { EncodingPrintWriter.err.println("Word pos " + start + " tagging " + (new IntTaggedWord(word, tagIndex.indexOf(stateIndex.get(state)))) + " score " + iScore_start_end[state] + " [state " + stateIndex.get(state) + " = " + state + "]", "UTF-8"); } } } } // end if ! assignedSomeTag // tag multi-counting if (op.dcTags) { for (int state = 0; state < numStates; state++) { if (isTag[state]) { iScore_start_end[state] *= (1.0 + op.testOptions.depWeight); } } } if (floodTags && (!op.testOptions.noRecoveryTagging) && ! (word == boundary)) { // if parse failed because of tag coverage, we put in all tags with // a score of -1000, by fiat. You get here from the invocation of // parse(ls) inside parse(ls) *after* floodTags has been turned on. // Search above for "floodTags = true". if (dumpTagging) { EncodingPrintWriter.err.println("Flooding tags for " + wordIndex.get(word), "UTF-8"); } for (int state = 0; state < numStates; state++) { if (isTag[state] && iScore_start_end[state] == Float.NEGATIVE_INFINITY) { iScore_start_end[state] = -1000.0f; narrowRExtent_start[state] = end; narrowLExtent_end[state] = start; wideRExtent_start[state] = end; wideLExtent_end[state] = start; } } } // Apply unary rules in diagonal cells of chart if (spillGuts) { tick("Terminal Unary..."); } for (int state = 0; state < numStates; state++) { float iS = iScore_start_end[state]; if (iS == Float.NEGATIVE_INFINITY) { continue; } UnaryRule[] unaries = ug.closedRulesByChild(state); for (UnaryRule ur : unaries) { int parentState = ur.parent; float pS = ur.score; float tot = iS + pS; if (tot > iScore_start_end[parentState]) { iScore_start_end[parentState] = tot; narrowRExtent_start[parentState] = end; narrowLExtent_end[parentState] = start; wideRExtent_start[parentState] = end; wideLExtent_end[parentState] = start; } } } if (spillGuts) { tick("Next word..."); } } } // end for start } // end initializeChart(List sentence) @Override public boolean hasParse() { return getBestScore() > Double.NEGATIVE_INFINITY; } private static final double TOL = 1e-5; protected static boolean matches(double x, double y) { return (Math.abs(x - y) / (Math.abs(x) + Math.abs(y) + 1e-10) < TOL); } @Override public double getBestScore() { return getBestScore(goalStr); } public double getBestScore(String stateName) { if (length > arraySize) { return Double.NEGATIVE_INFINITY; } if (!stateIndex.contains(stateName)) { return Double.NEGATIVE_INFINITY; } int goal = stateIndex.indexOf(stateName); if (iScore == null || iScore.length == 0 || iScore[0].length <= length || iScore[0][length].length <= goal) { return Double.NEGATIVE_INFINITY; } return iScore[0][length][goal]; } @Override public Tree getBestParse() { Tree internalTree = extractBestParse(goalStr, 0, length); //System.out.println("Got internal best parse..."); if (internalTree == null) { System.err.println("Warning: no parse found in ExhaustivePCFGParser.extractBestParse"); } // else { // restoreUnaries(internalTree); // } // System.out.println("Restored unaries..."); return internalTree; //TreeTransformer debinarizer = BinarizerFactory.getDebinarizer(); //return debinarizer.transformTree(internalTree); } /** Return the best parse of some category/state over a certain span. */ protected Tree extractBestParse(String goalStr, int start, int end) { return extractBestParse(stateIndex.indexOf(goalStr), start, end); } private Tree extractBestParse(int goal, int start, int end) { // find source of inside score // no backtraces so we can speed up the parsing for its primary use double bestScore = iScore[start][end][goal]; double normBestScore = op.testOptions.lengthNormalization ? (bestScore / wordsInSpan[start][end][goal]) : bestScore; String goalStr = stateIndex.get(goal); // check tags if (end - start <= op.testOptions.maxSpanForTags && tagIndex.contains(goalStr)) { if (op.testOptions.maxSpanForTags > 1) { Tree wordNode = null; if (sentence != null) { StringBuilder word = new StringBuilder(); for (int i = start; i < end; i++) { if (sentence.get(i) instanceof HasWord) { HasWord cl = (HasWord) sentence.get(i); word.append(cl.word()); } else { word.append(sentence.get(i).toString()); } } wordNode = tf.newLeaf(word.toString()); } else if (lr != null) { List latticeEdges = lr.getEdgesOverSpan(start, end); for (LatticeEdge edge : latticeEdges) { IntTaggedWord itw = new IntTaggedWord(edge.word, stateIndex.get(goal), wordIndex, tagIndex); float tagScore = (floodTags) ? -1000.0f : lex.score(itw, start, edge.word, null); if (matches(bestScore, tagScore + (float) edge.weight)) { wordNode = tf.newLeaf(edge.word); if(wordNode.label() instanceof CoreLabel) { CoreLabel cl = (CoreLabel) wordNode.label(); cl.setBeginPosition(start); cl.setEndPosition(end); } break; } } if (wordNode == null) { throw new RuntimeException("could not find matching word from lattice in parse reconstruction"); } } else { throw new RuntimeException("attempt to get word when sentence and lattice are null!"); } Tree tagNode = tf.newTreeNode(goalStr, Collections.singletonList(wordNode)); tagNode.setScore(bestScore); if (originalTags[start] != null) { tagNode.label().setValue(originalTags[start].tag()); } return tagNode; } else { // normal lexicon is single words case IntTaggedWord tagging = new IntTaggedWord(words[start], tagIndex.indexOf(goalStr)); String contextStr = getCoreLabel(start).originalText(); float tagScore = lex.score(tagging, start, wordIndex.get(words[start]), contextStr); if (tagScore > Float.NEGATIVE_INFINITY || floodTags) { // return a pre-terminal tree CoreLabel terminalLabel = getCoreLabel(start); Tree wordNode = tf.newLeaf(terminalLabel); Tree tagNode = tf.newTreeNode(goalStr, Collections.singletonList(wordNode)); tagNode.setScore(bestScore); if (terminalLabel.tag() != null) { tagNode.label().setValue(terminalLabel.tag()); } if (tagNode.label() instanceof HasTag) { ((HasTag) tagNode.label()).setTag(tagNode.label().value()); } return tagNode; } } } // check binaries first for (int split = start + 1; split < end; split++) { for (Iterator binaryI = bg.ruleIteratorByParent(goal); binaryI.hasNext(); ) { BinaryRule br = binaryI.next(); double score = br.score + iScore[start][split][br.leftChild] + iScore[split][end][br.rightChild]; boolean matches; if (op.testOptions.lengthNormalization) { double normScore = score / (wordsInSpan[start][split][br.leftChild] + wordsInSpan[split][end][br.rightChild]); matches = matches(normScore, normBestScore); } else { matches = matches(score, bestScore); } if (matches) { // build binary split Tree leftChildTree = extractBestParse(br.leftChild, start, split); Tree rightChildTree = extractBestParse(br.rightChild, split, end); List children = new ArrayList(); children.add(leftChildTree); children.add(rightChildTree); Tree result = tf.newTreeNode(goalStr, children); result.setScore(score); // System.err.println(" Found Binary node: "+result); return result; } } } // check unaries // note that even though we parse with the unary-closed grammar, we can // extract the best parse with the non-unary-closed grammar, since all // the intermediate states in the chain must have been built, and hence // we can exploit the sparser space and reconstruct the full tree as we go. // for (Iterator unaryI = ug.closedRuleIteratorByParent(goal); unaryI.hasNext(); ) { for (Iterator unaryI = ug.ruleIteratorByParent(goal); unaryI.hasNext(); ) { UnaryRule ur = unaryI.next(); // System.err.println(" Trying " + ur + " dtr score: " + iScore[start][end][ur.child]); double score = ur.score + iScore[start][end][ur.child]; boolean matches; if (op.testOptions.lengthNormalization) { double normScore = score / wordsInSpan[start][end][ur.child]; matches = matches(normScore, normBestScore); } else { matches = matches(score, bestScore); } if (ur.child != ur.parent && matches) { // build unary Tree childTree = extractBestParse(ur.child, start, end); Tree result = tf.newTreeNode(goalStr, Collections.singletonList(childTree)); // System.err.println(" Matched! Unary node: "+result); result.setScore(score); return result; } } System.err.println("Warning: no parse found in ExhaustivePCFGParser.extractBestParse: failing on: [" + start + ", " + end + "] looking for " + goalStr); return null; } /* ----------------------- // No longer needed: extracBestParse restores unaries as it goes protected void restoreUnaries(Tree t) { //System.out.println("In restoreUnaries..."); for (Tree node : t) { System.err.println("Doing node: "+node.label()); if (node.isLeaf() || node.isPreTerminal() || node.numChildren() != 1) { //System.out.println("Skipping node: "+node.label()); continue; } //System.out.println("Not skipping node: "+node.label()); Tree parent = node; Tree child = node.children()[0]; List path = ug.getBestPath(stateIndex.indexOf(parent.label().value()), stateIndex.indexOf(child.label().value())); System.err.println("Got path: "+path); int pos = 1; while (pos < path.size() - 1) { int interState = ((Integer) path.get(pos)).intValue(); Tree intermediate = tf.newTreeNode(new StringLabel(stateIndex.get(interState)), parent.getChildrenAsList()); parent.setChildren(Collections.singletonList(intermediate)); pos++; } //System.out.println("Done with node: "+node.label()); } } ---------------------- */ /** * Return all best parses (except no ties allowed on POS tags?). * Even though we parse with the unary-closed grammar, since all the * intermediate states in a chain must have been built, we can * reconstruct the unary chain as we go using the non-unary-closed grammar. */ protected List extractBestParses(int goal, int start, int end) { // find sources of inside score // no backtraces so we can speed up the parsing for its primary use double bestScore = iScore[start][end][goal]; String goalStr = stateIndex.get(goal); //System.out.println("Searching for "+goalStr+" from "+start+" to "+end+" scored "+bestScore); // check tags if (end - start == 1 && tagIndex.contains(goalStr)) { IntTaggedWord tagging = new IntTaggedWord(words[start], tagIndex.indexOf(goalStr)); String contextStr = getCoreLabel(start).originalText(); float tagScore = lex.score(tagging, start, wordIndex.get(words[start]), contextStr); if (tagScore > Float.NEGATIVE_INFINITY || floodTags) { // return a pre-terminal tree String wordStr = wordIndex.get(words[start]); Tree wordNode = tf.newLeaf(wordStr); Tree tagNode = tf.newTreeNode(goalStr, Collections.singletonList(wordNode)); if (originalTags[start] != null) { tagNode.label().setValue(originalTags[start].tag()); } //System.out.println("Tag node: "+tagNode); return Collections.singletonList(tagNode); } } // check binaries first List bestTrees = new ArrayList(); for (int split = start + 1; split < end; split++) { for (Iterator binaryI = bg.ruleIteratorByParent(goal); binaryI.hasNext(); ) { BinaryRule br = binaryI.next(); double score = br.score + iScore[start][split][br.leftChild] + iScore[split][end][br.rightChild]; if (matches(score, bestScore)) { // build binary split List leftChildTrees = extractBestParses(br.leftChild, start, split); List rightChildTrees = extractBestParses(br.rightChild, split, end); // System.out.println("Found a best way to build " + goalStr + "(" + // start + "," + end + ") with " + // leftChildTrees.size() + "x" + // rightChildTrees.size() + " ways to build."); for (Tree leftChildTree : leftChildTrees) { for (Tree rightChildTree : rightChildTrees) { List children = new ArrayList(); children.add(leftChildTree); children.add(rightChildTree); Tree result = tf.newTreeNode(goalStr, children); //System.out.println("Binary node: "+result); bestTrees.add(result); } } } } } // check unaries for (Iterator unaryI = ug.ruleIteratorByParent(goal); unaryI.hasNext(); ) { UnaryRule ur = unaryI.next(); double score = ur.score + iScore[start][end][ur.child]; if (ur.child != ur.parent && matches(score, bestScore)) { // build unary List childTrees = extractBestParses(ur.child, start, end); for (Tree childTree : childTrees) { Tree result = tf.newTreeNode(goalStr, Collections.singletonList(childTree)); //System.out.println("Unary node: "+result); bestTrees.add(result); } } } if (bestTrees.isEmpty()) { System.err.println("Warning: no parse found in ExhaustivePCFGParser.extractBestParse: failing on: [" + start + ", " + end + "] looking for " + goalStr); } return bestTrees; } /** Get k good parses for the sentence. It is expected that the * parses returned approximate the k best parses, but without any * guarantee that the exact list of k best parses has been produced. * * @param k The number of good parses to return * @return A list of k good parses for the sentence, with * each accompanied by its score */ @Override public List> getKGoodParses(int k) { return getKBestParses(k); } /** Get k parse samples for the sentence. It is expected that the * parses are sampled based on their relative probability. * * @param k The number of sampled parses to return * @return A list of k parse samples for the sentence, with * each accompanied by its score */ @Override public List> getKSampledParses(int k) { throw new UnsupportedOperationException("ExhaustivePCFGParser doesn't sample."); } // // BEGIN K-BEST STUFF // taken straight out of "Better k-best Parsing" by Liang Huang and David // Chiang // /** Get the exact k best parses for the sentence. * * @param k The number of best parses to return * @return The exact k best parses for the sentence, with * each accompanied by its score (typically a * negative log probability). */ @Override public List> getKBestParses(int k) { cand = Generics.newHashMap(); dHat = Generics.newHashMap(); int start = 0; int end = length; int goal = stateIndex.indexOf(goalStr); Vertex v = new Vertex(goal, start, end); List> kBestTrees = new ArrayList>(); for (int i = 1; i <= k; i++) { Tree internalTree = getTree(v, i, k); if (internalTree == null) { break; } // restoreUnaries(internalTree); kBestTrees.add(new ScoredObject(internalTree, dHat.get(v).get(i-1).score)); } return kBestTrees; } /** Get the kth best, when calculating kPrime best (e.g. 2nd best of 5). */ private Tree getTree(Vertex v, int k, int kPrime) { lazyKthBest(v, k, kPrime); String goalStr = stateIndex.get(v.goal); int start = v.start; // int end = v.end; List dHatV = dHat.get(v); if (isTag[v.goal] && v.start + 1 == v.end) { IntTaggedWord tagging = new IntTaggedWord(words[start], tagIndex.indexOf(goalStr)); String contextStr = getCoreLabel(start).originalText(); float tagScore = lex.score(tagging, start, wordIndex.get(words[start]), contextStr); if (tagScore > Float.NEGATIVE_INFINITY || floodTags) { // return a pre-terminal tree CoreLabel terminalLabel = getCoreLabel(start); Tree wordNode = tf.newLeaf(terminalLabel); Tree tagNode = tf.newTreeNode(goalStr, Collections.singletonList(wordNode)); if (originalTags[start] != null) { tagNode.label().setValue(originalTags[start].tag()); } if (tagNode.label() instanceof HasTag) { ((HasTag) tagNode.label()).setTag(tagNode.label().value()); } return tagNode; } else { assert false; } } if (k-1 >= dHatV.size()) { return null; } Derivation d = dHatV.get(k-1); List children = new ArrayList(); for (int i = 0; i < d.arc.size(); i++) { Vertex child = d.arc.tails.get(i); Tree t = getTree(child, d.j.get(i), kPrime); assert (t != null); children.add(t); } return tf.newTreeNode(goalStr,children); } private static class Vertex { public final int goal; public final int start; public final int end; public Vertex(int goal, int start, int end) { this.goal = goal; this.start = start; this.end = end; } public boolean equals(Object o) { if (!(o instanceof Vertex)) { return false; } Vertex v = (Vertex)o; return (v.goal == goal && v.start == start && v.end == end); } private int hc = -1; public int hashCode() { if (hc == -1) { hc = goal + (17 * (start + (17 * end))); } return hc; } public String toString() { return goal+"["+start+","+end+"]"; } } private static class Arc { public final List tails; public final Vertex head; public final double ruleScore; // for convenience public Arc(List tails, Vertex head, double ruleScore) { this.tails = Collections.unmodifiableList(tails); this.head = head; this.ruleScore = ruleScore; // TODO: add check that rule is compatible with head and tails! } public boolean equals(Object o) { if (!(o instanceof Arc)) { return false; } Arc a = (Arc) o; return a.head.equals(head) && a.tails.equals(tails); } private int hc = -1; public int hashCode() { if (hc == -1) { hc = head.hashCode() + (17 * tails.hashCode()); } return hc; } public int size() { return tails.size(); } } private static class Derivation { public final Arc arc; public final List j; public final double score; // score does not affect equality (?) public final List childrenScores; public Derivation(Arc arc, List j, double score, List childrenScores) { this.arc = arc; this.j = Collections.unmodifiableList(j); this.score = score; this.childrenScores = Collections.unmodifiableList(childrenScores); } public boolean equals(Object o) { if (!(o instanceof Derivation)) { return false; } Derivation d = (Derivation)o; if (arc == null && d.arc != null || arc != null && d.arc == null) { return false; } return ((arc == null && d.arc == null || d.arc.equals(arc)) && d.j.equals(j)); } private int hc = -1; public int hashCode() { if (hc == -1) { hc = (arc == null ? 0 : arc.hashCode()) + (17 * j.hashCode()); } return hc; } } private List getBackwardsStar(Vertex v) { List bs = new ArrayList(); // pre-terminal?? if (isTag[v.goal] && v.start + 1 == v.end) { List tails = new ArrayList(); double score = iScore[v.start][v.end][v.goal]; Arc arc = new Arc(tails, v, score); bs.add(arc); } // check binaries for (int split = v.start + 1; split < v.end; split++) { for (BinaryRule br : bg.ruleListByParent(v.goal)) { Vertex lChild = new Vertex(br.leftChild, v.start, split); Vertex rChild = new Vertex(br.rightChild, split, v.end); List tails = new ArrayList(); tails.add(lChild); tails.add(rChild); Arc arc = new Arc(tails, v, br.score); bs.add(arc); } } // check unaries for (UnaryRule ur : ug.rulesByParent(v.goal)) { Vertex child = new Vertex(ur.child, v.start, v.end); List tails = new ArrayList(); tails.add(child); Arc arc = new Arc(tails, v, ur.score); bs.add(arc); } return bs; } private Map> cand = Generics.newHashMap(); private Map> dHat = Generics.newHashMap(); private PriorityQueue getCandidates(Vertex v, int k) { PriorityQueue candV = cand.get(v); if (candV == null) { candV = new BinaryHeapPriorityQueue(); List bsV = getBackwardsStar(v); for (Arc arc : bsV) { int size = arc.size(); double score = arc.ruleScore; List childrenScores = new ArrayList(); for (int i = 0; i < size; i++) { Vertex child = arc.tails.get(i); double s = iScore[child.start][child.end][child.goal]; childrenScores.add(s); score += s; } if (score == Double.NEGATIVE_INFINITY) { continue; } List j = new ArrayList(); for (int i = 0; i < size; i++) { j.add(1); } Derivation d = new Derivation(arc, j, score, childrenScores); candV.add(d, score); } PriorityQueue tmp = new BinaryHeapPriorityQueue(); for (int i = 0; i < k; i++) { if (candV.isEmpty()) { break; } Derivation d = candV.removeFirst(); tmp.add(d, d.score); } candV = tmp; cand.put(v, candV); } return candV; } // note: kPrime is the original k private void lazyKthBest(Vertex v, int k, int kPrime) { PriorityQueue candV = getCandidates(v, kPrime); LinkedList dHatV = dHat.get(v); if (dHatV == null) { dHatV = new LinkedList(); dHat.put(v,dHatV); } while (dHatV.size() < k) { if (!dHatV.isEmpty()) { Derivation derivation = dHatV.getLast(); lazyNext(candV, derivation, kPrime); } if (!candV.isEmpty()) { Derivation d = candV.removeFirst(); dHatV.add(d); } else { break; } } } private void lazyNext(PriorityQueue candV, Derivation derivation, int kPrime) { List tails = derivation.arc.tails; for (int i = 0, sz = derivation.arc.size(); i < sz; i++) { List j = new ArrayList(derivation.j); j.set(i, j.get(i)+1); Vertex Ti = tails.get(i); lazyKthBest(Ti, j.get(i), kPrime); LinkedList dHatTi = dHat.get(Ti); // compute score for this derivation if (j.get(i)-1 >= dHatTi.size()) { continue; } Derivation d = dHatTi.get(j.get(i)-1); double newScore = derivation.score - derivation.childrenScores.get(i) + d.score; List childrenScores = new ArrayList(derivation.childrenScores); childrenScores.set(i, d.score); Derivation newDerivation = new Derivation(derivation.arc, j, newScore, childrenScores); if (!candV.contains(newDerivation) && newScore > Double.NEGATIVE_INFINITY) { candV.add(newDerivation, newScore); } } } // // END K-BEST STUFF // /** Get a complete set of the maximally scoring parses for a sentence, * rather than one chosen at random. This set may be of size 1 or larger. * * @return All the equal best parses for a sentence, with each * accompanied by its score */ @Override public List> getBestParses() { int start = 0; int end = length; int goal = stateIndex.indexOf(goalStr); double bestScore = iScore[start][end][goal]; List internalTrees = extractBestParses(goal, start, end); //System.out.println("Got internal best parse..."); // for (Tree internalTree : internalTrees) { // restoreUnaries(internalTree); // } //System.out.println("Restored unaries..."); List> scoredTrees = new ArrayList>(internalTrees.size()); for (Tree tr : internalTrees) { scoredTrees.add(new ScoredObject(tr, bestScore)); } return scoredTrees; //TreeTransformer debinarizer = BinarizerFactory.getDebinarizer(); //return debinarizer.transformTree(internalTree); } protected List getConstraints() { return constraints; } void setConstraints(List constraints) { if (constraints == null) { this.constraints = Collections.emptyList(); } else { this.constraints = constraints; } } public ExhaustivePCFGParser(BinaryGrammar bg, UnaryGrammar ug, Lexicon lex, Options op, Index stateIndex, Index wordIndex, Index tagIndex) { // System.out.println("ExhaustivePCFGParser constructor called."); this.bg = bg; this.ug = ug; this.lex = lex; this.op = op; this.tlp = op.langpack(); goalStr = tlp.startSymbol(); this.stateIndex = stateIndex; this.wordIndex = wordIndex; this.tagIndex = tagIndex; tf = new LabeledScoredTreeFactory(); numStates = stateIndex.size(); isTag = new boolean[numStates]; // tag index is smaller, so we fill by iterating over the tag index // rather than over the state index for (String tag : tagIndex.objectsList()) { int state = stateIndex.indexOf(tag); if (state < 0) { continue; } isTag[state] = true; } } public void nudgeDownArraySize() { try { if (arraySize > 2) { considerCreatingArrays(arraySize - 2); } } catch (OutOfMemoryError oome) { oome.printStackTrace(); } } private void considerCreatingArrays(int length) { if (length > op.testOptions.maxLength + 1 || length >= myMaxLength) { throw new OutOfMemoryError("Refusal to create such large arrays."); } else { try { createArrays(length + 1); } catch (OutOfMemoryError e) { myMaxLength = length; if (arraySize > 0) { try { createArrays(arraySize); } catch (OutOfMemoryError e2) { throw new RuntimeException("CANNOT EVEN CREATE ARRAYS OF ORIGINAL SIZE!!"); } } throw e; } arraySize = length + 1; if (op.testOptions.verbose) { System.err.println("Created PCFG parser arrays of size " + arraySize); } } } protected void createArrays(int length) { // zero out some stuff first in case we recently ran out of memory and are reallocating clearArrays(); int numTags = tagIndex.size(); // allocate just the parts of iScore and oScore used (end > start, etc.) // todo: with some modifications to doInsideScores, we wouldn't need to allocate iScore[i,length] for i != 0 and i != length // System.out.println("initializing iScore arrays with length " + length + " and numStates " + numStates); iScore = new float[length][length + 1][]; for (int start = 0; start < length; start++) { for (int end = start + 1; end <= length; end++) { iScore[start][end] = new float[numStates]; } } // System.out.println("finished initializing iScore arrays"); if (op.doDep && !op.testOptions.useFastFactored) { // System.out.println("initializing oScore arrays with length " + length + " and numStates " + numStates); oScore = new float[length][length + 1][]; for (int start = 0; start < length; start++) { for (int end = start + 1; end <= length; end++) { oScore[start][end] = new float[numStates]; } } // System.out.println("finished initializing oScore arrays"); } narrowRExtent = new int[length][numStates]; wideRExtent = new int[length][numStates]; narrowLExtent = new int[length + 1][numStates]; wideLExtent = new int[length + 1][numStates]; if (op.doDep && !op.testOptions.useFastFactored) { iPossibleByL = new boolean[length][numStates]; iPossibleByR = new boolean[length + 1][numStates]; oPossibleByL = new boolean[length][numStates]; oPossibleByR = new boolean[length + 1][numStates]; } tags = new boolean[length][numTags]; if (op.testOptions.lengthNormalization) { wordsInSpan = new int[length][length + 1][]; for (int start = 0; start < length; start++) { for (int end = start + 1; end <= length; end++) { wordsInSpan[start][end] = new int[numStates]; } } } // System.out.println("ExhaustivePCFGParser constructor finished."); } private void clearArrays() { iScore = oScore = null; iPossibleByL = iPossibleByR = oPossibleByL = oPossibleByR = null; oFilteredEnd = oFilteredStart = null; tags = null; narrowRExtent = wideRExtent = narrowLExtent = wideLExtent = null; } } // end class ExhaustivePCFGParser




© 2015 - 2024 Weber Informatics LLC | Privacy Policy