All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.berkeley.nlp.PCFGLA.GrammarStatistics Maven / Gradle / Ivy

Go to download

The Berkeley parser analyzes the grammatical structure of natural language using probabilistic context-free grammars (PCFGs).

The newest version!
package edu.berkeley.nlp.PCFGLA;

import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Writer;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import edu.berkeley.nlp.PCFGLA.ConditionalTrainer.Options;
import edu.berkeley.nlp.PCFGLA.Corpus.TreeBankType;
import edu.berkeley.nlp.PCFGLA.smoothing.SmoothAcrossParentBits;
import edu.berkeley.nlp.PCFGLA.smoothing.SmoothAcrossParentSubstate;
import edu.berkeley.nlp.discPCFG.HiearchicalAdaptiveLinearizer;
import edu.berkeley.nlp.discPCFG.Linearizer;
import edu.berkeley.nlp.syntax.StateSet;
import edu.berkeley.nlp.syntax.Tree;
import edu.berkeley.nlp.math.DoubleArrays;
import edu.berkeley.nlp.math.SloppyMath;
import edu.berkeley.nlp.util.*;

class FullState {
	public short state;
	public short substate;
	/** A hack to make getting P(parent|child) easier.*/
	public double score;

	public FullState(short state, short substate) {
		this.state = state;
		this.substate = substate;
	}

	/**
	 * @param tagNumberer
	 * @return
	 */
	public String toString(Numberer tagNumberer) {
		String w;
		String name = tagNumberer.object(state)+"-"+substate;
		w = ""+name+" ";
		return w;
	}

	/**
	 * @param tagNumberer
	 * @return
	 */
	public String toString(Numberer tagNumberer, String childFullName) {
		String w;
		String name = tagNumberer.object(state)+"-"+substate;
		w = ""+name+" ";
		return w;
	}

	public boolean equals(FullState s) {
		return (state==s.state && substate==s.substate);
	}
}

class SearchState {
	public ArrayList produced = new ArrayList();
	public FullState danglingState;
	public double score;
	public int insertPosition = 0;
	FullState parent = null;
	public boolean extended = false;

	public SearchState (FullState danglingState, double score) {
		this.danglingState = danglingState;
		this.score = score;
	}

	public SearchState (FullState danglingState, FullState firstProduction, double score) {
		this.danglingState = danglingState;
		produced.add(firstProduction);
		this.score = score;
	}

	public SearchState extend (FullState newProd, FullState newDangling, double scorePenalty, boolean left) {
		SearchState s = new SearchState(newDangling,score + scorePenalty);
		s.produced = new ArrayList(produced);
		s.produced.add(insertPosition,newProd);
		s.insertPosition = insertPosition + (left ? 0 : 1);
		return s;
	}

	public String toString(Numberer tagNumberer) {
		String w="";
		if (parent!=null) {
			String name = tagNumberer.object(parent.state)+"-"+parent.substate;
			w += ""+name+" -> ";
		}

		for (FullState s : produced) {
			String name = tagNumberer.object(s.state)+"-"+s.substate;
			w += ""+name+" ";
		}
		return w;
	}

	/**
	 * @param rs
	 * @param ps
	 * @param rscore
	 * @param b
	 * @return
	 */
	public SearchState extendUp(FullState cs, FullState ps, double rscore, boolean thisChildOnLeft) {
		SearchState s = new SearchState(ps,score + rscore);
		s.produced = new ArrayList(produced);
		if (cs!=null) {
			if (thisChildOnLeft)
				s.produced.add(0,cs);
			else
				s.produced.add(produced.size(),cs);
		}
		s.extended = true;
		return s;
	}
}


public class GrammarStatistics {
	private static int topN = 10;

	public GrammarStatistics (Grammar grammar, Numberer tagNumberer, int nScores) {
		this.grammar = grammar;
		this.tagNumberer = tagNumberer;
		this.nScores = nScores;
	}

	public Grammar grammar;
	public Numberer tagNumberer;
	public int nScores;

	/** Find the best nScores productions by doing breadth-first search.
	 * 
	 * @param p
	 * @param nScores
	 * @return
	 */
	PriorityQueue getTopProductions(FullState p) {
		PriorityQueue results = new PriorityQueue(nScores+1);
		PriorityQueue unExpanded = new PriorityQueue();

		unExpanded.add(new SearchState(p,0),0);
		while ( unExpanded.size()!=0 && (results.size() -results.peek().score) ) {
			//expand best-looking SearchState so far
			SearchState state = unExpanded.next();
			//accept complete productions
			if (state.danglingState==null || (state.produced.size()!=0 && !continues(state.danglingState.state))) {
				if (state.danglingState!=null)
					state = state.extend(state.danglingState,null,0,false);
				results.add(state,-state.score);
				if (results.size()>nScores)
					results.next();
			}
			//try to complete partial productions
			else {
				for (UnaryRule rule: grammar.getUnaryRulesByParent(state.danglingState.state)) {
					double[][] scores = rule.getScores2();
					for (short cSubState = 0; cSubState < grammar.numSubStates[rule.getChildState()]; cSubState++) {
						if (scores[cSubState]==null) continue;
						double rscore = scores[cSubState][state.danglingState.substate];
						FullState s = new FullState(rule.getChildState(),cSubState);
						SearchState newState = state.extend(s,null,rscore,false);
						unExpanded.add(newState,newState.score);
					}
				}
				for (BinaryRule rule : grammar.splitRulesWithP(state.danglingState.state)){//getBinaryRulesByParent(state.danglingState.state)) {
					double[][][] scores = rule.getScores2();
					for (short lSubState = 0; lSubState < grammar.numSubStates[rule.getLeftChildState()]; lSubState++) {
						FullState ls = new FullState(rule.getLeftChildState(),lSubState);
						for (short rSubState = 0; rSubState < grammar.numSubStates[rule.getRightChildState()]; rSubState++) {
							if (scores[lSubState][rSubState]==null) continue;
							FullState rs = new FullState(rule.getRightChildState(),rSubState);
							SearchState newState;
							double rscore = scores[lSubState][rSubState][state.danglingState.substate];
							if (continues(ls.state)) {
								newState = state.extend(rs,ls,rscore,true);
							} else {
								newState = state.extend(ls,rs,rscore,false);
							}
							unExpanded.add(newState,newState.score);
						}
					}
				}
			}
		}
		return results;
	}

	/** Find the best nScores productions by doing breadth-first search.
	 * 
	 * @param p
	 * @param nScores
	 * @return
	 */
	PriorityQueue getTopParentRuleProductions(FullState c,
			double[] probState, double[][] probSubGivenState) {
		PriorityQueue results = new PriorityQueue(nScores+1);
		PriorityQueue unExpanded = new PriorityQueue();

		double score = -(probState[c.state]+probSubGivenState[c.state][c.substate]);
		unExpanded.add(new SearchState(c,c,score),-score);
		int maxSize = 10000;
		while (unExpanded.size() != 0
				&& unExpanded.size() < maxSize
				&& (results.size() < nScores || unExpanded.peek().score > -results
						.peek().score)) {
			//expand best-looking SearchState so far
			SearchState state = unExpanded.next();
			//accept complete productions
			if (state.danglingState==null || (state.extended && !continues(state.danglingState.state))) {
				if (state.danglingState!=null)
					state.parent = state.danglingState;
				state.score += probState[state.parent.state]
				                         + probSubGivenState[state.parent.state][state.parent.substate]; 
				results.add(state,-state.score);
				if (results.size()>nScores)
					results.next();
			}
			//try to complete partial productions
			else {
				for (UnaryRule rule: grammar.getUnaryRulesByChild(state.danglingState.state)) {
					double[][] scores = rule.getScores2();
					if (scores[state.danglingState.substate]==null) continue;
					for (short pSubState = 0; pSubState < grammar.numSubStates[rule.getParentState()]; pSubState++) {
						double rscore = scores[state.danglingState.substate][pSubState];
						FullState s = new FullState(rule.getParentState(),pSubState);
						SearchState newState = state.extendUp(null,s,rscore,false);
						unExpanded.add(newState,newState.score);
					}
				}
				for (BinaryRule rule : grammar.splitRulesWithLC(state.danglingState.state)){//getBinaryRulesByLeftChild(state.danglingState.state)) {
					double[][][] scores = rule.getScores2();
					for (short pSubState = 0; pSubState < grammar.numSubStates[rule.getParentState()]; pSubState++) {
						FullState ps = new FullState(rule.getParentState(),pSubState);
						for (short rSubState = 0; rSubState < grammar.numSubStates[rule.getRightChildState()]; rSubState++) {
							if (scores[state.danglingState.substate][rSubState]==null) continue;
							FullState rs = new FullState(rule.getRightChildState(),rSubState);
							SearchState newState;
							double rscore = scores[state.danglingState.substate][rSubState][pSubState];
							newState = state.extendUp(rs,ps,rscore,false);
							unExpanded.add(newState,newState.score);
						}
					}
				}
				for (BinaryRule rule : grammar.splitRulesWithRC(state.danglingState.state)){//getBinaryRulesByRightChild(state.danglingState.state)) {
					double[][][] scores = rule.getScores2();
					for (short pSubState = 0; pSubState < grammar.numSubStates[rule.getParentState()]; pSubState++) {
						FullState ps = new FullState(rule.getParentState(),pSubState);
						for (short lSubState = 0; lSubState < grammar.numSubStates[rule.getLeftChildState()]; lSubState++) {
							if (scores[lSubState][state.danglingState.substate]==null) continue;
							FullState rs = new FullState(rule.getLeftChildState(),lSubState);
							SearchState newState;
							double rscore = scores[lSubState][state.danglingState.substate][pSubState];
							newState = state.extendUp(rs,ps,rscore,true);
							unExpanded.add(newState,newState.score);
						}
					}
				}
			}
		}
		return results;
	}

	public boolean continues(short state) {
		return ((String)tagNumberer.object(state)).charAt(0)=='@';
	}

	public static String pad(String s, int width, char c) {
		StringBuffer sb = new StringBuffer(s);
		for (int i=s.length(); i");
		System.out.println("

Links

"); System.out.println(""); Corpus corpus = new Corpus(wsjLoc,opts.treebank,1.0,false); List> trainTrees = Corpus.binarizeAndFilterTrees(corpus .getTrainTrees(), pData.getV_markov(), pData.getH_markov(), opts.maxL, pData.getBinarization(), false, false); trainTrees = Corpus.filterTreesForConditional(trainTrees, false,false,false); StateSetTreeList trainStateSetTrees = new StateSetTreeList(trainTrees, nonLogGrammar.numSubStates, false, tagNumberer); int padding = 3; topN = 30; printLexiconStatistics(lexicon, tagNumberer,grammar.isGrammarTag,grammar, trainStateSetTrees, opts); GrammarStatistics gs = new GrammarStatistics(grammar,tagNumberer, topN); // determine which tags need to be examined. // Continuation tags and lexical tags are excluded Set noContinueTags = new HashSet(); Set continueTags = new HashSet(); for (short i=0; i"); Set allRealTags = new HashSet(noContinueTags); for (short i=0; i tree : trainStateSetTrees) { // System.out.println("adding probs for tree "+nTree+" / "+trainStateSetTrees.size()); parser.doInsideOutsideScores(tree,false,true); tallyProbState(tree,probState,allRealTags); tallyProbSubState(tree,probSubGivenState,allRealTags); } for (int state=0; state"); } private static void tallyProbSubState(Tree tree, double[][] probSubGivenState, Set noContinueTags) { tallyProbSubStateHelper(tree,tree.getLabel().getIScore(0), probSubGivenState,noContinueTags); } /** * @param tree * @param probSubGivenState */ private static void tallyProbSubStateHelper(Tree tree, double treeProb, double[][] probSubGivenState, Set tags) { if (tree.isLeaf()) return; StateSet label = tree.getLabel(); short state = label.getState(); if (tags.contains(state)) { double[] iScores = label.getIScores(); double[] oScores = label.getOScores(); double[] scores = new double[iScores.length]; double sum = 0; for (int substate=0; substate child : tree.getChildren()) tallyProbSubStateHelper(child,treeProb,probSubGivenState,tags); } /** * Count occurrences of each state. Ignore states that start with "@". * * @param tree * @param probState */ private static void tallyProbState(Tree tree, double[] probState, Set tags) { if (tree.isLeaf()) return; short state = tree.getLabel().getState(); if (tags.contains(state)) probState[state] += 1; for (Tree child : tree.getChildren()) tallyProbState(child,probState,tags); } /** * @param columnOutput * @param grammar * @param tagNumberer * @param nonLogGrammar * @param nonLogLexicon * @param topN * @param gs * @param trainTrees */ private static FullState[][] printParentStatistics(boolean columnOutput, Grammar grammar, Numberer tagNumberer, Grammar nonLogGrammar, Lexicon nonLogLexicon, int topN, GrammarStatistics gs, List> trainTrees, ArrayParser parser) { System.out.println("

Parents

"); System.out.println(""); for (short childState=0; childState results = new PriorityQueue(topN+1); for (short parentState=0; parentStatetopN) results.next(); } } ArrayList resultsA = new ArrayList(topN); while (results.size()!=0) { resultsA.add(0,results.next()); } parents[childState] = new FullState[resultsA.size()]; for (short j = 0; j < topN; j++){ String o=""; double p=-1; if (resultsA.size()>j) { parents[childState][j] = resultsA.get(j); p = resultsA.get(j).score; String w = resultsA.get(j).toString(tagNumberer,childFullName); o = f.format(p)+" "+w; } outputMatrix[j+1][cS] = o; } } printRules("Parent", "parent", columnOutput, outputMatrix); } return parents; } /** * @param columnOutput * @param tagNumberer * @param padding * @param topN * @param gs * @param continueTags */ private static void printTrunkStatistics(boolean columnOutput, Numberer tagNumberer, int padding, int topN, GrammarStatistics gs, Set continueTags) { System.out.println("

Trunks

"); //output trunk rule probabilities for (short tag : continueTags) { String tagS = ((String)tagNumberer.object(tag)).substring(1); short parentTag = (short)tagNumberer.number(tagS); gs.printTopRules(parentTag, topN, columnOutput, padding); gs.printTopRules(tag, topN, columnOutput, padding); System.out.println(""); } } /** * @param columnOutput * @param pData * @param tagNumberer * @param topN * @param gs * @param noContinueTags */ private static void printGrammarStatistics(boolean columnOutput, ParserData pData, Numberer tagNumberer, int topN, GrammarStatistics gs, Set noContinueTags) { System.out.println("

Grammar

"); System.out.println("
"); // print rule probabilities for (short curTag : noContinueTags){ int nSubStates = pData.numSubStatesArray[curTag]; ArrayList[] results = new ArrayList[nSubStates]; for (short i = 0; i < nSubStates; i++) { //do heavy computation PriorityQueue pq = gs.getTopProductions(new FullState(curTag,i)); //convert pq to array results[i] = new ArrayList(topN); while (pq.size()!=0) { pq.peek().score = Math.exp(pq.peek().score); results[i].add(0,pq.next()); } } String[][] outputMatrix = new String[topN+1][nSubStates]; String tagName = (String) tagNumberer.object(curTag); for (int i = 0; i < nSubStates; i++) { outputMatrix[0][i] = tagName + "-" + i; } for (int j = 0; j < topN; j++){ for (int i = 0; i < nSubStates; i++) { String o=""; double p=-1; if (results[i].size()>j) { p = results[i].get(j).score; String w = results[i].get(j).toString(tagNumberer); o = f.format(p)+" "+w; } outputMatrix[j+1][i] = o; } } printRules("Grammar","productions", columnOutput, outputMatrix); } System.out.println("
"); } /** * @param columnOutput * @param pData * @param tagNumberer * @param topN * @param gs * @param noContinueTags */ private static void printParentRuleStatistics(boolean columnOutput, ParserData pData, Numberer tagNumberer, int topN, GrammarStatistics gs, Set noContinueTags, double[] probState, double[][] probSubGivenState) { System.out.println("

Parent Rules

"); // print rule probabilities for (short curTag : noContinueTags){ int nSubStates = pData.numSubStatesArray[curTag]; ArrayList[] results = new ArrayList[nSubStates]; for (short i = 0; i < nSubStates; i++) { //do heavy computation PriorityQueue pq = gs.getTopParentRuleProductions(new FullState(curTag,i),probState,probSubGivenState); //convert pq to array results[i] = new ArrayList(topN); while (pq.size()!=0) { pq.peek().score = Math.exp(pq.peek().score); results[i].add(0,pq.next()); } } String[][] outputMatrix = new String[topN+1][nSubStates]; String tagName = (String) tagNumberer.object(curTag); for (int i = 0; i < nSubStates; i++) { outputMatrix[0][i] = tagName + "-" + i; } for (int j = 0; j < topN; j++){ for (int i = 0; i < nSubStates; i++) { String o=""; double p=-1; if (results[i].size()>j) { p = results[i].get(j).score; String w = results[i].get(j).toString(tagNumberer); o = f.format(p)+" "+w; } outputMatrix[j+1][i] = o; } } printRules("Parent Rules","parentrules", columnOutput, outputMatrix); } } /** * @param tree */ private static void logarithmModeTree(Tree tree) { if (tree.isLeaf()) return; double[] iScores = tree.getLabel().getIScores(); int iScale = tree.getLabel().getIScale(); double[] oScores = tree.getLabel().getOScores(); int oScale = tree.getLabel().getOScale(); for (int i=0; i tree, Grammar g, double[][][][] parentProbs, double[][] normFactors, double treeScore) { int nSubStates = tree.getLabel().numSubStates(); double[][] viterbiProbs = new double[nSubStates][nSubStates]; for (int i=0; i tree, Grammar g, double[][][][] parentProbs, double[][] normFactor, double[][] viterbiProbs, double treeScore) { if (tree.isPreTerminal() || tree.isLeaf()) return; short pState = tree.getLabel().getState(); int nParentStates = tree.getLabel().numSubStates(); List> children = tree.getChildren(); switch(children.size()) { case 1: Tree child = children.get(0); short cState = child.getLabel().getState(); double[][] scores = g.getUnaryScore(pState,cState); int nChildStates = child.getLabel().numSubStates(); double[][] newViterbiProbs = new double[viterbiProbs.length][nChildStates]; for (int gpS=0; gpS lChild = children.get(0); Tree rChild = children.get(1); short lcState = lChild.getLabel().getState(); short rcState = rChild.getLabel().getState(); double[][][] scoresB = g.getBinaryScore(pState,lcState,rcState); int nLChildStates = lChild.getLabel().numSubStates(); int nRChildStates = rChild.getLabel().numSubStates(); double[][] newLViterbiProbs = new double[viterbiProbs.length][nLChildStates]; double[][] newRViterbiProbs = new double[viterbiProbs.length][nRChildStates]; for (int gpS=0; gpS child, short gpState, short cState, double[][][][] parentProbs, double[][] normFactor, double[][] viterbiProbs) { for (int gpS=0; gpS topRules = new PriorityQueue(); for (BinaryRule r : grammar.splitRulesWithP(tag)){//getBinaryRulesByParent(tag)) { for (int lSubState = 0; lSubState < grammar.numSubStates[r.getLeftChildState()]; lSubState++) { for (int rSubState = 0; rSubState < grammar.numSubStates[r.getRightChildState()]; rSubState++) { double score = r.getScore(subState,lSubState,rSubState); topRules.add(new RuleStruct(r,score,subState,lSubState,rSubState),-score); if (topRules.size() > topN) //remove worst rule topRules.next(); } } } for (UnaryRule r : grammar.getUnaryRulesByParent(tag)) { for (int cSubState = 0; cSubState < grammar.numSubStates[r.getChildState()]; cSubState++) { double score = r.getScore(subState,cSubState); topRules.add(new RuleStruct(r,score,subState,cSubState),-score); if (topRules.size() > topN) //remove worst rule topRules.next(); } } ArrayList r = new ArrayList(); while (topRules.hasNext()) { RuleStruct s = topRules.next(); r.add(0,s); } for (int i=0; i"+leftName+" "); sB.append(""+rightName+" "); } else { UnaryRule u = (UnaryRule)r.r; String childName = tagNumberer.object(u.childState)+"-"+r.lS; sB.append(""+childName+" "); } return sB.toString(); } /** * @param columnOutput * @param padding * @param outputMatrix */ private static void printRules(String typeName, String ruleTypeName, boolean columnOutput, String[][] outputMatrix) { System.out.println("

"+typeName+"

"); if (columnOutput) { for (int i = 0; i < outputMatrix.length; i++){ System.out.println(""); for (int j = 0; j < outputMatrix[0].length; j++) { if (i==0) { System.out.println(""); } else System.out.print(""); } System.out.println(""); } } else { for (int j = 0; j < outputMatrix[0].length; j++) { System.out.println(""); for (int i = 0; i < outputMatrix.length; i++){ if (j==0) { System.out.println(""); } else System.out.print(""); } System.out.println(""); } } System.out.println("
"); System.out.print(outputMatrix[i][j]); System.out.println(" (p)"+sanitize(outputMatrix[i][j])+"
"); System.out.print(outputMatrix[i][j]); System.out.println(""+sanitize(outputMatrix[i][j])+"

"); } public static int maxWidthInRow(String[][] m,int row) { int l=0; for (int c=0; c=3 letters long, ends with s, and not 'is' or 'us'\n" + // " The rest capture endings:\n" + // " -ed\n" + // " -ing\n" + // " -ion\n" + // " -er\n" + // " -est\n" + // " -ly\n" + // " -ity\n" + // " -y\n" + // " -al\n"); // Map unk = lexicon.getUnseenScores(); // for (String sig : unk.keySet()) { // System.out.println(); // System.out.println("signature "+sig); // double[][] scores = unk.get(sig); // int maxWidth = 0; // int count = 0; // for (int tag=0; tag= scores[tag].length) // out[tagIdx][substate] = ""; // else // out[tagIdx][substate] = f.format(scores[tag][substate]); // } // tagIdx++; // } // printRules("nothing","not ready",false,out); // } } public static void printLexiconStatistics(Lexicon lexicon, Numberer tagNumberer, boolean[] grammarTags, Grammar grammar, StateSetTreeList trainStateSetTrees, Options opts){ //printLexiconUnknownStatistics(lexicon, tagNumberer); System.out.println("

Lexicon

"); System.out.println("
"); double[][][] counts = null; double[][] posteriors = new double[grammar.numStates][(int)ArrayUtil.max(grammar.numSubStates)]; if (lexicon instanceof SimpleLexicon){ counts = new double[grammar.numStates][((SimpleLexicon)lexicon).nWords][grammar.numSubStates[1]]; ParserData pDataNoLog = ParserData.Load(opts.in); if (pDataNoLog == null) { System.exit(1); } Grammar nonLogGrammar = pDataNoLog.getGrammar(); nonLogGrammar.splitRules(); SimpleLexicon nonLogLexicon = (SimpleLexicon)pDataNoLog.getLexicon(); nonLogLexicon.explicitlyComputeScores(nonLogGrammar.finalLevel); SpanPredictor spanPredictor = pDataNoLog.getSpanPredictor(); // SophisticatedLexicon newLex = new SophisticatedLexicon(grammar.numSubStates, SophisticatedLexicon.DEFAULT_SMOOTHING_CUTOFF, new double[]{0.5, 0.1}, new SmoothAcrossParentSubstate(0.1), 1.0e-30); if (opts.unkT<0) { System.out.println("Replacing rare words"); Corpus.replaceRareWords(trainStateSetTrees,new SimpleLexicon(grammar.numSubStates,-1), Math.abs(opts.unkT)); } nonLogLexicon.labelTrees(trainStateSetTrees); ConstrainedHierarchicalTwoChartParser parser = new ConstrainedHierarchicalTwoChartParser(nonLogGrammar, nonLogLexicon, spanPredictor, grammar.finalLevel); // HiearchicalAdaptiveLinearizer linearizer = new HiearchicalAdaptiveLinearizer(nonLogGrammar, nonLogLexicon, spanPredictor, grammar.finalLevel); // double[] counts = new double[linearizer.dimension()]; // int nTrees = trainStateSetTrees.size(); // boolean secondHalf; // int n=0; for (Tree stateSetTree : trainStateSetTrees) { // secondHalf = (n++>nTrees/2.0); boolean noSmoothing = true, debugOutput = false; parser.doInsideOutsideScores(stateSetTree,false,false); grammar.tallyMergeWeights(stateSetTree, posteriors); double tree_score = stateSetTree.getLabel().getIScore(0); int tree_scale = stateSetTree.getLabel().getIScale(); List yield = stateSetTree.getYield(); int i =0; for (StateSet stateSet : stateSetTree.getPreTerminalYield()){ double scalingFactor = ScalingTools.calcScaleFactor(stateSet.getOScale()+stateSet.getIScale()-tree_scale); StateSet child = yield.get(i++); for (short substate=0; substate stateSetTree : trainStateSetTrees) { // parser.doInsideOutsideScores(stateSetTree,true,false); // grammar.tallyMergeWeights(stateSetTree, posteriors); // } } // System.out.println("Entropies"); // for (short curTag=0; curTag[] wordToTagCounters = lexicon.wordToTagCounters; for (short curTag=0; curTag[] pQs = new PriorityQueue[nSubStates]; for (int i = 0; i < nSubStates; i++) { pQs[i] = new PriorityQueue(); } double[] sum = new double[grammar.numSubStates[curTag]]; if (lexicon instanceof SophisticatedLexicon){ sum = posteriors[curTag]; SophisticatedLexicon lex = (SophisticatedLexicon)lexicon; HashMap tagMap = lex.wordToTagCounters[curTag]; for (String word : tagMap.keySet()) { double[] lexiconScores = lexicon.score(word,curTag,0,false,false); // double[] counts = tagMap.get(word); for (int i = 0; i < nSubStates; i++) { pQs[i].add(word, lexiconScores[i]);//counts[i]); } } } else { sum = new double[grammar.numSubStates[curTag]]; SimpleLexicon lex = (SimpleLexicon)lexicon; for (int w=0; w=lex.wordCounter.length||lex.wordCounter[k]<=51) continue; String word = (String)lex.wordIndexer.get(w); // System.out.println(word + " " +lex.wordCounter[k]+" "); // double[] lexiconScores = lexicon.score(word,curTag,0,true,word.startsWith("UNK")); double[] lexiconScores = counts[curTag][w]; boolean allZero=true; for (int i=0; iLexicon"); System.out.println(""); System.out.println(""); for (int i = 0; i < nSubStates; i++) { System.out.println(""); } System.out.println(""); for (int j = 0; j < topN; j++){ System.out.println(""); /* System.out.println("The top " + topN + " words for the tag " + (String) tagNumberer.object(curTag) + "-" + i + " are:"); System.out.println(pQs[i].toString(topN)); } */ for (int i = 0; i < nSubStates; i++) { if (i==0){ System.out.print("\n"); } String w=""; double p=-1; if (pQs[i].hasNext()) { p = pQs[i].getPriority(); w = pQs[i].next(); String tmp = sanitize(w)+" "+f.format(p); if (tmp.length()<8) tmp = tmp.concat("\t"); System.out.print(""); } } System.out.println(""); } System.out.println("
"); System.out.println(" "); System.out.print(sanitize(tagName) + "-" + i); System.out.println(" (p)"); System.out.println("
"+sum[i]/s); System.out.println("
"+tmp+"

"); } System.out.println("
"); } /** * @param tagName * @return */ static String lexiconLabel(String tagName) { return "\"productions-"+tagName+"\""; } /** * @param ruleTypeName * @param tagName * @return */ static String label(String ruleTypeName, String tagName) { return "\""+ruleTypeName+"-"+tagName+"\""; } static String reflabel(String ruleTypeName, String tagName) { return "\"#"+ruleTypeName+"-"+tagName+"\""; } static String parentLabel(String tagName) { return label("parentrules",tagName); } static String parentRefLabel(String tagName) { return reflabel("parentrules",tagName); } static String sanitize(String s) { return s.replaceAll("&","&"); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy