![JAR search and dependency download from the Maven repository](/logo.png)
edu.berkeley.nlp.PCFGLA.GrammarStatistics Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of berkeleyparser Show documentation
Show all versions of berkeleyparser Show documentation
The Berkeley parser analyzes the grammatical structure of natural language using probabilistic context-free grammars (PCFGs).
The newest version!
package edu.berkeley.nlp.PCFGLA;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Writer;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import edu.berkeley.nlp.PCFGLA.ConditionalTrainer.Options;
import edu.berkeley.nlp.PCFGLA.Corpus.TreeBankType;
import edu.berkeley.nlp.PCFGLA.smoothing.SmoothAcrossParentBits;
import edu.berkeley.nlp.PCFGLA.smoothing.SmoothAcrossParentSubstate;
import edu.berkeley.nlp.discPCFG.HiearchicalAdaptiveLinearizer;
import edu.berkeley.nlp.discPCFG.Linearizer;
import edu.berkeley.nlp.syntax.StateSet;
import edu.berkeley.nlp.syntax.Tree;
import edu.berkeley.nlp.math.DoubleArrays;
import edu.berkeley.nlp.math.SloppyMath;
import edu.berkeley.nlp.util.*;
class FullState {
public short state;
public short substate;
/** A hack to make getting P(parent|child) easier.*/
public double score;
public FullState(short state, short substate) {
this.state = state;
this.substate = substate;
}
/**
* @param tagNumberer
* @return
*/
public String toString(Numberer tagNumberer) {
String w;
String name = tagNumberer.object(state)+"-"+substate;
w = ""+name+" ";
return w;
}
/**
* @param tagNumberer
* @return
*/
public String toString(Numberer tagNumberer, String childFullName) {
String w;
String name = tagNumberer.object(state)+"-"+substate;
w = ""+name+" ";
return w;
}
public boolean equals(FullState s) {
return (state==s.state && substate==s.substate);
}
}
class SearchState {
public ArrayList produced = new ArrayList();
public FullState danglingState;
public double score;
public int insertPosition = 0;
FullState parent = null;
public boolean extended = false;
public SearchState (FullState danglingState, double score) {
this.danglingState = danglingState;
this.score = score;
}
public SearchState (FullState danglingState, FullState firstProduction, double score) {
this.danglingState = danglingState;
produced.add(firstProduction);
this.score = score;
}
public SearchState extend (FullState newProd, FullState newDangling, double scorePenalty, boolean left) {
SearchState s = new SearchState(newDangling,score + scorePenalty);
s.produced = new ArrayList(produced);
s.produced.add(insertPosition,newProd);
s.insertPosition = insertPosition + (left ? 0 : 1);
return s;
}
public String toString(Numberer tagNumberer) {
String w="";
if (parent!=null) {
String name = tagNumberer.object(parent.state)+"-"+parent.substate;
w += ""+name+" -> ";
}
for (FullState s : produced) {
String name = tagNumberer.object(s.state)+"-"+s.substate;
w += ""+name+" ";
}
return w;
}
/**
* @param rs
* @param ps
* @param rscore
* @param b
* @return
*/
public SearchState extendUp(FullState cs, FullState ps, double rscore, boolean thisChildOnLeft) {
SearchState s = new SearchState(ps,score + rscore);
s.produced = new ArrayList(produced);
if (cs!=null) {
if (thisChildOnLeft)
s.produced.add(0,cs);
else
s.produced.add(produced.size(),cs);
}
s.extended = true;
return s;
}
}
public class GrammarStatistics {
private static int topN = 10;
public GrammarStatistics (Grammar grammar, Numberer tagNumberer, int nScores) {
this.grammar = grammar;
this.tagNumberer = tagNumberer;
this.nScores = nScores;
}
public Grammar grammar;
public Numberer tagNumberer;
public int nScores;
/** Find the best nScores productions by doing breadth-first search.
*
* @param p
* @param nScores
* @return
*/
PriorityQueue getTopProductions(FullState p) {
PriorityQueue results = new PriorityQueue(nScores+1);
PriorityQueue unExpanded = new PriorityQueue();
unExpanded.add(new SearchState(p,0),0);
while ( unExpanded.size()!=0 && (results.size() -results.peek().score) ) {
//expand best-looking SearchState so far
SearchState state = unExpanded.next();
//accept complete productions
if (state.danglingState==null || (state.produced.size()!=0 && !continues(state.danglingState.state))) {
if (state.danglingState!=null)
state = state.extend(state.danglingState,null,0,false);
results.add(state,-state.score);
if (results.size()>nScores)
results.next();
}
//try to complete partial productions
else {
for (UnaryRule rule: grammar.getUnaryRulesByParent(state.danglingState.state)) {
double[][] scores = rule.getScores2();
for (short cSubState = 0; cSubState < grammar.numSubStates[rule.getChildState()]; cSubState++) {
if (scores[cSubState]==null) continue;
double rscore = scores[cSubState][state.danglingState.substate];
FullState s = new FullState(rule.getChildState(),cSubState);
SearchState newState = state.extend(s,null,rscore,false);
unExpanded.add(newState,newState.score);
}
}
for (BinaryRule rule : grammar.splitRulesWithP(state.danglingState.state)){//getBinaryRulesByParent(state.danglingState.state)) {
double[][][] scores = rule.getScores2();
for (short lSubState = 0; lSubState < grammar.numSubStates[rule.getLeftChildState()]; lSubState++) {
FullState ls = new FullState(rule.getLeftChildState(),lSubState);
for (short rSubState = 0; rSubState < grammar.numSubStates[rule.getRightChildState()]; rSubState++) {
if (scores[lSubState][rSubState]==null) continue;
FullState rs = new FullState(rule.getRightChildState(),rSubState);
SearchState newState;
double rscore = scores[lSubState][rSubState][state.danglingState.substate];
if (continues(ls.state)) {
newState = state.extend(rs,ls,rscore,true);
} else {
newState = state.extend(ls,rs,rscore,false);
}
unExpanded.add(newState,newState.score);
}
}
}
}
}
return results;
}
/** Find the best nScores productions by doing breadth-first search.
*
* @param p
* @param nScores
* @return
*/
PriorityQueue getTopParentRuleProductions(FullState c,
double[] probState, double[][] probSubGivenState) {
PriorityQueue results = new PriorityQueue(nScores+1);
PriorityQueue unExpanded = new PriorityQueue();
double score = -(probState[c.state]+probSubGivenState[c.state][c.substate]);
unExpanded.add(new SearchState(c,c,score),-score);
int maxSize = 10000;
while (unExpanded.size() != 0
&& unExpanded.size() < maxSize
&& (results.size() < nScores || unExpanded.peek().score > -results
.peek().score)) {
//expand best-looking SearchState so far
SearchState state = unExpanded.next();
//accept complete productions
if (state.danglingState==null || (state.extended && !continues(state.danglingState.state))) {
if (state.danglingState!=null)
state.parent = state.danglingState;
state.score += probState[state.parent.state]
+ probSubGivenState[state.parent.state][state.parent.substate];
results.add(state,-state.score);
if (results.size()>nScores)
results.next();
}
//try to complete partial productions
else {
for (UnaryRule rule: grammar.getUnaryRulesByChild(state.danglingState.state)) {
double[][] scores = rule.getScores2();
if (scores[state.danglingState.substate]==null) continue;
for (short pSubState = 0; pSubState < grammar.numSubStates[rule.getParentState()]; pSubState++) {
double rscore = scores[state.danglingState.substate][pSubState];
FullState s = new FullState(rule.getParentState(),pSubState);
SearchState newState = state.extendUp(null,s,rscore,false);
unExpanded.add(newState,newState.score);
}
}
for (BinaryRule rule : grammar.splitRulesWithLC(state.danglingState.state)){//getBinaryRulesByLeftChild(state.danglingState.state)) {
double[][][] scores = rule.getScores2();
for (short pSubState = 0; pSubState < grammar.numSubStates[rule.getParentState()]; pSubState++) {
FullState ps = new FullState(rule.getParentState(),pSubState);
for (short rSubState = 0; rSubState < grammar.numSubStates[rule.getRightChildState()]; rSubState++) {
if (scores[state.danglingState.substate][rSubState]==null) continue;
FullState rs = new FullState(rule.getRightChildState(),rSubState);
SearchState newState;
double rscore = scores[state.danglingState.substate][rSubState][pSubState];
newState = state.extendUp(rs,ps,rscore,false);
unExpanded.add(newState,newState.score);
}
}
}
for (BinaryRule rule : grammar.splitRulesWithRC(state.danglingState.state)){//getBinaryRulesByRightChild(state.danglingState.state)) {
double[][][] scores = rule.getScores2();
for (short pSubState = 0; pSubState < grammar.numSubStates[rule.getParentState()]; pSubState++) {
FullState ps = new FullState(rule.getParentState(),pSubState);
for (short lSubState = 0; lSubState < grammar.numSubStates[rule.getLeftChildState()]; lSubState++) {
if (scores[lSubState][state.danglingState.substate]==null) continue;
FullState rs = new FullState(rule.getLeftChildState(),lSubState);
SearchState newState;
double rscore = scores[lSubState][state.danglingState.substate][pSubState];
newState = state.extendUp(rs,ps,rscore,true);
unExpanded.add(newState,newState.score);
}
}
}
}
}
return results;
}
public boolean continues(short state) {
return ((String)tagNumberer.object(state)).charAt(0)=='@';
}
public static String pad(String s, int width, char c) {
StringBuffer sb = new StringBuffer(s);
for (int i=s.length(); i");
System.out.println("Links
");
System.out.println("- Lexicon
");
System.out.println("- Grammar
");
System.out.println("- Trunks
");
System.out.println("- Parents
");
System.out.println("- Parent Rules
");
System.out.println("
");
System.out.println("");
Corpus corpus = new Corpus(wsjLoc,opts.treebank,1.0,false);
List> trainTrees = Corpus.binarizeAndFilterTrees(corpus
.getTrainTrees(), pData.getV_markov(), pData.getH_markov(),
opts.maxL, pData.getBinarization(), false, false);
trainTrees = Corpus.filterTreesForConditional(trainTrees, false,false,false);
StateSetTreeList trainStateSetTrees = new StateSetTreeList(trainTrees, nonLogGrammar.numSubStates, false, tagNumberer);
int padding = 3;
topN = 30;
printLexiconStatistics(lexicon, tagNumberer,grammar.isGrammarTag,grammar, trainStateSetTrees, opts);
GrammarStatistics gs = new GrammarStatistics(grammar,tagNumberer, topN);
// determine which tags need to be examined.
// Continuation tags and lexical tags are excluded
Set noContinueTags = new HashSet();
Set continueTags = new HashSet();
for (short i=0; i");
Set allRealTags = new HashSet(noContinueTags);
for (short i=0; i tree : trainStateSetTrees) {
// System.out.println("adding probs for tree "+nTree+" / "+trainStateSetTrees.size());
parser.doInsideOutsideScores(tree,false,true);
tallyProbState(tree,probState,allRealTags);
tallyProbSubState(tree,probSubGivenState,allRealTags);
}
for (int state=0; state");
}
private static void tallyProbSubState(Tree tree,
double[][] probSubGivenState, Set noContinueTags) {
tallyProbSubStateHelper(tree,tree.getLabel().getIScore(0),
probSubGivenState,noContinueTags);
}
/**
* @param tree
* @param probSubGivenState
*/
private static void tallyProbSubStateHelper(Tree tree,
double treeProb, double[][] probSubGivenState,
Set tags) {
if (tree.isLeaf())
return;
StateSet label = tree.getLabel();
short state = label.getState();
if (tags.contains(state)) {
double[] iScores = label.getIScores();
double[] oScores = label.getOScores();
double[] scores = new double[iScores.length];
double sum = 0;
for (int substate=0; substate child : tree.getChildren())
tallyProbSubStateHelper(child,treeProb,probSubGivenState,tags);
}
/**
* Count occurrences of each state. Ignore states that start with "@".
*
* @param tree
* @param probState
*/
private static void tallyProbState(Tree tree, double[] probState, Set tags) {
if (tree.isLeaf())
return;
short state = tree.getLabel().getState();
if (tags.contains(state))
probState[state] += 1;
for (Tree child : tree.getChildren())
tallyProbState(child,probState,tags);
}
/**
* @param columnOutput
* @param grammar
* @param tagNumberer
* @param nonLogGrammar
* @param nonLogLexicon
* @param topN
* @param gs
* @param trainTrees
*/
private static FullState[][] printParentStatistics(boolean columnOutput, Grammar grammar, Numberer tagNumberer, Grammar nonLogGrammar, Lexicon nonLogLexicon, int topN, GrammarStatistics gs, List> trainTrees, ArrayParser parser) {
System.out.println("Parents
");
System.out.println("");
for (short childState=0; childState results = new PriorityQueue(topN+1);
for (short parentState=0; parentStatetopN)
results.next();
}
}
ArrayList resultsA = new ArrayList(topN);
while (results.size()!=0) {
resultsA.add(0,results.next());
}
parents[childState] = new FullState[resultsA.size()];
for (short j = 0; j < topN; j++){
String o="";
double p=-1;
if (resultsA.size()>j) {
parents[childState][j] = resultsA.get(j);
p = resultsA.get(j).score;
String w = resultsA.get(j).toString(tagNumberer,childFullName);
o = f.format(p)+" "+w;
}
outputMatrix[j+1][cS] = o;
}
}
printRules("Parent", "parent", columnOutput, outputMatrix);
}
return parents;
}
/**
* @param columnOutput
* @param tagNumberer
* @param padding
* @param topN
* @param gs
* @param continueTags
*/
private static void printTrunkStatistics(boolean columnOutput, Numberer tagNumberer, int padding, int topN, GrammarStatistics gs, Set continueTags) {
System.out.println("Trunks
");
//output trunk rule probabilities
for (short tag : continueTags) {
String tagS = ((String)tagNumberer.object(tag)).substring(1);
short parentTag = (short)tagNumberer.number(tagS);
gs.printTopRules(parentTag, topN, columnOutput, padding);
gs.printTopRules(tag, topN, columnOutput, padding);
System.out.println("");
}
}
/**
* @param columnOutput
* @param pData
* @param tagNumberer
* @param topN
* @param gs
* @param noContinueTags
*/
private static void printGrammarStatistics(boolean columnOutput, ParserData pData, Numberer tagNumberer, int topN, GrammarStatistics gs, Set noContinueTags) {
System.out.println("Grammar
");
System.out.println("");
// print rule probabilities
for (short curTag : noContinueTags){
int nSubStates = pData.numSubStatesArray[curTag];
ArrayList[] results = new ArrayList[nSubStates];
for (short i = 0; i < nSubStates; i++) {
//do heavy computation
PriorityQueue pq = gs.getTopProductions(new FullState(curTag,i));
//convert pq to array
results[i] = new ArrayList(topN);
while (pq.size()!=0) {
pq.peek().score = Math.exp(pq.peek().score);
results[i].add(0,pq.next());
}
}
String[][] outputMatrix = new String[topN+1][nSubStates];
String tagName = (String) tagNumberer.object(curTag);
for (int i = 0; i < nSubStates; i++) {
outputMatrix[0][i] = tagName + "-" + i;
}
for (int j = 0; j < topN; j++){
for (int i = 0; i < nSubStates; i++) {
String o="";
double p=-1;
if (results[i].size()>j) {
p = results[i].get(j).score;
String w = results[i].get(j).toString(tagNumberer);
o = f.format(p)+" "+w;
}
outputMatrix[j+1][i] = o;
}
}
printRules("Grammar","productions", columnOutput, outputMatrix);
}
System.out.println(" ");
}
/**
* @param columnOutput
* @param pData
* @param tagNumberer
* @param topN
* @param gs
* @param noContinueTags
*/
private static void printParentRuleStatistics(boolean columnOutput, ParserData pData, Numberer tagNumberer, int topN, GrammarStatistics gs, Set noContinueTags,
double[] probState, double[][] probSubGivenState) {
System.out.println("Parent Rules
");
// print rule probabilities
for (short curTag : noContinueTags){
int nSubStates = pData.numSubStatesArray[curTag];
ArrayList[] results = new ArrayList[nSubStates];
for (short i = 0; i < nSubStates; i++) {
//do heavy computation
PriorityQueue pq = gs.getTopParentRuleProductions(new FullState(curTag,i),probState,probSubGivenState);
//convert pq to array
results[i] = new ArrayList(topN);
while (pq.size()!=0) {
pq.peek().score = Math.exp(pq.peek().score);
results[i].add(0,pq.next());
}
}
String[][] outputMatrix = new String[topN+1][nSubStates];
String tagName = (String) tagNumberer.object(curTag);
for (int i = 0; i < nSubStates; i++) {
outputMatrix[0][i] = tagName + "-" + i;
}
for (int j = 0; j < topN; j++){
for (int i = 0; i < nSubStates; i++) {
String o="";
double p=-1;
if (results[i].size()>j) {
p = results[i].get(j).score;
String w = results[i].get(j).toString(tagNumberer);
o = f.format(p)+" "+w;
}
outputMatrix[j+1][i] = o;
}
}
printRules("Parent Rules","parentrules", columnOutput, outputMatrix);
}
}
/**
* @param tree
*/
private static void logarithmModeTree(Tree tree) {
if (tree.isLeaf())
return;
double[] iScores = tree.getLabel().getIScores();
int iScale = tree.getLabel().getIScale();
double[] oScores = tree.getLabel().getOScores();
int oScale = tree.getLabel().getOScale();
for (int i=0; i tree, Grammar g,
double[][][][] parentProbs, double[][] normFactors,
double treeScore) {
int nSubStates = tree.getLabel().numSubStates();
double[][] viterbiProbs = new double[nSubStates][nSubStates];
for (int i=0; i tree, Grammar g,
double[][][][] parentProbs, double[][] normFactor, double[][] viterbiProbs, double treeScore) {
if (tree.isPreTerminal() || tree.isLeaf())
return;
short pState = tree.getLabel().getState();
int nParentStates = tree.getLabel().numSubStates();
List> children = tree.getChildren();
switch(children.size()) {
case 1:
Tree child = children.get(0);
short cState = child.getLabel().getState();
double[][] scores = g.getUnaryScore(pState,cState);
int nChildStates = child.getLabel().numSubStates();
double[][] newViterbiProbs = new double[viterbiProbs.length][nChildStates];
for (int gpS=0; gpS lChild = children.get(0);
Tree rChild = children.get(1);
short lcState = lChild.getLabel().getState();
short rcState = rChild.getLabel().getState();
double[][][] scoresB = g.getBinaryScore(pState,lcState,rcState);
int nLChildStates = lChild.getLabel().numSubStates();
int nRChildStates = rChild.getLabel().numSubStates();
double[][] newLViterbiProbs = new double[viterbiProbs.length][nLChildStates];
double[][] newRViterbiProbs = new double[viterbiProbs.length][nRChildStates];
for (int gpS=0; gpS child, short gpState, short cState,
double[][][][] parentProbs, double[][] normFactor, double[][] viterbiProbs) {
for (int gpS=0; gpS topRules = new PriorityQueue();
for (BinaryRule r : grammar.splitRulesWithP(tag)){//getBinaryRulesByParent(tag)) {
for (int lSubState = 0; lSubState < grammar.numSubStates[r.getLeftChildState()]; lSubState++) {
for (int rSubState = 0; rSubState < grammar.numSubStates[r.getRightChildState()]; rSubState++) {
double score = r.getScore(subState,lSubState,rSubState);
topRules.add(new RuleStruct(r,score,subState,lSubState,rSubState),-score);
if (topRules.size() > topN)
//remove worst rule
topRules.next();
}
}
}
for (UnaryRule r : grammar.getUnaryRulesByParent(tag)) {
for (int cSubState = 0; cSubState < grammar.numSubStates[r.getChildState()]; cSubState++) {
double score = r.getScore(subState,cSubState);
topRules.add(new RuleStruct(r,score,subState,cSubState),-score);
if (topRules.size() > topN)
//remove worst rule
topRules.next();
}
}
ArrayList r = new ArrayList();
while (topRules.hasNext()) {
RuleStruct s = topRules.next();
r.add(0,s);
}
for (int i=0; i"+leftName+" ");
sB.append(""+rightName+" ");
} else {
UnaryRule u = (UnaryRule)r.r;
String childName = tagNumberer.object(u.childState)+"-"+r.lS;
sB.append(""+childName+" ");
}
return sB.toString();
}
/**
* @param columnOutput
* @param padding
* @param outputMatrix
*/
private static void printRules(String typeName, String ruleTypeName,
boolean columnOutput, String[][] outputMatrix) {
System.out.println(""+typeName+"
");
if (columnOutput) {
for (int i = 0; i < outputMatrix.length; i++){
System.out.println("");
for (int j = 0; j < outputMatrix[0].length; j++) {
if (i==0) {
System.out.println(" ");
System.out.print(outputMatrix[i][j]);
System.out.println(" (p) ");
} else
System.out.print(""+sanitize(outputMatrix[i][j])+" ");
}
System.out.println(" ");
}
} else {
for (int j = 0; j < outputMatrix[0].length; j++) {
System.out.println("");
for (int i = 0; i < outputMatrix.length; i++){
if (j==0) {
System.out.println(" ");
System.out.print(outputMatrix[i][j]);
System.out.println(" ");
} else
System.out.print(""+sanitize(outputMatrix[i][j])+" ");
}
System.out.println(" ");
}
}
System.out.println("
");
}
public static int maxWidthInRow(String[][] m,int row) {
int l=0;
for (int c=0; c=3 letters long, ends with s, and not 'is' or 'us'\n" +
// " The rest capture endings:\n" +
// " -ed\n" +
// " -ing\n" +
// " -ion\n" +
// " -er\n" +
// " -est\n" +
// " -ly\n" +
// " -ity\n" +
// " -y\n" +
// " -al\n");
// Map unk = lexicon.getUnseenScores();
// for (String sig : unk.keySet()) {
// System.out.println();
// System.out.println("signature "+sig);
// double[][] scores = unk.get(sig);
// int maxWidth = 0;
// int count = 0;
// for (int tag=0; tag= scores[tag].length)
// out[tagIdx][substate] = "";
// else
// out[tagIdx][substate] = f.format(scores[tag][substate]);
// }
// tagIdx++;
// }
// printRules("nothing","not ready",false,out);
// }
}
public static void printLexiconStatistics(Lexicon lexicon, Numberer tagNumberer, boolean[] grammarTags, Grammar grammar, StateSetTreeList trainStateSetTrees, Options opts){
//printLexiconUnknownStatistics(lexicon, tagNumberer);
System.out.println("Lexicon
");
System.out.println("");
double[][][] counts = null;
double[][] posteriors = new double[grammar.numStates][(int)ArrayUtil.max(grammar.numSubStates)];
if (lexicon instanceof SimpleLexicon){
counts = new double[grammar.numStates][((SimpleLexicon)lexicon).nWords][grammar.numSubStates[1]];
ParserData pDataNoLog = ParserData.Load(opts.in);
if (pDataNoLog == null) {
System.exit(1);
}
Grammar nonLogGrammar = pDataNoLog.getGrammar();
nonLogGrammar.splitRules();
SimpleLexicon nonLogLexicon = (SimpleLexicon)pDataNoLog.getLexicon();
nonLogLexicon.explicitlyComputeScores(nonLogGrammar.finalLevel);
SpanPredictor spanPredictor = pDataNoLog.getSpanPredictor();
// SophisticatedLexicon newLex = new SophisticatedLexicon(grammar.numSubStates, SophisticatedLexicon.DEFAULT_SMOOTHING_CUTOFF, new double[]{0.5, 0.1}, new SmoothAcrossParentSubstate(0.1), 1.0e-30);
if (opts.unkT<0) {
System.out.println("Replacing rare words");
Corpus.replaceRareWords(trainStateSetTrees,new SimpleLexicon(grammar.numSubStates,-1), Math.abs(opts.unkT));
}
nonLogLexicon.labelTrees(trainStateSetTrees);
ConstrainedHierarchicalTwoChartParser parser = new ConstrainedHierarchicalTwoChartParser(nonLogGrammar, nonLogLexicon, spanPredictor, grammar.finalLevel);
// HiearchicalAdaptiveLinearizer linearizer = new HiearchicalAdaptiveLinearizer(nonLogGrammar, nonLogLexicon, spanPredictor, grammar.finalLevel);
// double[] counts = new double[linearizer.dimension()];
// int nTrees = trainStateSetTrees.size();
// boolean secondHalf;
// int n=0;
for (Tree stateSetTree : trainStateSetTrees) {
// secondHalf = (n++>nTrees/2.0);
boolean noSmoothing = true, debugOutput = false;
parser.doInsideOutsideScores(stateSetTree,false,false);
grammar.tallyMergeWeights(stateSetTree, posteriors);
double tree_score = stateSetTree.getLabel().getIScore(0);
int tree_scale = stateSetTree.getLabel().getIScale();
List yield = stateSetTree.getYield();
int i =0;
for (StateSet stateSet : stateSetTree.getPreTerminalYield()){
double scalingFactor = ScalingTools.calcScaleFactor(stateSet.getOScale()+stateSet.getIScale()-tree_scale);
StateSet child = yield.get(i++);
for (short substate=0; substate stateSetTree : trainStateSetTrees) {
// parser.doInsideOutsideScores(stateSetTree,true,false);
// grammar.tallyMergeWeights(stateSetTree, posteriors);
// }
}
// System.out.println("Entropies");
// for (short curTag=0; curTag[] wordToTagCounters = lexicon.wordToTagCounters;
for (short curTag=0; curTag[] pQs = new PriorityQueue[nSubStates];
for (int i = 0; i < nSubStates; i++) {
pQs[i] = new PriorityQueue();
}
double[] sum = new double[grammar.numSubStates[curTag]];
if (lexicon instanceof SophisticatedLexicon){
sum = posteriors[curTag];
SophisticatedLexicon lex = (SophisticatedLexicon)lexicon;
HashMap tagMap = lex.wordToTagCounters[curTag];
for (String word : tagMap.keySet()) {
double[] lexiconScores = lexicon.score(word,curTag,0,false,false);
// double[] counts = tagMap.get(word);
for (int i = 0; i < nSubStates; i++) {
pQs[i].add(word, lexiconScores[i]);//counts[i]);
}
}
}
else {
sum = new double[grammar.numSubStates[curTag]];
SimpleLexicon lex = (SimpleLexicon)lexicon;
for (int w=0; w=lex.wordCounter.length||lex.wordCounter[k]<=51) continue;
String word = (String)lex.wordIndexer.get(w);
// System.out.println(word + " " +lex.wordCounter[k]+" ");
// double[] lexiconScores = lexicon.score(word,curTag,0,true,word.startsWith("UNK"));
double[] lexiconScores = counts[curTag][w];
boolean allZero=true;
for (int i=0; iLexicon");
System.out.println("");
System.out.println("");
for (int i = 0; i < nSubStates; i++) {
System.out.println("");
System.out.println(" ");
System.out.print(sanitize(tagName) + "-" + i);
System.out.println(" (p)");
System.out.println("
"+sum[i]/s);
System.out.println(" ");
}
System.out.println(" ");
for (int j = 0; j < topN; j++){
System.out.println("");
/* System.out.println("The top " + topN + " words for the tag "
+ (String) tagNumberer.object(curTag) + "-" + i + " are:");
System.out.println(pQs[i].toString(topN));
}
*/ for (int i = 0; i < nSubStates; i++) {
if (i==0){ System.out.print("\n"); }
String w="";
double p=-1;
if (pQs[i].hasNext()) {
p = pQs[i].getPriority();
w = pQs[i].next();
String tmp = sanitize(w)+" "+f.format(p);
if (tmp.length()<8) tmp = tmp.concat("\t");
System.out.print(""+tmp+" ");
}
}
System.out.println(" ");
}
System.out.println("
");
}
System.out.println(" ");
}
/**
* @param tagName
* @return
*/
static String lexiconLabel(String tagName) {
return "\"productions-"+tagName+"\"";
}
/**
* @param ruleTypeName
* @param tagName
* @return
*/
static String label(String ruleTypeName, String tagName) {
return "\""+ruleTypeName+"-"+tagName+"\"";
}
static String reflabel(String ruleTypeName, String tagName) {
return "\"#"+ruleTypeName+"-"+tagName+"\"";
}
static String parentLabel(String tagName) {
return label("parentrules",tagName);
}
static String parentRefLabel(String tagName) {
return reflabel("parentrules",tagName);
}
static String sanitize(String s) {
return s.replaceAll("&","&");
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy