![JAR search and dependency download from the Maven repository](/logo.png)
edu.berkeley.nlp.PCFGLA.CorpusStatistics Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of berkeleyparser Show documentation
Show all versions of berkeleyparser Show documentation
The Berkeley parser analyzes the grammatical structure of natural language using probabilistic context-free grammars (PCFGs).
The newest version!
/**
*
*/
package edu.berkeley.nlp.PCFGLA;
import java.util.Collection;
import java.util.List;
import java.util.Set;
import edu.berkeley.nlp.syntax.StateSet;
import edu.berkeley.nlp.syntax.Tree;
import edu.berkeley.nlp.util.*;
import edu.berkeley.nlp.util.PriorityQueue;
/**
* CorpusStatistics calculates symbol counts for a corpus.
*
* @author leon
*
*/
public class CorpusStatistics {
// i dont know how to initialize shorts...
short zero = 0, one = 1;
int[] counts;
Collection> trees;
Counter unaryRuleCounter;
Counter binaryRuleCounter;
int[] contexts;
CounterMap posCounter;
/**
* Count statistics for a collection of StateSet trees.
*/
public CorpusStatistics(Numberer tagNumberer, Collection> trees) {
counts = new int[tagNumberer.objects().size()];
this.trees = trees;
unaryRuleCounter = new Counter();
binaryRuleCounter = new Counter();
contexts = new int[tagNumberer.objects().size()];
posCounter = new CounterMap();
}
public void countSymbols(){
for (Tree tree : trees) {
addCount(tree);
}
}
private void addCount(Tree tree) {
counts[tree.getLabel().getState()] += 1.0;
if (!tree.isPreTerminal()) {
for (Tree child : tree.getChildren()) {
addCount(child);
}
}
}
/*
* Counts how many different 'things' (non-terminals or terminals for the POS)
* appear under a given nonterminal symbol.
* Currently POS and other nonterminals are handled the same way.
* We might to change that.
*/
public void countRuleParents(){
for (Tree tree : trees) {
addParent(tree);
}
for (BinaryRule br : binaryRuleCounter.keySet()){
contexts[br.parentState]++;
contexts[br.leftChildState]++;
contexts[br.rightChildState]++;
}
for (UnaryRule ur : unaryRuleCounter.keySet()){
contexts[ur.parentState]++;
contexts[ur.childState]++;
}
for (int i=0; i tempC = posCounter.getCounter(i);
contexts[i] += tempC.size();
}
}
public int[] getContextCounts(){
return contexts;
}
private void addParent(Tree tree) {
short parentState = tree.getLabel().getState();
counts[parentState] += 1.0;
if (!tree.isPreTerminal()) {
if (tree.getChildren().size() == 1) {
UnaryRule r = new UnaryRule(parentState,tree.getChildren().get(0).getLabel().getState(),new double[1][1]);
unaryRuleCounter.incrementCount(r, 1.0);
}
else {
BinaryRule r = new BinaryRule(parentState,
tree.getChildren().get(0).getLabel().getState(),
tree.getChildren().get(1).getLabel().getState(),new double[1][1][1]);
binaryRuleCounter.incrementCount(r, 1.0);
}
for (Tree child : tree.getChildren()) {
addParent(child);
}
}
else {
posCounter.incrementCount((int)parentState,tree.getChildren().get(0).getLabel().getWord(),1.0);
}
}
/** Get the number of times each state appeared.
*
* @return
*/
public int[] getSymbolCounts() {
countSymbols();
return counts;
}
public void printStateCountArray(Numberer tagNumberer, int[] array){
PriorityQueue pq = new PriorityQueue(array.length);
for (int i=0; i
© 2015 - 2025 Weber Informatics LLC | Privacy Policy