All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.berkeley.nlp.PCFGLA.CorpusStatistics Maven / Gradle / Ivy

Go to download

The Berkeley parser analyzes the grammatical structure of natural language using probabilistic context-free grammars (PCFGs).

The newest version!
/**
 * 
 */
package edu.berkeley.nlp.PCFGLA;

import java.util.Collection;
import java.util.List;
import java.util.Set;

import edu.berkeley.nlp.syntax.StateSet;
import edu.berkeley.nlp.syntax.Tree;
import edu.berkeley.nlp.util.*;
import edu.berkeley.nlp.util.PriorityQueue;

/**
 * CorpusStatistics calculates symbol counts for a corpus.
 * 
 * @author leon
 * 
 */
public class CorpusStatistics {
	// i dont know how to initialize shorts...
	short zero = 0, one = 1;

	int[] counts;
	Collection> trees;
  Counter unaryRuleCounter;
  Counter binaryRuleCounter;
  
  int[] contexts;
  CounterMap posCounter;
	
	/**
	 * Count statistics for a collection of StateSet trees.
	 */
	public CorpusStatistics(Numberer tagNumberer, Collection> trees) {
		counts = new int[tagNumberer.objects().size()];
		this.trees = trees;
		unaryRuleCounter = new Counter();
		binaryRuleCounter = new Counter();
		contexts = new int[tagNumberer.objects().size()];
		posCounter = new CounterMap();
	}	
	
	public void countSymbols(){	
		for (Tree tree : trees) {
			addCount(tree);
		}
	}

	private void addCount(Tree tree) {
		counts[tree.getLabel().getState()] += 1.0;
		if (!tree.isPreTerminal()) {
			for (Tree child : tree.getChildren()) {
				addCount(child);
			}
		}
	}

	/*
	 * Counts how many different 'things' (non-terminals or terminals for the POS)
	 * appear under a given nonterminal symbol.
	 * Currently POS and other nonterminals are handled the same way.
	 * We might to change that.
	 */
	
	public void countRuleParents(){	
		for (Tree tree : trees) {
			addParent(tree);
		}
		for (BinaryRule br : binaryRuleCounter.keySet()){
			contexts[br.parentState]++;
			contexts[br.leftChildState]++;
			contexts[br.rightChildState]++;
		}
		for (UnaryRule ur : unaryRuleCounter.keySet()){
			contexts[ur.parentState]++;
			contexts[ur.childState]++;
		}
		for (int i=0; i tempC = posCounter.getCounter(i);
			contexts[i] += tempC.size();
			
		}
	}

	public int[] getContextCounts(){
		return contexts;
	}
	
	private void addParent(Tree tree) {
		short parentState = tree.getLabel().getState();
		counts[parentState] += 1.0;
		if (!tree.isPreTerminal()) {
			if (tree.getChildren().size() == 1) {
				UnaryRule r = new UnaryRule(parentState,tree.getChildren().get(0).getLabel().getState(),new double[1][1]);
				unaryRuleCounter.incrementCount(r, 1.0);
			}
			else {
				BinaryRule r = new BinaryRule(parentState,
						tree.getChildren().get(0).getLabel().getState(),
						tree.getChildren().get(1).getLabel().getState(),new double[1][1][1]);
				binaryRuleCounter.incrementCount(r, 1.0);
			}
			for (Tree child : tree.getChildren()) {
				addParent(child);
			}
		}
		else {
			posCounter.incrementCount((int)parentState,tree.getChildren().get(0).getLabel().getWord(),1.0);
		}
	}
	
	
	/** Get the number of times each state appeared.
	 * 
	 * @return
	 */
	public int[] getSymbolCounts() {
		countSymbols();
		return counts;
	}
	
	public void printStateCountArray(Numberer tagNumberer, int[] array){
  	PriorityQueue pq = new PriorityQueue(array.length);
  	for (int i=0; i




© 2015 - 2025 Weber Informatics LLC | Privacy Policy