edu.berkeley.nlp.PCFGLA.HierarchicalFullyConnectedLexicon Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of berkeleyparser Show documentation
The Berkeley parser analyzes the grammatical structure of natural language using probabilistic context-free grammars (PCFGs).
The newest version!
/**
 * 
 */
package edu.berkeley.nlp.PCFGLA;

import java.util.Arrays;
import java.util.List;

import edu.berkeley.nlp.PCFGLA.smoothing.Smoother;
import edu.berkeley.nlp.syntax.StateSet;
import edu.berkeley.nlp.syntax.Tree;

/**
 * @author petrov
 *
 */
public class HierarchicalFullyConnectedLexicon extends HierarchicalLexicon {

	private static final long serialVersionUID = 1L;
	protected int knownWordCount;


	/**
	 * @param numSubStates
	 * @param threshold
	 */
	public HierarchicalFullyConnectedLexicon(short[] numSubStates, int knownWordCount) {
		super(numSubStates, 0);
		this.knownWordCount = knownWordCount;
	}
	
	public HierarchicalFullyConnectedLexicon(short[] numSubStates, int smoothingCutoff, double[] smoothParam, 
			Smoother smoother, StateSetTreeList trainTrees, int knownWordCount) {
  	this(numSubStates, knownWordCount);
  	init(trainTrees);
  }

	
  /**
	 * @param previousLexicon
	 */
	public HierarchicalFullyConnectedLexicon(SimpleLexicon previousLexicon, int knownWordCount) {
		super(previousLexicon);
		this.knownWordCount = knownWordCount;
	}
	
	public HierarchicalFullyConnectedLexicon newInstance() {
		return new HierarchicalFullyConnectedLexicon(this.numSubStates,this.knownWordCount);
	}

  public void init(StateSetTreeList trainTrees){
  	for (Tree tree : trainTrees){
  		List words = tree.getYield();
  		for (StateSet word : words){
				String sig = word.getWord();
				wordIndexer.add(sig);
  		}
  	}
  	wordCounter = new int[wordIndexer.size()];
  	for (Tree tree : trainTrees){
  		List words = tree.getYield();
  		int ind = 0;  		
  		for (StateSet word : words){
  			String wordString = word.getWord();
  			wordCounter[wordIndexer.indexOf(wordString)]++;
  			
				String sig = getSignature(word.getWord(), ind++);
				wordIndexer.add(sig);
  		}  		
  	}
  	
  	tagWordIndexer = new IntegerIndexer[numStates];
  	for (int tag=0; tag tree : trainTrees){
  		List words = tree.getYield();
  		List tags = tree.getPreTerminalYield();
  		int ind = 0;
  		for (StateSet word : words){
  			int tag = tags.get(ind).getState();
				tagWordIndexer[tag].add(new Integer(word.wordIndex));
				tagWordIndexer[tag].add(new Integer(word.sigIndex));
				lexTag[tag] = true;
				ind++;
  		}  		
  	}


  	expectedCounts = new double[numStates][][];
  	scores = new double[numStates][][];
  	for (int tag=0; tag=0 && (wordCounter[globalWordIndex]>knownWordCount)) {
//			if (globalSigIndex!=-1) System.out.println("Problem: frequent word has signature!");
			return res;
		}
		if (globalSigIndex!=-1) {
			int tagSpecificWordIndex = tagWordIndexer[tag].indexOf(globalSigIndex);
			if (tagSpecificWordIndex!=-1){
				for (int i=0; i wordCounter.length){
//					System.out.println("no count for this word: "+(String)wordIndexer.get(tagWordIndexer[tag].get(stateSet.wordIndex)));
//					stateSet.sigIndex = -1;
//				} else {
				if ((stateSet.wordIndex>=0 && (wordCounter[stateSet.wordIndex]>knownWordCount)) || noSmoothing)
					stateSet.sigIndex = -1;
				else if (knownWordCount > 0)
					stateSet.sigIndex = wordIndexer.indexOf(getSignature(word,stateSet.from));
				else 
					stateSet.wordIndex = wordIndexer.indexOf(getSignature(word,stateSet.from));
			}
//			}
		}
		return score(stateSet.wordIndex, stateSet.sigIndex, tag, stateSet.from, noSmoothing, isSignature);
	}
	
	
	public void labelTrees(StateSetTreeList trainTrees){
  	for (Tree tree : trainTrees){
  		List words = tree.getYield();
  		List tags = tree.getPreTerminalYield();
  		int ind = 0;
  		for (StateSet word : words){
  			word.wordIndex = wordIndexer.indexOf(word.getWord());
  			if (word.wordIndex<0 || word.wordIndex>=wordCounter.length){
  				System.out.println("Have never seen this word before: "+word.getWord()+" "+word.wordIndex);
  				System.out.println(tree);
  			}
  			else if (wordCounter[word.wordIndex]<=knownWordCount){
	  			short tag = tags.get(ind).getState();
					String sig = getSignature(word.getWord(), ind);
					wordIndexer.add(sig);
	  			word.sigIndex = wordIndexer.indexOf(sig);
					tagWordIndexer[tag].add(wordIndexer.indexOf(sig));
  			}
  			else 
  				word.sigIndex = -1;
				ind++;
  		}  		
  	}
	}
	
	


}