All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.berkeley.nlp.PCFGLA.HierarchicalFullyConnectedLexicon Maven / Gradle / Ivy

Go to download

The Berkeley parser analyzes the grammatical structure of natural language using probabilistic context-free grammars (PCFGs).

The newest version!
/**
 * 
 */
package edu.berkeley.nlp.PCFGLA;

import java.util.Arrays;
import java.util.List;

import edu.berkeley.nlp.PCFGLA.smoothing.Smoother;
import edu.berkeley.nlp.syntax.StateSet;
import edu.berkeley.nlp.syntax.Tree;

/**
 * @author petrov
 *
 */
public class HierarchicalFullyConnectedLexicon extends HierarchicalLexicon {

	private static final long serialVersionUID = 1L;
	protected int knownWordCount;


	/**
	 * @param numSubStates
	 * @param threshold
	 */
	public HierarchicalFullyConnectedLexicon(short[] numSubStates, int knownWordCount) {
		super(numSubStates, 0);
		this.knownWordCount = knownWordCount;
	}
	
	public HierarchicalFullyConnectedLexicon(short[] numSubStates, int smoothingCutoff, double[] smoothParam, 
			Smoother smoother, StateSetTreeList trainTrees, int knownWordCount) {
  	this(numSubStates, knownWordCount);
  	init(trainTrees);
  }

	
  /**
	 * @param previousLexicon
	 */
	public HierarchicalFullyConnectedLexicon(SimpleLexicon previousLexicon, int knownWordCount) {
		super(previousLexicon);
		this.knownWordCount = knownWordCount;
	}
	
	public HierarchicalFullyConnectedLexicon newInstance() {
		return new HierarchicalFullyConnectedLexicon(this.numSubStates,this.knownWordCount);
	}

  public void init(StateSetTreeList trainTrees){
  	for (Tree tree : trainTrees){
  		List words = tree.getYield();
  		for (StateSet word : words){
				String sig = word.getWord();
				wordIndexer.add(sig);
  		}
  	}
  	wordCounter = new int[wordIndexer.size()];
  	for (Tree tree : trainTrees){
  		List words = tree.getYield();
  		int ind = 0;  		
  		for (StateSet word : words){
  			String wordString = word.getWord();
  			wordCounter[wordIndexer.indexOf(wordString)]++;
  			
				String sig = getSignature(word.getWord(), ind++);
				wordIndexer.add(sig);
  		}  		
  	}
  	
  	tagWordIndexer = new IntegerIndexer[numStates];
  	for (int tag=0; tag tree : trainTrees){
  		List words = tree.getYield();
  		List tags = tree.getPreTerminalYield();
  		int ind = 0;
  		for (StateSet word : words){
  			int tag = tags.get(ind).getState();
				tagWordIndexer[tag].add(new Integer(word.wordIndex));
				tagWordIndexer[tag].add(new Integer(word.sigIndex));
				lexTag[tag] = true;
				ind++;
  		}  		
  	}


  	expectedCounts = new double[numStates][][];
  	scores = new double[numStates][][];
  	for (int tag=0; tag=0 && (wordCounter[globalWordIndex]>knownWordCount)) {
//			if (globalSigIndex!=-1) System.out.println("Problem: frequent word has signature!");
			return res;
		}
		if (globalSigIndex!=-1) {
			int tagSpecificWordIndex = tagWordIndexer[tag].indexOf(globalSigIndex);
			if (tagSpecificWordIndex!=-1){
				for (int i=0; i wordCounter.length){
//					System.out.println("no count for this word: "+(String)wordIndexer.get(tagWordIndexer[tag].get(stateSet.wordIndex)));
//					stateSet.sigIndex = -1;
//				} else {
				if ((stateSet.wordIndex>=0 && (wordCounter[stateSet.wordIndex]>knownWordCount)) || noSmoothing)
					stateSet.sigIndex = -1;
				else if (knownWordCount > 0)
					stateSet.sigIndex = wordIndexer.indexOf(getSignature(word,stateSet.from));
				else 
					stateSet.wordIndex = wordIndexer.indexOf(getSignature(word,stateSet.from));
			}
//			}
		}
		return score(stateSet.wordIndex, stateSet.sigIndex, tag, stateSet.from, noSmoothing, isSignature);
	}
	
	
	public void labelTrees(StateSetTreeList trainTrees){
  	for (Tree tree : trainTrees){
  		List words = tree.getYield();
  		List tags = tree.getPreTerminalYield();
  		int ind = 0;
  		for (StateSet word : words){
  			word.wordIndex = wordIndexer.indexOf(word.getWord());
  			if (word.wordIndex<0 || word.wordIndex>=wordCounter.length){
  				System.out.println("Have never seen this word before: "+word.getWord()+" "+word.wordIndex);
  				System.out.println(tree);
  			}
  			else if (wordCounter[word.wordIndex]<=knownWordCount){
	  			short tag = tags.get(ind).getState();
					String sig = getSignature(word.getWord(), ind);
					wordIndexer.add(sig);
	  			word.sigIndex = wordIndexer.indexOf(sig);
					tagWordIndexer[tag].add(wordIndexer.indexOf(sig));
  			}
  			else 
  				word.sigIndex = -1;
				ind++;
  		}  		
  	}
	}
	
	


}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy