![JAR search and dependency download from the Maven repository](/logo.png)
edu.berkeley.nlp.PCFGLA.HierarchicalFullyConnectedLexicon Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of berkeleyparser Show documentation
Show all versions of berkeleyparser Show documentation
The Berkeley parser analyzes the grammatical structure of natural language using probabilistic context-free grammars (PCFGs).
The newest version!
/**
*
*/
package edu.berkeley.nlp.PCFGLA;
import java.util.Arrays;
import java.util.List;
import edu.berkeley.nlp.PCFGLA.smoothing.Smoother;
import edu.berkeley.nlp.syntax.StateSet;
import edu.berkeley.nlp.syntax.Tree;
/**
* @author petrov
*
*/
public class HierarchicalFullyConnectedLexicon extends HierarchicalLexicon {
private static final long serialVersionUID = 1L;
protected int knownWordCount;
/**
* @param numSubStates
* @param threshold
*/
public HierarchicalFullyConnectedLexicon(short[] numSubStates, int knownWordCount) {
super(numSubStates, 0);
this.knownWordCount = knownWordCount;
}
public HierarchicalFullyConnectedLexicon(short[] numSubStates, int smoothingCutoff, double[] smoothParam,
Smoother smoother, StateSetTreeList trainTrees, int knownWordCount) {
this(numSubStates, knownWordCount);
init(trainTrees);
}
/**
* @param previousLexicon
*/
public HierarchicalFullyConnectedLexicon(SimpleLexicon previousLexicon, int knownWordCount) {
super(previousLexicon);
this.knownWordCount = knownWordCount;
}
public HierarchicalFullyConnectedLexicon newInstance() {
return new HierarchicalFullyConnectedLexicon(this.numSubStates,this.knownWordCount);
}
public void init(StateSetTreeList trainTrees){
for (Tree tree : trainTrees){
List words = tree.getYield();
for (StateSet word : words){
String sig = word.getWord();
wordIndexer.add(sig);
}
}
wordCounter = new int[wordIndexer.size()];
for (Tree tree : trainTrees){
List words = tree.getYield();
int ind = 0;
for (StateSet word : words){
String wordString = word.getWord();
wordCounter[wordIndexer.indexOf(wordString)]++;
String sig = getSignature(word.getWord(), ind++);
wordIndexer.add(sig);
}
}
tagWordIndexer = new IntegerIndexer[numStates];
for (int tag=0; tag tree : trainTrees){
List words = tree.getYield();
List tags = tree.getPreTerminalYield();
int ind = 0;
for (StateSet word : words){
int tag = tags.get(ind).getState();
tagWordIndexer[tag].add(new Integer(word.wordIndex));
tagWordIndexer[tag].add(new Integer(word.sigIndex));
lexTag[tag] = true;
ind++;
}
}
expectedCounts = new double[numStates][][];
scores = new double[numStates][][];
for (int tag=0; tag=0 && (wordCounter[globalWordIndex]>knownWordCount)) {
// if (globalSigIndex!=-1) System.out.println("Problem: frequent word has signature!");
return res;
}
if (globalSigIndex!=-1) {
int tagSpecificWordIndex = tagWordIndexer[tag].indexOf(globalSigIndex);
if (tagSpecificWordIndex!=-1){
for (int i=0; i wordCounter.length){
// System.out.println("no count for this word: "+(String)wordIndexer.get(tagWordIndexer[tag].get(stateSet.wordIndex)));
// stateSet.sigIndex = -1;
// } else {
if ((stateSet.wordIndex>=0 && (wordCounter[stateSet.wordIndex]>knownWordCount)) || noSmoothing)
stateSet.sigIndex = -1;
else if (knownWordCount > 0)
stateSet.sigIndex = wordIndexer.indexOf(getSignature(word,stateSet.from));
else
stateSet.wordIndex = wordIndexer.indexOf(getSignature(word,stateSet.from));
}
// }
}
return score(stateSet.wordIndex, stateSet.sigIndex, tag, stateSet.from, noSmoothing, isSignature);
}
public void labelTrees(StateSetTreeList trainTrees){
for (Tree tree : trainTrees){
List words = tree.getYield();
List tags = tree.getPreTerminalYield();
int ind = 0;
for (StateSet word : words){
word.wordIndex = wordIndexer.indexOf(word.getWord());
if (word.wordIndex<0 || word.wordIndex>=wordCounter.length){
System.out.println("Have never seen this word before: "+word.getWord()+" "+word.wordIndex);
System.out.println(tree);
}
else if (wordCounter[word.wordIndex]<=knownWordCount){
short tag = tags.get(ind).getState();
String sig = getSignature(word.getWord(), ind);
wordIndexer.add(sig);
word.sigIndex = wordIndexer.indexOf(sig);
tagWordIndexer[tag].add(wordIndexer.indexOf(sig));
}
else
word.sigIndex = -1;
ind++;
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy