edu.stanford.nlp.parser.lexparser.UnknownGTTrainer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
package edu.stanford.nlp.parser.lexparser;
import java.util.Collection;
import java.util.Map;
import java.util.Set;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Pair;
/**
* This class trains a Good-Turing model for unknown words from a
* collection of trees. It builds up a map of statistics which can be
* used by any UnknownWordModel which wants to use the GT model.
*
* Authors:
*
* @author Roger Levy
* @author Greg Donaker (corrections and modeling improvements)
* @author Christopher Manning (generalized and improved what Greg did)
* @author Anna Rafferty
* @author John Bauer (refactored into a separate training class)
*/
public class UnknownGTTrainer {
ClassicCounter> wtCount = new ClassicCounter>();
ClassicCounter tagCount = new ClassicCounter();
ClassicCounter r1 = new ClassicCounter(); // for each tag, # of words seen once
ClassicCounter r0 = new ClassicCounter(); // for each tag, # of words not seen
Set seenWords = Generics.newHashSet();
double tokens = 0;
Map unknownGT = Generics.newHashMap();
public void train(Collection trees) {
train(trees, 1.0);
}
public void train(Collection trees, double weight) {
for (Tree t : trees) {
train(t, weight);
}
}
public void train(Tree tree, double weight) {
/* get TaggedWord and total tag counts, and get set of all
* words attested in training
*/
for (TaggedWord word : tree.taggedYield()) {
train(word, weight);
}
}
public void train(TaggedWord tw, double weight) {
tokens = tokens + weight;
String word = tw.word();
String tag = tw.tag();
// TaggedWord has crummy equality conditions
Pair wt = new Pair(word, tag);
wtCount.incrementCount(wt, weight);
tagCount.incrementCount(tag, weight);
seenWords.add(word);
}
public void finishTraining() {
// testing: get some stats here
System.err.println("Total tokens: " + tokens);
System.err.println("Total WordTag types: " + wtCount.keySet().size());
System.err.println("Total tag types: " + tagCount.keySet().size());
System.err.println("Total word types: " + seenWords.size());
/* find # of once-seen words for each tag */
for (Pair wt : wtCount.keySet()) {
if (wtCount.getCount(wt) == 1) {
r1.incrementCount(wt.second());
}
}
/* find # of unseen words for each tag */
for (String tag : tagCount.keySet()) {
for (String word : seenWords) {
Pair wt = new Pair(word, tag);
if (!(wtCount.keySet().contains(wt))) {
r0.incrementCount(tag);
}
}
}
/* set unseen word probability for each tag */
for (String tag : tagCount.keySet()) {
float logprob = (float) Math.log(r1.getCount(tag) / (tagCount.getCount(tag) * r0.getCount(tag)));
unknownGT.put(tag, Float.valueOf(logprob));
}
}
}