edu.stanford.nlp.parser.lexparser.BaseLexicon Maven / Gradle / Ivy
Show all versions of stanford-parser Show documentation
package edu.stanford.nlp.parser.lexparser;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.io.NumberRangesFileFilter;
import edu.stanford.nlp.io.EncodingPrintWriter;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.Treebank;
import edu.stanford.nlp.trees.DiskTreebank;
import edu.stanford.nlp.trees.TreebankLanguagePack;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.stats.Counters;
import edu.stanford.nlp.util.*;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.io.Writer;
import java.text.NumberFormat;
import java.util.*;
import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* This is the default concrete instantiation of the Lexicon interface. It was
* originally built for Penn Treebank English.
*
* @author Dan Klein
* @author Galen Andrew
* @author Christopher Manning
*/
public class BaseLexicon implements Lexicon {
protected UnknownWordModel uwModel;
protected final String uwModelTrainerClass;
protected transient UnknownWordModelTrainer uwModelTrainer;
protected static final boolean DEBUG_LEXICON = false;
protected static final boolean DEBUG_LEXICON_SCORE = false;
protected static final int nullWord = -1;
protected static final short nullTag = -1;
protected static final IntTaggedWord NULL_ITW = new IntTaggedWord(nullWord, nullTag);
protected final TrainOptions trainOptions;
protected final TestOptions testOptions;
protected final Options op;
/**
* If a word has been seen more than this many times, then relative
* frequencies of tags are used for POS assignment; if not, they are smoothed
* with tag priors.
*/
protected int smoothInUnknownsThreshold;
/**
* Have tags changeable based on statistics on word types having various
* taggings.
*/
protected boolean smartMutation;
protected final Index wordIndex;
protected final Index tagIndex;
/** An array of Lists of rules (IntTaggedWord), indexed by word. */
public transient List[] rulesWithWord;
// protected transient Set rules = new
// HashSet();
// When it existed, rules somehow held a few less things than rulesWithWord
// I never figured out why [cdm, Dec 2004]
/** Set of all tags as IntTaggedWord. Alive in both train and runtime
* phases, but transient.
*/
protected transient Set tags = Generics.newHashSet();
protected transient Set words = Generics.newHashSet();
// protected transient Set sigs=Generics.newHashSet();
/** Records the number of times word/tag pair was seen in training data.
* Includes word/tag pairs where one is a wildcard not a real word/tag.
*/
public ClassicCounter seenCounter = new ClassicCounter();
double[] smooth = { 1.0, 1.0 };
// these next two are used for smartMutation calculation
transient double[][] m_TT; // = null;
transient double[] m_T; // = null;
protected boolean flexiTag;
protected boolean useSignatureForKnownSmoothing;
/**
* Only used when training, specifically when training on sentences
* that weren't part of annotated (e.g., markovized, etc.) data.
*/
private Map> baseTagCounts = Generics.newHashMap();
public BaseLexicon(Index wordIndex, Index tagIndex) {
this(new Options(), wordIndex, tagIndex);
}
public BaseLexicon(Options op, Index wordIndex, Index tagIndex) {
this.wordIndex = wordIndex;
this.tagIndex = tagIndex;
flexiTag = op.lexOptions.flexiTag;
useSignatureForKnownSmoothing = op.lexOptions.useSignatureForKnownSmoothing;
this.smoothInUnknownsThreshold = op.lexOptions.smoothInUnknownsThreshold;
this.smartMutation = op.lexOptions.smartMutation;
this.trainOptions = op.trainOptions;
this.testOptions = op.testOptions;
this.op = op;
// Construct UnknownWordModel by reflection -- a right pain
// Lexicons and UnknownWordModels aren't very well encapsulated
// from each other!
if (op.lexOptions.uwModelTrainer == null) {
this.uwModelTrainerClass = "edu.stanford.nlp.parser.lexparser.BaseUnknownWordModelTrainer";
} else {
this.uwModelTrainerClass = op.lexOptions.uwModelTrainer;
}
}
/**
* Checks whether a word is in the lexicon. This version will compile the
* lexicon into the rulesWithWord array, if that hasn't already happened
*
* @param word The word as an int index to an Index
* @return Whether the word is in the lexicon
*/
@Override
public boolean isKnown(int word) {
return (word < rulesWithWord.length && word >= 0 && !rulesWithWord[word].isEmpty());
}
/**
* Checks whether a word is in the lexicon. This version works even while
* compiling lexicon with current counters (rather than using the compiled
* rulesWithWord array).
*
* TODO: The previous version would insert rules into the
* wordNumberer. Is that the desired behavior? Why not test in
* some way that doesn't affect the index? For example, start by
* testing wordIndex.contains(word).
*
* @param word The word as a String
* @return Whether the word is in the lexicon
*/
@Override
public boolean isKnown(String word) {
if (!wordIndex.contains(word))
return false;
IntTaggedWord iW = new IntTaggedWord(wordIndex.indexOf(word), nullTag);
return seenCounter.getCount(iW) > 0.0;
}
/** {@inheritDoc} */
@Override
public Set tagSet(Function basicCategoryFunction) {
Set tagSet = new HashSet();
for (String tag : tagIndex.objectsList()) {
tagSet.add(basicCategoryFunction.apply(tag));
}
return tagSet;
}
/**
* Returns the possible POS taggings for a word.
*
* @param word The word, represented as an integer in wordIndex
* @param loc The position of the word in the sentence (counting from 0).
* Implementation note: The BaseLexicon class doesn't actually
* make use of this position information.
* @return An Iterator over a List ofIntTaggedWords, which pair the word with
* possible taggings as integer pairs. (Each can be thought of as a
* tag -> word rule.)
*/
public Iterator ruleIteratorByWord(String word, int loc) {
return ruleIteratorByWord(wordIndex.addToIndex(word), loc, null);
}
/** Generate the possible taggings for a word at a sentence position.
* This may either be based on a strict lexicon or an expanded generous
* set of possible taggings.
* Implementation note: Expanded sets of possible taggings are
* calculated dynamically at runtime, so as to reduce the memory used by
* the lexicon (a space/time tradeoff).
*
* @param word The word (as an int)
* @param loc Its index in the sentence (usually only relevant for unknown words)
* @return A list of possible taggings
*/
@Override
public Iterator ruleIteratorByWord(int word, int loc, String featureSpec) {
// if (rulesWithWord == null) { // tested in isKnown already
// initRulesWithWord();
// }
List wordTaggings;
if (isKnown(word)) {
if ( ! flexiTag) {
// Strict lexical tagging for seen items
wordTaggings = rulesWithWord[word];
} else {
/* Allow all tags with same basicCategory */
/* Allow all scored taggings, unless very common */
IntTaggedWord iW = new IntTaggedWord(word, nullTag);
if (seenCounter.getCount(iW) > smoothInUnknownsThreshold) {
return rulesWithWord[word].iterator();
} else {
// give it flexible tagging not just lexicon
wordTaggings = new ArrayList(40);
for (IntTaggedWord iTW2 : tags) {
IntTaggedWord iTW = new IntTaggedWord(word, iTW2.tag);
if (score(iTW, loc, wordIndex.get(word), null) > Float.NEGATIVE_INFINITY) {
wordTaggings.add(iTW);
}
}
}
}
} else {
// we copy list so we can insert correct word in each item
wordTaggings = new ArrayList(40);
for (IntTaggedWord iTW : rulesWithWord[wordIndex.indexOf(UNKNOWN_WORD)]) {
wordTaggings.add(new IntTaggedWord(word, iTW.tag));
}
}
if (DEBUG_LEXICON) {
EncodingPrintWriter.err.println("Lexicon: " + wordIndex.get(word) + " (" +
(isKnown(word) ? "known": "unknown") + ", loc=" + loc + ", n=" +
(isKnown(word) ? word: wordIndex.indexOf(UNKNOWN_WORD)) + ") " +
(flexiTag ? "flexi": "lexicon") + " taggings: " + wordTaggings, "UTF-8");
}
return wordTaggings.iterator();
}
@Override
public Iterator ruleIteratorByWord(String word, int loc, String featureSpec) {
return ruleIteratorByWord(wordIndex.addToIndex(word), loc, featureSpec);
}
protected void initRulesWithWord() {
if (testOptions.verbose || DEBUG_LEXICON) {
System.err.print("\nInitializing lexicon scores ... ");
}
// int numWords = words.size()+sigs.size()+1;
int unkWord = wordIndex.addToIndex(UNKNOWN_WORD);
int numWords = wordIndex.size();
rulesWithWord = new List[numWords];
for (int w = 0; w < numWords; w++) {
rulesWithWord[w] = new ArrayList(1); // most have 1 or 2
// items in them
}
// for (Iterator ruleI = rules.iterator(); ruleI.hasNext();) {
tags = Generics.newHashSet();
for (IntTaggedWord iTW : seenCounter.keySet()) {
if (iTW.word() == nullWord && iTW.tag() != nullTag) {
tags.add(iTW);
}
}
// tags for unknown words
if (DEBUG_LEXICON) {
System.err.println("Lexicon initializing tags for UNKNOWN WORD (" +
Lexicon.UNKNOWN_WORD + ", " + unkWord + ')');
}
if (DEBUG_LEXICON) System.err.println("unSeenCounter is: " + uwModel.unSeenCounter());
if (DEBUG_LEXICON) System.err.println("Train.openClassTypesThreshold is " + trainOptions.openClassTypesThreshold);
for (IntTaggedWord iT : tags) {
if (DEBUG_LEXICON) System.err.println("Entry for " + iT + " is " + uwModel.unSeenCounter().getCount(iT));
double types = uwModel.unSeenCounter().getCount(iT);
if (types > trainOptions.openClassTypesThreshold) {
// Number of types before it's treated as open class
IntTaggedWord iTW = new IntTaggedWord(unkWord, iT.tag);
rulesWithWord[iTW.word].add(iTW);
}
}
if (testOptions.verbose || DEBUG_LEXICON) {
System.err.print("The " + rulesWithWord[unkWord].size() + " open class tags are: [");
for (IntTaggedWord item : rulesWithWord[unkWord]) {
System.err.print(" " + tagIndex.get(item.tag()));
if (DEBUG_LEXICON) {
IntTaggedWord iTprint = new IntTaggedWord(nullWord, item.tag);
System.err.print(" (tag " + item.tag() + ", type count is " +
uwModel.unSeenCounter().getCount(iTprint) + ')');
}
}
System.err.println(" ] ");
}
for (IntTaggedWord iTW : seenCounter.keySet()) {
if (iTW.tag() != nullTag && iTW.word() != nullWord) {
rulesWithWord[iTW.word].add(iTW);
}
}
}
protected List treeToEvents(Tree tree) {
List taggedWords = tree.taggedYield();
return listToEvents(taggedWords);
}
protected List listToEvents(List taggedWords) {
List itwList = new ArrayList();
for (TaggedWord tw : taggedWords) {
IntTaggedWord iTW = new IntTaggedWord(tw.word(), tw.tag(), wordIndex, tagIndex);
itwList.add(iTW);
}
return itwList;
}
/** Not yet implemented. */
public void addAll(List tagWords) {
addAll(tagWords, 1.0);
}
/** Not yet implemented. */
public void addAll(List taggedWords, double weight) {
List tagWords = listToEvents(taggedWords);
}
/** Not yet implemented. */
public void trainWithExpansion(Collection taggedWords) {
}
@Override
public void initializeTraining(double numTrees) {
this.uwModelTrainer =
ReflectionLoading.loadByReflection(uwModelTrainerClass);
uwModelTrainer.initializeTraining(op, this, wordIndex, tagIndex,
numTrees);
}
/**
* Trains this lexicon on the Collection of trees.
*/
@Override
public void train(Collection trees) {
train(trees, 1.0);
}
/**
* Trains this lexicon on the Collection of trees.
* Also trains the unknown word model pointed to by this lexicon.
*/
@Override
public void train(Collection trees, double weight) {
// scan data
for (Tree tree : trees) {
train(tree, weight);
}
}
@Override
public void train(Tree tree, double weight) {
train(tree.taggedYield(), weight);
}
@Override
public final void train(List sentence, double weight) {
uwModelTrainer.incrementTreesRead(weight);
int loc = 0;
for (TaggedWord tw : sentence) {
train(tw, loc, weight);
++loc;
}
}
@Override
public final void incrementTreesRead(double weight) {
uwModelTrainer.incrementTreesRead(weight);
}
@Override
public final void trainUnannotated(List sentence,
double weight) {
uwModelTrainer.incrementTreesRead(weight);
int loc = 0;
for (TaggedWord tw : sentence) {
String baseTag = op.langpack().basicCategory(tw.tag());
Counter counts = baseTagCounts.get(baseTag);
if (counts == null) {
++loc;
continue;
}
double totalCount = counts.totalCount();
if (totalCount == 0) {
++loc;
continue;
}
for (String tag : counts.keySet()) {
TaggedWord newTW = new TaggedWord(tw.word(), tag);
train(newTW, loc, weight * counts.getCount(tag) / totalCount);
}
++loc;
}
}
@Override
public void train(TaggedWord tw, int loc, double weight) {
uwModelTrainer.train(tw, loc, weight);
IntTaggedWord iTW =
new IntTaggedWord(tw.word(), tw.tag(), wordIndex, tagIndex);
seenCounter.incrementCount(iTW, weight);
IntTaggedWord iT = new IntTaggedWord(nullWord, iTW.tag);
seenCounter.incrementCount(iT, weight);
IntTaggedWord iW = new IntTaggedWord(iTW.word, nullTag);
seenCounter.incrementCount(iW, weight);
IntTaggedWord i = new IntTaggedWord(nullWord, nullTag);
seenCounter.incrementCount(i, weight);
// rules.add(iTW);
tags.add(iT);
words.add(iW);
String tag = tw.tag();
String baseTag = op.langpack().basicCategory(tag);
Counter counts = baseTagCounts.get(baseTag);
if (counts == null) {
counts = new ClassicCounter();
baseTagCounts.put(baseTag, counts);
}
counts.incrementCount(tag, weight);
}
@Override
public void finishTraining() {
uwModel = uwModelTrainer.finishTraining();
tune();
// index the possible tags for each word
initRulesWithWord();
if (DEBUG_LEXICON) {
printLexStats();
}
}
/**
* Adds the tagging with count to the data structures in this Lexicon.
*/
protected void addTagging(boolean seen, IntTaggedWord itw, double count) {
if (seen) {
seenCounter.incrementCount(itw, count);
if (itw.tag() == nullTag) {
words.add(itw);
} else if (itw.word() == nullWord) {
tags.add(itw);
} else {
// rules.add(itw);
}
} else {
uwModel.addTagging(seen, itw, count);
// if (itw.tag() == nullTag) {
// sigs.add(itw);
// }
}
}
/**
* This records how likely it is for a word with one tag to also have another
* tag. This won't work after serialization/deserialization, but that is how
* it is currently called....
*/
void buildPT_T() {
int numTags = tagIndex.size();
m_TT = new double[numTags][numTags];
m_T = new double[numTags];
double[] tmp = new double[numTags];
for (IntTaggedWord word : words) {
double tot = 0.0;
for (int t = 0; t < numTags; t++) {
IntTaggedWord iTW = new IntTaggedWord(word.word, t);
tmp[t] = seenCounter.getCount(iTW);
tot += tmp[t];
}
if (tot < 10) {
continue;
}
for (int t = 0; t < numTags; t++) {
for (int t2 = 0; t2 < numTags; t2++) {
if (tmp[t2] > 0.0) {
double c = tmp[t] / tot;
m_T[t] += c;
m_TT[t2][t] += c;
}
}
}
}
}
/**
* Get the score of this word with this tag (as an IntTaggedWord) at this
* location. (Presumably an estimate of P(word | tag).)
*
* Implementation documentation:
* Seen:
* c_W = count(W) c_TW = count(T,W)
* c_T = count(T) c_Tunseen = count(T) among new words in 2nd half
* total = count(seen words) totalUnseen = count("unseen" words)
* p_T_U = Pmle(T|"unseen")
* pb_T_W = P(T|W). If (c_W > smoothInUnknownsThreshold) = c_TW/c_W
* Else (if not smart mutation) pb_T_W = bayes prior smooth[1] with p_T_U
* p_T= Pmle(T) p_W = Pmle(W)
* pb_W_T = log(pb_T_W * p_W / p_T) [Bayes rule]
* Note that this doesn't really properly reserve mass to unknowns.
*
* Unseen:
* c_TS = count(T,Sig|Unseen) c_S = count(Sig) c_T = count(T|Unseen)
* c_U = totalUnseen above
* p_T_U = Pmle(T|Unseen)
* pb_T_S = Bayes smooth of Pmle(T|S) with P(T|Unseen) [smooth[0]]
* pb_W_T = log(P(W|T)) inverted
*
* @param iTW An IntTaggedWord pairing a word and POS tag
* @param loc The position in the sentence. In the default implementation
* this is used only for unknown words to change their probability
* distribution when sentence initial
* @return A float score, usually, log P(word|tag)
*/
@Override
public float score(IntTaggedWord iTW, int loc, String word, String featureSpec) {
// both actual
double c_TW = seenCounter.getCount(iTW);
// double x_TW = xferCounter.getCount(iTW);
IntTaggedWord temp = new IntTaggedWord(iTW.word, nullTag);
// word counts
double c_W = seenCounter.getCount(temp);
// double x_W = xferCounter.getCount(temp);
// totals
double total = seenCounter.getCount(NULL_ITW);
double totalUnseen = uwModel.unSeenCounter().getCount(NULL_ITW);
temp = new IntTaggedWord(nullWord, iTW.tag);
// tag counts
double c_T = seenCounter.getCount(temp);
double c_Tunseen = uwModel.unSeenCounter().getCount(temp);
double pb_W_T; // always set below
if (DEBUG_LEXICON) {
// dump info about last word
if (iTW.word != debugLastWord) {
if (debugLastWord >= 0 && debugPrefix != null) {
// the 2nd conjunct in test above handles older serialized files
EncodingPrintWriter.err.println(debugPrefix + debugProbs + debugNoProbs, "UTF-8");
}
}
}
boolean seen = (c_W > 0.0);
if (seen) {
// known word model for P(T|W)
if (DEBUG_LEXICON_SCORE) {
System.err.println("Lexicon.score " + wordIndex.get(iTW.word) + "/" + tagIndex.get(iTW.tag) + " as known word.");
}
// c_TW = Math.sqrt(c_TW); [cdm: funny math scaling? dunno who played with this]
// c_TW += 0.5;
double p_T_U;
if (useSignatureForKnownSmoothing) { // only works for English currently
p_T_U = getUnknownWordModel().scoreProbTagGivenWordSignature(iTW, loc, smooth[0], word);
if (DEBUG_LEXICON_SCORE) System.err.println("With useSignatureForKnownSmoothing, P(T|U) is " + p_T_U + " rather than " + (c_Tunseen / totalUnseen));
} else {
p_T_U = c_Tunseen / totalUnseen;
}
double pb_T_W; // always set below
if (DEBUG_LEXICON_SCORE) {
System.err.println("c_W is " + c_W + " mle = " + (c_TW/c_W)+ " smoothInUnknownsThresh is " +
smoothInUnknownsThreshold + " base p_T_U is " + c_Tunseen + "/" + totalUnseen + " = " + p_T_U);
}
if (c_W > smoothInUnknownsThreshold && c_TW > 0.0 && c_W > 0.0) {
// we've seen the word enough times to have confidence in its tagging
pb_T_W = c_TW / c_W;
} else {
// we haven't seen the word enough times to have confidence in its
// tagging
if (smartMutation) {
int numTags = tagIndex.size();
if (m_TT == null || numTags != m_T.length) {
buildPT_T();
}
p_T_U *= 0.1;
// System.out.println("Checking "+iTW);
for (int t = 0; t < numTags; t++) {
IntTaggedWord iTW2 = new IntTaggedWord(iTW.word, t);
double p_T_W2 = seenCounter.getCount(iTW2) / c_W;
if (p_T_W2 > 0) {
// System.out.println(" Observation of "+tagIndex.get(t)+"
// ("+seenCounter.getCount(iTW2)+") mutated to
// "+tagIndex.get(iTW.tag)+" at rate
// "+(m_TT[tag][t]/m_T[t]));
p_T_U += p_T_W2 * m_TT[iTW.tag][t] / m_T[t] * 0.9;
}
}
}
if (DEBUG_LEXICON_SCORE) {
System.err.println("c_TW = " + c_TW + " c_W = " + c_W +
" p_T_U = " + p_T_U);
}
// double pb_T_W = (c_TW+smooth[1]*x_TW)/(c_W+smooth[1]*x_W);
pb_T_W = (c_TW + smooth[1] * p_T_U) / (c_W + smooth[1]);
}
double p_T = (c_T / total);
double p_W = (c_W / total);
pb_W_T = Math.log(pb_T_W * p_W / p_T);
if (DEBUG_LEXICON) {
if (iTW.word != debugLastWord) {
debugLastWord = iTW.word;
debugLoc = loc;
debugProbs = new StringBuilder();
debugNoProbs = new StringBuilder("impossible: ");
debugPrefix = "Lexicon: " + wordIndex.get(debugLastWord) + " (known): ";
}
if (pb_W_T > Double.NEGATIVE_INFINITY) {
NumberFormat nf = NumberFormat.getNumberInstance();
nf.setMaximumFractionDigits(3);
debugProbs.append(tagIndex.get(iTW.tag) + ": cTW=" + c_TW + " c_T=" + c_T
+ " pb_T_W=" + nf.format(pb_T_W) + " log pb_W_T=" + nf.format(pb_W_T)
+ ", ");
// debugProbs.append("\n" + "smartMutation=" + smartMutation + "
// smoothInUnknownsThreshold=" + smoothInUnknownsThreshold + "
// smooth0=" + smooth[0] + "smooth1=" + smooth[1] + " p_T_U=" + p_T_U
// + " c_W=" + c_W);
} else {
debugNoProbs.append(tagIndex.get(iTW.tag)).append(' ');
}
} // end if (DEBUG_LEXICON)
} else { // when unseen
if (loc >= 0) {
pb_W_T = getUnknownWordModel().score(iTW, loc, c_T, total, smooth[0], word);
} else {
// For negative we now do a weighted average for the dependency grammar :-)
double pb_W0_T = getUnknownWordModel().score(iTW, 0, c_T, total, smooth[0], word);
double pb_W1_T = getUnknownWordModel().score(iTW, 1, c_T, total, smooth[0], word);
pb_W_T = Math.log((Math.exp(pb_W0_T) + 2 * Math.exp(pb_W1_T))/3);
}
}
String tag = tagIndex.get(iTW.tag());
// Categorical cutoff if score is too low
if (pb_W_T > -100.0) {
return (float) pb_W_T;
}
return Float.NEGATIVE_INFINITY;
} // end score()
private transient int debugLastWord = -1;
private transient int debugLoc = -1;
private transient StringBuilder debugProbs;
private transient StringBuilder debugNoProbs;
private transient String debugPrefix;
/**
* TODO: this used to actually score things based on the original trees
*/
public final void tune() {
double bestScore = Double.NEGATIVE_INFINITY;
double[] bestSmooth = { 0.0, 0.0 };
for (smooth[0] = 1; smooth[0] <= 1; smooth[0] *= 2.0) {// 64
for (smooth[1] = 0.2; smooth[1] <= 0.2; smooth[1] *= 2.0) {// 3
// for (smooth[0]=0.5; smooth[0]<=64; smooth[0] *= 2.0) {//64
// for (smooth[1]=0.1; smooth[1]<=12.8; smooth[1] *= 2.0) {//3
double score = 0.0;
// score = scoreAll(trees);
if (testOptions.verbose) {
System.err.println("Tuning lexicon: s0 " + smooth[0] +
" s1 " + smooth[1] + " is " + score);
}
if (score > bestScore) {
System.arraycopy(smooth, 0, bestSmooth, 0, smooth.length);
bestScore = score;
}
}
}
System.arraycopy(bestSmooth, 0, smooth, 0, bestSmooth.length);
if (smartMutation) {
smooth[0] = 8.0;
// smooth[1] = 1.6;
// smooth[0] = 0.5;
smooth[1] = 0.1;
}
if (testOptions.unseenSmooth > 0.0) {
smooth[0] = testOptions.unseenSmooth;
}
if (testOptions.verbose) {
System.err.println("Tuning selected smoothUnseen " + smooth[0] + " smoothSeen " + smooth[1]
+ " at " + bestScore);
}
}
private void readObject(ObjectInputStream ois)
throws IOException, ClassNotFoundException
{
ois.defaultReadObject();
// Reinitialize the transient objects. This must be done here
// rather than lazily so that there is no race condition to
// reinitialize them later.
initRulesWithWord();
}
/**
* Populates data in this Lexicon from the character stream given by the
* Reader r.
* TODO: this doesn't appear to correctly read in the
* UnknownWordModel in the case of a model more complicated than the
* unSeenCounter
*/
@Override
public void readData(BufferedReader in) throws IOException {
final String SEEN = "SEEN";
String line;
int lineNum = 1;
// all lines have one tagging with raw count per line
line = in.readLine();
Pattern p = Pattern.compile("^smooth\\[([0-9])\\] = (.*)$");
while (line != null && line.length() > 0) {
try {
Matcher m = p.matcher(line);
if (m.matches()) {
int i = Integer.parseInt(m.group(1));
smooth[i] = Double.parseDouble(m.group(2));
} else {
// split on spaces, quote with doublequote, and escape with backslash
String[] fields = StringUtils.splitOnCharWithQuoting(line, ' ', '\"', '\\');
// System.out.println("fields:\n" + fields[0] + "\n" + fields[1] +
// "\n" + fields[2] + "\n" + fields[3] + "\n" + fields[4]);
boolean seen = fields[3].equals(SEEN);
addTagging(seen, new IntTaggedWord(fields[2], fields[0], wordIndex, tagIndex), Double.parseDouble(fields[4]));
}
} catch (RuntimeException e) {
throw new IOException("Error on line " + lineNum + ": " + line, e);
}
lineNum++;
line = in.readLine();
}
initRulesWithWord();
}
/**
* Writes out data from this Object to the Writer w. Rules are separated by
* newline, and rule elements are delimited by \t.
*/
@Override
public void writeData(Writer w) throws IOException {
PrintWriter out = new PrintWriter(w);
for (IntTaggedWord itw : seenCounter.keySet()) {
out.println(itw.toLexicalEntry(wordIndex, tagIndex) + " SEEN " + seenCounter.getCount(itw));
}
for (IntTaggedWord itw : getUnknownWordModel().unSeenCounter().keySet()) {
out.println(itw.toLexicalEntry(wordIndex, tagIndex) + " UNSEEN " + getUnknownWordModel().unSeenCounter().getCount(itw));
}
for (int i = 0; i < smooth.length; i++) {
out.println("smooth[" + i + "] = " + smooth[i]);
}
out.flush();
}
/** Returns the number of rules (tag rewrites as word) in the Lexicon.
* This method assumes that the lexicon has been initialized.
*/
@Override
public int numRules() {
int accumulated = 0;
for (List lis : rulesWithWord) {
accumulated += lis.size();
}
return accumulated;
}
private static final int STATS_BINS = 15;
protected static void examineIntersection(Set s1, Set s2) {
Set knownTypes = Generics.newHashSet(s1);
knownTypes.retainAll(s2);
if (knownTypes.size() != 0) {
System.err.printf("|intersect|: %d%n", knownTypes.size());
for (String word : knownTypes) {
System.err.print(word + " ");
}
System.err.println();
}
}
/** Print some statistics about this lexicon. */
public void printLexStats() {
System.out.println("BaseLexicon statistics");
System.out.println("unknownLevel is " + getUnknownWordModel().getUnknownLevel());
// System.out.println("Rules size: " + rules.size());
System.out.println("Sum of rulesWithWord: " + numRules());
System.out.println("Tags size: " + tags.size());
int wsize = words.size();
System.out.println("Words size: " + wsize);
// System.out.println("Unseen Sigs size: " + sigs.size() +
// " [number of unknown equivalence classes]");
System.out.println("rulesWithWord length: " + rulesWithWord.length
+ " [should be sum of words + unknown sigs]");
int[] lengths = new int[STATS_BINS];
ArrayList[] wArr = new ArrayList[STATS_BINS];
for (int j = 0; j < STATS_BINS; j++) {
wArr[j] = new ArrayList();
}
for (int i = 0; i < rulesWithWord.length; i++) {
int num = rulesWithWord[i].size();
if (num > STATS_BINS - 1) {
num = STATS_BINS - 1;
}
lengths[num]++;
if (wsize <= 20 || num >= STATS_BINS / 2) {
wArr[num].add(wordIndex.get(i));
}
}
System.out.println("Stats on how many taggings for how many words");
for (int j = 0; j < STATS_BINS; j++) {
System.out.print(j + " taggings: " + lengths[j] + " words ");
if (wsize <= 20 || j >= STATS_BINS / 2) {
System.out.print(wArr[j]);
}
System.out.println();
}
NumberFormat nf = NumberFormat.getNumberInstance();
nf.setMaximumFractionDigits(0);
System.out.println("Unseen counter: " + Counters.toString(uwModel.unSeenCounter(), nf));
if (wsize < 50 && tags.size() < 10) {
nf.setMaximumFractionDigits(3);
StringWriter sw = new StringWriter();
PrintWriter pw = new PrintWriter(sw);
pw.println("Tagging probabilities log P(word|tag)");
for (int t = 0; t < tags.size(); t++) {
pw.print('\t');
pw.print(tagIndex.get(t));
}
pw.println();
for (int w = 0; w < wsize; w++) {
pw.print(wordIndex.get(w));
pw.print('\t');
for (int t = 0; t < tags.size(); t++) {
IntTaggedWord iTW = new IntTaggedWord(w, t);
pw.print(nf.format(score(iTW, 1, wordIndex.get(w), null)));
if (t == tags.size() -1) {
pw.println();
} else
pw.print('\t');
}
}
pw.close();
System.out.println(sw.toString());
}
}
/**
* Evaluates how many words (= terminals) in a collection of trees are
* covered by the lexicon. First arg is the collection of trees; second
* through fourth args get the results. Currently unused; this probably
* only works if train and test at same time so tags and words variables
* are initialized.
*/
public double evaluateCoverage(Collection trees, Set missingWords,
Set missingTags, Set missingTW) {
List iTW1 = new ArrayList();
for (Tree t : trees) {
iTW1.addAll(treeToEvents(t));
}
int total = 0;
int unseen = 0;
for (IntTaggedWord itw : iTW1) {
total++;
if (!words.contains(new IntTaggedWord(itw.word(), nullTag))) {
missingWords.add(wordIndex.get(itw.word()));
}
if (!tags.contains(new IntTaggedWord(nullWord, itw.tag()))) {
missingTags.add(tagIndex.get(itw.tag()));
}
// if (!rules.contains(itw)) {
if (seenCounter.getCount(itw) == 0.0) {
unseen++;
missingTW.add(itw);
}
}
return (double) unseen / total;
}
int[] tagsToBaseTags = null;
public int getBaseTag(int tag, TreebankLanguagePack tlp) {
if (tagsToBaseTags == null) {
populateTagsToBaseTags(tlp);
}
return tagsToBaseTags[tag];
}
private void populateTagsToBaseTags(TreebankLanguagePack tlp) {
int total = tagIndex.size();
tagsToBaseTags = new int[total];
for (int i = 0; i < total; i++) {
String tag = tagIndex.get(i);
String baseTag = tlp.basicCategory(tag);
int j = tagIndex.addToIndex(baseTag);
tagsToBaseTags[i] = j;
}
}
/** Provides some testing and opportunities for exploration of the
* probabilities of a BaseLexicon. What's here currently probably
* only works for the English Penn Treeebank, as it uses default
* constructors. Of the words given to test on,
* the first is treated as sentence initial, and the rest as not
* sentence initial.
*
* @param args The command line arguments:
* java BaseLexicon treebankPath fileRange unknownWordModel words*
*/
public static void main(String[] args) {
if (args.length < 3) {
System.err.println("java BaseLexicon treebankPath fileRange unknownWordModel words*");
return;
}
System.out.print("Training BaseLexicon from " + args[0] + ' ' + args[1] + " ... ");
Treebank tb = new DiskTreebank();
tb.loadPath(args[0], new NumberRangesFileFilter(args[1], true));
// TODO: change this interface so the lexicon creates its own indices?
Index wordIndex = new HashIndex();
Index tagIndex = new HashIndex();
Options op = new Options();
op.lexOptions.useUnknownWordSignatures = Integer.parseInt(args[2]);
BaseLexicon lex = new BaseLexicon(op, wordIndex, tagIndex);
lex.initializeTraining(tb.size());
lex.train(tb);
lex.finishTraining();
System.out.println("done.");
System.out.println();
NumberFormat nf = NumberFormat.getNumberInstance();
nf.setMaximumFractionDigits(4);
List impos = new ArrayList();
for (int i = 3; i < args.length; i++) {
if (lex.isKnown(args[i])) {
System.out.println(args[i] + " is a known word. Log probabilities [log P(w|t)] for its taggings are:");
for (Iterator it = lex.ruleIteratorByWord(wordIndex.addToIndex(args[i]), i - 3, null); it.hasNext(); ) {
IntTaggedWord iTW = it.next();
System.out.println(StringUtils.pad(iTW, 24) + nf.format(lex.score(iTW, i - 3, wordIndex.get(iTW.word), null)));
}
} else {
String sig = lex.getUnknownWordModel().getSignature(args[i], i-3);
System.out.println(args[i] + " is an unknown word. Signature with uwm " + lex.getUnknownWordModel().getUnknownLevel() + ((i == 3) ? " init": "non-init") + " is: " + sig);
impos.clear();
List lis = new ArrayList(tagIndex.objectsList());
Collections.sort(lis);
for (String tStr : lis) {
IntTaggedWord iTW = new IntTaggedWord(args[i], tStr, wordIndex, tagIndex);
double score = lex.score(iTW, 1, args[i], null);
if (score == Float.NEGATIVE_INFINITY) {
impos.add(tStr);
} else {
System.out.println(StringUtils.pad(iTW, 24) + nf.format(score));
}
}
if (impos.size() > 0) {
System.out.println(args[i] + " impossible tags: " + impos);
}
}
System.out.println();
}
}
@Override
public UnknownWordModel getUnknownWordModel() {
return uwModel;
}
@Override
public final void setUnknownWordModel(UnknownWordModel uwm) {
this.uwModel = uwm;
}
// TODO(spenceg): Debug method for getting a treebank with CoreLabels. This is for training
// the FactoredLexicon.
@Override
public void train(Collection trees, Collection rawTrees) {
train(trees);
}
private static final long serialVersionUID = 40L;
}