edu.stanford.nlp.parser.lexparser.FactoredLexicon Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
package edu.stanford.nlp.parser.lexparser;
import edu.stanford.nlp.util.logging.Redwood;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import edu.stanford.nlp.international.Language;
import edu.stanford.nlp.international.arabic.ArabicMorphoFeatureSpecification;
import edu.stanford.nlp.international.french.FrenchMorphoFeatureSpecification;
import edu.stanford.nlp.international.morph.MorphoFeatureSpecification;
import edu.stanford.nlp.international.morph.MorphoFeatureSpecification.MorphoFeatureType;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.stats.Counters;
import edu.stanford.nlp.stats.TwoDimensionalIntCounter;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.Treebank;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.HashIndex;
import edu.stanford.nlp.util.Index;
import edu.stanford.nlp.util.Pair;
/**
*
* @author Spence Green
*
*/
public class FactoredLexicon extends BaseLexicon {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(FactoredLexicon.class);
private static final long serialVersionUID = -744693222804176489L;
private static final boolean DEBUG = false;
private MorphoFeatureSpecification morphoSpec;
private static final String NO_MORPH_ANALYSIS = "xXxNONExXx";
private Index morphIndex = new HashIndex<>();
private TwoDimensionalIntCounter wordTag = new TwoDimensionalIntCounter<>(40000);
private Counter wordTagUnseen = new ClassicCounter<>(500);
private TwoDimensionalIntCounter lemmaTag = new TwoDimensionalIntCounter<>(40000);
private Counter lemmaTagUnseen = new ClassicCounter<>(500);
private TwoDimensionalIntCounter morphTag = new TwoDimensionalIntCounter<>(500);
private Counter morphTagUnseen = new ClassicCounter<>(500);
private Counter tagCounter = new ClassicCounter<>(300);
public FactoredLexicon(MorphoFeatureSpecification morphoSpec, Index wordIndex, Index tagIndex) {
super(wordIndex, tagIndex);
this.morphoSpec = morphoSpec;
}
public FactoredLexicon(Options op, MorphoFeatureSpecification morphoSpec, Index wordIndex, Index tagIndex) {
super(op, wordIndex, tagIndex);
this.morphoSpec = morphoSpec;
}
/**
* Rule table is lemmas. So isKnown() is slightly trickier.
*/
@Override
public Iterator ruleIteratorByWord(int word, int loc, String featureSpec) {
if (word == wordIndex.indexOf(BOUNDARY)) {
// Deterministic tagging of the boundary symbol
return rulesWithWord[word].iterator();
} else if (isKnown(word)) {
// Strict lexical tagging for seen *lemma* types
// We need to copy the word form into the rules, which currently have lemmas in them
return rulesWithWord[word].iterator();
} else {
if (DEBUG) log.info("UNKNOWN WORD");
// Unknown word signatures
Set lexRules = Generics.newHashSet(10);
List uwRules = rulesWithWord[wordIndex.indexOf(UNKNOWN_WORD)];
// Inject the word into these rules instead of the UW signature
for (IntTaggedWord iTW : uwRules) {
lexRules.add(new IntTaggedWord(word, iTW.tag));
}
return lexRules.iterator();
}
}
@Override
public float score(IntTaggedWord iTW, int loc, String word, String featureSpec) {
final int wordId = iTW.word();
final int tagId = iTW.tag();
// Force 1-best path to go through the boundary symbol
// (deterministic tagging)
final int boundaryId = wordIndex.indexOf(BOUNDARY);
final int boundaryTagId = tagIndex.indexOf(BOUNDARY_TAG);
if (wordId == boundaryId && tagId == boundaryTagId) {
return 0.0f;
}
// Morphological features
String tag = tagIndex.get(iTW.tag());
Pair lemmaMorph = MorphoFeatureSpecification.splitMorphString(word, featureSpec);
String lemma = lemmaMorph.first();
int lemmaId = wordIndex.indexOf(lemma);
String richMorphTag = lemmaMorph.second();
String reducedMorphTag = morphoSpec.strToFeatures(richMorphTag).toString().trim();
reducedMorphTag = reducedMorphTag.length() == 0 ? NO_MORPH_ANALYSIS : reducedMorphTag;
int morphId = morphIndex.addToIndex(reducedMorphTag);
// Score the factors and create the rule score p_W_T
double p_W_Tf = Math.log(probWordTag(word, loc, wordId, tagId));
// double p_L_T = Math.log(probLemmaTag(word, loc, tagId, lemmaId));
double p_L_T = 0.0;
double p_M_T = Math.log(probMorphTag(tagId, morphId));
double p_W_T = p_W_Tf + p_L_T + p_M_T;
if (DEBUG) {
// String tag = tagIndex.get(tagId);
System.err.printf("WSGDEBUG: %s --> %s %s %s || %.10f (%.5f / %.5f / %.5f)%n", tag, word, lemma,
reducedMorphTag, p_W_T, p_W_Tf, p_L_T, p_M_T);
}
// Filter low probability taggings
return p_W_T > -100.0 ? (float) p_W_T : Float.NEGATIVE_INFINITY;
}
private double probWordTag(String word, int loc, int wordId, int tagId) {
double cW = wordTag.totalCount(wordId);
double cWT = wordTag.getCount(wordId, tagId);
// p_L
double p_W = cW / wordTag.totalCount();
// p_T
double cTseen = tagCounter.getCount(tagId);
double p_T = cTseen / tagCounter.totalCount();
// p_T_L
double p_W_T = 0.0;
if (cW > 0.0) { // Seen lemma
double p_T_W = 0.0;
if (cW > 100.0 && cWT > 0.0) {
p_T_W = cWT / cW;
} else {
double cTunseen = wordTagUnseen.getCount(tagId);
// TODO p_T_U is 0?
double p_T_U = cTunseen / wordTagUnseen.totalCount();
p_T_W = (cWT + smooth[1]*p_T_U) / (cW + smooth[1]);
}
p_W_T = p_T_W * p_W / p_T;
} else { // Unseen word. Score based on the word signature (of the surface form)
IntTaggedWord iTW = new IntTaggedWord(wordId, tagId);
double c_T = tagCounter.getCount(tagId);
p_W_T = Math.exp(getUnknownWordModel().score(iTW, loc, c_T, tagCounter.totalCount(), smooth[0], word));
}
return p_W_T;
}
/**
* This method should never return 0!!
*/
private double probLemmaTag(String word, int loc, int tagId, int lemmaId) {
double cL = lemmaTag.totalCount(lemmaId);
double cLT = lemmaTag.getCount(lemmaId, tagId);
// p_L
double p_L = cL / lemmaTag.totalCount();
// p_T
double cTseen = tagCounter.getCount(tagId);
double p_T = cTseen / tagCounter.totalCount();
// p_T_L
double p_L_T = 0.0;
if (cL > 0.0) { // Seen lemma
double p_T_L = 0.0;
if (cL > 100.0 && cLT > 0.0) {
p_T_L = cLT / cL;
} else {
double cTunseen = lemmaTagUnseen.getCount(tagId);
// TODO(spenceg): p_T_U is 0??
double p_T_U = cTunseen / lemmaTagUnseen.totalCount();
p_T_L = (cLT + smooth[1]*p_T_U) / (cL + smooth[1]);
}
p_L_T = p_T_L * p_L / p_T;
} else { // Unseen lemma. Score based on the word signature (of the surface form)
// Hack
double cTunseen = lemmaTagUnseen.getCount(tagId);
p_L_T = cTunseen / tagCounter.totalCount();
// int wordId = wordIndex.indexOf(word);
// IntTaggedWord iTW = new IntTaggedWord(wordId, tagId);
// double c_T = tagCounter.getCount(tagId);
// p_L_T = Math.exp(getUnknownWordModel().score(iTW, loc, c_T, tagCounter.totalCount(), smooth[0], word));
}
return p_L_T;
}
/**
* This method should never return 0!
*/
private double probMorphTag(int tagId, int morphId) {
double cM = morphTag.totalCount(morphId);
double cMT = morphTag.getCount(morphId, tagId);
// p_M
double p_M = cM / morphTag.totalCount();
// p_T
double cTseen = tagCounter.getCount(tagId);
double p_T = cTseen / tagCounter.totalCount();
double p_M_T = 0.0;
if (cM > 100.0 && cMT > 0.0) {
double p_T_M = cMT / cM;
// else {
// double cTunseen = morphTagUnseen.getCount(tagId);
// double p_T_U = cTunseen / morphTagUnseen.totalCount();
// p_T_M = (cMT + smooth[1]*p_T_U) / (cM + smooth[1]);
// }
p_M_T = p_T_M * p_M / p_T;
} else { // Unseen morphological analysis
// Hack....unseen morph tags are extremely rare
// Add+1 smoothing
p_M_T = 1.0 / (morphTag.totalCount() + tagIndex.size() + 1.0);
}
return p_M_T;
}
/**
* This method should populate wordIndex, tagIndex, and morphIndex.
*/
@Override
public void train(Collection trees, Collection rawTrees) {
double weight = 1.0;
// Train uw model on words
uwModelTrainer.train(trees, weight);
final double numTrees = trees.size();
Iterator rawTreesItr = rawTrees == null ? null : rawTrees.iterator();
Iterator treeItr = trees.iterator();
// Train factored lexicon on lemmas and morph tags
int treeId = 0;
while (treeItr.hasNext()) {
Tree tree = treeItr.next();
// CoreLabels, with morph analysis in the originalText annotation
List