edu.stanford.nlp.parser.lexparser.Lexicon Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
package edu.stanford.nlp.parser.lexparser;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Serializable;
import java.io.Writer;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.trees.Tree;
import java.util.function.Function;
/**
* An interface for lexicons interfacing to lexparser. Its primary
* responsibility is to provide a conditional probability
* P(word|tag), which is fulfilled by the {#score} method.
* Inside the lexparser,
* Strings are interned and tags and words are usually represented as integers.
*
* @author Galen Andrew
*/
public interface Lexicon extends Serializable {
String UNKNOWN_WORD = "UNK"; // if UNK were a word, counts would merge
String BOUNDARY = ".$."; // boundary word -- assumed not a real word
String BOUNDARY_TAG = ".$$."; // boundary tag -- assumed not a real tag
/**
* Set the model via which unknown words should be scored by this lexicon
*/
//void setUnknownWordModel(UnknownWordModel uwModel);
/**
* Returns the number of times this word/tag pair has been seen;
* 0 returned if never previously seen
*/
// double getCount(IntTaggedWord w);
/**
* Checks whether a word is in the lexicon.
*
* @param word The word as an int
* @return Whether the word is in the lexicon
*/
boolean isKnown(int word);
/**
* Checks whether a word is in the lexicon.
*
* @param word The word as a String
* @return Whether the word is in the lexicon
*/
boolean isKnown(String word);
/** Return the Set of tags used by this tagger (available after training the tagger).
*
* @return The Set of tags used by this tagger
*/
Set tagSet(Function basicCategoryFunction);
/**
* Get an iterator over all rules (pairs of (word, POS)) for this word.
*
* @param word The word, represented as an integer in Index
* @param loc The position of the word in the sentence (counting from 0).
* Implementation note: The BaseLexicon class doesn't
* actually make use of this position information.
* @param featureSpec Additional word features like morphosyntactic information.
* @return An Iterator over a List ofIntTaggedWords, which pair the word
* with possible taggings as integer pairs. (Each can be
* thought of as a tag -> word rule.)
*/
Iterator ruleIteratorByWord(int word, int loc, String featureSpec);
/**
* Same thing, but with a string that needs to be translated by the
* lexicon's word index
*/
Iterator ruleIteratorByWord(String word, int loc, String featureSpec);
/** Returns the number of rules (tag rewrites as word) in the Lexicon.
* This method assumes that the lexicon has been initialized.
*
* @return The number of rules (tag rewrites as word) in the Lexicon.
*/
public int numRules();
/**
* Start training this lexicon on the expected number of trees.
* (Some UnknownWordModels use the number of trees to know when to
* start counting statistics.)
*/
void initializeTraining(double numTrees);
/**
* Trains this lexicon on the Collection of trees.
* Can be called more than once with different collections of trees.
*
* @param trees Trees to train on
*/
void train(Collection trees);
void train(Collection trees, double weight);
// WSGDEBUG
// Binarizer converts everything to CategoryWordTag, so we lose additional
// lexical annotations. RawTrees should be the same size as trees.
void train(Collection trees, Collection rawTrees);
void train(Tree tree, double weight);
/**
* Not all subclasses support this particular method. Those that
* don't will barf...
*/
void train(List sentence, double weight);
/**
* Not all subclasses support this particular method. Those that
* don't will barf...
*/
void train(TaggedWord tw, int loc, double weight);
/**
* If training on a per-word basis instead of on a per-tree basis,
* we will want to increment the tree count as this happens.
*/
void incrementTreesRead(double weight);
/**
* Sometimes we might have a sentence of tagged words which we would
* like to add to the lexicon, but they weren't part of a binarized,
* markovized, or otherwise annotated tree.
*/
void trainUnannotated(List sentence, double weight);
/**
* Done collecting statistics for the lexicon.
*/
void finishTraining();
/**
* Add additional words with expansion of subcategories.
*/
// void trainWithExpansion(Collection taggedWords);
/**
* Get the score of this word with this tag (as an IntTaggedWord) at this
* loc.
* (Presumably an estimate of P(word | tag).)
*
* @param iTW An IntTaggedWord pairing a word and POS tag
* @param loc The position in the sentence. In the default implementation
* this is used only for unknown words to change their
* probability distribution when sentence initial.
* @param word The word itself; useful so we don't have to look it
* up in an index
* @param featureSpec TODO
* @return A score, usually, log P(word|tag)
*/
float score(IntTaggedWord iTW, int loc, String word, String featureSpec);
/**
* Write the lexicon in human-readable format to the Writer.
* (An optional operation.)
*
* @param w The writer to output to
* @throws IOException If any I/O problem
*/
public void writeData(Writer w) throws IOException;
/**
* Read the lexicon from the BufferedReader in the format written by
* writeData.
* (An optional operation.)
*
* @param in The BufferedReader to read from
* @throws IOException If any I/O problem
*/
public void readData(BufferedReader in) throws IOException;
public UnknownWordModel getUnknownWordModel();
// todo [cdm Sep 2013]: It seems like we could easily remove this from the interface
public void setUnknownWordModel(UnknownWordModel uwm);
}