edu.stanford.nlp.parser.lexparser.TreebankLangParserParams Maven / Gradle / Ivy
package edu.stanford.nlp.parser.lexparser;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.parser.metrics.AbstractEval;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.ling.*;
import java.util.function.Predicate;
import edu.stanford.nlp.util.Index;
import java.io.OutputStream;
import java.io.PrintWriter;
import java.io.Serializable;
import java.util.List;
/**
* Contains language-specific methods commonly necessary to get a parser
* to parse an arbitrary treebank.
*
* @author Roger Levy
* @version 03/05/2003
*/
public interface TreebankLangParserParams extends TreebankFactory, Serializable {
HeadFinder headFinder();
HeadFinder typedDependencyHeadFinder();
/**
* Allows language specific processing (e.g., stemming) of head words.
*
* @param headWord An {@link edu.stanford.nlp.ling.Label} that minimally implements the
* {@link edu.stanford.nlp.ling.HasWord} and {@link edu.stanford.nlp.ling.HasTag} interfaces.
* @return A processed {@link edu.stanford.nlp.ling.Label}
*/
Label processHeadWord(Label headWord);
void setInputEncoding(String encoding);
void setOutputEncoding(String encoding);
/**
* If evalGFs = true, then the evaluation of parse trees will include evaluation on grammatical functions.
* Otherwise, evaluation will strip the grammatical functions.
*/
void setEvaluateGrammaticalFunctions(boolean evalGFs);
/**
* Returns the output encoding being used.
* @return The output encoding being used.
*/
String getOutputEncoding();
/**
* Returns the input encoding being used.
* @return The input encoding being used.
*/
String getInputEncoding();
/**
* Returns a factory for reading in trees from the source you want. It's
* the responsibility of trf to deal properly with character-set encoding
* of the input. It also is the responsibility of trf to properly
* normalize trees.
*
* @return A factory that vends an appropriate TreeReader
*/
TreeReaderFactory treeReaderFactory();
/**
* Vends a {@link Lexicon} object suitable to the particular language/treebank combination of interest.
* @param op Options as to how the Lexicon behaves
* @return A Lexicon, constructed based on the given option
*/
Lexicon lex(Options op, Index wordIndex, Index tagIndex);
/**
* The tree transformer applied to trees prior to evaluation.
* For instance, it might delete punctuation nodes. This method will
* be applied both to the parse output tree and to the gold
* tree. The exact specification depends on "standard practice" for
* various treebanks.
*
* @return A TreeTransformer that performs adjustments to trees to delete
* or equivalence class things not evaluated in the parser performance
* evaluation.
*/
TreeTransformer collinizer();
/**
* the tree transformer used to produce trees for evaluation. Will
* be applied both to the parse output tree and to the gold
* tree. Should strip punctuation and maybe do some other
* things. The evalb version should strip some more stuff
* off. (finish this doc!)
*/
TreeTransformer collinizerEvalb();
/**
* returns a MemoryTreebank appropriate to the treebank source
*/
MemoryTreebank memoryTreebank();
/**
* returns a DiskTreebank appropriate to the treebank source
*/
DiskTreebank diskTreebank();
/**
* returns a MemoryTreebank appropriate to the testing treebank source
*/
MemoryTreebank testMemoryTreebank();
/**
* Required to extend TreebankFactory
*/
Treebank treebank();
/**
* returns a TreebankLanguagePack containing Treebank-specific (but
* not parser-specific) info such as what is punctuation, and also
* information about the structure of labels
*/
TreebankLanguagePack treebankLanguagePack();
/**
* returns a PrintWriter used to print output. It's the
* responsibility of the returned PrintWriter to deal properly with
* character encodings for the relevant treebank
*/
PrintWriter pw();
/**
* returns a PrintWriter used to print output to the OutputStream
* o. It's the responsibility of the returned PrintWriter to deal
* properly with character encodings for the relevant treebank
*/
PrintWriter pw(OutputStream o);
/**
* Returns the splitting strings used for selective splits.
*
* @return An array containing ancestor-annotated Strings: categories
* should be split according to these ancestor annotations.
*/
String[] sisterSplitters();
/**
* Returns a TreeTransformer appropriate to the Treebank which
* can be used to remove functional tags (such as "-TMP") from
* categories.
*/
TreeTransformer subcategoryStripper();
/**
* This method does language-specific tree transformations such
* as annotating particular nodes with language-relevant features.
* Such parameterizations should be inside the specific
* TreebankLangParserParams class. This method is recursively
* applied to each node in the tree (depth first, left-to-right),
* so you shouldn't write this method to apply recursively to tree
* members. This method is allowed to (and in some cases does)
* destructively change the input tree {@code t}. It changes both
* labels and the tree shape.
*
* @param t The input tree (with non-language specific annotation already
* done, so you need to strip back to basic categories)
* @param root The root of the current tree (can be null for words)
* @return The fully annotated tree node (with daughters still as you
* want them in the final result)
*/
Tree transformTree(Tree t, Tree root);
/**
* display language-specific settings
*/
void display();
/**
* Set a language-specific option according to command-line flags.
* This routine should try to process the option starting at args[i] (which
* might potentially be several arguments long if it takes arguments).
* It should return the index after the last index it consumed in
* processing. In particular, if it cannot process the current option,
* the return value should be i.
*
* @param args Array of command line arguments
* @param i Index in command line arguments to try to process as an option
* @return The index of the item after arguments processed as part of this
* command line option.
*/
int setOptionFlag(String[] args, int i);
/**
* Return a default sentence of the language (for testing).
* @return A default sentence of the language
*/
List extends HasWord> defaultTestSentence();
TokenizerFactory treeTokenizerFactory();
Extractor dependencyGrammarExtractor(Options op, Index wordIndex, Index tagIndex);
/**
* Give the parameters for smoothing in the MLEDependencyGrammar.
* @return an array of doubles with smooth_aT_hTWd, smooth_aTW_hTWd, smooth_stop, and interp
*/
double[] MLEDependencyGrammarSmoothingParams();
/**
* Returns a language specific object for evaluating PP attachment
*
* @return An object that implements {@link AbstractEval}
*/
AbstractEval ppAttachmentEval();
/**
* Returns a function which reads the given filename and turns its
* content in a list of GrammaticalStructures. Will throw
* UnsupportedOperationException if the language doesn't support
* dependencies or GrammaticalStructures.
*/
List readGrammaticalStructureFromFile(String filename);
/**
* Build a GrammaticalStructure from a Tree. Throws
* UnsupportedOperationException if the language doesn't support
* dependencies or GrammaticalStructures.
*/
GrammaticalStructure getGrammaticalStructure(Tree t, Predicate filter,
HeadFinder hf);
/** Whether our code provides support for converting phrase structure
* (constituency) parses to (basic) dependency parses.
* @return Whether dependencies are supported for a language
*
*/
boolean supportsBasicDependencies();
/** Set whether to generate original Stanford Dependencies or the newer
* Universal Dependencies.
*
* @param originalDependencies Whether to generate SD
*/
void setGenerateOriginalDependencies(boolean originalDependencies);
/** Whether to generate original Stanford Dependencies or the newer
* Universal Dependencies.
*
* @return Whether to generate SD
*/
boolean generateOriginalDependencies();
/** When run inside StanfordCoreNLP, which flags should be used by default.
* E.g., the current use is that for English, we want it to run with the
* option to retain "-TMP" functional tags but not to impose that on
* other languages.
*/
String[] defaultCoreNLPFlags();
}