edu.stanford.nlp.parser.lexparser.TreebankLangParserParams Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.parser.lexparser;

import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.parser.metrics.AbstractEval;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.ling.*;

import java.util.Properties;
import java.util.function.Predicate;
import edu.stanford.nlp.util.Index;

import java.io.OutputStream;
import java.io.PrintWriter;
import java.io.Serializable;
import java.util.List;


/**
 * Contains language-specific methods commonly necessary to get a parser
 * to parse an arbitrary treebank.
 *
 * @author Roger Levy
 * @version 03/05/2003
 */
public interface TreebankLangParserParams extends TreebankFactory, Serializable {

  HeadFinder headFinder();

  HeadFinder typedDependencyHeadFinder();

  /**
   * Allows language specific processing (e.g., stemming) of head words.
   *
   * @param headWord An {@link edu.stanford.nlp.ling.Label} that minimally implements the
   * {@link edu.stanford.nlp.ling.HasWord} and {@link edu.stanford.nlp.ling.HasTag} interfaces.
   * @return A processed {@link edu.stanford.nlp.ling.Label}
   */
  Label processHeadWord(Label headWord);

  void setInputEncoding(String encoding);

  void setOutputEncoding(String encoding);

  /**
   * If evalGFs = true, then the evaluation of parse trees will include evaluation on grammatical functions.
   * Otherwise, evaluation will strip the grammatical functions.
   */
  void setEvaluateGrammaticalFunctions(boolean evalGFs);

  /**
   * Returns the output encoding being used.
   * @return The output encoding being used.
   */
  String getOutputEncoding();

  /**
   * Returns the input encoding being used.
   * @return The input encoding being used.
   */
  String getInputEncoding();


  /**
   * Returns a factory for reading in trees from the source you want.  It's
   * the responsibility of trf to deal properly with character-set encoding
   * of the input.  It also is the responsibility of trf to properly
   * normalize trees.
   *
   * @return A factory that vends an appropriate TreeReader
   */
  TreeReaderFactory treeReaderFactory();


  /**
   * Vends a {@link Lexicon} object suitable to the particular language/treebank combination of interest.
   * @param op Options as to how the Lexicon behaves
   * @return A Lexicon, constructed based on the given option
   */
  Lexicon lex(Options op, Index wordIndex, Index tagIndex);


  /**
   * The tree transformer applied to trees prior to evaluation.
   * For instance, it might delete punctuation nodes.  This method will
   * be applied both to the parse output tree and to the gold
   * tree.  The exact specification depends on "standard practice" for
   * various treebanks.
   *
   * @return A TreeTransformer that performs adjustments to trees to delete
   *     or equivalence class things not evaluated in the parser performance
   *     evaluation.
   */
  TreeTransformer collinizer();


  /**
   * the tree transformer used to produce trees for evaluation.  Will
   * be applied both to the parse output tree and to the gold
   * tree. Should strip punctuation and maybe do some other
   * things. The evalb version should strip some more stuff
   * off. (finish this doc!)
   */
  TreeTransformer collinizerEvalb();

  /**
   * returns a MemoryTreebank appropriate to the treebank source
   */
  MemoryTreebank memoryTreebank();

  /**
   * returns a DiskTreebank appropriate to the treebank source
   */
  DiskTreebank diskTreebank();

  /**
   * returns a MemoryTreebank appropriate to the testing treebank source
   */
  MemoryTreebank testMemoryTreebank();

  /**
   * Required to extend TreebankFactory
   */
  Treebank treebank();

  /**
   * returns a TreebankLanguagePack containing Treebank-specific (but
   * not parser-specific) info such as what is punctuation, and also
   * information about the structure of labels
   */
  TreebankLanguagePack treebankLanguagePack();

  /**
   * returns a PrintWriter used to print output. It's the
   * responsibility of the returned PrintWriter to deal properly with
   * character encodings for the relevant treebank
   */
  PrintWriter pw();

  /**
   * returns a PrintWriter used to print output to the OutputStream
   * o. It's the responsibility of the returned PrintWriter to deal
   * properly with character encodings for the relevant treebank
   */
  PrintWriter pw(OutputStream o);


  /**
   * Returns the splitting strings used for selective splits.
   *
   * @return An array containing ancestor-annotated Strings: categories
   *         should be split according to these ancestor annotations.
   */
  String[] sisterSplitters();


  /**
   * Returns a TreeTransformer appropriate to the Treebank which
   * can be used to remove functional tags (such as "-TMP") from
   * categories.
   */
  TreeTransformer subcategoryStripper();

  /**
   * This method does language-specific tree transformations such
   * as annotating particular nodes with language-relevant features.
   * Such parameterizations should be inside the specific
   * TreebankLangParserParams class.  This method is recursively
   * applied to each node in the tree (depth first, left-to-right),
   * so you shouldn't write this method to apply recursively to tree
   * members.  This method is allowed to (and in some cases does)
   * destructively change the input tree {@code t}. It changes both
   * labels and the tree shape.
   *
   * @param t The input tree (with non-language specific annotation already
   *           done, so you need to strip back to basic categories)
   * @param root The root of the current tree (can be null for words)
   * @return The fully annotated tree node (with daughters still as you
   *           want them in the final result)
   */
  Tree transformTree(Tree t, Tree root);

  /**
   * display language-specific settings
   */
  void display();

  /**
   * Set a language-specific option according to command-line flags.
   * This routine should try to process the option starting at args[i] (which
   * might potentially be several arguments long if it takes arguments).
   * It should return the index after the last index it consumed in
   * processing.  In particular, if it cannot process the current option,
   * the return value should be i.
   *
   * @param args Array of command line arguments
   * @param i    Index in command line arguments to try to process as an option
   * @return The index of the item after arguments processed as part of this
   *         command line option.
   */
  int setOptionFlag(String[] args, int i);


  /**
   * Return a default sentence of the language (for testing).
   * @return A default sentence of the language
   */
  List defaultTestSentence();

  TokenizerFactory treeTokenizerFactory();

  Extractor dependencyGrammarExtractor(Options op, Index wordIndex, Index tagIndex);

  /**
   * Give the parameters for smoothing in the MLEDependencyGrammar.
   * @return an array of doubles with smooth_aT_hTWd, smooth_aTW_hTWd, smooth_stop, and interp
   */
  double[] MLEDependencyGrammarSmoothingParams();

  /**
   * Returns a language specific object for evaluating PP attachment
   *
   * @return An object that implements {@link AbstractEval}
   */
  AbstractEval ppAttachmentEval();

  /**
   * Returns a function which reads the given filename and turns its
   * content in a list of GrammaticalStructures.  Will throw
   * UnsupportedOperationException if the language doesn't support
   * dependencies or GrammaticalStructures.
   */
  List readGrammaticalStructureFromFile(String filename);

  /**
   * Build a GrammaticalStructure from a Tree.  Throws
   * UnsupportedOperationException if the language doesn't support
   * dependencies or GrammaticalStructures.
   */
  GrammaticalStructure getGrammaticalStructure(Tree t, Predicate filter,
                                               HeadFinder hf);

  /** Whether our code provides support for converting phrase structure
   *  (constituency) parses to (basic) dependency parses.
   *  @return Whether dependencies are supported for a language
   *
   */
  boolean supportsBasicDependencies();

  /** Set whether to generate original Stanford Dependencies or the newer
   *  Universal Dependencies.
   *
   *  @param originalDependencies Whether to generate SD
   */
  void setGenerateOriginalDependencies(boolean originalDependencies);

  /** Whether to generate original Stanford Dependencies or the newer
   *  Universal Dependencies.
   *
   *  @return Whether to generate SD
   */
  boolean generateOriginalDependencies();


  /** When run inside StanfordCoreNLP, which flags should be used by default.
   *  E.g., the current use is that for English, we want it to run with the
   *  option to retain "-TMP" functional tags but not to impose that on
   *  other languages.
   */
  String[] defaultCoreNLPFlags();

}