edu.stanford.nlp.trees.PennTreebankLanguagePack Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
There is a newer version: 3.9.2
Show newest version
package edu.stanford.nlp.trees;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.process.PTBTokenizer;
import java.util.function.Predicate;


/**
 * Specifies the treebank/language specific components needed for
 * parsing the English Penn Treebank.
 *
 * @author Christopher Manning
 * @version 1.2
 */
public class PennTreebankLanguagePack extends AbstractTreebankLanguagePack {

  /**
   * Gives a handle to the TreebankLanguagePack
   */
  public PennTreebankLanguagePack() {
  }


  private static final String[] pennPunctTags = {"''", "``", "-LRB-", "-RRB-", ".", ":", ","};

  private static final String[] pennSFPunctTags = {"."};

  private static final String[] collinsPunctTags = {"''", "``", ".", ":", ","};

  private static final String[] pennPunctWords = {"''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", ".", "?", "!", ",", ":", "-", "--", "...", ";"};

  private static final String[] pennSFPunctWords = {".", "!", "?"};


  /**
   * The first 3 are used by the Penn Treebank; # is used by the
   * BLLIP corpus, and ^ and ~ are used by Klein's lexparser.
   * Teg added _ (let me know if it hurts).
   * John Bauer added [ on account of category annotations added when
   * printing out lexicalized dependencies.  Note that ] ought to be
   * unnecessary, since it would end the annotation, not start it.
   */
  private static final char[] annotationIntroducingChars = {'-', '=', '|', '#', '^', '~', '_', '['};

  /**
   * This is valid for "BobChrisTreeNormalizer" conventions only.
   */
  private static final String[] pennStartSymbols = {"ROOT", "TOP"};


  /**
   * Returns a String array of punctuation tags for this treebank/language.
   *
   * @return The punctuation tags
   */
  @Override
  public String[] punctuationTags() {
    return pennPunctTags;
  }


  /**
   * Returns a String array of punctuation words for this treebank/language.
   *
   * @return The punctuation words
   */
  @Override
  public String[] punctuationWords() {
    return pennPunctWords;
  }


  /**
   * Returns a String array of sentence final punctuation tags for this
   * treebank/language.
   *
   * @return The sentence final punctuation tags
   */
  @Override
  public String[] sentenceFinalPunctuationTags() {
    return pennSFPunctTags;
  }

  /**
   * Returns a String array of sentence final punctuation words for this
   * treebank/language.
   *
   * @return The sentence final punctuation tags
   */
  @Override
  public String[] sentenceFinalPunctuationWords() {
    return pennSFPunctWords;
  }

  /**
   * Returns a String array of punctuation tags that EVALB-style evaluation
   * should ignore for this treebank/language.
   * Traditionally, EVALB has ignored a subset of the total set of
   * punctuation tags in the English Penn Treebank (quotes and
   * period, comma, colon, etc., but not brackets)
   *
   * @return Whether this is a EVALB-ignored punctuation tag
   */
  @Override
  public String[] evalBIgnoredPunctuationTags() {
    return collinsPunctTags;
  }


  /**
   * Return an array of characters at which a String should be
   * truncated to give the basic syntactic category of a label.
   * The idea here is that Penn treebank style labels follow a syntactic
   * category with various functional and crossreferencing information
   * introduced by special characters (such as "NP-SBJ=1").  This would
   * be truncated to "NP" by the array containing '-' and "=".
   *
   * @return An array of characters that set off label name suffixes
   */
  @Override
  public char[] labelAnnotationIntroducingCharacters() {
    return annotationIntroducingChars;
  }


  /**
   * Returns a String array of treebank start symbols.
   *
   * @return The start symbols
   */
  @Override
  public String[] startSymbols() {
    return pennStartSymbols;
  }

  /**
   * Returns a factory for {@link PTBTokenizer}.
   *
   * @return A tokenizer
   */
  @Override
  public TokenizerFactory getTokenizerFactory() {
    return PTBTokenizer.coreLabelFactory();
  }

  /**
   * Returns the extension of treebank files for this treebank.
   * This is "mrg".
   */
  @Override
  public String treebankFileExtension() {
    return "mrg";
  }

  /**
   * Return a GrammaticalStructure suitable for this language/treebank.
   *
   * @return A GrammaticalStructure suitable for this language/treebank.
   */
  @Override
  public GrammaticalStructureFactory grammaticalStructureFactory() {
    return new EnglishGrammaticalStructureFactory();
  }

  /**
   * Return a GrammaticalStructure suitable for this language/treebank.
   * 
   * Note: This is loaded by reflection so basic treebank use does not require all the Stanford Dependencies code.
   *
   * @return A GrammaticalStructure suitable for this language/treebank.
   */
  @Override
  public GrammaticalStructureFactory grammaticalStructureFactory(Predicate puncFilter) {
    return new EnglishGrammaticalStructureFactory(puncFilter);
  }

  @Override
  public GrammaticalStructureFactory grammaticalStructureFactory(Predicate puncFilter, HeadFinder hf) {
    return new EnglishGrammaticalStructureFactory(puncFilter, hf);
  }

  @Override
  public boolean supportsGrammaticalStructures() {
    return true;
  }

  /** {@inheritDoc} */
  @Override
  public HeadFinder headFinder() {
    return new ModCollinsHeadFinder(this);
  }

  /** {@inheritDoc} */
  @Override
  public HeadFinder typedDependencyHeadFinder() {
    return new SemanticHeadFinder(this, true);
  }


  /** Prints a few aspects of the TreebankLanguagePack, just for debugging.
   */
  public static void main(String[] args) {
    TreebankLanguagePack tlp = new PennTreebankLanguagePack();
    System.out.println("Start symbol: " + tlp.startSymbol());
    String start = tlp.startSymbol();
    System.out.println("Should be true: " + (tlp.isStartSymbol(start)));
    String[] strs = {"-", "-LLB-", "NP-2", "NP=3", "NP-LGS", "NP-TMP=3"};
    for (String str : strs) {
      System.out.println("String: " + str + " basic: " + tlp.basicCategory(str) + " basicAndFunc: " + tlp.categoryAndFunction(str));
    }
  }

  private static final long serialVersionUID = 9081305982861675328L;

}