edu.berkeley.nlp.treebank.ChineseTreebankLanguagePack Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of berkeleyparser Show documentation
The Berkeley parser analyzes the grammatical structure of natural language using probabilistic context-free grammars (PCFGs).
The newest version!
package edu.berkeley.nlp.treebank;

import java.io.Serializable;

import edu.berkeley.nlp.tokenizer.TokenizerFactory;
import edu.berkeley.nlp.util.Filter;
import edu.berkeley.nlp.util.Filters;

/**
 * Language pack for Chinese treebank.  (Look into using native2ascii
 * to edit this file as a GB file)
 *
 * @author Roger Levy
 */

public class ChineseTreebankLanguagePack extends AbstractTreebankLanguagePack implements Serializable {

  private static TokenizerFactory tf;

  public static void setTokenizerFactory(TokenizerFactory tf) {
    ChineseTreebankLanguagePack.tf = tf;
  }

  public static final String ENCODING = "GB18030";

  /**
   * Return the input Charset encoding for the Treebank.
   * See documentation for the Charset class.
   *
   * @return Name of Charset
   */
  public String getEncoding() {
    return ENCODING;
  }

  /**
   * Accepts a String that is a punctuation
   * tag name, and rejects everything else.
   *
   * @return Whether this is a punctuation tag
   */
  public boolean isPunctuationTag(String str) {
    return str.equals("PU");
  }


  /**
   * Accepts a String that is a punctuation
   * word, and rejects everything else.
   * If one can't tell for sure (as for ' in the Penn Treebank), it
   * maks the best guess that it can.
   *
   * @return Whether this is a punctuation word
   */
  public boolean isPunctuationWord(String str) {
    return chineseCommaAcceptFilter().accept(str) || chineseEndSentenceAcceptFilter().accept(str) || chineseDouHaoAcceptFilter().accept(str) || chineseQuoteMarkAcceptFilter().accept(str) || chineseParenthesisAcceptFilter().accept(str) || chineseColonAcceptFilter().accept(str) || chineseDashAcceptFilter().accept(str) || chineseOtherAcceptFilter().accept(str);

  }


  /**
   * Accepts a String that is a sentence end
   * punctuation tag, and rejects everything else.
   *
   * @return Whether this is a sentence final punctuation tag
   */
  public boolean isSentenceFinalPunctuationTag(String str) {
    return chineseEndSentenceAcceptFilter().accept(str);
  }


  /**
   * Returns a String array of punctuation tags for this treebank/language.
   *
   * @return The punctuation tags
   */
  public String[] punctuationTags() {
    return tags;
  }


  /**
   * Returns a String array of punctuation words for this treebank/language.
   *
   * @return The punctuation words
   */
  public String[] punctuationWords() {
    return punctWords;
  }


  /**
   * Returns a String array of sentence final punctuation tags for this
   * treebank/language.
   *
   * @return The sentence final punctuation tags
   */
  public String[] sentenceFinalPunctuationTags() {
    return tags;
  }

  /**
   * Returns a String array of sentence final punctuation words for this
   * treebank/language.
   *
   * @return The sentence final punctuation tags
   */
  public String[] sentenceFinalPunctuationWords() {
    return endSentence;
  }

  /**
   * Accepts a String that is a punctuation
   * tag that should be ignored by EVALB-style evaluation,
   * and rejects everything else.
   * Traditionally, EVALB has ignored a subset of the total set of
   * punctuation tags in the English Penn Treebank (quotes and
   * period, comma, colon, etc., but not brackets)
   *
   * @return Whether this is a EVALB-ignored punctuation tag
   */
  public boolean isEvalBIgnoredPunctuationTag(String str) {
    return Filters.collectionAcceptFilter(tags).accept(str);
  }


  /**
   * The first 3 are used by the Penn Treebank; # is used by the
   * BLLIP corpus, and ^ and ~ are used by Klein's
   * lexparser. Identical to PennTreebankLanguagePack.
   */
  private static final char[] annotationIntroducingChars = {'-', '=', '|', '#', '^', '~'};


  /**
   * Return an array of characters at which a String should be
   * truncated to give the basic syntactic category of a label.
   * The idea here is that Penn treebank style labels follow a syntactic
   * category with various functional and crossreferencing information
   * introduced by special characters (such as "NP-SBJ=1").  This would
   * be truncated to "NP" by the array containing '-' and "=".
   *
   * @return An array of characters that set off label name suffixes
   */
  public char[] labelAnnotationIntroducingCharacters() {
    return annotationIntroducingChars;
  }


  /**
   * This is valid for "BobChrisTreeNormalizer" conventions
   * only. Again, identical to PennTreebankLanguagePack.
   */
  private static final String[] startSymbols = {"ROOT"};

  /**
   * Returns a String array of treebank start symbols.
   *
   * @return The start symbols
   */
  public String[] startSymbols() {
    return startSymbols;
  }


  private static final String[] tags = {"PU"};
  private static final String[] comma = {",", "\uff0c", "\u3000"};  // \u3000 is an "ideographic space"...?
  private static final String[] endSentence = {"\u3002", "\uff0e", "\uff01", "\uff1f", "?", "!", "."};
  private static final String[] douHao = {"\u3001"};
  private static final String[] quoteMark = {"\u201c", "\u201d", "\u2018", "\u2019", "\u300a", "\u300b", "\u300e", "\u300f", "\u3008", "\u3009", "\u300c", "\u300d", "\uff02", "\uff1c", "\uff1e", "`", "\uff07"};
  private static final String[] parenthesis = {"\uff08", "\uff09", "-LRB-", "-RRB-", "\u3010", "\u3011"};
  private static final String[] colon = {"\uff1a", "\uff1b", "\u2236", ":"};
  private static final String[] dash = {"\u2026", "\u2014", "\u2014\u2014", "\u2014\u2014\u2014", "\uff0d", "\uff0d\uff0d", "\u2500\u2500", "\u2501", "\u2501\u2501", "\u2014\uff0d", "-", "----", "~", "\u2026\u2026", "\uff5e"};
  private static final String[] other = {"\u00b7", "\uff0f", "\uff0f", "\uff0a", "\uff06", "/", "//", "*" };  // slashes are used in urls

  private static String[] leftQuoteMark = {"\u201c", "\u2018", "\u300a", "\u300e", "\u3008", "\u300c", "\uff1c", "`"};
  private static String[] rightQuoteMark = {"\u201d", "\u2019", "\u300b", "\u300f", "\u3009", "\u300d", "\uff1e", "\uff07"};
  private static String[] leftParenthesis = {"\uff08", "-LRB-", "\u3010"};
  private static String[] rightParenthesis = {"\uff09", "-RRB-", "\u3011"};


  private static final String[] punctWords;

  static {
    int n = tags.length + comma.length + endSentence.length + douHao.length + quoteMark.length + parenthesis.length + colon.length + dash.length + other.length;
    punctWords = new String[n];
    int m = 0;
    System.arraycopy(tags, 0, punctWords, m, tags.length);
    m += tags.length;
    System.arraycopy(comma, 0, punctWords, m, comma.length);
    m += comma.length;
    System.arraycopy(endSentence, 0, punctWords, m, endSentence.length);
    m += endSentence.length;
    System.arraycopy(douHao, 0, punctWords, m, douHao.length);
    m += douHao.length;
    System.arraycopy(quoteMark, 0, punctWords, m, quoteMark.length);
    m += quoteMark.length;
    System.arraycopy(parenthesis, 0, punctWords, m, parenthesis.length);
    m += parenthesis.length;
    System.arraycopy(colon, 0, punctWords, m, colon.length);
    m += colon.length;
    System.arraycopy(dash, 0, punctWords, m, dash.length);
    m += dash.length;
    System.arraycopy(other, 0, punctWords, m, other.length);
  }

  public static Filter chineseCommaAcceptFilter() {
    return Filters.collectionAcceptFilter(comma);
  }

  public static Filter chineseEndSentenceAcceptFilter() {
    return Filters.collectionAcceptFilter(endSentence);
  }

  public static Filter chineseDouHaoAcceptFilter() {
    return Filters.collectionAcceptFilter(douHao);
  }

  public static Filter chineseQuoteMarkAcceptFilter() {
    return Filters.collectionAcceptFilter(quoteMark);
  }

  public static Filter chineseParenthesisAcceptFilter() {
    return Filters.collectionAcceptFilter(parenthesis);
  }

  public static Filter chineseColonAcceptFilter() {
    return Filters.collectionAcceptFilter(colon);
  }

  public static Filter chineseDashAcceptFilter() {
    return Filters.collectionAcceptFilter(dash);
  }

  public static Filter chineseOtherAcceptFilter() {
    return Filters.collectionAcceptFilter(other);
  }


  public static Filter chineseLeftParenthesisAcceptFilter() {
    return Filters.collectionAcceptFilter(leftParenthesis);
  }

  public static Filter chineseRightParenthesisAcceptFilter() {
    return Filters.collectionAcceptFilter(rightParenthesis);
  }

  public static Filter chineseLeftQuoteMarkAcceptFilter() {
    return Filters.collectionAcceptFilter(leftQuoteMark);
  }

  public static Filter chineseRightQuoteMarkAcceptFilter() {
    return Filters.collectionAcceptFilter(rightQuoteMark);
  }

  /**
   * Returns the extension of treebank files for this treebank.
   * This is "fid".
   */
  public String treebankFileExtension() {
    return "fid";
  }

  private static final long serialVersionUID = 5757403475523638802L;

}