All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.berkeley.nlp.treebank.ChineseTreebankLanguagePack Maven / Gradle / Ivy

Go to download

The Berkeley parser analyzes the grammatical structure of natural language using probabilistic context-free grammars (PCFGs).

The newest version!
package edu.berkeley.nlp.treebank;

import java.io.Serializable;

import edu.berkeley.nlp.tokenizer.TokenizerFactory;
import edu.berkeley.nlp.util.Filter;
import edu.berkeley.nlp.util.Filters;

/**
 * Language pack for Chinese treebank.  (Look into using native2ascii
 * to edit this file as a GB file)
 *
 * @author Roger Levy
 */

public class ChineseTreebankLanguagePack extends AbstractTreebankLanguagePack implements Serializable {

  private static TokenizerFactory tf;

  public static void setTokenizerFactory(TokenizerFactory tf) {
    ChineseTreebankLanguagePack.tf = tf;
  }

  public static final String ENCODING = "GB18030";

  /**
   * Return the input Charset encoding for the Treebank.
   * See documentation for the Charset class.
   *
   * @return Name of Charset
   */
  public String getEncoding() {
    return ENCODING;
  }

  /**
   * Accepts a String that is a punctuation
   * tag name, and rejects everything else.
   *
   * @return Whether this is a punctuation tag
   */
  public boolean isPunctuationTag(String str) {
    return str.equals("PU");
  }


  /**
   * Accepts a String that is a punctuation
   * word, and rejects everything else.
   * If one can't tell for sure (as for ' in the Penn Treebank), it
   * maks the best guess that it can.
   *
   * @return Whether this is a punctuation word
   */
  public boolean isPunctuationWord(String str) {
    return chineseCommaAcceptFilter().accept(str) || chineseEndSentenceAcceptFilter().accept(str) || chineseDouHaoAcceptFilter().accept(str) || chineseQuoteMarkAcceptFilter().accept(str) || chineseParenthesisAcceptFilter().accept(str) || chineseColonAcceptFilter().accept(str) || chineseDashAcceptFilter().accept(str) || chineseOtherAcceptFilter().accept(str);

  }


  /**
   * Accepts a String that is a sentence end
   * punctuation tag, and rejects everything else.
   *
   * @return Whether this is a sentence final punctuation tag
   */
  public boolean isSentenceFinalPunctuationTag(String str) {
    return chineseEndSentenceAcceptFilter().accept(str);
  }


  /**
   * Returns a String array of punctuation tags for this treebank/language.
   *
   * @return The punctuation tags
   */
  public String[] punctuationTags() {
    return tags;
  }


  /**
   * Returns a String array of punctuation words for this treebank/language.
   *
   * @return The punctuation words
   */
  public String[] punctuationWords() {
    return punctWords;
  }


  /**
   * Returns a String array of sentence final punctuation tags for this
   * treebank/language.
   *
   * @return The sentence final punctuation tags
   */
  public String[] sentenceFinalPunctuationTags() {
    return tags;
  }

  /**
   * Returns a String array of sentence final punctuation words for this
   * treebank/language.
   *
   * @return The sentence final punctuation tags
   */
  public String[] sentenceFinalPunctuationWords() {
    return endSentence;
  }

  /**
   * Accepts a String that is a punctuation
   * tag that should be ignored by EVALB-style evaluation,
   * and rejects everything else.
   * Traditionally, EVALB has ignored a subset of the total set of
   * punctuation tags in the English Penn Treebank (quotes and
   * period, comma, colon, etc., but not brackets)
   *
   * @return Whether this is a EVALB-ignored punctuation tag
   */
  public boolean isEvalBIgnoredPunctuationTag(String str) {
    return Filters.collectionAcceptFilter(tags).accept(str);
  }


  /**
   * The first 3 are used by the Penn Treebank; # is used by the
   * BLLIP corpus, and ^ and ~ are used by Klein's
   * lexparser. Identical to PennTreebankLanguagePack.
   */
  private static final char[] annotationIntroducingChars = {'-', '=', '|', '#', '^', '~'};


  /**
   * Return an array of characters at which a String should be
   * truncated to give the basic syntactic category of a label.
   * The idea here is that Penn treebank style labels follow a syntactic
   * category with various functional and crossreferencing information
   * introduced by special characters (such as "NP-SBJ=1").  This would
   * be truncated to "NP" by the array containing '-' and "=".
   *
   * @return An array of characters that set off label name suffixes
   */
  public char[] labelAnnotationIntroducingCharacters() {
    return annotationIntroducingChars;
  }


  /**
   * This is valid for "BobChrisTreeNormalizer" conventions
   * only. Again, identical to PennTreebankLanguagePack.
   */
  private static final String[] startSymbols = {"ROOT"};

  /**
   * Returns a String array of treebank start symbols.
   *
   * @return The start symbols
   */
  public String[] startSymbols() {
    return startSymbols;
  }


  private static final String[] tags = {"PU"};
  private static final String[] comma = {",", "\uff0c", "\u3000"};  // \u3000 is an "ideographic space"...?
  private static final String[] endSentence = {"\u3002", "\uff0e", "\uff01", "\uff1f", "?", "!", "."};
  private static final String[] douHao = {"\u3001"};
  private static final String[] quoteMark = {"\u201c", "\u201d", "\u2018", "\u2019", "\u300a", "\u300b", "\u300e", "\u300f", "\u3008", "\u3009", "\u300c", "\u300d", "\uff02", "\uff1c", "\uff1e", "`", "\uff07"};
  private static final String[] parenthesis = {"\uff08", "\uff09", "-LRB-", "-RRB-", "\u3010", "\u3011"};
  private static final String[] colon = {"\uff1a", "\uff1b", "\u2236", ":"};
  private static final String[] dash = {"\u2026", "\u2014", "\u2014\u2014", "\u2014\u2014\u2014", "\uff0d", "\uff0d\uff0d", "\u2500\u2500", "\u2501", "\u2501\u2501", "\u2014\uff0d", "-", "----", "~", "\u2026\u2026", "\uff5e"};
  private static final String[] other = {"\u00b7", "\uff0f", "\uff0f", "\uff0a", "\uff06", "/", "//", "*" };  // slashes are used in urls

  private static String[] leftQuoteMark = {"\u201c", "\u2018", "\u300a", "\u300e", "\u3008", "\u300c", "\uff1c", "`"};
  private static String[] rightQuoteMark = {"\u201d", "\u2019", "\u300b", "\u300f", "\u3009", "\u300d", "\uff1e", "\uff07"};
  private static String[] leftParenthesis = {"\uff08", "-LRB-", "\u3010"};
  private static String[] rightParenthesis = {"\uff09", "-RRB-", "\u3011"};


  private static final String[] punctWords;

  static {
    int n = tags.length + comma.length + endSentence.length + douHao.length + quoteMark.length + parenthesis.length + colon.length + dash.length + other.length;
    punctWords = new String[n];
    int m = 0;
    System.arraycopy(tags, 0, punctWords, m, tags.length);
    m += tags.length;
    System.arraycopy(comma, 0, punctWords, m, comma.length);
    m += comma.length;
    System.arraycopy(endSentence, 0, punctWords, m, endSentence.length);
    m += endSentence.length;
    System.arraycopy(douHao, 0, punctWords, m, douHao.length);
    m += douHao.length;
    System.arraycopy(quoteMark, 0, punctWords, m, quoteMark.length);
    m += quoteMark.length;
    System.arraycopy(parenthesis, 0, punctWords, m, parenthesis.length);
    m += parenthesis.length;
    System.arraycopy(colon, 0, punctWords, m, colon.length);
    m += colon.length;
    System.arraycopy(dash, 0, punctWords, m, dash.length);
    m += dash.length;
    System.arraycopy(other, 0, punctWords, m, other.length);
  }

  public static Filter chineseCommaAcceptFilter() {
    return Filters.collectionAcceptFilter(comma);
  }

  public static Filter chineseEndSentenceAcceptFilter() {
    return Filters.collectionAcceptFilter(endSentence);
  }

  public static Filter chineseDouHaoAcceptFilter() {
    return Filters.collectionAcceptFilter(douHao);
  }

  public static Filter chineseQuoteMarkAcceptFilter() {
    return Filters.collectionAcceptFilter(quoteMark);
  }

  public static Filter chineseParenthesisAcceptFilter() {
    return Filters.collectionAcceptFilter(parenthesis);
  }

  public static Filter chineseColonAcceptFilter() {
    return Filters.collectionAcceptFilter(colon);
  }

  public static Filter chineseDashAcceptFilter() {
    return Filters.collectionAcceptFilter(dash);
  }

  public static Filter chineseOtherAcceptFilter() {
    return Filters.collectionAcceptFilter(other);
  }


  public static Filter chineseLeftParenthesisAcceptFilter() {
    return Filters.collectionAcceptFilter(leftParenthesis);
  }

  public static Filter chineseRightParenthesisAcceptFilter() {
    return Filters.collectionAcceptFilter(rightParenthesis);
  }

  public static Filter chineseLeftQuoteMarkAcceptFilter() {
    return Filters.collectionAcceptFilter(leftQuoteMark);
  }

  public static Filter chineseRightQuoteMarkAcceptFilter() {
    return Filters.collectionAcceptFilter(rightQuoteMark);
  }

  /**
   * Returns the extension of treebank files for this treebank.
   * This is "fid".
   */
  public String treebankFileExtension() {
    return "fid";
  }

  private static final long serialVersionUID = 5757403475523638802L;

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy