edu.stanford.nlp.trees.international.arabic.ArabicTreebankLanguagePack Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7

Show newest version

package edu.stanford.nlp.trees.international.arabic;

import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.international.arabic.ArabicMorphoFeatureSpecification;
import edu.stanford.nlp.international.arabic.process.ArabicTokenizer;
import edu.stanford.nlp.international.morph.MorphoFeatureSpecification;
import edu.stanford.nlp.ling.HasWord;

/**
 * Specifies the treebank/language specific components needed for
 * parsing the Penn Arabic Treebank (ATB). This language pack has been updated for
 * ATB1v4, ATB2v3, and ATB3v3.2
 * 
 * The encoding for the ATB is the default UTF-8 specified in AbstractTreebankLanguagePack.
 *
 * @author Christopher Manning
 * @author Mona Diab
 * @author Roger Levy
 * @author Spence Green
 *
 */
public class ArabicTreebankLanguagePack extends AbstractTreebankLanguagePack {

  private static final long serialVersionUID = 9081305982861675328L;

  private static final String[] collinsPunctTags = {"PUNC"};

  private static final String[] pennPunctTags = {"PUNC"};

  private static final String[] pennPunctWords = {".","\"",",","-LRB-","-RRB-","-",":","/","?","_","*","%","!",">","-PLUS-","...",";","..","&","=","ر","'","\\","`","......"};

  private static final String[] pennSFPunctTags = {"PUNC"};

  private static final String[] pennSFPunctWords = {".", "!", "?"};

  /**
   * The first 3 are used by the Penn Treebank; # is used by the
   * BLLIP corpus, and ^ and ~ are used by Klein's lexparser.
   * Chris deleted '_' for Arabic as it appears in tags (NO_FUNC).
   * June 2006: CDM tested _ again with true (new) Treebank tags to see if it
   * was useful for densening up the tag space, but the results were negative.
   * Roger added + for Arabic but Chris deleted it again, since unless you've
   * recoded determiners, it screws up DET+NOUN, etc.  (That is, it would only be useful if
   * you always wanted to cut at the first '+', but in practice that is not viable, certainly
   * not with the IBM ATB processing either.)
   */
  private static final char[] annotationIntroducingChars = {'-', '=', '|', '#', '^', '~'};

  /**
   * This is valid for "BobChrisTreeNormalizer" conventions only.
   * wsg: "ROOT" should always be the first value. See {@link #startSymbol} in
   * the parent class.
   */
  private static final String[] pennStartSymbols = {"ROOT"};


  /**
   * Returns a String array of punctuation tags for this treebank/language.
   *
   * @return The punctuation tags
   */
  @Override
  public String[] punctuationTags() {
    return pennPunctTags;
  }


  /**
   * Returns a String array of punctuation words for this treebank/language.
   *
   * @return The punctuation words
   */
  @Override
  public String[] punctuationWords() {
    return pennPunctWords;
  }


  /**
   * Returns a String array of sentence final punctuation tags for this
   * treebank/language.
   *
   * @return The sentence final punctuation tags
   */
  @Override
  public String[] sentenceFinalPunctuationTags() {
    return pennSFPunctTags;
  }

  /**
   * Returns a String array of sentence final punctuation words for this
   * treebank/language.
   *
   * @return The sentence final punctuation tags
   */
  public String[] sentenceFinalPunctuationWords() {
    return pennSFPunctWords;
  }

  /**
   * Returns a String array of punctuation tags that EVALB-style evaluation
   * should ignore for this treebank/language.
   * Traditionally, EVALB has ignored a subset of the total set of
   * punctuation tags in the English Penn Treebank (quotes and
   * period, comma, colon, etc., but not brackets)
   *
   * @return Whether this is a EVALB-ignored punctuation tag
   */
  @Override
  public String[] evalBIgnoredPunctuationTags() {
    return collinsPunctTags;
  }


  /**
   * Return an array of characters at which a String should be
   * truncated to give the basic syntactic category of a label.
   * The idea here is that Penn treebank style labels follow a syntactic
   * category with various functional and crossreferencing information
   * introduced by special characters (such as "NP-SBJ=1").  This would
   * be truncated to "NP" by the array containing '-' and "=".
   *
   * @return An array of characters that set off label name suffixes
   */
  @Override
  public char[] labelAnnotationIntroducingCharacters() {
    return annotationIntroducingChars;
  }


  /**
   * Returns a String array of treebank start symbols.
   *
   * @return The start symbols
   */
  @Override
  public String[] startSymbols() {
    return pennStartSymbols;
  }

  /**
   * TODO: there is no way to change this using options.
   */
  private TokenizerFactory tf =
    ArabicTokenizer.atbFactory();

  /**
   * Return a tokenizer which might be suitable for tokenizing text
   * that will be used with this Treebank/Language pair.  We tokenize
   * the Arabic using the ArabicTokenizer class.
   *
   * @return A tokenizer
   */
  @Override
  public TokenizerFactory getTokenizerFactory() {
    return tf;
  }

  /**
   * Returns the extension of treebank files for this treebank.
   * This is "tree".
   */
  public String treebankFileExtension() {
    return "tree";
  }

  @Override
  public TreeReaderFactory treeReaderFactory() {
    return new ArabicTreeReaderFactory();
  }

  @Override
  public String toString() {
    return "ArabicTreebankLanguagePack";
  }

  /** {@inheritDoc} */
  public HeadFinder headFinder() {
    return new ArabicHeadFinder(this);
  }

  /** {@inheritDoc} */
  public HeadFinder typedDependencyHeadFinder() {
    return new ArabicHeadFinder(this);
  }

  @Override
  public MorphoFeatureSpecification morphFeatureSpec() {
    return new ArabicMorphoFeatureSpecification();
  }


  /**
   *
   * @param args
   */
  public static void main(String[] args) {
    TreebankLanguagePack tlp = new PennTreebankLanguagePack();
    System.out.println("Start symbol: " + tlp.startSymbol());
    String start = tlp.startSymbol();
    System.out.println("Should be true: " + (tlp.isStartSymbol(start)));
    String[] strs = new String[]{"-", "-LLB-", "NP-2", "NP=3", "NP-LGS", "NP-TMP=3"};
    for (String str : strs) {
      System.out.println("String: " + str + " basic: " + tlp.basicCategory(str) + " basicAndFunc: " + tlp.categoryAndFunction(str));
    }
  }
}