All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.trees.international.tuebadz.TueBaDZLanguagePack Maven / Gradle / Ivy

Go to download

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.trees.international.tuebadz;

import edu.stanford.nlp.trees.AbstractTreebankLanguagePack;
import edu.stanford.nlp.trees.TreebankLanguagePack;
import edu.stanford.nlp.trees.TreeReaderFactory;
import edu.stanford.nlp.trees.HeadFinder;


/** Language pack for the Tuebingen Treebank of Written German (TueBa-D/Z).
 *  http://www.sfs.nphil.uni-tuebingen.de/en_tuebadz.shtml
 *  This treebank is in utf-8.
 *
 *  @author Roger Levy ([email protected])
 */
public class TueBaDZLanguagePack extends AbstractTreebankLanguagePack {
  private boolean limitedGF = false;

  private static String[] gfToKeepArray = {"ON", "OA", "OD"};

  private static String[] tuebadzPunctTags = {"$.","$,","$-LRB"};

  private static String[] tuebadzSFPunctTags = {"$."};


  private static String[] tuebadzPunctWords = { "`", "-", ",", ";", ":", "!", "?", "/", ".", "...","'", "\"", "[", "]", "*"};

  private static String[] tuebadzSFPunctWords = {".", "!", "?"};

  /**
   * The first one is used by the TueBaDZ Treebank, and the rest are used by Klein's lexparser.
   */
  private static char[] annotationIntroducingChars = {':', '^', '~', '%', '#', '='};


  /**
   * Gives a handle to the TreebankLanguagePack
   */
  public TueBaDZLanguagePack() {
    this(false);
  }

  /**
   * Make a new language pack with grammatical functions used based on the value of leaveGF
   */
  public TueBaDZLanguagePack(boolean leaveGF) {
    this(leaveGF, AbstractTreebankLanguagePack.DEFAULT_GF_CHAR);
  }

  /**
   * Make a new language pack with grammatical functions used based on the value of leaveGF
   * and marked with the character gfChar.  gfChar should *not* be an annotation introducing character.
   */
  public TueBaDZLanguagePack(boolean leaveGF, char gfChar) {
    this(false, leaveGF, gfChar);
  }

  /**
   * Make a new language pack with grammatical functions used based on the value of leaveGF
   * and marked with the character gfChar.  gfChar should *not* be an annotation introducing character.
   */
  public TueBaDZLanguagePack(boolean useLimitedGF, boolean leaveGF, char gfChar) {
    super(gfChar);
    this.leaveGF  = leaveGF;
    this.limitedGF = useLimitedGF;
  }


  /**
   * Return an array of characters at which a String should be
   * truncated to give the basic syntactic category of a label.
   * The idea here is that Penn treebank style labels follow a syntactic
   * category with various functional and crossreferencing information
   * introduced by special characters (such as "NP-SBJ=1").  This would
   * be truncated to "NP" by the array containing '-' and "=".
   *
   * @return An array of characters that set off label name suffixes
   */
  @Override
  public char[] labelAnnotationIntroducingCharacters() {
    return annotationIntroducingChars;
  }

  @Override
  public String[] punctuationTags() {
    return tuebadzPunctTags;
  }

  @Override
  public String[] punctuationWords() {
    return tuebadzPunctWords;
  }

  @Override
  public String[] sentenceFinalPunctuationTags() {
    return tuebadzSFPunctTags;
  }

  @Override
  public String[] startSymbols() {
    return new String[] {"TOP"};
  }

  public String[] sentenceFinalPunctuationWords() {
    return tuebadzSFPunctWords;
  }

  public String treebankFileExtension() {
    return ".penn";
  }

  private boolean leaveGF = false;

  @Override
  public String basicCategory(String category) {
    String basicCat = super.basicCategory(category);
    if(!leaveGF) {
      basicCat = stripGF(basicCat);
    }
    return basicCat;
  }

  @Override
  public String stripGF(String category) {
    if(category == null) {
      return null;
    }
    int index = category.lastIndexOf(gfCharacter);
    if(index > 0) {
      if(!limitedGF || !containsKeptGF(category, index))
        category = category.substring(0, index);
    }
    return category;
  }

  /**
   * Helper method for determining if the gf in category
   * is one of those in the array gfToKeepArray.  Index is the
   * index where the gfCharacter appears.
   */
  private static boolean containsKeptGF(String category, int index) {
    for(String gf : gfToKeepArray) {
      int gfLength = gf.length();
      if(gfLength < (category.length() - index)) {
        if(category.substring(index+1).equals(gf))//category.substring(index+1, index+1+gfLength).equals(gf))
          return true;
      }
    }
    return false;
  }

  public boolean isLeaveGF() {
    return leaveGF;
  }

  public void setLeaveGF(boolean leaveGF) {
    this.leaveGF = leaveGF;
  }


  /**
   * Return the input Charset encoding for the Treebank.
   * See documentation for the Charset class.
   *
   * @return Name of Charset
   */
  @Override
  public String getEncoding() {
    return "iso-8859-15";
  }

  /** Prints a few aspects of the TreebankLanguagePack, just for debugging.
   */
  public static void main(String[] args) {
    TreebankLanguagePack tlp = new TueBaDZLanguagePack();
    System.out.println("Start symbol: " + tlp.startSymbol());
    String start = tlp.startSymbol();
    System.out.println("Should be true: " + (tlp.isStartSymbol(start)));
    String[] strs = new String[]{"-", "-LLB-", "NP-2", "NP=3", "NP-LGS", "NP-TMP=3", "CARD-HD"};
    for (String str : strs) {
      System.out.println("String: " + str + " basic: " + tlp.basicCategory(str) + " basicAndFunc: " + tlp.categoryAndFunction(str));
    }
  }

  private static final long serialVersionUID = 2697418320262700673L;


  public boolean isLimitedGF() {
    return limitedGF;
  }

  public void setLimitedGF(boolean limitedGF) {
    this.limitedGF = limitedGF;
  }

  @Override
  public TreeReaderFactory treeReaderFactory() {
    return new TueBaDZTreeReaderFactory(this);
  }

  /** {@inheritDoc} */
  public HeadFinder headFinder() {
    return new TueBaDZHeadFinder();
  }

  /** {@inheritDoc} */
  public HeadFinder typedDependencyHeadFinder() {
    return new TueBaDZHeadFinder();
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy