edu.stanford.nlp.trees.international.tuebadz.TueBaDZLanguagePack Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
package edu.stanford.nlp.trees.international.tuebadz;
import edu.stanford.nlp.trees.AbstractTreebankLanguagePack;
import edu.stanford.nlp.trees.TreebankLanguagePack;
import edu.stanford.nlp.trees.TreeReaderFactory;
import edu.stanford.nlp.trees.HeadFinder;
/** Language pack for the Tuebingen Treebank of Written German (TueBa-D/Z).
* http://www.sfs.nphil.uni-tuebingen.de/en_tuebadz.shtml
* This treebank is in utf-8.
*
* @author Roger Levy ([email protected])
*/
public class TueBaDZLanguagePack extends AbstractTreebankLanguagePack {
private boolean limitedGF = false;
private static String[] gfToKeepArray = {"ON", "OA", "OD"};
private static String[] tuebadzPunctTags = {"$.","$,","$-LRB"};
private static String[] tuebadzSFPunctTags = {"$."};
private static String[] tuebadzPunctWords = { "`", "-", ",", ";", ":", "!", "?", "/", ".", "...","'", "\"", "[", "]", "*"};
private static String[] tuebadzSFPunctWords = {".", "!", "?"};
/**
* The first one is used by the TueBaDZ Treebank, and the rest are used by Klein's lexparser.
*/
private static char[] annotationIntroducingChars = {':', '^', '~', '%', '#', '='};
/**
* Gives a handle to the TreebankLanguagePack
*/
public TueBaDZLanguagePack() {
this(false);
}
/**
* Make a new language pack with grammatical functions used based on the value of leaveGF
*/
public TueBaDZLanguagePack(boolean leaveGF) {
this(leaveGF, AbstractTreebankLanguagePack.DEFAULT_GF_CHAR);
}
/**
* Make a new language pack with grammatical functions used based on the value of leaveGF
* and marked with the character gfChar. gfChar should *not* be an annotation introducing character.
*/
public TueBaDZLanguagePack(boolean leaveGF, char gfChar) {
this(false, leaveGF, gfChar);
}
/**
* Make a new language pack with grammatical functions used based on the value of leaveGF
* and marked with the character gfChar. gfChar should *not* be an annotation introducing character.
*/
public TueBaDZLanguagePack(boolean useLimitedGF, boolean leaveGF, char gfChar) {
super(gfChar);
this.leaveGF = leaveGF;
this.limitedGF = useLimitedGF;
}
/**
* Return an array of characters at which a String should be
* truncated to give the basic syntactic category of a label.
* The idea here is that Penn treebank style labels follow a syntactic
* category with various functional and crossreferencing information
* introduced by special characters (such as "NP-SBJ=1"). This would
* be truncated to "NP" by the array containing '-' and "=".
*
* @return An array of characters that set off label name suffixes
*/
@Override
public char[] labelAnnotationIntroducingCharacters() {
return annotationIntroducingChars;
}
@Override
public String[] punctuationTags() {
return tuebadzPunctTags;
}
@Override
public String[] punctuationWords() {
return tuebadzPunctWords;
}
@Override
public String[] sentenceFinalPunctuationTags() {
return tuebadzSFPunctTags;
}
@Override
public String[] startSymbols() {
return new String[] {"TOP"};
}
public String[] sentenceFinalPunctuationWords() {
return tuebadzSFPunctWords;
}
public String treebankFileExtension() {
return ".penn";
}
private boolean leaveGF = false;
@Override
public String basicCategory(String category) {
String basicCat = super.basicCategory(category);
if(!leaveGF) {
basicCat = stripGF(basicCat);
}
return basicCat;
}
@Override
public String stripGF(String category) {
if(category == null) {
return null;
}
int index = category.lastIndexOf(gfCharacter);
if(index > 0) {
if(!limitedGF || !containsKeptGF(category, index))
category = category.substring(0, index);
}
return category;
}
/**
* Helper method for determining if the gf in category
* is one of those in the array gfToKeepArray. Index is the
* index where the gfCharacter appears.
*/
private static boolean containsKeptGF(String category, int index) {
for(String gf : gfToKeepArray) {
int gfLength = gf.length();
if(gfLength < (category.length() - index)) {
if(category.substring(index+1).equals(gf))//category.substring(index+1, index+1+gfLength).equals(gf))
return true;
}
}
return false;
}
public boolean isLeaveGF() {
return leaveGF;
}
public void setLeaveGF(boolean leaveGF) {
this.leaveGF = leaveGF;
}
/**
* Return the input Charset encoding for the Treebank.
* See documentation for the Charset
class.
*
* @return Name of Charset
*/
@Override
public String getEncoding() {
return "iso-8859-15";
}
/** Prints a few aspects of the TreebankLanguagePack, just for debugging.
*/
public static void main(String[] args) {
TreebankLanguagePack tlp = new TueBaDZLanguagePack();
System.out.println("Start symbol: " + tlp.startSymbol());
String start = tlp.startSymbol();
System.out.println("Should be true: " + (tlp.isStartSymbol(start)));
String[] strs = new String[]{"-", "-LLB-", "NP-2", "NP=3", "NP-LGS", "NP-TMP=3", "CARD-HD"};
for (String str : strs) {
System.out.println("String: " + str + " basic: " + tlp.basicCategory(str) + " basicAndFunc: " + tlp.categoryAndFunction(str));
}
}
private static final long serialVersionUID = 2697418320262700673L;
public boolean isLimitedGF() {
return limitedGF;
}
public void setLimitedGF(boolean limitedGF) {
this.limitedGF = limitedGF;
}
@Override
public TreeReaderFactory treeReaderFactory() {
return new TueBaDZTreeReaderFactory(this);
}
/** {@inheritDoc} */
public HeadFinder headFinder() {
return new TueBaDZHeadFinder();
}
/** {@inheritDoc} */
public HeadFinder typedDependencyHeadFinder() {
return new TueBaDZHeadFinder();
}
}