edu.stanford.nlp.trees.AbstractTreebankLanguagePack Maven / Gradle / Ivy
Show all versions of stanford-parser Show documentation
package edu.stanford.nlp.trees;
import java.io.Serializable;
import edu.stanford.nlp.international.morph.MorphoFeatureSpecification;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.process.WhitespaceTokenizer;
import java.util.function.Predicate;
import edu.stanford.nlp.util.Filters;
import java.util.function.Function;
/**
* This provides an implementation of parts of the TreebankLanguagePack
* API to reduce the load on fresh implementations. Only the abstract
* methods below need to be implemented to give a reasonable solution for
* a new language.
*
* @author Christopher Manning
* @version 1.1
*/
public abstract class AbstractTreebankLanguagePack implements TreebankLanguagePack {
/**
* So changed versions deserialize correctly.
*/
private static final long serialVersionUID = -6506749780512708352L;
//Grammatical function parameters
/**
* Default character for indicating that something is a grammatical fn; probably should be overridden by
* lang specific ones
*/
protected char gfCharacter;
protected static final char DEFAULT_GF_CHAR = '-';
/**
* Use this as the default encoding for Readers and Writers of
* Treebank data.
*/
public static final String DEFAULT_ENCODING = "UTF-8";
/**
* For languages where a Universal Dependency converter
* exists this variable determines whether the original
* or the Universal converter will be used.
*/
protected boolean generateOriginalDependencies;
/**
* Gives a handle to the TreebankLanguagePack.
*/
public AbstractTreebankLanguagePack() {
this(DEFAULT_GF_CHAR);
}
/**
* Gives a handle to the TreebankLanguagePack.
*
* @param gfChar The character that sets of grammatical functions in node labels.
*/
public AbstractTreebankLanguagePack(char gfChar) {
this.gfCharacter = gfChar;
}
/**
* Returns a String array of punctuation tags for this treebank/language.
*
* @return The punctuation tags
*/
@Override
public abstract String[] punctuationTags();
/**
* Returns a String array of punctuation words for this treebank/language.
*
* @return The punctuation words
*/
@Override
public abstract String[] punctuationWords();
/**
* Returns a String array of sentence final punctuation tags for this
* treebank/language.
*
* @return The sentence final punctuation tags
*/
@Override
public abstract String[] sentenceFinalPunctuationTags();
/**
* Returns a String array of punctuation tags that EVALB-style evaluation
* should ignore for this treebank/language.
* Traditionally, EVALB has ignored a subset of the total set of
* punctuation tags in the English Penn Treebank (quotes and
* period, comma, colon, etc., but not brackets)
*
* @return Whether this is a EVALB-ignored punctuation tag
*/
@Override
public String[] evalBIgnoredPunctuationTags() {
return punctuationTags();
}
/**
* Accepts a String that is a punctuation
* tag name, and rejects everything else.
*
* @return Whether this is a punctuation tag
*/
@Override
public boolean isPunctuationTag(String str) {
return punctTagStringAcceptFilter.test(str);
}
/**
* Accepts a String that is a punctuation
* word, and rejects everything else.
* If one can't tell for sure (as for ' in the Penn Treebank), it
* maks the best guess that it can.
*
* @return Whether this is a punctuation word
*/
@Override
public boolean isPunctuationWord(String str) {
return punctWordStringAcceptFilter.test(str);
}
/**
* Accepts a String that is a sentence end
* punctuation tag, and rejects everything else.
*
* @return Whether this is a sentence final punctuation tag
*/
@Override
public boolean isSentenceFinalPunctuationTag(String str) {
return sFPunctTagStringAcceptFilter.test(str);
}
/**
* Accepts a String that is a punctuation
* tag that should be ignored by EVALB-style evaluation,
* and rejects everything else.
* Traditionally, EVALB has ignored a subset of the total set of
* punctuation tags in the English Penn Treebank (quotes and
* period, comma, colon, etc., but not brackets)
*
* @return Whether this is a EVALB-ignored punctuation tag
*/
@Override
public boolean isEvalBIgnoredPunctuationTag(String str) {
return eIPunctTagStringAcceptFilter.test(str);
}
/**
* Return a filter that accepts a String that is a punctuation
* tag name, and rejects everything else.
*
* @return The filter
*/
@Override
public Predicate punctuationTagAcceptFilter() {
return punctTagStringAcceptFilter;
}
/**
* Return a filter that rejects a String that is a punctuation
* tag name, and rejects everything else.
*
* @return The filter
*/
@Override
public Predicate punctuationTagRejectFilter() {
return Filters.notFilter(punctTagStringAcceptFilter);
}
/**
* Returns a filter that accepts a String that is a punctuation
* word, and rejects everything else.
* If one can't tell for sure (as for ' in the Penn Treebank), it
* makes the best guess that it can.
*
* @return The Filter
*/
@Override
public Predicate punctuationWordAcceptFilter() {
return punctWordStringAcceptFilter;
}
/**
* Returns a filter that accepts a String that is not a punctuation
* word, and rejects punctuation.
* If one can't tell for sure (as for ' in the Penn Treebank), it
* makes the best guess that it can.
*
* @return The Filter
*/
@Override
public Predicate punctuationWordRejectFilter() {
return Filters.notFilter(punctWordStringAcceptFilter);
}
/**
* Returns a filter that accepts a String that is a sentence end
* punctuation tag, and rejects everything else.
*
* @return The Filter
*/
@Override
public Predicate sentenceFinalPunctuationTagAcceptFilter() {
return sFPunctTagStringAcceptFilter;
}
/**
* Returns a filter that accepts a String that is a punctuation
* tag that should be ignored by EVALB-style evaluation,
* and rejects everything else.
* Traditionally, EVALB has ignored a subset of the total set of
* punctuation tags in the English Penn Treebank (quotes and
* period, comma, colon, etc., but not brackets)
*
* @return The Filter
*/
@Override
public Predicate evalBIgnoredPunctuationTagAcceptFilter() {
return eIPunctTagStringAcceptFilter;
}
/**
* Returns a filter that accepts everything except a String that is a
* punctuation tag that should be ignored by EVALB-style evaluation.
* Traditionally, EVALB has ignored a subset of the total set of
* punctuation tags in the English Penn Treebank (quotes and
* period, comma, colon, etc., but not brackets)
*
* @return The Filter
*/
@Override
public Predicate evalBIgnoredPunctuationTagRejectFilter() {
return Filters.notFilter(eIPunctTagStringAcceptFilter);
}
/**
* Return the input Charset encoding for the Treebank.
* See documentation for the Charset
class.
*
* @return Name of Charset
*/
@Override
public String getEncoding() {
return DEFAULT_ENCODING;
}
private static final char[] EMPTY_CHAR_ARRAY = new char[0];
/**
* Return an array of characters at which a String should be
* truncated to give the basic syntactic category of a label.
* The idea here is that Penn treebank style labels follow a syntactic
* category with various functional and crossreferencing information
* introduced by special characters (such as "NP-SBJ=1"). This would
* be truncated to "NP" by the array containing '-' and "=".
*
* @return An array of characters that set off label name suffixes
*/
@Override
public char[] labelAnnotationIntroducingCharacters() {
return EMPTY_CHAR_ARRAY;
}
/**
* Returns the index of the first character that is after the basic
* label. That is, if category is "NP-LGS", it returns 2.
* This routine assumes category != null.
* This routine returns 0 iff the String is of length 0.
* This routine always returns a number <= category.length(), and
* so it is safe to pass it as an argument to category.substring().
*
* NOTE: the routine should never allow the first character of a label
* to be taken as the annotation introducing character, because in the
* Penn Treebank, "-" is a valid tag, but also the character used to
* set off functional and co-indexing annotations. If the first letter is
* such a character then a matched character is also not used, for
* -LRB- etc., iff there is an intervening character (so --PU becomes -).
*
* @param category Phrasal category
* @return The index of the first character that is after the basic
* label
*/
private int postBasicCategoryIndex(String category) {
boolean sawAtZero = false;
char seenAtZero = '\u0000';
int i = 0;
for (int leng = category.length(); i < leng; i++) {
char ch = category.charAt(i);
if (isLabelAnnotationIntroducingCharacter(ch)) {
if (i == 0) {
sawAtZero = true;
seenAtZero = ch;
} else if (sawAtZero && i > 1 && ch == seenAtZero) {
sawAtZero = false;
} else {
// still skip past identical ones for weird negra-penn "---CJ" (should we just delete it?)
// if (i + 1 < leng && category.charAt(i + 1) == ch) {
// keep looping
// } else {
break;
// }
}
}
}
return i;
}
/**
* Returns the basic syntactic category of a String.
* This implementation basically truncates
* stuff after an occurrence of one of the
* labelAnnotationIntroducingCharacters()
.
* However, there is also special case stuff to deal with
* labelAnnotationIntroducingCharacters in category labels:
* (i) if the first char is in this set, it's never truncated
* (e.g., '-' or '=' as a token), and (ii) if it starts with
* one of this set, a second instance of the same item from this set is
* also excluded (to deal with '-LLB-', '-RCB-', etc.).
*
* @param category The whole String name of the label
* @return The basic category of the String
*/
@Override
public String basicCategory(String category) {
if (category == null) {
return null;
}
return category.substring(0, postBasicCategoryIndex(category));
}
@Override
public String stripGF(String category) {
if(category == null) {
return null;
}
int index = category.lastIndexOf(gfCharacter);
if(index > 0) {
category = category.substring(0, index);
}
return category;
}
/**
* Returns a {@link Function Function} object that maps Strings to Strings according
* to this TreebankLanguagePack's basicCategory() method.
*
* @return The String->String Function object
*/
@Override
public Function getBasicCategoryFunction() {
return new BasicCategoryStringFunction(this);
}
private static class BasicCategoryStringFunction implements Function, Serializable {
private static final long serialVersionUID = 1L;
private TreebankLanguagePack tlp;
BasicCategoryStringFunction(TreebankLanguagePack tlp) {
this.tlp = tlp;
}
@Override
public String apply(String in) {
return tlp.basicCategory(in);
}
}
private static class CategoryAndFunctionStringFunction implements Function, Serializable {
private static final long serialVersionUID = 1L;
private TreebankLanguagePack tlp;
CategoryAndFunctionStringFunction(TreebankLanguagePack tlp) {
this.tlp = tlp;
}
@Override
public String apply(String in) {
return tlp.categoryAndFunction(in);
}
}
/**
* Returns the syntactic category and 'function' of a String.
* This normally involves truncating numerical coindexation
* showing coreference, etc. By 'function', this means
* keeping, say, Penn Treebank functional tags or ICE phrasal functions,
* perhaps returning them as category-function
.
*
* This implementation strips numeric tags after label introducing
* characters (assuming that non-numeric things are functional tags).
*
* @param category The whole String name of the label
* @return A String giving the category and function
*/
@Override
public String categoryAndFunction(String category) {
if (category == null) {
return null;
}
String catFunc = category;
int i = lastIndexOfNumericTag(catFunc);
while (i >= 0) {
catFunc = catFunc.substring(0, i);
i = lastIndexOfNumericTag(catFunc);
}
return catFunc;
}
/**
* Returns the index within this string of the last occurrence of a
* isLabelAnnotationIntroducingCharacter which is followed by only
* digits, corresponding to a numeric tag at the end of the string.
* Example: lastIndexOfNumericTag("NP-TMP-1") returns
* 6
.
*
* @param category A String category
* @return The index within this string of the last occurrence of a
* isLabelAnnotationIntroducingCharacter which is followed by only
* digits
*/
private int lastIndexOfNumericTag(String category) {
if (category == null) {
return -1;
}
int last = -1;
for (int i = category.length() - 1; i >= 0; i--) {
if (isLabelAnnotationIntroducingCharacter(category.charAt(i))) {
boolean onlyDigitsFollow = false;
for (int j = i + 1; j < category.length(); j++) {
onlyDigitsFollow = true;
if (!(Character.isDigit(category.charAt(j)))) {
onlyDigitsFollow = false;
break;
}
}
if (onlyDigitsFollow) {
last = i;
}
}
}
return last;
}
/**
* Returns a {@link Function Function} object that maps Strings to Strings according
* to this TreebankLanguagePack's categoryAndFunction() method.
*
* @return The String->String Function object
*/
@Override
public Function getCategoryAndFunctionFunction() {
return new CategoryAndFunctionStringFunction(this);
}
/**
* Say whether this character is an annotation introducing
* character.
*
* @param ch The character to check
* @return Whether it is an annotation introducing character
*/
@Override
public boolean isLabelAnnotationIntroducingCharacter(char ch) {
char[] cutChars = labelAnnotationIntroducingCharacters();
for (char cutChar : cutChars) {
if (ch == cutChar) {
return true;
}
}
return false;
}
/**
* Accepts a String that is a start symbol of the treebank.
*
* @return Whether this is a start symbol
*/
@Override
public boolean isStartSymbol(String str) {
return startSymbolAcceptFilter.test(str);
}
/**
* Return a filter that accepts a String that is a start symbol
* of the treebank, and rejects everything else.
*
* @return The filter
*/
@Override
public Predicate startSymbolAcceptFilter() {
return startSymbolAcceptFilter;
}
/**
* Returns a String array of treebank start symbols.
*
* @return The start symbols
*/
@Override
public abstract String[] startSymbols();
/**
* Returns a String which is the first (perhaps unique) start symbol
* of the treebank, or null if none is defined.
*
* @return The start symbol
*/
@Override
public String startSymbol() {
String[] ssyms = startSymbols();
if (ssyms == null || ssyms.length == 0) {
return null;
}
return ssyms[0];
}
private final Predicate punctTagStringAcceptFilter = Filters.collectionAcceptFilter(punctuationTags());
private final Predicate punctWordStringAcceptFilter = Filters.collectionAcceptFilter(punctuationWords());
private final Predicate sFPunctTagStringAcceptFilter = Filters.collectionAcceptFilter(sentenceFinalPunctuationTags());
private final Predicate eIPunctTagStringAcceptFilter = Filters.collectionAcceptFilter(evalBIgnoredPunctuationTags());
private final Predicate startSymbolAcceptFilter = Filters.collectionAcceptFilter(startSymbols());
/**
* Return a tokenizer which might be suitable for tokenizing text that
* will be used with this Treebank/Language pair, without tokenizing carriage returns (i.e., treating them as white space). The implementation in AbstractTreebankLanguagePack
* returns a factory for {@link WhitespaceTokenizer}.
*
* @return A tokenizer
*/
@Override
public TokenizerFactory extends HasWord> getTokenizerFactory() {
return WhitespaceTokenizer.factory(false);
}
/**
* Return a GrammaticalStructureFactory suitable for this language/treebank.
* (To be overridden in subclasses.)
*
* @return A GrammaticalStructureFactory suitable for this language/treebank
*/
@Override
public GrammaticalStructureFactory grammaticalStructureFactory() {
throw new UnsupportedOperationException(
"No GrammaticalStructureFactory (typed dependencies) available for language/treebank " +
getClass().getName());
}
/**
* Return a GrammaticalStructureFactory suitable for this language/treebank.
* (To be overridden in subclasses.)
*
* @return A GrammaticalStructureFactory suitable for this language/treebank
*/
@Override
public GrammaticalStructureFactory grammaticalStructureFactory(Predicate puncFilt) {
return grammaticalStructureFactory();
}
/**
* Return a GrammaticalStructureFactory suitable for this language/treebank.
* (To be overridden in subclasses.)
*
* @return A GrammaticalStructureFactory suitable for this language/treebank
*/
@Override
public GrammaticalStructureFactory grammaticalStructureFactory(Predicate puncFilt, HeadFinder typedDependencyHeadFinder) {
return grammaticalStructureFactory();
}
@Override
public boolean supportsGrammaticalStructures() {
return false;
}
public char getGfCharacter() {
return gfCharacter;
}
@Override
public void setGfCharacter(char gfCharacter) {
this.gfCharacter = gfCharacter;
}
/** {@inheritDoc} */
@Override
public TreeReaderFactory treeReaderFactory() {
return new PennTreeReaderFactory();
}
/** {@inheritDoc} */
@Override
public TokenizerFactory treeTokenizerFactory() {
return new TreeTokenizerFactory(treeReaderFactory());
}
/**
* Returns a morphological feature specification for words in this language.
*/
@Override
public MorphoFeatureSpecification morphFeatureSpec() {
return null;
}
@Override
public void setGenerateOriginalDependencies(boolean generateOriginalDependencies) {
this.generateOriginalDependencies = generateOriginalDependencies;
}
@Override
public boolean generateOriginalDependencies() {
return this.generateOriginalDependencies;
}
}