edu.stanford.nlp.parser.lexparser.TregexPoweredTreebankParserParams Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
package edu.stanford.nlp.parser.lexparser;
import edu.stanford.nlp.util.logging.Redwood;
import edu.stanford.nlp.ling.HasTag;
import edu.stanford.nlp.process.SerializableFunction;
import edu.stanford.nlp.trees.HeadFinder;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreebankLanguagePack;
import edu.stanford.nlp.trees.tregex.TregexMatcher;
import edu.stanford.nlp.trees.tregex.TregexParseException;
import edu.stanford.nlp.trees.tregex.TregexPattern;
import edu.stanford.nlp.trees.tregex.TregexPatternCompiler;
import edu.stanford.nlp.util.CollectionUtils;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Pair;
import java.util.Collection;
import java.util.function.Function;
import java.util.Map;
/**
* An extension of
* {@link edu.stanford.nlp.parser.lexparser.AbstractTreebankParserParams}
* which provides support for Tregex-powered annotations.
*
* Subclasses of this class provide collections of features
* which are associated with annotation behaviors that seek out
* and label matching trees in some way. For example, a coord
* feature might have an annotation behavior which searches for
* coordinating noun phrases and labels the associated constituent
* with a suffix -coordinating.
*
* The "search" in this process is conducted via Tregex, and the
* actual annotation is done through execution of an arbitrary
* {@link java.util.function.Function} provided by the user.
* This class carries as inner several classes several useful common
* annotation functions.
*
* @see #annotations
* @see SimpleStringFunction
*
* @author Jon Gauthier
* @author Spence Green
*/
public abstract class TregexPoweredTreebankParserParams extends AbstractTreebankParserParams {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(TregexPoweredTreebankParserParams.class);
private static final long serialVersionUID = -1985603901694682420L;
/**
* This data structure dictates how an arbitrary tree should be
* annotated. Subclasses should fill out the related member
* {@link #annotations}.
*
* It is a collection of features: a map from feature name
* to behavior, where each behavior is a tuple (t, f).
* t is a Tregex pattern which matches subtrees
* corresponding to the feature, and f is a function which
* accepts such matches and generates an annotation which the matched
* subtree should be given.
*
* @see #annotations
*/
private final Map>> annotationPatterns
= Generics.newHashMap();
/**
* This data structure dictates how an arbitrary tree should be
* annotated.
*
* It is a collection of features: a map from feature name
* to behavior, where each behavior is a tuple (t, f).
* t is a string form of a TregexPattern which matches
* subtrees corresponding to the feature, and f is a
* function which accepts such matches and generates an annotation
* which the matched subtree should be given.
*
* @see #annotationPatterns
* @see SimpleStringFunction
*/
protected final Map>> annotations
= Generics.newHashMap();
/**
* Features which should be enabled by default.
*/
protected abstract String[] baselineAnnotationFeatures();
/**
* Extra features which have been requested. Use
* {@link #addFeature(String)} to add features.
*/
private final Collection features;
public TregexPoweredTreebankParserParams(TreebankLanguagePack tlp) {
super(tlp);
features = CollectionUtils.asSet(baselineAnnotationFeatures());
}
/**
* Compile the {@link #annotations} collection given a
* particular head finder. Subclasses should call this method at
* least once before the class is used, and whenever the head finder
* is changed.
*/
protected void compileAnnotations(HeadFinder hf) {
TregexPatternCompiler compiler = new TregexPatternCompiler(hf);
annotationPatterns.clear();
for (Map.Entry>> annotation : annotations.entrySet()) {
TregexPattern compiled;
try {
compiled = compiler.compile(annotation.getValue().first());
} catch (TregexParseException e) {
int nth = annotationPatterns.size() + 1;
log.info("Parse exception on annotation pattern #" + nth + " initialization: " + e);
continue;
}
Pair> behavior =
new Pair<>(compiled, annotation.getValue().second());
annotationPatterns.put(annotation.getKey(), behavior);
}
}
/**
* Enable an annotation feature. If the provided feature has already
* been enabled, this method does nothing.
*
* @param featureName
* @throws java.lang.IllegalArgumentException If the provided feature
* name is unknown (i.e., if there is no entry in the
* {@link #annotations} collection with the same name)
*/
protected void addFeature(String featureName) {
if (!annotations.containsKey(featureName))
throw new IllegalArgumentException("Invalid feature name '" + featureName + "'");
if (!annotationPatterns.containsKey(featureName))
throw new RuntimeException("Compiled patterns out of sync with annotations data structure;" +
"did you call compileAnnotations?");
features.add(featureName);
}
/**
* Disable a feature. If the feature was never enabled, this method
* returns without error.
*
* @param featureName
*/
protected void removeFeature(String featureName) {
features.remove(featureName);
}
/**
* This method does language-specific tree transformations such as annotating particular nodes with language-relevant
* features. Such parameterizations should be inside the specific TreebankLangParserParams class. This method is
* recursively applied to each node in the tree (depth first, left-to-right), so you shouldn't write this method to
* apply recursively to tree members. This method is allowed to (and in some cases does) destructively change the
* input tree t
. It changes both labels and the tree shape.
*
* @param t The input tree (with non-language specific annotation already done, so you need to strip back to basic
* categories)
* @param root The root of the current tree (can be null for words)
* @return The fully annotated tree node (with daughters still as you want them in the final result)
*/
@Override
public Tree transformTree(Tree t, Tree root) {
String newCat = t.value() + getAnnotationString(t, root);
t.setValue(newCat);
if (t.isPreTerminal() && t.label() instanceof HasTag)
((HasTag) t.label()).setTag(newCat);
return t;
}
/**
* Build a string of annotations for the given tree.
*
* @param t The input tree (with non-language specific annotation
* already done, so you need to strip back to basic categories)
* @param root The root of the current tree (can be null for words)
* @return A (possibly empty) string of annotations to add to the
* given tree
*/
protected String getAnnotationString(Tree t, Tree root) {
// Accumulate all annotations in this string
StringBuilder annotationStr = new StringBuilder();
for (String featureName : features) {
Pair> behavior = annotationPatterns.get(featureName);
TregexMatcher m = behavior.first().matcher(root);
if (m.matchesAt(t))
annotationStr.append(behavior.second().apply(m));
}
return annotationStr.toString();
}
/**
* Output a description of the current annotation configuration to
* standard error.
*/
@Override
public void display() {
for (String feature : features)
System.err.printf("%s ", feature);
log.info();
}
/**
* Annotates all nodes that match the tregex query with some string.
*/
protected static class SimpleStringFunction implements SerializableFunction {
private static final long serialVersionUID = 6958776731059724396L;
private String annotationMark;
public SimpleStringFunction(String annotationMark) {
this.annotationMark = annotationMark;
}
public String apply(TregexMatcher matcher) {
return annotationMark;
}
@Override
public String toString() {
return "SimpleStringFunction[" + annotationMark + ']';
}
}
/**
* Annotate a tree constituent with its lexical head.
*/
protected static class AnnotateHeadFunction implements SerializableFunction {
private static final long serialVersionUID = -4213299755069618322L;
private final HeadFinder headFinder;
private boolean lowerCase;
public AnnotateHeadFunction(HeadFinder hf) {
this(hf, true);
}
public AnnotateHeadFunction(HeadFinder hf, boolean lowerCase) {
headFinder = hf;
this.lowerCase = lowerCase;
}
public String apply(TregexMatcher matcher) {
Tree matchedTree = matcher.getMatch();
Tree head = headFinder.determineHead(matchedTree);
if (!head.isPrePreTerminal())
return "";
Tree lexicalHead = head.firstChild().firstChild();
String headValue = lexicalHead.value();
if (headValue != null) {
if (lowerCase) headValue = headValue.toLowerCase();
return '[' + headValue + ']';
} else {
return "";
}
}
@Override
public String toString() {
return "AnnotateHeadFunction[" + headFinder.getClass().getName() + ']';
}
}
}