edu.stanford.nlp.parser.lexparser.TregexPoweredTreebankParserParams Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
There is a newer version: 3.9.2
Show newest version
package edu.stanford.nlp.parser.lexparser;

import edu.stanford.nlp.ling.HasTag;
import edu.stanford.nlp.process.SerializableFunction;
import edu.stanford.nlp.trees.HeadFinder;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreebankLanguagePack;
import edu.stanford.nlp.trees.tregex.TregexMatcher;
import edu.stanford.nlp.trees.tregex.TregexParseException;
import edu.stanford.nlp.trees.tregex.TregexPattern;
import edu.stanford.nlp.trees.tregex.TregexPatternCompiler;
import edu.stanford.nlp.util.CollectionUtils;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Pair;

import java.util.Collection;
import java.util.function.Function;
import java.util.Map;

/**
 * An extension of
 * {@link edu.stanford.nlp.parser.lexparser.AbstractTreebankParserParams}
 * which provides support for Tregex-powered annotations.
 *
 * Subclasses of this class provide collections of features
 * which are associated with annotation behaviors that seek out
 * and label matching trees in some way. For example, a coord
 * feature might have an annotation behavior which searches for
 * coordinating noun phrases and labels the associated constituent
 * with a suffix -coordinating.
 *
 * The "search" in this process is conducted via Tregex, and the
 * actual annotation is done through execution of an arbitrary
 * {@link java.util.function.Function} provided by the user.
 * This class carries as inner several classes several useful common
 * annotation functions.
 *
 * @see #annotations
 * @see SimpleStringFunction
 *
 * @author Jon Gauthier
 * @author Spence Green
 */
public abstract class TregexPoweredTreebankParserParams extends AbstractTreebankParserParams {

  private static final long serialVersionUID = -1985603901694682420L;

  /**
   * This data structure dictates how an arbitrary tree should be
   * annotated. Subclasses should fill out the related member
   * {@link #annotations}.
   *
   * It is a collection of features: a map from feature name
   * to behavior, where each behavior is a tuple (t, f).
   * t is a Tregex pattern which matches subtrees
   * corresponding to the feature, and f is a function which
   * accepts such matches and generates an annotation which the matched
   * subtree should be given.
   *
   * @see #annotations
   */
  private final Map>> annotationPatterns
    = Generics.newHashMap();

  /**
   * This data structure dictates how an arbitrary tree should be
   * annotated.
   *
   * It is a collection of features: a map from feature name
   * to behavior, where each behavior is a tuple (t, f).
   * t is a string form of a TregexPattern which matches
   * subtrees corresponding to the feature, and f is a
   * function which accepts such matches and generates an annotation
   * which the matched subtree should be given.
   *
   * @see #annotationPatterns
   * @see SimpleStringFunction
   */
  protected final Map>> annotations
    = Generics.newHashMap();

  /**
   * Features which should be enabled by default.
   */
  protected abstract String[] baselineAnnotationFeatures();

  /**
   * Extra features which have been requested. Use
   * {@link #addFeature(String)} to add features.
   */
  private final Collection features;

  public TregexPoweredTreebankParserParams(TreebankLanguagePack tlp) {
    super(tlp);

    features = CollectionUtils.asSet(baselineAnnotationFeatures());
  }

  /**
   * Compile the {@link #annotations} collection given a
   * particular head finder. Subclasses should call this method at
   * least once before the class is used, and whenever the head finder
   * is changed.
   */
  protected void compileAnnotations(HeadFinder hf) {
    TregexPatternCompiler compiler = new TregexPatternCompiler(hf);

    annotationPatterns.clear();
    for (Map.Entry>> annotation : annotations.entrySet()) {
      TregexPattern compiled;
      try {
        compiled = compiler.compile(annotation.getValue().first());
      } catch (TregexParseException e) {
        int nth = annotationPatterns.size() + 1;
        System.err.println("Parse exception on annotation pattern #" + nth + " initialization: " + e);
        continue;
      }

      Pair> behavior =
        new Pair>(compiled, annotation.getValue().second());

      annotationPatterns.put(annotation.getKey(), behavior);
    }
  }

  /**
   * Enable an annotation feature. If the provided feature has already
   * been enabled, this method does nothing.
   *
   * @param featureName
   * @throws java.lang.IllegalArgumentException If the provided feature
   *           name is unknown (i.e., if there is no entry in the
   *           {@link #annotations} collection with the same name)
   */
  protected void addFeature(String featureName) {
    if (!annotations.containsKey(featureName))
      throw new IllegalArgumentException("Invalid feature name '" + featureName + "'");
    if (!annotationPatterns.containsKey(featureName))
      throw new RuntimeException("Compiled patterns out of sync with annotations data structure;" +
        "did you call compileAnnotations?");

    features.add(featureName);
  }

  /**
   * Disable a feature. If the feature was never enabled, this method
   * returns without error.
   *
   * @param featureName
   */
  protected void removeFeature(String featureName) {
    features.remove(featureName);
  }

  /**
   * This method does language-specific tree transformations such as annotating particular nodes with language-relevant
   * features. Such parameterizations should be inside the specific TreebankLangParserParams class.  This method is
   * recursively applied to each node in the tree (depth first, left-to-right), so you shouldn't write this method to
   * apply recursively to tree members.  This method is allowed to (and in some cases does) destructively change the
   * input tree t. It changes both labels and the tree shape.
   *
   * @param t    The input tree (with non-language specific annotation already done, so you need to strip back to basic
   *             categories)
   * @param root The root of the current tree (can be null for words)
   * @return The fully annotated tree node (with daughters still as you want them in the final result)
   */
  @Override
  public Tree transformTree(Tree t, Tree root) {
    String newCat = t.value() + getAnnotationString(t, root);
    t.setValue(newCat);
    if (t.isPreTerminal() && t.label() instanceof HasTag)
      ((HasTag) t.label()).setTag(newCat);

    return t;
  }

  /**
   * Build a string of annotations for the given tree.
   *
   * @param t The input tree (with non-language specific annotation
   *          already done, so you need to strip back to basic categories)
   * @param root The root of the current tree (can be null for words)
   * @return A (possibly empty) string of annotations to add to the
   *         given tree
   */
  protected String getAnnotationString(Tree t, Tree root) {
    // Accumulate all annotations in this string
    StringBuilder annotationStr = new StringBuilder();

    for (String featureName : features) {
      Pair> behavior = annotationPatterns.get(featureName);
      TregexMatcher m = behavior.first().matcher(root);
      if (m.matchesAt(t))
        annotationStr.append(behavior.second().apply(m));
    }

    return annotationStr.toString();
  }

  /**
   * Output a description of the current annotation configuration to
   * standard error.
   */
  @Override
  public void display() {
    for (String feature : features)
      System.err.printf("%s ", feature);
    System.err.println();
  }

  /**
   * Annotates all nodes that match the tregex query with some string.
   */
  protected static class SimpleStringFunction implements SerializableFunction {

    private static final long serialVersionUID = 6958776731059724396L;
    private String annotationMark;

    public SimpleStringFunction(String annotationMark) {
      this.annotationMark = annotationMark;
    }

    public String apply(TregexMatcher matcher) {
      return annotationMark;
    }

    @Override
    public String toString() {
      return "SimpleStringFunction[" + annotationMark + ']';
    }

  }

  /**
   * Annotate a tree constituent with its lexical head.
   */
  protected static class AnnotateHeadFunction implements SerializableFunction {

    private static final long serialVersionUID = -4213299755069618322L;

    private final HeadFinder headFinder;
    private boolean lowerCase;

    public AnnotateHeadFunction(HeadFinder hf) {
      this(hf, true);
    }

    public AnnotateHeadFunction(HeadFinder hf, boolean lowerCase) {
      headFinder = hf;
      this.lowerCase = lowerCase;
    }

    public String apply(TregexMatcher matcher) {
      Tree matchedTree = matcher.getMatch();

      Tree head = headFinder.determineHead(matchedTree);
      if (!head.isPrePreTerminal())
        return "";

      Tree lexicalHead = head.firstChild().firstChild();
      String headValue = lexicalHead.value();

      if (headValue != null) {
        if (lowerCase) headValue = headValue.toLowerCase();
        return '[' + headValue + ']';
      } else {
        return "";
      }
    }

    @Override
    public String toString() {
      return "AnnotateHeadFunction[" + headFinder.getClass().getName() + ']';
    }

  }

}