edu.stanford.nlp.parser.lexparser.TreeCollinizer Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation

Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.

There is a newer version: 3.9.2

Show newest version

package edu.stanford.nlp.parser.lexparser;

import java.util.List;
import java.util.ArrayList;

import edu.stanford.nlp.trees.*;

/**
 * Does detransformations to a parsed sentence to map it back to the
 * standard treebank form for output or evaluation.
 * This version has Penn-Treebank-English-specific details, but can probably
 * be used without harm on other treebanks.
 * Returns labels to their basic category, removes punctuation (should be with
 * respect to a gold tree, but currently isn't), deletes the boundary symbol,
 * changes PRT labels to ADVP.
 *
 * @author Dan Klein
 * @author Christopher Manning
 */
public class TreeCollinizer implements TreeTransformer {

  private final TreebankLanguagePack tlp;
  private final boolean deletePunct;
  private final boolean fixCollinsBaseNP;
  /** whOption: 0 = do nothing, 1 = also collapse WH phrasal categories in gold tree,
      2 = also collapse WH tags in gold tree,
      4 = attempt to restore WH categories in parse trees (not yet implemented) */
  private final int whOption;

  public TreeCollinizer(TreebankLanguagePack tlp) {
    this(tlp, true, false);
  }

  public TreeCollinizer(TreebankLanguagePack tlp, boolean deletePunct,
                        boolean fixCollinsBaseNP) {
    this(tlp, deletePunct, fixCollinsBaseNP, 0);
  }

  public TreeCollinizer(TreebankLanguagePack tlp, boolean deletePunct,
                        boolean fixCollinsBaseNP, int whOption) {
    this.tlp = tlp;
    this.deletePunct = deletePunct;
    this.fixCollinsBaseNP = fixCollinsBaseNP;
    this.whOption = whOption;
  }

  public Tree transformTree(Tree tree) {
    if (tree == null) return null;
    TreeFactory tf = tree.treeFactory();

    String s = tree.value();
    if (tlp.isStartSymbol(s))
      return transformTree(tree.firstChild());

    if (tree.isLeaf()) {
      return tf.newLeaf(tree.label());
    }
    s = tlp.basicCategory(s);
    if (((whOption & 1) != 0) && s.startsWith("WH")) {
      s = s.substring(2);
    }
    if ((whOption & 2) != 0) {
      s = s.replaceAll("^WP", "PRP"); // does both WP and WP$ !!
      s = s.replaceAll("^WDT", "DT");
      s = s.replaceAll("^WRB", "RB");
    }
    if (((whOption & 4) != 0) && s.startsWith("WH")) {
      s = s.substring(2);
    }

    // wsg2010: Might need a better way to deal with tag ambiguity. This still doesn't handle the
    // case where the GOLD tree does not label a punctuation mark as such (common in French), and
    // the guess tree does.
    if (deletePunct && tree.isPreTerminal() &&
        (tlp.isEvalBIgnoredPunctuationTag(s) ||
        tlp.isPunctuationWord(tree.firstChild().value()))) {
      return null;
    }

    // remove the extra NPs inserted in the collinsBaseNP option
    if (fixCollinsBaseNP && s.equals("NP")) {
      Tree[] kids = tree.children();
      if (kids.length == 1 && tlp.basicCategory(kids[0].value()).equals("NP")) {
        return transformTree(kids[0]);
      }
    }
    // Magerman erased this distinction, and everyone else has followed like sheep...
    if (s.equals("PRT")) {
      s = "ADVP";
    }
    List children = new ArrayList();
    for (int cNum = 0, numKids = tree.numChildren(); cNum < numKids; cNum++) {
      Tree child = tree.children()[cNum];
      Tree newChild = transformTree(child);
      if (newChild != null) {
        children.add(newChild);
      }
    }
    if (children.isEmpty()) {
      return null;
    }

    Tree node = tf.newTreeNode(tree.label(), children);
    node.setValue(s);

    return node;
  }

} // end class TreeCollinizer