edu.stanford.nlp.trees.BobChrisTreeNormalizer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
There is a newer version: 3.9.2
Show newest version
package edu.stanford.nlp.trees;

import edu.stanford.nlp.ling.Label;

import java.io.Serializable;
import java.util.function.Predicate;


/**
 * Normalizes trees in the way used in Manning and Carpenter 1997.
 * NB: This implementation is still incomplete!
 * The normalizations performed are: (i) terminals are interned, (ii)
 * nonterminals are stripped of alternants, functional tags and
 * cross-reference codes, and then interned, (iii) empty
 * elements (ones with nonterminal label "-NONE-") are deleted from the
 * tree, (iv) the null label at the root node is replaced with the label
 * "ROOT". 

 * 17 Apr 2001: This was fixed to work with different kinds of labels,
 * by making proper use of the Label interface, after it was moved into
 * the trees module.
 * 
 * The normalizations of the original (Prolog) BobChrisNormalize were:
 * 1. Remap the root node to be called 'ROOT'
 * 2. Truncate all nonterminal labels before characters introducing
 * annotations according to TreebankLanguagePack
 * (traditionally, -, =, | or # (last for BLLIP))
 * 3. Remap the representation of certain leaf symbols (brackets etc.)
 * 4. Map to lowercase all leaf nodes
 * 5. Delete empty/trace nodes (ones marked '-NONE-')
 * 6. Recursively delete any nodes that do not dominate any words
 * 7. Delete A over A nodes where the top A dominates nothing else
 * 8. Remove backslashes from lexical items
 * (the Treebank inserts them to escape slashes (/) and stars (*)).
 * 4 is deliberately omitted, and a few things are purely aesthetic.
 * 
 * 14 June 2002: It now deletes unary A over A if both nodes' labels are equal
 * (7), and (6) was always part of the Tree.prune() functionality...
 * 30 June 2005: Also splice out an EDITED node, just in case you're parsing
 * the Brown corpus.
 *
 * @author Christopher Manning
 */
public class BobChrisTreeNormalizer extends TreeNormalizer implements TreeTransformer {

  protected final TreebankLanguagePack tlp;


  public BobChrisTreeNormalizer() {
    this(new PennTreebankLanguagePack());
  }

  public BobChrisTreeNormalizer(TreebankLanguagePack tlp) {
    this.tlp = tlp;
  }


  /**
   * Normalizes a leaf contents.
   * This implementation interns the leaf.
   */
  @Override
  public String normalizeTerminal(String leaf) {
    // We could unquote * and / with backslash \ in front of them
    return leaf.intern();
  }


  /**
   * Normalizes a nonterminal contents.
   * This implementation strips functional tags, etc. and interns the
   * nonterminal.
   */
  @Override
  public String normalizeNonterminal(String category) {
    return cleanUpLabel(category).intern();
  }


  /**
   * Remove things like hyphened functional tags and equals from the
   * end of a node label.  This version always just returns the phrase
   * structure category, or "ROOT" if the label was null.
   *
   * @param label The label from the treebank
   * @return The cleaned up label (phrase structure category)
   */
  protected String cleanUpLabel(final String label) {
    if (label == null || label.isEmpty()) {
      return "ROOT";
      // String constants are always interned
    } else {
      return tlp.basicCategory(label);
    }
  }


  /**
   * Normalize a whole tree -- one can assume that this is the
   * root.  This implementation deletes empty elements (ones with
   * nonterminal tag label '-NONE-') from the tree, and splices out
   * unary A over A nodes.  It assumes that it is not given a
   * null tree, but it may return one if there are no real words.
   */
  @Override
  public Tree normalizeWholeTree(Tree tree, TreeFactory tf) {
    Tree middle = tree.prune(emptyFilter, tf);
    if (middle == null) {
      return null;
    } else {
      return middle.spliceOut(aOverAFilter, tf);
    }
  }

  @Override
  public Tree transformTree(Tree tree) {
    return normalizeWholeTree(tree, tree.treeFactory());
  }


  protected Predicate emptyFilter = new EmptyFilter();

  protected Predicate aOverAFilter = new AOverAFilter();

  private static final long serialVersionUID = -1005188028979810143L;


  public static class EmptyFilter implements Predicate, Serializable {

    private static final long serialVersionUID = 8914098359495987617L;

    /** Doesn't accept nodes that only cover an empty. */
    @Override
    public boolean test(Tree t) {
      Tree[] kids = t.children();
      Label l = t.label();
      // Delete (return false for) empty/trace nodes (ones marked '-NONE-')
      return ! ((l != null) && "-NONE-".equals(l.value()) && !t.isLeaf() && kids.length == 1 && kids[0].isLeaf());
    }

      //    private static final long serialVersionUID = 1L;

  } // end class EmptyFilter


  public static class AOverAFilter implements Predicate, Serializable {

    /** Doesn't accept nodes that are A over A nodes (perhaps due to
     *  empty removal or are EDITED nodes).
     */
    @Override
    public boolean test(Tree t) {
      if (t.isLeaf() || t.isPreTerminal()) {
        return true;
      }
      // The special switchboard non-terminals clause
      if ("EDITED".equals(t.label().value()) || "CODE".equals(t.label().value())) {
        return false;
      }
      if (t.numChildren() != 1) {
        return true;
      }
      return ! (t.label() != null && t.label().value() != null && t.label().value().equals(t.getChild(0).label().value()));
    }

    private static final long serialVersionUID = 1L;

  } // end static class AOverAFilter

}