edu.stanford.nlp.trees.BobChrisTreeNormalizer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
package edu.stanford.nlp.trees;
import edu.stanford.nlp.ling.Label;
import java.io.Serializable;
import java.util.function.Predicate;
/**
* Normalizes trees in the way used in Manning and Carpenter 1997.
* NB: This implementation is still incomplete!
* The normalizations performed are: (i) terminals are interned, (ii)
* nonterminals are stripped of alternants, functional tags and
* cross-reference codes, and then interned, (iii) empty
* elements (ones with nonterminal label "-NONE-") are deleted from the
* tree, (iv) the null label at the root node is replaced with the label
* "ROOT".
* 17 Apr 2001: This was fixed to work with different kinds of labels,
* by making proper use of the Label interface, after it was moved into
* the trees module.
*
* The normalizations of the original (Prolog) BobChrisNormalize were:
* 1. Remap the root node to be called 'ROOT'
* 2. Truncate all nonterminal labels before characters introducing
* annotations according to TreebankLanguagePack
* (traditionally, -, =, | or # (last for BLLIP))
* 3. Remap the representation of certain leaf symbols (brackets etc.)
* 4. Map to lowercase all leaf nodes
* 5. Delete empty/trace nodes (ones marked '-NONE-')
* 6. Recursively delete any nodes that do not dominate any words
* 7. Delete A over A nodes where the top A dominates nothing else
* 8. Remove backslashes from lexical items
* (the Treebank inserts them to escape slashes (/) and stars (*)).
* 4 is deliberately omitted, and a few things are purely aesthetic.
*
* 14 June 2002: It now deletes unary A over A if both nodes' labels are equal
* (7), and (6) was always part of the Tree.prune() functionality...
* 30 June 2005: Also splice out an EDITED node, just in case you're parsing
* the Brown corpus.
*
* @author Christopher Manning
*/
public class BobChrisTreeNormalizer extends TreeNormalizer implements TreeTransformer {
protected final TreebankLanguagePack tlp;
public BobChrisTreeNormalizer() {
this(new PennTreebankLanguagePack());
}
public BobChrisTreeNormalizer(TreebankLanguagePack tlp) {
this.tlp = tlp;
}
/**
* Normalizes a leaf contents.
* This implementation interns the leaf.
*/
@Override
public String normalizeTerminal(String leaf) {
// We could unquote * and / with backslash \ in front of them
return leaf.intern();
}
/**
* Normalizes a nonterminal contents.
* This implementation strips functional tags, etc. and interns the
* nonterminal.
*/
@Override
public String normalizeNonterminal(String category) {
return cleanUpLabel(category).intern();
}
/**
* Remove things like hyphened functional tags and equals from the
* end of a node label. This version always just returns the phrase
* structure category, or "ROOT" if the label was null
.
*
* @param label The label from the treebank
* @return The cleaned up label (phrase structure category)
*/
protected String cleanUpLabel(final String label) {
if (label == null || label.isEmpty()) {
return "ROOT";
// String constants are always interned
} else {
return tlp.basicCategory(label);
}
}
/**
* Normalize a whole tree -- one can assume that this is the
* root. This implementation deletes empty elements (ones with
* nonterminal tag label '-NONE-') from the tree, and splices out
* unary A over A nodes. It assumes that it is not given a
* null tree, but it may return one if there are no real words.
*/
@Override
public Tree normalizeWholeTree(Tree tree, TreeFactory tf) {
Tree middle = tree.prune(emptyFilter, tf);
if (middle == null) {
return null;
} else {
return middle.spliceOut(aOverAFilter, tf);
}
}
@Override
public Tree transformTree(Tree tree) {
return normalizeWholeTree(tree, tree.treeFactory());
}
protected Predicate emptyFilter = new EmptyFilter();
protected Predicate aOverAFilter = new AOverAFilter();
private static final long serialVersionUID = -1005188028979810143L;
public static class EmptyFilter implements Predicate, Serializable {
private static final long serialVersionUID = 8914098359495987617L;
/** Doesn't accept nodes that only cover an empty. */
@Override
public boolean test(Tree t) {
Tree[] kids = t.children();
Label l = t.label();
// Delete (return false for) empty/trace nodes (ones marked '-NONE-')
return ! ((l != null) && "-NONE-".equals(l.value()) && !t.isLeaf() && kids.length == 1 && kids[0].isLeaf());
}
// private static final long serialVersionUID = 1L;
} // end class EmptyFilter
public static class AOverAFilter implements Predicate, Serializable {
/** Doesn't accept nodes that are A over A nodes (perhaps due to
* empty removal or are EDITED nodes).
*/
@Override
public boolean test(Tree t) {
if (t.isLeaf() || t.isPreTerminal()) {
return true;
}
// The special switchboard non-terminals clause
if ("EDITED".equals(t.label().value()) || "CODE".equals(t.label().value())) {
return false;
}
if (t.numChildren() != 1) {
return true;
}
return ! (t.label() != null && t.label().value() != null && t.label().value().equals(t.getChild(0).label().value()));
}
private static final long serialVersionUID = 1L;
} // end static class AOverAFilter
}