edu.stanford.nlp.parser.lexparser.NegraPennCollinizer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
The newest version!
package edu.stanford.nlp.parser.lexparser;
import edu.stanford.nlp.util.logging.Redwood;
import java.util.ArrayList;
import java.util.List;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.ling.StringLabel;
import edu.stanford.nlp.trees.LabeledScoredTreeFactory;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeFactory;
import edu.stanford.nlp.trees.TreeTransformer;
public class NegraPennCollinizer implements TreeTransformer {
/** A logger for this class */
Redwood.RedwoodChannels log = Redwood.channels(NegraPennCollinizer.class);
private TreebankLangParserParams tlpp;
private final boolean deletePunct;
public NegraPennCollinizer(TreebankLangParserParams tlpp) {
this(tlpp, true);
}
public NegraPennCollinizer(TreebankLangParserParams tlpp, boolean deletePunct) {
this.tlpp = tlpp;
this.deletePunct = deletePunct;
}
protected TreeFactory tf = new LabeledScoredTreeFactory();
public Tree transformTree(Tree tree) {
Label l = tree.label();
if (tree.isLeaf()) {
return tf.newLeaf(l);
}
String s = l.value();
s = tlpp.treebankLanguagePack().basicCategory(s);
if (deletePunct) {
// this is broken as it's not the right thing to do when there
// is any tag ambiguity -- and there is for ' (POS/''). Sentences
// can then have more or less words. It's also unnecessary for EVALB,
// since it ignores punctuation anyway
if (tree.isPreTerminal() && tlpp.treebankLanguagePack().isEvalBIgnoredPunctuationTag(s)) {
return null;
}
}
// TEMPORARY: eliminate the TOPP constituent
if (tree.children()[0].label().value().equals("TOPP")) {
log.info("Found a TOPP");
tree.setChildren(tree.children()[0].children());
}
// Negra has lots of non-unary roots; delete unary roots
if (tlpp.treebankLanguagePack().isStartSymbol(s) && tree.numChildren() == 1) {
// NB: This deletes the boundary symbol, which is in the tree!
return transformTree(tree.getChild(0));
}
List children = new ArrayList<>();
for (int cNum = 0, numC = tree.numChildren(); cNum < numC; cNum++) {
Tree child = tree.getChild(cNum);
Tree newChild = transformTree(child);
if (newChild != null) {
children.add(newChild);
}
}
if (children.isEmpty()) {
return null;
}
return tf.newTreeNode(new StringLabel(s), children);
}
}