edu.stanford.nlp.trees.GrammaticalStructure Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
The newest version!
package edu.stanford.nlp.trees;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.LineNumberReader;
import java.io.Serializable;
import java.util.*;
import java.util.concurrent.locks.Lock;
import java.util.function.Predicate;
import edu.stanford.nlp.graph.DirectedMultiGraph;
import edu.stanford.nlp.international.Language;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.AbstractCoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.trees.ud.EnhancementOptions;
import edu.stanford.nlp.util.Filters;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;
import static edu.stanford.nlp.trees.GrammaticalRelation.DEPENDENT;
import static edu.stanford.nlp.trees.GrammaticalRelation.ROOT;
/**
* A {@code GrammaticalStructure} stores dependency relations between
* nodes in a tree. A new {@code GrammaticalStructure} is constructed
* from an existing parse tree with the help of {@link
* GrammaticalRelation {@code GrammaticalRelation}}, which
* defines a hierarchy of grammatical relations, along with
* patterns for identifying them in parse trees. The constructor for
* {@code GrammaticalStructure} uses these definitions to
* populate the new {@code GrammaticalStructure} with as many
* labeled grammatical relations as it can. Once constructed, the new
* {@code GrammaticalStructure} can be printed in various
* formats, or interrogated using the interface methods in this
* class. Internally, this uses a representation via a {@code TreeGraphNode},
* that is, a tree with additional labeled
* arcs between nodes, for representing the grammatical relations in a
* parse tree.
*
* @author Bill MacCartney
* @author Galen Andrew (refactoring English-specific stuff)
* @author Ilya Sherman (dependencies)
* @author Daniel Cer
* @see EnglishGrammaticalRelations
* @see GrammaticalRelation
* @see EnglishGrammaticalStructure
*/
public abstract class GrammaticalStructure implements Serializable {
/** A logger for this class */
private static final Redwood.RedwoodChannels log = Redwood.channels(GrammaticalStructure.class);
private static final boolean PRINT_DEBUGGING = System.getProperty("GrammaticalStructure", null) != null;
/**
* A specification for the types of extra edges to add to the dependency tree.
* If you're in doubt, use {@link edu.stanford.nlp.trees.GrammaticalStructure.Extras#NONE}.
*/
public enum Extras {
/**
* Don't include any additional edges.
*
* Note: In older code (2014 and before) including extras was a boolean flag. This option is the equivalent of
* the {@code false} flag.
*/
NONE(false, false, false),
/**
* Include only the extra reference edges, and save them as reference edges without collapsing.
*/
REF_ONLY_UNCOLLAPSED(true, false, false),
/**
* Include only the extra reference edges, but collapsing these edges to clone the edge type of the referent.
* So, for example, My dog who eats sausage may have a "ref" edge from who to dog
* that would be deleted and replaced with an "nsubj" edge from eats to dog.
*/
REF_ONLY_COLLAPSED(true, false, true),
/**
* Add extra subjects only, not adding any of the other extra edge types.
*/
SUBJ_ONLY(false, true, false),
/**
* @see edu.stanford.nlp.trees.GrammaticalStructure.Extras#SUBJ_ONLY
* @see edu.stanford.nlp.trees.GrammaticalStructure.Extras#REF_ONLY_UNCOLLAPSED
*/
REF_UNCOLLAPSED_AND_SUBJ(true, true, false),
/**
* @see edu.stanford.nlp.trees.GrammaticalStructure.Extras#SUBJ_ONLY
* @see edu.stanford.nlp.trees.GrammaticalStructure.Extras#REF_ONLY_COLLAPSED
*/
REF_COLLAPSED_AND_SUBJ(true, true, true),
/**
* Do the maximal amount of extra processing.
* Currently, this is equivalent to {@link edu.stanford.nlp.trees.GrammaticalStructure.Extras#REF_COLLAPSED_AND_SUBJ}.
*
* Note: In older code (2014 and before) including extras was a boolean flag. This option is the equivalent of
* the {@code true} flag.
*/
MAXIMAL(true, true, true);
/** Add "ref" edges */
public final boolean doRef;
/** Add extra subject edges */
public final boolean doSubj;
/** collapse the "ref" edges */
public final boolean collapseRef;
/** Constructor. Nothing exciting here. */
Extras(boolean doRef, boolean doSubj, boolean collapseRef) {
this.doRef = doRef;
this.doSubj = doSubj;
this.collapseRef = collapseRef;
}
} // end enum Extras
protected final List typedDependencies;
protected final List allTypedDependencies;
protected final Predicate puncFilter;
protected final Predicate tagFilter;
/**
* The root Tree node for this GrammaticalStructure.
*/
private final TreeGraphNode root;
/**
* A map from arbitrary integer indices to nodes.
*/
private final Map indexMap = Generics.newHashMap();
/**
* Create a new GrammaticalStructure, analyzing the parse tree and
* populate the GrammaticalStructure with as many labeled
* grammatical relation arcs as possible.
*
* @param t A Tree to analyze
* @param relations A set of GrammaticalRelations to consider
* @param relationsLock Something needed to make this thread-safe when iterating over relations
* @param transformer A tree transformer to apply to the tree before converting (this argument
* may be null if no transformer is required)
* @param hf A HeadFinder for analysis
* @param puncFilter A Filter to reject punctuation. To delete punctuation
* dependencies, this filter should return false on
* punctuation word strings, and true otherwise.
* If punctuation dependencies should be kept, you
* should pass in a {@code Filters.acceptFilter()}.
* @param tagFilter Appears to be unused (filters out tags??)
*/
public GrammaticalStructure(Tree t, Collection relations,
Lock relationsLock, TreeTransformer transformer,
HeadFinder hf, Predicate puncFilter,
Predicate tagFilter) {
TreeGraphNode treeGraph = new TreeGraphNode(t, (TreeGraphNode) null);
// TODO: create the tree and reuse the leaf labels in one pass,
// avoiding a wasteful copy of the labels.
Trees.setLeafLabels(treeGraph, t.yield());
Trees.setLeafTagsIfUnset(treeGraph);
if (transformer != null) {
Tree transformed = transformer.transformTree(treeGraph);
if (!(transformed instanceof TreeGraphNode)) {
throw new RuntimeException("Transformer did not change TreeGraphNode into another TreeGraphNode: " + transformer);
}
this.root = (TreeGraphNode) transformed;
} else {
this.root = treeGraph;
}
indexNodes(this.root);
// add head word and tag to phrase nodes
if (hf == null) {
throw new AssertionError("Cannot use null HeadFinder");
}
root.percolateHeads(hf);
if (root.value() == null) {
root.setValue("ROOT"); // todo: cdm: it doesn't seem like this line should be here
}
// add dependencies, using heads
this.puncFilter = puncFilter;
this.tagFilter = tagFilter;
// NoPunctFilter puncDepFilter = new NoPunctFilter(puncFilter);
NoPunctTypedDependencyFilter puncTypedDepFilter = new NoPunctTypedDependencyFilter(puncFilter, tagFilter);
DirectedMultiGraph basicGraph = new DirectedMultiGraph<>();
DirectedMultiGraph completeGraph = new DirectedMultiGraph<>();
// analyze the root (and its descendants, recursively)
if (relationsLock != null) {
relationsLock.lock();
}
try {
analyzeNode(root, root, relations, hf, puncFilter, tagFilter, basicGraph, completeGraph);
}
finally {
if (relationsLock != null) {
relationsLock.unlock();
}
}
attachStrandedNodes(root, root, false, puncFilter, tagFilter, basicGraph);
// add typed dependencies
typedDependencies = getDeps(puncTypedDepFilter, basicGraph);
allTypedDependencies = Generics.newArrayList(typedDependencies);
getExtraDeps(allTypedDependencies, puncTypedDepFilter, completeGraph);
}
/**
* Assign sequential integer indices (starting with 1) to all
* nodes of the subtree rooted at this
* {@code Tree}. The leaves are indexed first,
* from left to right. Then the internal nodes are indexed,
* using a pre-order tree traversal.
*/
private void indexNodes(TreeGraphNode tree) {
indexNodes(tree, indexLeaves(tree, 1));
}
/**
* Assign sequential integer indices to the leaves of the subtree
* rooted at this {@code TreeGraphNode}, beginning with
* {@code startIndex}, and traversing the leaves from left
* to right. If node is already indexed, then it uses the existing index.
*
* @param startIndex index for this node
* @return the next index still unassigned
*/
private int indexLeaves(TreeGraphNode tree, int startIndex) {
if (tree.isLeaf()) {
int oldIndex = tree.index();
if (oldIndex >= 0) {
startIndex = oldIndex;
} else {
tree.setIndex(startIndex);
}
addNodeToIndexMap(startIndex, tree);
startIndex++;
} else {
for (TreeGraphNode child : tree.children) {
startIndex = indexLeaves(child, startIndex);
}
}
return startIndex;
}
/**
* Assign sequential integer indices to all nodes of the subtree
* rooted at this {@code TreeGraphNode}, beginning with
* {@code startIndex}, and doing a pre-order tree traversal.
* Any node which already has an index will not be re-indexed
* — this is so that we can index the leaves first, and
* then index the rest.
*
* @param startIndex index for this node
* @return the next index still unassigned
*/
private int indexNodes(TreeGraphNode tree, int startIndex) {
if (tree.index() < 0) { // if this node has no index
addNodeToIndexMap(startIndex, tree);
tree.setIndex(startIndex++);
}
if (!tree.isLeaf()) {
for (TreeGraphNode child : tree.children) {
startIndex = indexNodes(child, startIndex);
}
}
return startIndex;
}
/**
* Store a mapping from an arbitrary integer index to a node in
* this treegraph. Normally a client shouldn't need to use this,
* as the nodes are automatically indexed by the
* {@code TreeGraph} constructor.
*
* @param index the arbitrary integer index
* @param node the {@code TreeGraphNode} to be indexed
*/
private void addNodeToIndexMap(int index, TreeGraphNode node) {
indexMap.put(Integer.valueOf(index), node);
}
/**
* Return the node in the this treegraph corresponding to the
* specified integer index.
*
* @param index the integer index of the node you want
* @return the {@code TreeGraphNode} having the specified
* index (or {@code null} if such does not exist)
*/
private TreeGraphNode getNodeByIndex(int index) {
return indexMap.get(Integer.valueOf(index));
}
/**
* Return the root Tree of this GrammaticalStructure.
*
* @return the root Tree of this GrammaticalStructure
*/
public TreeGraphNode root() {
return root;
}
private static void throwDepFormatException(String dep) {
throw new RuntimeException(String.format("Dependencies should be for the format 'type(arg-idx, arg-idx)'. Could not parse '%s'", dep));
}
/**
* Create a grammatical structure from its string representation.
*
* Like buildCoNLLXGrammaticalStructure,
* this method fakes up the parts of the tree structure that are not
* used by the grammatical relation transformation operations.
*
* Note: Added by daniel cer
*
* @param tokens
* @param posTags
* @param deps
*/
public static GrammaticalStructure fromStringReps(List tokens, List posTags, List deps) {
if (tokens.size() != posTags.size()) {
throw new RuntimeException(String.format(
"tokens.size(): %d != pos.size(): %d%n", tokens.size(), posTags
.size()));
}
List tgWordNodes = new ArrayList<>(tokens.size());
List tgPOSNodes = new ArrayList<>(tokens.size());
CoreLabel rootLabel = new CoreLabel();
rootLabel.setValue("ROOT");
List nodeWords = new ArrayList<>(tgPOSNodes.size() + 1);
nodeWords.add(new IndexedWord(rootLabel));
UniversalSemanticHeadFinder headFinder = new UniversalSemanticHeadFinder();
Iterator posIter = posTags.iterator();
for (String wordString : tokens) {
String posString = posIter.next();
CoreLabel wordLabel = new CoreLabel();
wordLabel.setWord(wordString);
wordLabel.setValue(wordString);
wordLabel.setTag(posString);
TreeGraphNode word = new TreeGraphNode(wordLabel);
CoreLabel tagLabel = new CoreLabel();
tagLabel.setValue(posString);
tagLabel.setWord(posString);
TreeGraphNode pos = new TreeGraphNode(tagLabel);
tgWordNodes.add(word);
tgPOSNodes.add(pos);
TreeGraphNode[] childArr = {word};
pos.setChildren(childArr);
word.setParent(pos);
pos.percolateHeads(headFinder);
nodeWords.add(new IndexedWord(wordLabel));
}
TreeGraphNode root = new TreeGraphNode(rootLabel);
root.setChildren(tgPOSNodes.toArray(new TreeGraphNode[tgPOSNodes.size()]));
root.setIndex(0);
// Build list of TypedDependencies
List tdeps = new ArrayList<>(deps.size());
for (String depString : deps) {
int firstBracket = depString.indexOf('(');
if (firstBracket == -1) throwDepFormatException(depString);
String type = depString.substring(0, firstBracket);
if (depString.charAt(depString.length() - 1) != ')') throwDepFormatException(depString);
String args = depString.substring(firstBracket + 1, depString.length() - 1);
int argSep = args.indexOf(", ");
if (argSep == -1) throwDepFormatException(depString);
String parentArg = args.substring(0, argSep);
String childArg = args.substring(argSep + 2);
int parentDash = parentArg.lastIndexOf('-');
if (parentDash == -1) throwDepFormatException(depString);
int childDash = childArg.lastIndexOf('-');
if (childDash == -1) throwDepFormatException(depString);
//System.err.printf("parentArg: %s%n", parentArg);
int parentIdx = Integer.parseInt(parentArg.substring(parentDash+1).replace("'", ""));
int childIdx = Integer.parseInt(childArg.substring(childDash+1).replace("'", ""));
GrammaticalRelation grel = new GrammaticalRelation(Language.Any, type, null, DEPENDENT);
TypedDependency tdep = new TypedDependency(grel, nodeWords.get(parentIdx), nodeWords.get(childIdx));
tdeps.add(tdep);
}
// TODO add some elegant way to construct language
// appropriate GrammaticalStructures (e.g., English, Chinese, etc.)
return new GrammaticalStructure(tdeps, root) {
private static final long serialVersionUID = 1L;
};
}
public GrammaticalStructure(List projectiveDependencies, TreeGraphNode root) {
this.root = root;
indexNodes(this.root);
this.puncFilter = Filters.acceptFilter();
this.tagFilter = Filters.acceptFilter();
allTypedDependencies = typedDependencies = new ArrayList<>(projectiveDependencies);
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append(root.toPrettyString(0).substring(1));
sb.append("Typed Dependencies:\n");
sb.append(typedDependencies);
return sb.toString();
}
private static void attachStrandedNodes(TreeGraphNode t, TreeGraphNode root, boolean attach, Predicate puncFilter, Predicate tagFilter, DirectedMultiGraph basicGraph) {
if (t.isLeaf()) {
return;
}
if (attach && puncFilter.test(t.headWordNode().label().value()) &&
tagFilter.test(t.headWordNode().label().tag())) {
// make faster by first looking for links from parent
// it is necessary to look for paths using all directions
// because sometimes there are edges created from lower nodes to
// nodes higher up
TreeGraphNode parent = t.parent().highestNodeWithSameHead();
if (!basicGraph.isEdge(parent, t) && basicGraph.getShortestPath(root, t, false) == null) {
basicGraph.add(parent, t, GrammaticalRelation.DEPENDENT);
}
}
for (TreeGraphNode kid : t.children()) {
attachStrandedNodes(kid, root, (kid.headWordNode() != t.headWordNode()), puncFilter, tagFilter, basicGraph);
}
}
// cdm dec 2009: I changed this to automatically fail on preterminal nodes, since they shouldn't match for GR parent patterns. Should speed it up.
private static void analyzeNode(TreeGraphNode t, TreeGraphNode root, Collection relations, HeadFinder hf, Predicate puncFilter, Predicate tagFilter, DirectedMultiGraph basicGraph, DirectedMultiGraph completeGraph) {
if (t.isPhrasal()) { // don't do leaves or preterminals!
TreeGraphNode tHigh = t.highestNodeWithSameHead();
for (GrammaticalRelation egr : relations) {
if (egr.isApplicable(t)) {
for (TreeGraphNode u : egr.getRelatedNodes(t, root, hf)) {
TreeGraphNode uHigh = u.highestNodeWithSameHead();
if (uHigh == tHigh) {
continue;
}
if (!puncFilter.test(uHigh.headWordNode().label().value()) ||
! tagFilter.test(uHigh.headWordNode().label().tag())) {
continue;
}
completeGraph.add(tHigh, uHigh, egr);
// If there are two patterns that add dependencies, X --> Z and Y --> Z, and X dominates Y, then the dependency Y --> Z is not added to the basic graph to prevent unwanted duplication.
// Similarly, if there is already a path from X --> Y, and an expression would trigger Y --> X somehow, we ignore that
Set parents = basicGraph.getParents(uHigh);
if ((parents == null || parents.size() == 0 || parents.contains(tHigh)) &&
basicGraph.getShortestPath(uHigh, tHigh, true) == null) {
// log.info("Adding " + egr.getShortName() + " from " + t + " to " + u + " tHigh=" + tHigh + "(" + tHigh.headWordNode() + ") uHigh=" + uHigh + "(" + uHigh.headWordNode() + ")");
basicGraph.add(tHigh, uHigh, egr);
}
}
}
}
// now recurse into children
for (TreeGraphNode kid : t.children()) {
analyzeNode(kid, root, relations, hf, puncFilter, tagFilter, basicGraph, completeGraph);
}
}
}
private void getExtraDeps(List deps, Predicate puncTypedDepFilter, DirectedMultiGraph completeGraph) {
getExtras(deps);
// adds stuff to basicDep based on the tregex patterns over the tree
this.getTreeDeps(deps, completeGraph, puncTypedDepFilter, extraTreeDepFilter());
Collections.sort(deps);
}
/**
* Helps the constructor build a list of typed dependencies using
* information from a {@code GrammaticalStructure}.
*/
private List getDeps(Predicate puncTypedDepFilter, DirectedMultiGraph basicGraph) {
List basicDep = Generics.newArrayList();
for (TreeGraphNode gov : basicGraph.getAllVertices()) {
for (TreeGraphNode dep : basicGraph.getChildren(gov)) {
GrammaticalRelation reln = getGrammaticalRelationCommonAncestor(gov.headWordNode().label(), gov.label(), dep.headWordNode().label(), dep.label(), basicGraph.getEdges(gov, dep));
// log.info(" Gov: " + gov + " Dep: " + dep + " Reln: " + reln);
basicDep.add(new TypedDependency(reln, new IndexedWord(gov.headWordNode().label()), new IndexedWord(dep.headWordNode().label())));
}
}
// add the root
TreeGraphNode dependencyRoot = new TreeGraphNode(new Word("ROOT"));
dependencyRoot.setIndex(0);
TreeGraphNode rootDep = root().headWordNode();
if (rootDep == null) {
List leaves = Trees.leaves(root());
if (leaves.size() > 0) {
Tree leaf = leaves.get(0);
if (!(leaf instanceof TreeGraphNode)) {
throw new AssertionError("Leaves should be TreeGraphNodes");
}
rootDep = (TreeGraphNode) leaf;
if (rootDep.headWordNode() != null) {
rootDep = rootDep.headWordNode();
}
}
}
if (rootDep != null) {
TypedDependency rootTypedDep = new TypedDependency(ROOT, new IndexedWord(dependencyRoot.label()), new IndexedWord(rootDep.label()));
if (puncTypedDepFilter.test(rootTypedDep)) {
basicDep.add(rootTypedDep);
} else { // Root is a punctuation character
/* Heuristic to find a root for the graph.
* Make the first child of the current root the
* new root and attach all other children to
* the new root.
*/
IndexedWord root = rootTypedDep.dep();
IndexedWord newRoot = null;
Collections.sort(basicDep);
for (TypedDependency td : basicDep) {
if (td.gov().equals(root)) {
if (newRoot != null) {
td.setGov(newRoot);
} else {
td.setGov(td.gov());
td.setReln(ROOT);
newRoot = td.dep();
}
}
}
}
}
postProcessDependencies(basicDep);
Collections.sort(basicDep);
return basicDep;
}
/**
* Returns a Filter which checks dependencies for usefulness as
* extra tree-based dependencies. By default, everything is
* accepted. One example of how this can be useful is in the
* English dependencies, where the REL dependency is used as an
* intermediate and we do not want this to be added when we make a
* second pass over the trees for missing dependencies.
*/
protected Predicate extraTreeDepFilter() {
return Filters.acceptFilter();
}
/**
* Post process the dependencies in whatever way this language
* requires. For example, English might replace "rel" dependencies
* with either dobj or pobj depending on the surrounding
* dependencies.
*/
protected void postProcessDependencies(List basicDep) {
// no post processing by default
}
/**
* Get extra dependencies that do not depend on the tree structure,
* but rather only depend on the existing dependency structure.
* For example, the English xsubj dependency can be extracted that way.
*/
protected void getExtras(List basicDep) {
// no extra dependencies by default
}
/** Look through the tree t and adds to the List basicDep
* additional dependencies which aren't
* in the List but which satisfy the filter puncTypedDepFilter.
*
* @param deps The list of dependencies which may be augmented
* @param completeGraph a graph of all the tree dependencies found earlier
* @param puncTypedDepFilter The filter that may skip punctuation dependencies
* @param extraTreeDepFilter Additional dependencies are added only if they pass this filter
*/
protected void getTreeDeps(List deps,
DirectedMultiGraph completeGraph,
Predicate puncTypedDepFilter,
Predicate extraTreeDepFilter) {
for (TreeGraphNode gov : completeGraph.getAllVertices()) {
for (TreeGraphNode dep : completeGraph.getChildren(gov)) {
for (GrammaticalRelation rel : removeGrammaticalRelationAncestors(completeGraph.getEdges(gov, dep))) {
TypedDependency newDep = new TypedDependency(rel, new IndexedWord(gov.headWordNode().label()), new IndexedWord(dep.headWordNode().label()));
if (!deps.contains(newDep) && puncTypedDepFilter.test(newDep) && extraTreeDepFilter.test(newDep)) {
newDep.setExtra();
deps.add(newDep);
}
}
}
}
}
private static class NoPunctFilter implements Predicate>, Serializable {
private Predicate npf;
NoPunctFilter(Predicate f) {
this.npf = f;
}
@Override
public boolean test(Dependency