![JAR search and dependency download from the Maven repository](/logo.png)
edu.stanford.nlp.semgraph.SemanticGraphUtils Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
The newest version!
package edu.stanford.nlp.semgraph;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.ling.LabeledWord;
import edu.stanford.nlp.process.Morphology;
import edu.stanford.nlp.trees.EnglishGrammaticalRelations;
import edu.stanford.nlp.trees.GrammaticalRelation;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.util.*;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.StringWriter;
import java.util.*;
import java.util.function.BiPredicate;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.regex.Pattern;
/**
* Generic utilities for dealing with Dependency graphs and other structures, useful for
* text simplification and rewriting.
*
* TODO: Migrate some of the functions (that make sense) into SemanticGraph proper.
* BUT BEWARE: This class has methods that use jgraph (as opposed to jgrapht).
* We don't want our core code to become dependent on jgraph, so methods in
* SemanticGraph shouldn't call methods in this class, and methods that use
* jgraph shouldn't be moved into SemanticGraph.
*
* @author Eric Yeh
*
*/
public class SemanticGraphUtils {
/** A logger for this class */
private static final Redwood.RedwoodChannels log = Redwood.channels(SemanticGraphUtils.class);
private SemanticGraphUtils() {}
/**
* Given a collection of nodes from srcGraph, generates a new
* SemanticGraph based off the subset represented by those nodes.
* This uses the same vertices as in the original graph, which
* allows for equality and comparisons between the two graphs.
*/
public static SemanticGraph makeGraphFromNodes(Collection nodes, SemanticGraph srcGraph) {
if (nodes.size() == 1) {
SemanticGraph retSg = new SemanticGraph();
for (IndexedWord node :nodes)
retSg.addVertex(node);
return retSg;
}
if (nodes.isEmpty()) {
return null;
}
// TODO: if any nodes are not connected to edges in the original
// graph, this will leave them out
List edges = new ArrayList<>();
for (IndexedWord nodeG : nodes) {
for (IndexedWord nodeD: nodes) {
Collection existingEdges =
srcGraph.getAllEdges(nodeG, nodeD);
if (existingEdges != null) {
edges.addAll(existingEdges);
}
}
}
return SemanticGraphFactory.makeFromEdges(edges);
}
//----------------------------------------------------------------------------------------
//Query routines (obtaining sets of edges/vertices over predicates, etc)
//----------------------------------------------------------------------------------------
/**
* Finds the vertex in the given SemanticGraph that corresponds to the given node.
* Returns null if cannot find. Uses first match on index, sentIndex, and word values.
*/
public static IndexedWord findMatchingNode(IndexedWord node,
SemanticGraph sg) {
for (IndexedWord tgt : sg.vertexSet()) {
if ((tgt.index() == node.index()) &&
(tgt.sentIndex() == node.sentIndex()) &&
(tgt.word().equals(node.word())) )
return tgt;
}
return null;
}
/**
* Given a starting vertice, grabs the subtree encapsulated by portion of the semantic graph, excluding
* a given edge. A tabu list is maintained, in order to deal with cyclical relations (such as between a
* rcmod (relative clause) and its nsubj).
*
*/
public static Set getSubTreeEdges(IndexedWord vertice, SemanticGraph sg, SemanticGraphEdge excludedEdge) {
Set tabu = Generics.newHashSet();
tabu.add(excludedEdge);
getSubTreeEdgesHelper(vertice, sg, tabu);
tabu.remove(excludedEdge); // Do not want this in the returned edges
return tabu;
}
public static void getSubTreeEdgesHelper(IndexedWord vertice, SemanticGraph sg, Set tabuEdges) {
for (SemanticGraphEdge edge : sg.outgoingEdgeIterable(vertice)) {
if (!tabuEdges.contains(edge)) {
IndexedWord dep = edge.getDependent();
tabuEdges.add(edge);
getSubTreeEdgesHelper(dep, sg, tabuEdges);
}
}
}
/**
* Given a set of nodes from a SemanticGraph, returns the set of
* edges that are spanned between these nodes.
*/
public static Collection getEdgesSpannedByVertices(Collection nodes, SemanticGraph sg) {
Collection ret = Generics.newHashSet();
for (IndexedWord n1 : nodes)
for (IndexedWord n2: nodes) {
if (n1 != n2) {
Collection edges = sg.getAllEdges(n1, n2);
if (edges != null) ret.addAll(edges);
}
}
return ret;
}
/**
* Returns a list of all children bearing a grammatical relation starting with the given string, relnPrefix
*/
public static List getChildrenWithRelnPrefix(SemanticGraph graph, IndexedWord vertex, String relnPrefix) {
if (vertex.equals(IndexedWord.NO_WORD))
return new ArrayList<>();
if (!graph.containsVertex(vertex)) {
throw new IllegalArgumentException();
}
List childList = new ArrayList<>();
for (SemanticGraphEdge edge : graph.outgoingEdgeIterable(vertex)) {
if (edge.getRelation().toString().startsWith(relnPrefix)) {
childList.add(edge.getTarget());
}
}
return childList;
}
/**
* Returns a list of all children bearing a grammatical relation starting with the given set of relation prefixes
*/
public static List getChildrenWithRelnPrefix(SemanticGraph graph, IndexedWord vertex, Collection relnPrefixes) {
if (vertex.equals(IndexedWord.NO_WORD))
return new ArrayList<>();
if (!graph.containsVertex(vertex)) {
throw new IllegalArgumentException();
}
List childList = new ArrayList<>();
for (SemanticGraphEdge edge : graph.outgoingEdgeIterable(vertex)) {
String edgeString = edge.getRelation().toString();
for (String relnPrefix : relnPrefixes) {
if (edgeString.startsWith(relnPrefix)) {
childList.add(edge.getTarget());
break;
}
}
}
return childList;
}
/**
* Since graphs can be have preps collapsed, finds all the immediate children of this node
* that are linked by a collapsed preposition edge.
*/
public static List getChildrenWithPrepC(SemanticGraph sg, IndexedWord vertex) {
List ret = new ArrayList<>();
// Collection prepCs = EnglishGrammaticalRelations.getPrepsC();
// for (SemanticGraphEdge edge : sg.outgoingEdgesOf(vertex)) {
// if (prepCs.contains(edge.getRelation()))
for (SemanticGraphEdge edge : sg.outgoingEdgeIterable(vertex)) {
if (edge.getRelation().toString().startsWith("prep"))
ret.add(edge.getDependent());
}
return ret;
}
/**
* Returns the set of incoming edges for the given node that have the given
* relation.
*
* Because certain edges may remain in string form (prepcs), check for both
* string and object form of relations.
*/
public static List incomingEdgesWithReln(IndexedWord node, SemanticGraph sg, GrammaticalRelation reln) {
return edgesWithReln(sg.incomingEdgeIterable(node), reln);
}
/**
* Checks for outgoing edges of the node, in the given graph, which contain
* the given relation. Relations are matched on if they are GrammaticalRelation
* objects or strings.
*/
public static List outgoingEdgesWithReln(IndexedWord node, SemanticGraph sg, GrammaticalRelation reln) {
return edgesWithReln(sg.outgoingEdgeIterable(node), reln);
}
/**
* Given a list of edges, returns those which match the given relation (can be string or
* GrammaticalRelation object).
*/
public static List edgesWithReln(Iterable edges,
GrammaticalRelation reln) {
List found = Generics.newArrayList();
for (SemanticGraphEdge edge : edges) {
GrammaticalRelation tgtReln = edge.getRelation();
if (tgtReln.equals(reln)) {
found.add(edge);
}
}
return found;
}
/**
* Given a semantic graph, and a relation prefix, returns a list of all relations (edges)
* that start with the given prefix (e.g., prefix "prep" gives you all the prep relations: prep_by, pref_in,etc.)
*
*/
public static List findAllRelnsWithPrefix(SemanticGraph sg, String prefix) {
ArrayList relns = new ArrayList<>();
for (SemanticGraphEdge edge : sg.edgeIterable()) {
GrammaticalRelation edgeRelation = edge.getRelation();
if (edgeRelation.toString().startsWith(prefix)) {
relns.add(edge);
}
}
return relns;
}
/**
* Finds the descendents of the given node in graph, avoiding the given set of nodes
*/
public static Set tabuDescendants(SemanticGraph sg, IndexedWord vertex, Collection tabu) {
if (!sg.containsVertex(vertex)) {
throw new IllegalArgumentException();
}
// Do a depth first search
Set descendantSet = Generics.newHashSet();
tabuDescendantsHelper(sg, vertex, descendantSet, tabu, null, (Predicate) null);
return descendantSet;
}
/**
* Finds the set of descendants for a node in the graph, avoiding the set of nodes and the
* set of edge relations. NOTE: these edges are encountered from the downward cull,
* from governor to dependent.
*/
public static Set tabuDescendants(SemanticGraph sg, IndexedWord vertex, Collection tabu,
Collection tabuRelns) {
if (!sg.containsVertex(vertex)) {
throw new IllegalArgumentException();
}
// Do a depth first search
Set descendantSet = Generics.newHashSet();
tabuDescendantsHelper(sg, vertex, descendantSet, tabu, tabuRelns, (Predicate) null);
return descendantSet;
}
public static Set descendantsTabuRelns(SemanticGraph sg, IndexedWord vertex,
Collection tabuRelns) {
if (!sg.containsVertex(vertex)) {
throw new IllegalArgumentException();
}
// Do a depth first search
Set descendantSet = Generics.newHashSet();
tabuDescendantsHelper(sg, vertex, descendantSet, Generics.newHashSet(), tabuRelns, (Predicate) null);
return descendantSet;
}
public static Set descendantsTabuTestAndRelns(SemanticGraph sg, IndexedWord vertex,
Collection tabuRelns, Predicate tabuTest) {
if (!sg.containsVertex(vertex)) {
throw new IllegalArgumentException();
}
// Do a depth first search
Set descendantSet = Generics.newHashSet();
tabuDescendantsHelper(sg, vertex, descendantSet, Generics.newHashSet(), tabuRelns, tabuTest);
return descendantSet;
}
public static Set descendantsTabuTestAndRelns(SemanticGraph sg, IndexedWord vertex,
Collection tabuNodes, Collection tabuRelns, Predicate tabuTest) {
if (!sg.containsVertex(vertex)) {
throw new IllegalArgumentException();
}
// Do a depth first search
Set descendantSet = Generics.newHashSet();
tabuDescendantsHelper(sg, vertex, descendantSet, tabuNodes, tabuRelns, tabuTest);
return descendantSet;
}
public static Set descendantsTabuTestAndRelns(SemanticGraph sg, IndexedWord vertex,
Collection tabuNodes, Collection tabuRelns,
BiPredicate tabuTest) {
if ( ! sg.containsVertex(vertex)) {
throw new IllegalArgumentException();
}
// Do a depth first search
Set descendantSet = Generics.newHashSet();
tabuDescendantsHelper(sg, vertex, descendantSet, tabuNodes, tabuRelns, tabuTest);
return descendantSet;
}
/**
* Performs a cull for the descendants of the given node in the
* graph, subject to the tabu nodes to avoid, relations to avoid
* crawling over, and child nodes to avoid traversing to based upon
* a predicate test.
*/
private static void tabuDescendantsHelper(SemanticGraph sg, IndexedWord curr, Set descendantSet, Collection tabu,
Collection relnsToAvoid, Predicate tabuTest) {
if (tabu.contains(curr))
return;
if (descendantSet.contains(curr)) {
return;
}
descendantSet.add(curr);
for (IndexedWord child : sg.getChildren(curr)) {
for (SemanticGraphEdge edge : sg.getAllEdges(curr, child)) {
if (relnsToAvoid != null && relnsToAvoid.contains(edge.getRelation()))
continue;
if (tabuTest != null && tabuTest.test(edge.getDependent()))
continue;
tabuDescendantsHelper(sg, child, descendantSet, tabu, relnsToAvoid,
tabuTest);
}
}
}
/**
* Performs a cull for the descendants of the given node in the
* graph, subject to the tabu nodes to avoid, relations to avoid
* crawling over, and child nodes to avoid traversing to based upon
* a predicate test.
*/
private static void tabuDescendantsHelper(SemanticGraph sg, IndexedWord curr, Set descendantSet, Collection tabu,
Collection relnsToAvoid, BiPredicate tabuTest) {
if (tabu.contains(curr))
return;
if (descendantSet.contains(curr)) {
return;
}
descendantSet.add(curr);
for (IndexedWord child : sg.getChildren(curr)) {
for (SemanticGraphEdge edge : sg.getAllEdges(curr, child)) {
if (relnsToAvoid != null && relnsToAvoid.contains(edge.getRelation()))
continue;
if (tabuTest != null && tabuTest.test(edge.getDependent(), sg))
continue;
tabuDescendantsHelper(sg, child, descendantSet, tabu, relnsToAvoid,
tabuTest);
}
}
}
//------------------------------------------------------------------------------------
//"Constituent" extraction and manipulation
//------------------------------------------------------------------------------------
/**
* Returns the vertice that is "leftmost." Note this requires that the IndexedFeatureLabels present actually have
* ordering information.
* TODO: can be done more efficiently?
*/
public static IndexedWord leftMostChildVertice(IndexedWord startNode, SemanticGraph sg) {
TreeSet vertices = new TreeSet<>();
for (IndexedWord vertex : sg.descendants(startNode)) {
vertices.add(vertex);
}
return vertices.first();
}
/**
* Returns the vertices that are "leftmost, rightmost" Note this requires that the IndexedFeatureLabels present actually have
* ordering information.
* TODO: can be done more efficiently?
*/
public static Pair leftRightMostChildVertices(IndexedWord startNode, SemanticGraph sg) {
TreeSet vertices = new TreeSet<>();
for (IndexedWord vertex : sg.descendants(startNode)) {
vertices.add(vertex);
}
return Pair.makePair(vertices.first(), vertices.last());
}
/**
* Given a SemanticGraph, and a set of nodes, finds the "blanket" of nodes that are one
* edge away from the set of nodes passed in. This is similar to the idea of a Markov
* Blanket, except in the context of a SemanticGraph.
* TODO: optimize
*/
public static Collection getDependencyBlanket(SemanticGraph sg, Collection assertedNodes) {
Set retSet = Generics.newHashSet();
for (IndexedWord curr : sg.vertexSet()) {
if (!assertedNodes.contains(curr) && !retSet.contains(curr)) {
for (IndexedWord assertedNode : assertedNodes) {
if (sg.containsEdge(assertedNode, curr) ||
sg.containsEdge(curr, assertedNode)) {
retSet.add(curr);
}
}
}
}
return retSet;
}
/**
* Resets the indices for the vertices in the graph, using the current
* ordering returned by vertexList (presumably in order). This is to ensure
* accesses to the InfoFile word table do not fall off after a SemanticGraph has
* been edited.
*
* NOTE: the vertices will be replaced, as JGraphT does not permit
* in-place modification of the nodes. (TODO: we no longer use
* JGraphT, so this should be fixed)
*/
public static SemanticGraph resetVerticeOrdering(SemanticGraph sg) {
SemanticGraph nsg = new SemanticGraph();
List vertices = sg.vertexListSorted();
int index = 1;
Map oldToNewVertices = Generics.newHashMap();
List newVertices = new ArrayList<>();
for (IndexedWord vertex : vertices) {
IndexedWord newVertex = new IndexedWord(vertex);
newVertex.setIndex(index++);
oldToNewVertices.put(vertex, newVertex);
///sg.removeVertex(vertex);
newVertices.add(newVertex);
}
for (IndexedWord nv : newVertices) {
nsg.addVertex(nv);
}
List newRoots = new ArrayList<>();
for (IndexedWord or : sg.getRoots()) {
newRoots.add(oldToNewVertices.get(or));
}
nsg.setRoots(newRoots);
for (SemanticGraphEdge edge : sg.edgeIterable()) {
IndexedWord newGov = oldToNewVertices.get(edge.getGovernor());
IndexedWord newDep = oldToNewVertices.get(edge.getDependent());
nsg.addEdge(newGov, newDep, edge.getRelation(), edge.getWeight(), edge.isExtra());
}
return nsg;
}
/**
* Given a graph, ensures all edges are EnglishGrammaticalRelations.
* NOTE: this is English specific.
* NOTE: currently EnglishGrammaticalRelations does not link collapsed prep string forms
* back to their object forms, for its valueOf relation. This may need to be repaired if
* generated edges indeed do have collapsed preps as strings.
*/
private static void enRepairEdges(SemanticGraph sg, boolean verbose) {
for (SemanticGraphEdge edge : sg.edgeIterable()) {
if (edge.getRelation().isFromString()) {
GrammaticalRelation newReln = EnglishGrammaticalRelations.valueOf(edge.getRelation().toString());
if (newReln != null) {
IndexedWord gov = edge.getGovernor();
IndexedWord dep = edge.getDependent();
double weight = edge.getWeight();
boolean isExtra = edge.isExtra();
sg.removeEdge(edge);
sg.addEdge(gov, dep, newReln, weight, isExtra);
} else {
if (verbose)
log.info("Warning, could not find matching GrammaticalRelation for reln=" + edge.getRelation());
}
}
}
}
public static void enRepairEdges(SemanticGraph sg) {
enRepairEdges(sg, false);
}
/**
* Deletes all nodes that are not rooted (such as dangling vertices after a series of
* edges have been chopped).
*/
public static void killNonRooted(SemanticGraph sg) {
List nodes = new ArrayList<>(sg.vertexSet());
// Hack: store all of the nodes we know are in the rootset
Set guaranteed = Generics.newHashSet();
for (IndexedWord root : sg.getRoots()) {
guaranteed.add(root);
guaranteed.addAll(sg.descendants(root));
}
for (IndexedWord node : nodes) {
if (!guaranteed.contains(node)) {
sg.removeVertex(node);
}
}
}
/**
* Replaces a node in the given SemanticGraph with the new node,
* replacing its position in the node edges.
*/
public static void replaceNode(IndexedWord newNode, IndexedWord oldNode, SemanticGraph sg) {
// Obtain the edges where the old node was the governor and the dependent.
// Remove the old node, insert the new, and re-insert the edges.
// Save the edges in a list so that remove operations don't affect
// the iterator or our ability to find the edges in the first place
List govEdges = sg.outgoingEdgeList(oldNode);
List depEdges = sg.incomingEdgeList(oldNode);
boolean oldNodeRemoved = sg.removeVertex(oldNode);
if (oldNodeRemoved) {
// If the new node is not present, be sure to add it in.
if (!sg.containsVertex(newNode)) {
sg.addVertex(newNode);
}
for (SemanticGraphEdge govEdge : govEdges) {
sg.removeEdge(govEdge);
sg.addEdge(newNode, govEdge.getDependent(), govEdge.getRelation(), govEdge.getWeight(), govEdge.isExtra());
}
for (SemanticGraphEdge depEdge : depEdges) {
sg.removeEdge(depEdge);
sg.addEdge(depEdge.getGovernor(), newNode, depEdge.getRelation(), depEdge.getWeight(), depEdge.isExtra());
}
} else {
log.info("SemanticGraphUtils.replaceNode: previous node does not exist");
}
}
public static final String WILDCARD_VERTICE_TOKEN = "WILDCARD";
public static final IndexedWord WILDCARD_VERTICE = new IndexedWord();
static {
WILDCARD_VERTICE.setWord("*");
WILDCARD_VERTICE.setValue("*");
WILDCARD_VERTICE.setOriginalText("*");
}
/**
* Given an iterable set of distinct vertices, creates a new mapping that maps the
* original vertices to a set of "generic" versions. Used for generalizing tokens in discovered rules.
*
* @param verts Vertices to anonymize
* @param prefix Prefix to assign to this anonymization
*/
public static Map anonymyizeNodes(Iterable verts, String prefix) {
Map retMap = Generics.newHashMap();
int index = 1;
for (IndexedWord orig: verts) {
IndexedWord genericVert = new IndexedWord(orig);
genericVert.set(CoreAnnotations.LemmaAnnotation.class, "");
String genericValue = prefix+index;
genericVert.setValue(genericValue);
genericVert.setWord(genericValue);
genericVert.setOriginalText(genericValue);
index++;
retMap.put(orig, genericVert);
}
return retMap;
}
public static final String SHARED_NODE_ANON_PREFIX ="A";
public static final String BLANKET_NODE_ANON_PREFIX ="B";
/**
* Used to make a mapping that lets you create "anonymous" versions of shared nodes between two
* graphs (given in the arg) using the shared prefix.
*/
public static Map makeGenericVertices(Iterable verts) {
return anonymyizeNodes(verts, SHARED_NODE_ANON_PREFIX);
}
/**
* Used to assign generic labels to the nodes in the "blanket" for a set of vertices in a graph. Here, a "blanket" node is
* similar to nodes in a Markov Blanket, i.e. nodes that are one edge away from a set of asserted vertices in a
* SemanticGraph.
*/
public static Map makeBlanketVertices(Iterable verts) {
return anonymyizeNodes(verts, BLANKET_NODE_ANON_PREFIX);
}
/**
* Given a set of edges, and a mapping between the replacement and target vertices that comprise the
* vertices of the edges, returns a new set of edges with the replacement vertices. If a replacement
* is not present, the WILDCARD_VERTICE is used in its place (i.e. can be anything).
*
* Currently used to generate "generic" versions of Semantic Graphs, when given a list of generic
* vertices to replace with, but can conceivably be used for other purposes where vertices must
* be replaced.
*/
public static List makeReplacedEdges(Iterable edges, Map vertReplacementMap,
boolean useGenericReplacement) {
List retList = new ArrayList<>();
for (SemanticGraphEdge edge : edges) {
IndexedWord gov = edge.getGovernor();
IndexedWord dep = edge.getDependent();
IndexedWord newGov = vertReplacementMap.get(gov);
IndexedWord newDep = vertReplacementMap.get(dep);
if (useGenericReplacement) {
if (newGov == null) {
newGov = new IndexedWord(gov);
newGov.set(CoreAnnotations.TextAnnotation.class, WILDCARD_VERTICE_TOKEN);
newGov.set(CoreAnnotations.OriginalTextAnnotation.class, WILDCARD_VERTICE_TOKEN);
newGov.set(CoreAnnotations.LemmaAnnotation.class, WILDCARD_VERTICE_TOKEN);
}
if (newDep == null) {
newDep = new IndexedWord(dep);
newDep.set(CoreAnnotations.TextAnnotation.class, WILDCARD_VERTICE_TOKEN);
newDep.set(CoreAnnotations.OriginalTextAnnotation.class, WILDCARD_VERTICE_TOKEN);
newDep.set(CoreAnnotations.LemmaAnnotation.class,WILDCARD_VERTICE_TOKEN);
}
} else {
if (newGov == null)
newGov = edge.getGovernor();
if (newDep == null)
newDep = edge.getDependent();
}
SemanticGraphEdge newEdge = new SemanticGraphEdge(newGov, newDep, edge.getRelation(), edge.getWeight(), edge.isExtra());
retList.add(newEdge);
}
return retList;
}
/**
* Given a set of vertices from the same graph, returns the set of all edges between these
* vertices.
*/
public static Set allEdgesInSet(Iterable vertices, SemanticGraph sg) {
Set edges = Generics.newHashSet();
for (IndexedWord v1 : vertices) {
for (SemanticGraphEdge edge : sg.outgoingEdgeIterable(v1)) {
edges.add(edge);
}
for (SemanticGraphEdge edge : sg.incomingEdgeIterable(v1)) {
edges.add(edge);
}
}
return edges;
}
/**
* Given two iterable sequences of edges, returns a pair containing the set of
* edges in the first graph not in the second, and edges in the second not in the first.
* Edge equality is determined using an object that implements ISemanticGraphEdgeEql.
*
*/
public static EdgeDiffResult diffEdges(Collection edges1, Collection edges2,
SemanticGraph sg1, SemanticGraph sg2,
ISemanticGraphEdgeEql compareObj) {
Set remainingEdges1 = Generics.newHashSet();
Set remainingEdges2 = Generics.newHashSet();
Set sameEdges = Generics.newHashSet();
ArrayList edges2Cache = new ArrayList<>(edges2);
edge1Loop:
for (SemanticGraphEdge edge1 : edges1) {
for (SemanticGraphEdge edge2 : edges2Cache) {
if (compareObj.equals(edge1, edge2, sg1, sg2)) {
sameEdges.add(edge1);
edges2Cache.remove(edge2);
continue edge1Loop;
}
}
remainingEdges1.add(edge1);
}
ArrayList edges1Cache = new ArrayList<>(edges1);
edge2Loop:
for (SemanticGraphEdge edge2 : edges2) {
for (SemanticGraphEdge edge1 : edges1) {
if (compareObj.equals(edge1, edge2, sg1, sg2)) {
edges1Cache.remove(edge1);
continue edge2Loop;
}
}
remainingEdges2.add(edge2);
}
return new EdgeDiffResult(sameEdges, remainingEdges1, remainingEdges2);
}
public static class EdgeDiffResult {
Set sameEdges;
Set remaining1;
Set remaining2;
public EdgeDiffResult(Set sameEdges,
Set remaining1,
Set remaining2) {
this.sameEdges = sameEdges;
this.remaining1 = remaining1;
this.remaining2 = remaining2;
}
public Set getRemaining1() {
return remaining1;
}
public Set getRemaining2() {
return remaining2;
}
public Set getSameEdges() {
return sameEdges;
}
}
/**
* Pretty printers
*/
public static String printEdges(Iterable edges) {
StringWriter buf = new StringWriter();
for (SemanticGraphEdge edge : edges) {
buf.append("\t");
buf.append(edge.getRelation().toString());
buf.append("(");
buf.append(edge.getGovernor().toString());
buf.append(", ");
buf.append(edge.getDependent().toString());
buf.append(")\n");
}
return buf.toString();
}
public static class PrintVerticeParams {
public boolean showWord = true;
public boolean showIndex = true;
public boolean showSentIndex = false;
public boolean showPOS = false;
public int wrapAt = 8;
}
public static String printVertices(SemanticGraph sg) {
return printVertices(sg, new PrintVerticeParams());
}
public static String printVertices(SemanticGraph sg, PrintVerticeParams params) {
StringWriter buf = new StringWriter();
int count = 0;
for (IndexedWord word : sg.vertexListSorted()) {
count++;
if (count % params.wrapAt == 0) { buf.write("\n\t"); }
if (params.showIndex) {
buf.write(String.valueOf(word.index()));
buf.write(":");
}
if (params.showSentIndex) {
buf.write("s");
buf.write(String.valueOf(word.sentIndex()));
buf.write("/");
}
if (params.showPOS) {
buf.write(word.tag());
buf.write("/");
}
if (params.showWord) {
buf.write(word.word());
}
buf.write(" ");
}
return buf.toString();
}
/**
* Given a SemanticGraph, creates a SemgrexPattern string based off of this graph.
* NOTE: the word() value of the vertice is the name to reference
* NOTE: currently presumes there is only one root in this graph.
* TODO: see if Semgrex can allow multiroot patterns
* @param sg SemanticGraph to base this pattern on.
*/
public static String semgrexFromGraph(SemanticGraph sg, boolean matchTag, boolean matchWord,
Map nodeNameMap) throws Exception {
return semgrexFromGraph(sg, null, matchTag, matchWord, nodeNameMap);
}
public static String semgrexFromGraph(SemanticGraph sg, Collection wildcardNodes,
boolean useTag, boolean useWord, Map nodeNameMap) throws Exception {
Function transformNode = o ->{
String str = "";
if(useWord)
str = "{word: /" + Pattern.quote(o.word()) + "/";
if(useTag){
if(!str.isEmpty())
str += "; ";
str = "tag: " + o.tag();
}
if(!str.isEmpty())
str += "}";
return str;
};
return semgrexFromGraph(sg, wildcardNodes, nodeNameMap, transformNode);
}
/**
* nodeValuesTranformation is a function that converts a vertex (IndexedWord) to the value.
* For an example, see {@code semgrexFromGraph}
* function implementations (if useWord and useTag is true, the value is "{word: vertex.word; tag: vertex.tag}").
* @throws Exception
*/
public static String semgrexFromGraph(SemanticGraph sg, Collection wildcardNodes,
Map nodeNameMap, Function wordTransformation) {
IndexedWord patternRoot = sg.getFirstRoot();
StringWriter buf = new StringWriter();
Set tabu = Generics.newHashSet();
Set seenEdges = Generics.newHashSet();
buf.append(semgrexFromGraphHelper(patternRoot, sg, tabu, seenEdges, true, true, wildcardNodes,
nodeNameMap, false, wordTransformation));
String patternString = buf.toString();
return patternString;
}
/**
* Given a set of edges that form a rooted and connected graph, returns a Semgrex pattern
* corresponding to it.
* @throws Exception
*/
public static String semgrexFromGraph(Iterable edges, boolean matchTag,
boolean matchWord, Map nodeNameMap) throws Exception {
SemanticGraph sg = SemanticGraphFactory.makeFromEdges(edges);
return semgrexFromGraph(sg, matchTag, matchWord, nodeNameMap);
}
/**
* Recursive call to generate the Semgrex pattern based off of this SemanticGraph.
* nodeValuesTranformation is a function that converts a vertex (IndexedWord) to the value. For an example, see {@code semgrexFromGraph}
* function implementations.
*/
protected static String semgrexFromGraphHelper(IndexedWord vertice, SemanticGraph sg,
Set tabu, Set seenEdges, boolean useWordAsLabel, boolean nameEdges, Collection wildcardNodes,
Map nodeNameMap, boolean orderedNodes, Function nodeValuesTransformation) {
StringWriter buf = new StringWriter();
// If the node is a wildcarded one, treat it as a {}, meaning any match. Currently these will not
// be labeled, but this may change later.
if (wildcardNodes != null && wildcardNodes.contains(vertice)) {
buf.append("{}");
} else {
String vertexStr = nodeValuesTransformation.apply(vertice);
if(vertexStr != null && !vertexStr.isEmpty()){
buf.append(vertexStr);
}
// buf.append("{");
// int i = 0;
// for(String corekey: useNodeCoreAnnotations){
// AnnotationLookup.KeyLookup lookup = AnnotationLookup.getCoreKey(corekey);
// assert lookup != null : "Invalid key " + corekey;
// if(i > 0)
// buf.append("; ");
// String value = vertice.containsKey(lookup.coreKey) ? vertice.get(lookup.coreKey).toString() : "null";
// buf.append(corekey+":"+nodeValuesTransformation.apply(value));
// i++;
// }
// if (useTag) {
//
// buf.append("tag:"); buf.append(vertice.tag());
// if (useWord)
// buf.append(";");
// }
// if (useWord) {
// buf.append("word:"); buf.append(wordTransformation.apply(vertice.word()));
// }
// buf.append("}");
}
if (nodeNameMap != null) {
buf.append("=");
buf.append(nodeNameMap.get(vertice));
buf.append(" ");
} else if (useWordAsLabel) {
buf.append("=");
buf.append(sanitizeForSemgrexName(vertice.word()));
buf.append(" ");
}
tabu.add(vertice);
Iterable edgeIter; // = null;
if(!orderedNodes){
edgeIter = sg.outgoingEdgeIterable(vertice);
} else{
edgeIter = CollectionUtils.sorted(sg.outgoingEdgeList(vertice), (arg0, arg1) ->
(arg0.getRelation().toString().compareTo(arg1.getRelation().toString())));
}
// For each edge, record the edge, but do not traverse to the vertice if it is already in the
// tabu list. If it already is, we emit the edge and the target vertice, as
// we will not be continuing in that vertex, but we wish to record the relation.
// If we will proceed down that node, add parens if it will continue recursing down.
for (SemanticGraphEdge edge : edgeIter) {
seenEdges.add(edge);
IndexedWord tgtVert = edge.getDependent();
boolean applyParens =
sg.outDegree(tgtVert) > 0 && !tabu.contains(tgtVert);
buf.append(" >");
buf.append(edge.getRelation().toString());
if (nameEdges) {
buf.append("=E");
buf.write(String.valueOf(seenEdges.size()));
}
buf.append(" ");
if (applyParens)
buf.append("(");
if (tabu.contains(tgtVert)) {
buf.append("{tag:"); buf.append(tgtVert.tag()); buf.append("}");
if (useWordAsLabel) {
buf.append("=");
buf.append(tgtVert.word());
buf.append(" ");
}
} else {
buf.append(semgrexFromGraphHelper(tgtVert, sg, tabu, seenEdges, useWordAsLabel, nameEdges,
wildcardNodes, nodeNameMap, orderedNodes, nodeValuesTransformation));
if (applyParens)
buf.append(")");
}
}
return buf.toString();
}
/** Same as semgrexFromGraph except the node traversal is ordered by sorting
*/
public static String semgrexFromGraphOrderedNodes(SemanticGraph sg, Collection wildcardNodes,
Map nodeNameMap, Function wordTransformation) {
IndexedWord patternRoot = sg.getFirstRoot();
StringWriter buf = new StringWriter();
Set tabu = Generics.newHashSet();
Set seenEdges = Generics.newHashSet();
buf.append(semgrexFromGraphHelper(patternRoot, sg, tabu, seenEdges, true, true, wildcardNodes,
nodeNameMap, true, wordTransformation));
String patternString = buf.toString();
return patternString;
}
/**
* Sanitizes the given string into a Semgrex friendly name
*/
public static String sanitizeForSemgrexName(String text) {
text = text.replaceAll("\\.", "_DOT_");
text = text.replaceAll(",", "_COMMA_");
text = text.replaceAll("\\\\", "_BSLASH_");
text = text.replaceAll("/", "_BSLASH_");
text = text.replaceAll("\\?", "_QUES_");
text = text.replaceAll("!", "_BANG_");
text = text.replaceAll("\\$", "_DOL_");
text = text.replaceAll("&", "_AMP_");
text = text.replaceAll(":", "_COL_");
text = text.replaceAll(";", "_SCOL_");
text = text.replaceAll("#", "_PND_");
text = text.replaceAll("@", "_AND_");
text = text.replaceAll("%", "_PER_");
text = text.replaceAll("\\(","_LRB_");
text = text.replaceAll("\\)", "_RRB_");
return text;
}
/**
* Given a {@code SemanticGraph}, sets the lemmas on its label
* objects based on their word and tag.
*/
public static void lemmatize(SemanticGraph sg) {
for (IndexedWord node : sg.vertexSet()) {
node.setLemma(Morphology.lemmaStatic(node.word(), node.tag()));
}
}
/**
* GIven a graph, returns a new graph with the the new sentence index enforced.
* NOTE: new vertices are inserted.
* TODO: is this ok? rewrite this?
*/
public static SemanticGraph setSentIndex(SemanticGraph sg, int newSentIndex) {
SemanticGraph newGraph = new SemanticGraph(sg);
List prevRoots = new ArrayList<>(newGraph.getRoots());
List newRoots = new ArrayList<>();
// TODO: we are using vertexListSorted here because we're changing
// vertices while iterating. Perhaps there is a better way to do it.
for (IndexedWord node : newGraph.vertexListSorted()) {
IndexedWord newWord = new IndexedWord(node);
newWord.setSentIndex(newSentIndex);
SemanticGraphUtils.replaceNode(newWord, node, newGraph);
if (prevRoots.contains(node))
newRoots.add(newWord);
}
newGraph.setRoots(newRoots);
return newGraph;
}
//-----------------------------------------------------------------------------------------------
// Graph redundancy checks
//-----------------------------------------------------------------------------------------------
/**
* Removes duplicate graphs from the set, using the string form of the graph
* as the key (obviating issues with object equality).
*/
public static Collection removeDuplicates(Collection graphs) {
Map map = Generics.newHashMap();
for (SemanticGraph sg : graphs) {
String keyVal = sg.toString().intern();
map.put(keyVal, sg);
}
return map.values();
}
/**
* Given the set of graphs to remove duplicates from, also removes those on the tabu graphs
* (and does not include them in the return set).
*/
public static Collection removeDuplicates(Collection graphs,
Collection tabuGraphs) {
Map tabuMap = Generics.newHashMap();
for (SemanticGraph tabuSg : tabuGraphs) {
String keyVal = tabuSg.toString().intern();
tabuMap.put(keyVal, tabuSg);
}
Map map = Generics.newHashMap();
for (SemanticGraph sg : graphs) {
String keyVal = sg.toString().intern();
if (tabuMap.containsKey(keyVal))
continue;
map.put(keyVal, sg);
}
return map.values();
}
public static Collection removeDuplicates(Collection graphs,
SemanticGraph tabuGraph) {
Collection tabuSet = Generics.newHashSet();
tabuSet.add(tabuGraph);
return removeDuplicates(graphs, tabuSet);
}
// -----------------------------------------------------------------------------------------------
// Tree matching code
// -----------------------------------------------------------------------------------------------
/**
* Given a CFG Tree parse, and the equivalent SemanticGraph derived from that Tree, generates a mapping
* from each of the tree terminals to the best-guess SemanticGraph node(s).
* This is performed using lexical matching, finding the nth match.
* NOTE: not all tree nodes may match a Semgraph node, esp. for tokens removed in a collapsed Semgraph,
* such as prepositions.
*/
public static Map mapTreeToSg(Tree tree, SemanticGraph sg) {
// In order to keep track of positions, we store lists, in order encountered, of lex terms.
// e.g. lexToTreeNode.get("the").get(2) should point to the same word as lexToSemNode.get("the").get(2)
// Because IndexedWords may be collapsed together "A B" -> "A_B", we check the value of current(), and
// split on whitespace if present.
MapList lexToTreeNode = new MapList<>();
MapList lexToSemNode = new MapList<>();
for (Tree child : tree.getLeaves()) {
List leafProxies = TreeNodeProxy.create(child, tree);
for (TreeNodeProxy proxy : leafProxies)
lexToTreeNode.add(proxy.lex, proxy);
}
Map depthMap = Generics.newHashMap();
for (IndexedWord node : sg.vertexSet()) {
List path = sg.getPathToRoot(node);
if (path != null)
depthMap.put(node, path.size());
else
depthMap.put(node, 99999); // Use an arbitrarily deep depth value, to trick it into never being used.
List nodeProxies = IndexedWordProxy.create(node);
for (IndexedWordProxy proxy : nodeProxies)
lexToSemNode.add(proxy.lex, proxy);
}
// Now the map-lists (string->position encountered indices) are populated,
// simply go through, finding matches.
// NOTE: we use TreeNodeProxy instead of keying off of Tree, as
// hash codes for Tree nodes do not consider position of the tree
// within a tree: two subtrees with the same layout and child
// labels will be equal.
Map map = Generics.newHashMap();
for (String lex : lexToTreeNode.keySet()) {
for (int i=0;i " + treeNode + ", #=" + treeNode.nodeNumber(root);
}
private TreeNodeProxy(Tree intree, String lex, Tree root) {
this.treeNode = intree;
this.lex = lex;
this.root = root;
}
public static List create(Tree intree, Tree root) {
List ret = new ArrayList<>();
if (intree.isLeaf()) {
ret.add(new TreeNodeProxy(intree, intree.label().value(), root));
} else
for (LabeledWord lword : intree.labeledYield()) {
ret.add(new TreeNodeProxy(intree, lword.word(), root));
}
return ret;
}
}
/**
* This is used to uniquely index trees within a
* Tree, maintaining the position of this subtree
* within the context of the root.
* @author Eric Yeh
*
*/
public static class PositionedTree {
Tree tree;
Tree root;
int nodeNumber;
public String toString() {
return tree+"."+nodeNumber;
}
public PositionedTree(Tree tree, Tree root) {
this.tree = tree;
this.root = root;
this.nodeNumber = tree.nodeNumber(root);
}
public boolean equals(Object obj) {
if (obj instanceof PositionedTree) {
PositionedTree tgt = (PositionedTree) obj;
return tree.equals(tgt.tree) && root.equals(tgt.root) && tgt.nodeNumber == nodeNumber;
}
return false;
}
/**
* TODO: verify this is correct
*/
@Override
public int hashCode() {
int hc = tree.hashCode() ^ (root.hashCode() << 8);
hc ^= (2 ^ nodeNumber);
return hc;
}
}
/**
* Private helper class for {@code mapTreeToSg}. Acts to
* map between an IndexedWord (in a SemanticGraph) and a lexical value.
* @author lumberjack
*
*/
private static final class IndexedWordProxy {
IndexedWord node;
String lex;
public String toString() {
return lex+" -> "+node.word()+":"+node.sentIndex()+"."+node.index();
}
private IndexedWordProxy(IndexedWord node, String lex) {
this.node = node; this.lex = lex;
}
/**
* Generates a set of IndexedWordProxy objects. If the current() field is present, splits the tokens by
* a space, and for each, creates a new IndexedWordProxy, in order encountered, referencing this current
* node, but using the lexical value of the current split token. Otherwise just use the value of word().
* This is used to retain attribution to the originating node.
*/
public static List create(IndexedWord node) {
List ret = new ArrayList<>();
if (node.originalText().length() > 0) {
for (String token : node.originalText().split(" ")) {
ret.add(new IndexedWordProxy(node, token));
}
} else {
ret.add(new IndexedWordProxy(node, node.word()));
}
return ret;
}
}
/**
*
* Checks whether a given SemanticGraph is a strict surface syntax tree.
*
* @param sg
* @return
*/
public static boolean isTree(SemanticGraph sg) {
if (sg.getRoots().size() != 1) {
return false;
}
IndexedWord root = sg.getFirstRoot();
Set visitedNodes = Generics.newHashSet();
Queue queue = Generics.newLinkedList();
queue.add(root);
while (!queue.isEmpty()) {
IndexedWord current = queue.remove();
visitedNodes.add(current);
for (SemanticGraphEdge edge : sg.outgoingEdgeIterable(current)) {
IndexedWord dep = edge.getDependent();
if (visitedNodes.contains(dep)) {
return false;
}
if (dep.copyCount() > 0) {
return false;
}
queue.add(dep);
}
}
return visitedNodes.size() == sg.size();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy