All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.semgraph.SemanticGraphUtils Maven / Gradle / Ivy

Go to download

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.semgraph;
import edu.stanford.nlp.util.logging.Redwood;

import edu.stanford.nlp.ling.AnnotationLookup;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.ling.LabeledWord;
import edu.stanford.nlp.process.Morphology;
import edu.stanford.nlp.trees.EnglishGrammaticalRelations;
import edu.stanford.nlp.trees.GrammaticalRelation;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.util.CollectionUtils;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.MapList;
import edu.stanford.nlp.util.Pair;

import java.io.StringWriter;
import java.util.*;
import java.util.function.Function;
import java.util.regex.Pattern;


/**
 * Generic utilities for dealing with Dependency graphs and other structures, useful for
 * text simplification and rewriting.
 *
 * TODO: Migrate some of the functions (that make sense) into SemanticGraph proper.
 * BUT BEWARE: This class has methods that use jgraph (as opposed to jgrapht).
 * We don't want our core code to become dependent on jgraph, so methods in
 * SemanticGraph shouldn't call methods in this class, and methods that use
 * jgraph shouldn't be moved into SemanticGraph.
 *
 * @author Eric Yeh
 *
 */
public class SemanticGraphUtils  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(SemanticGraphUtils.class);

  private SemanticGraphUtils() {}

  /**
   * Given a collection of nodes from srcGraph, generates a new
   * SemanticGraph based off the subset represented by those nodes.
   * This uses the same vertices as in the original graph, which
   * allows for equality and comparisons between the two graphs.
   */
  public static SemanticGraph makeGraphFromNodes(Collection nodes, SemanticGraph srcGraph) {
    if (nodes.size() == 1) {
      SemanticGraph retSg = new SemanticGraph();
      for (IndexedWord node :nodes)
        retSg.addVertex(node);
      return retSg;
    }

    if (nodes.isEmpty()) {
      return null;
    }

    // TODO: if any nodes are not connected to edges in the original
    // graph, this will leave them out
    List edges = new ArrayList<>();
    for (IndexedWord nodeG : nodes) {
      for (IndexedWord nodeD: nodes) {
        Collection existingEdges =
          srcGraph.getAllEdges(nodeG, nodeD);
        if (existingEdges != null) {
          edges.addAll(existingEdges);
        }
      }
    }
    return SemanticGraphFactory.makeFromEdges(edges);
  }

  //----------------------------------------------------------------------------------------
  //Query routines (obtaining sets of edges/vertices over predicates, etc)
  //----------------------------------------------------------------------------------------

  /**
   * Finds the vertex in the given SemanticGraph that corresponds to the given node.
   * Returns null if cannot find. Uses first match on index, sentIndex, and word values.
   */
  public static IndexedWord findMatchingNode(IndexedWord node,
                                             SemanticGraph sg) {
    for (IndexedWord tgt : sg.vertexSet()) {
      if ((tgt.index() == node.index()) &&
          (tgt.sentIndex() == node.sentIndex()) &&
          (tgt.word().equals(node.word())) )
        return tgt;
    }
    return null;
  }


  /**
   * Given a starting vertice, grabs the subtree encapsulated by portion of the semantic graph, excluding
   * a given edge.  A tabu list is maintained, in order to deal with cyclical relations (such as between a
   * rcmod (relative clause) and its nsubj).
   *
   */
  public static Set getSubTreeEdges(IndexedWord vertice, SemanticGraph sg, SemanticGraphEdge excludedEdge) {
    Set tabu = Generics.newHashSet();
    tabu.add(excludedEdge);
    getSubTreeEdgesHelper(vertice, sg, tabu);
    tabu.remove(excludedEdge); // Do not want this in the returned edges
    return tabu;
  }

  public static void getSubTreeEdgesHelper(IndexedWord vertice, SemanticGraph sg, Set tabuEdges) {
    for (SemanticGraphEdge edge : sg.outgoingEdgeIterable(vertice)) {
      if (!tabuEdges.contains(edge)) {
        IndexedWord dep = edge.getDependent();
        tabuEdges.add(edge);
        getSubTreeEdgesHelper(dep, sg, tabuEdges);
      }
    }
  }


  /**
   * Given a set of nodes from a SemanticGraph, returns the set of
   * edges that are spanned between these nodes.
   */
  public static Collection getEdgesSpannedByVertices(Collection nodes, SemanticGraph sg) {
    Collection ret = Generics.newHashSet();
    for (IndexedWord n1 : nodes)
      for (IndexedWord n2: nodes) {
        if (n1 != n2) {
          Collection edges = sg.getAllEdges(n1, n2);
          if (edges != null) ret.addAll(edges);
        }
      }
    return ret;
  }

  /**
   * Returns a list of all children bearing a grammatical relation starting with the given string, relnPrefix
   */
  public static List getChildrenWithRelnPrefix(SemanticGraph graph, IndexedWord vertex, String relnPrefix) {
    if (vertex.equals(IndexedWord.NO_WORD))
      return new ArrayList<>();
    if (!graph.containsVertex(vertex)) {
      throw new IllegalArgumentException();
    }
    List childList = new ArrayList<>();
    for (SemanticGraphEdge edge : graph.outgoingEdgeIterable(vertex)) {
      if (edge.getRelation().toString().startsWith(relnPrefix)) {
        childList.add(edge.getTarget());
      }
    }
    return childList;
  }

  /**
   * Returns a list of all children bearing a grammatical relation starting with the given set of relation prefixes
   */
  public static List getChildrenWithRelnPrefix(SemanticGraph graph, IndexedWord vertex, Collection relnPrefixes) {
    if (vertex.equals(IndexedWord.NO_WORD))
      return new ArrayList<>();
    if (!graph.containsVertex(vertex)) {
      throw new IllegalArgumentException();
    }
    List childList = new ArrayList<>();
    for (SemanticGraphEdge edge : graph.outgoingEdgeIterable(vertex)) {
      String edgeString = edge.getRelation().toString();
      for (String relnPrefix : relnPrefixes) {
        if (edgeString.startsWith(relnPrefix)) {
          childList.add(edge.getTarget());
          break;
        }
      }
    }
    return childList;
  }

  /**
   * Since graphs can be have preps collapsed, finds all the immediate children of this node
   * that are linked by a collapsed preposition edge.
   */
  public static List getChildrenWithPrepC(SemanticGraph sg, IndexedWord vertex) {
    List ret = new ArrayList<>();
    //  Collection prepCs = EnglishGrammaticalRelations.getPrepsC();
    //  for (SemanticGraphEdge edge : sg.outgoingEdgesOf(vertex)) {
    //  if (prepCs.contains(edge.getRelation()))
    for (SemanticGraphEdge edge : sg.outgoingEdgeIterable(vertex)) {
      if (edge.getRelation().toString().startsWith("prep"))
        ret.add(edge.getDependent());
    }
    return ret;
  }

  /**
   * Returns the set of incoming edges for the given node that have the given
   * relation.
   *
   * Because certain edges may remain in string form (prepcs), check for both
   * string and object form of relations.
   */
  public static List incomingEdgesWithReln(IndexedWord node, SemanticGraph sg, GrammaticalRelation reln) {
    return edgesWithReln(sg.incomingEdgeIterable(node), reln);
  }

  /**
   * Checks for outgoing edges of the node, in the given graph, which contain
   * the given relation.  Relations are matched on if they are GrammaticalRelation
   * objects or strings.
   */
  public static List outgoingEdgesWithReln(IndexedWord node, SemanticGraph sg, GrammaticalRelation reln) {
    return edgesWithReln(sg.outgoingEdgeIterable(node), reln);
  }

  /**
   * Given a list of edges, returns those which match the given relation (can be string or
   * GrammaticalRelation object).
   */
  public static List edgesWithReln(Iterable edges,
                                                      GrammaticalRelation reln) {
    List found = Generics.newArrayList();
    for (SemanticGraphEdge edge : edges) {
      GrammaticalRelation tgtReln = edge.getRelation();
      if (tgtReln.equals(reln)) {
        found.add(edge);
      }
    }
    return found;
  }

  /**
   * Given a semantic graph, and a relation prefix, returns a list of all relations (edges)
   * that start with the given prefix (e.g., prefix "prep" gives you all the prep relations: prep_by, pref_in,etc.)
   *
   */
  public static List findAllRelnsWithPrefix(SemanticGraph sg, String prefix) {
    ArrayList relns = new ArrayList<>();
    for (SemanticGraphEdge edge : sg.edgeIterable()) {
      GrammaticalRelation edgeRelation = edge.getRelation();
      if (edgeRelation.toString().startsWith(prefix)) {
        relns.add(edge);
      }
    }
    return relns;
  }

  /**
   * Finds the descendents of the given node in graph, avoiding the given set of nodes
   */
  public static Set tabuDescendants(SemanticGraph sg, IndexedWord vertex, Collection tabu) {
    if (!sg.containsVertex(vertex)) {
      throw new IllegalArgumentException();
    }
    // Do a depth first search
    Set descendantSet = Generics.newHashSet();
    tabuDescendantsHelper(sg, vertex, descendantSet, tabu, null, null);
    return descendantSet;
  }

  /**
   * Finds the set of descendants for a node in the graph, avoiding the set of nodes and the
   * set of edge relations.  NOTE: these edges are encountered from the downward cull,
   * from governor to dependent.
   */
  public static Set tabuDescendants(SemanticGraph sg, IndexedWord vertex, Collection tabu,
                                                 Collection tabuRelns) {
    if (!sg.containsVertex(vertex)) {
      throw new IllegalArgumentException();
    }
    // Do a depth first search
    Set descendantSet = Generics.newHashSet();
    tabuDescendantsHelper(sg, vertex, descendantSet, tabu, tabuRelns, null);
    return descendantSet;
  }

  public static Set descendantsTabuRelns(SemanticGraph sg, IndexedWord vertex,
                                                      Collection tabuRelns) {
    if (!sg.containsVertex(vertex)) {
      throw new IllegalArgumentException();
    }
    // Do a depth first search
    Set descendantSet = Generics.newHashSet();
    tabuDescendantsHelper(sg, vertex, descendantSet, Generics.newHashSet(), tabuRelns, null);
    return descendantSet;
  }

  public static Set descendantsTabuTestAndRelns(SemanticGraph sg, IndexedWord vertex,
      Collection tabuRelns, IndexedWordUnaryPred tabuTest) {
    if (!sg.containsVertex(vertex)) {
      throw new IllegalArgumentException();
    }
    // Do a depth first search
    Set descendantSet = Generics.newHashSet();
    tabuDescendantsHelper(sg, vertex, descendantSet, Generics.newHashSet(), tabuRelns, tabuTest);
    return descendantSet;
  }

  public static Set descendantsTabuTestAndRelns(SemanticGraph sg, IndexedWord vertex,
      Collection tabuNodes, Collection tabuRelns, IndexedWordUnaryPred tabuTest) {
    if (!sg.containsVertex(vertex)) {
      throw new IllegalArgumentException();
    }
    // Do a depth first search
    Set descendantSet = Generics.newHashSet();
    tabuDescendantsHelper(sg, vertex, descendantSet, tabuNodes, tabuRelns, tabuTest);
    return descendantSet;
  }



  /**
   * Performs a cull for the descendents of the given node in the
   * graph, subject to the tabu nodes to avoid, relations to avoid
   * crawling over, and child nodes to avoid traversing to based upon
   * a predicate test.
   */
  private static void tabuDescendantsHelper(SemanticGraph sg, IndexedWord curr, Set descendantSet, Collection tabu,
      Collection relnsToAvoid, IndexedWordUnaryPred tabuTest) {
    if (tabu.contains(curr))
      return;
    if (descendantSet.contains(curr)) {
      return;
    }

    descendantSet.add(curr);
    for (IndexedWord child : sg.getChildren(curr)) {
      for (SemanticGraphEdge edge : sg.getAllEdges(curr, child)) {
        if (relnsToAvoid != null && relnsToAvoid.contains(edge.getRelation()))
          continue;
        if (tabuTest != null && tabuTest.test(edge.getDependent(), sg))
          continue;
        tabuDescendantsHelper(sg, child, descendantSet, tabu, relnsToAvoid,
                              tabuTest);
      }
    }
  }


  //------------------------------------------------------------------------------------
  //"Constituent" extraction and manipulation
  //------------------------------------------------------------------------------------


  /**
   * Returns the vertice that is "leftmost."  Note this requires that the IndexedFeatureLabels present actually have
   * ordering information.
   * TODO: can be done more efficiently?
   */
  public static IndexedWord leftMostChildVertice(IndexedWord startNode, SemanticGraph sg) {
    TreeSet vertices = new TreeSet<>();
    for (IndexedWord vertex : sg.descendants(startNode)) {
      vertices.add(vertex);
    }
    return vertices.first();
  }

  /**
   * Returns the vertices that are "leftmost, rightmost"  Note this requires that the IndexedFeatureLabels present actually have
   * ordering information.
   * TODO: can be done more efficiently?
   */
  public static Pair leftRightMostChildVertices(IndexedWord startNode, SemanticGraph sg) {
    TreeSet vertices = new TreeSet<>();
    for (IndexedWord vertex : sg.descendants(startNode)) {
      vertices.add(vertex);
    }
    return Pair.makePair(vertices.first(), vertices.last());
  }

  /**
   * Given a SemanticGraph, and a set of nodes, finds the "blanket" of nodes that are one
   * edge away from the set of nodes passed in.  This is similar to the idea of a Markov
   * Blanket, except in the context of a SemanticGraph.
   * TODO: optimize
   */
  public static Collection getDependencyBlanket(SemanticGraph sg, Collection assertedNodes) {
    Set retSet = Generics.newHashSet();
    for (IndexedWord curr : sg.vertexSet()) {
      if (!assertedNodes.contains(curr) && !retSet.contains(curr)) {
        for (IndexedWord assertedNode : assertedNodes) {
          if (sg.containsEdge(assertedNode, curr) ||
              sg.containsEdge(curr, assertedNode)) {
            retSet.add(curr);
          }
        }
      }
    }
    return retSet;
  }

  /**
   * Resets the indices for the vertices in the graph, using the current
   * ordering returned by vertexList (presumably in order).  This is to ensure
   * accesses to the InfoFile word table do not fall off after a SemanticGraph has
   * been edited.
   * 
* NOTE: the vertices will be replaced, as JGraphT does not permit * in-place modification of the nodes. (TODO: we no longer use * JGraphT, so this should be fixed) */ public static SemanticGraph resetVerticeOrdering(SemanticGraph sg) { SemanticGraph nsg = new SemanticGraph(); List vertices = sg.vertexListSorted(); int index = 1; Map oldToNewVertices = Generics.newHashMap(); List newVertices = new ArrayList<>(); for (IndexedWord vertex : vertices) { IndexedWord newVertex = new IndexedWord(vertex); newVertex.setIndex(index++); oldToNewVertices.put(vertex, newVertex); ///sg.removeVertex(vertex); newVertices.add(newVertex); } for (IndexedWord nv : newVertices) { nsg.addVertex(nv); } List newRoots = new ArrayList<>(); for (IndexedWord or : sg.getRoots()) { newRoots.add(oldToNewVertices.get(or)); } nsg.setRoots(newRoots); for (SemanticGraphEdge edge : sg.edgeIterable()) { IndexedWord newGov = oldToNewVertices.get(edge.getGovernor()); IndexedWord newDep = oldToNewVertices.get(edge.getDependent()); nsg.addEdge(newGov, newDep, edge.getRelation(), edge.getWeight(), edge.isExtra()); } return nsg; } /** * Given a graph, ensures all edges are EnglishGrammaticalRelations * NOTE: this is English specific * NOTE: currently EnglishGrammaticalRelations does not link collapsed prep string forms * back to their object forms, for its valueOf relation. This may need to be repaired if * generated edges indeed do have collapsed preps as strings. */ public static void enRepairEdges(SemanticGraph sg, boolean verbose) { for (SemanticGraphEdge edge : sg.edgeIterable()) { if (edge.getRelation().isFromString()) { GrammaticalRelation newReln = EnglishGrammaticalRelations.valueOf(edge.getRelation().toString()); if (newReln != null) { IndexedWord gov = edge.getGovernor(); IndexedWord dep = edge.getDependent(); double weight = edge.getWeight(); boolean isExtra = edge.isExtra(); sg.removeEdge(edge); sg.addEdge(gov, dep, newReln, weight, isExtra); } else { if (verbose) log.info("Warning, could not find matching GrammaticalRelation for reln="+edge.getRelation()); } } } } public static void enRepairEdges(SemanticGraph sg) { enRepairEdges(sg, false); } /** * Deletes all nodes that are not rooted (such as dangling vertices after a series of * edges have been chopped). */ public static void killNonRooted(SemanticGraph sg) { List nodes = new ArrayList<>(sg.vertexSet()); // Hack: store all of the nodes we know are in the rootset Set guaranteed = Generics.newHashSet(); for (IndexedWord root : sg.getRoots()) { guaranteed.add(root); guaranteed.addAll(sg.descendants(root)); } for (IndexedWord node : nodes) { if (!guaranteed.contains(node)) { sg.removeVertex(node); } } } /** * Replaces a node in the given SemanticGraph with the new node, * replacing its position in the node edges. */ public static void replaceNode(IndexedWord newNode, IndexedWord oldNode, SemanticGraph sg) { // Obtain the edges where the old node was the governor and the dependent. // Remove the old node, insert the new, and re-insert the edges. // Save the edges in a list so that remove operations don't affect // the iterator or our ability to find the edges in the first place List govEdges = sg.outgoingEdgeList(oldNode); List depEdges = sg.incomingEdgeList(oldNode); boolean oldNodeRemoved = sg.removeVertex(oldNode); if (oldNodeRemoved) { // If the new node is not present, be sure to add it in. if (!sg.containsVertex(newNode)) { sg.addVertex(newNode); } for (SemanticGraphEdge govEdge : govEdges) { sg.removeEdge(govEdge); sg.addEdge(newNode, govEdge.getDependent(), govEdge.getRelation(), govEdge.getWeight(), govEdge.isExtra()); } for (SemanticGraphEdge depEdge : depEdges) { sg.removeEdge(depEdge); sg.addEdge(depEdge.getGovernor(), newNode, depEdge.getRelation(), depEdge.getWeight(), depEdge.isExtra()); } } else { log.info("SemanticGraphUtils.replaceNode: previous node does not exist"); } } public static final String WILDCARD_VERTICE_TOKEN = "WILDCARD"; public static final IndexedWord WILDCARD_VERTICE = new IndexedWord(); static { WILDCARD_VERTICE.setWord("*"); WILDCARD_VERTICE.setValue("*"); WILDCARD_VERTICE.setOriginalText("*"); } /** * GIven an iterable set of distinct vertices, creates a new mapping that maps the * original vertices to a set of "generic" versions. Used for generalizing tokens in discovered rules. * @param verts Vertices to anonymize * @param prefix Prefix to assign to this anonymization */ public static Map anonymyizeNodes(Iterable verts, String prefix) { Map retMap = Generics.newHashMap(); int index = 1; for (IndexedWord orig: verts) { IndexedWord genericVert = new IndexedWord(orig); genericVert.set(CoreAnnotations.LemmaAnnotation.class, ""); String genericValue = prefix+index; genericVert.setValue(genericValue); genericVert.setWord(genericValue); genericVert.setOriginalText(genericValue); index++; retMap.put(orig, genericVert); } return retMap; } public static final String SHARED_NODE_ANON_PREFIX ="A"; public static final String BLANKET_NODE_ANON_PREFIX ="B"; /** * Used to make a mapping that lets you create "anonymous" versions of shared nodes between two * graphs (given in the arg) using the shared prefix. */ public static Map makeGenericVertices(Iterable verts) { return anonymyizeNodes(verts, SHARED_NODE_ANON_PREFIX); } /** * Used to assign generic labels to the nodes in the "blanket" for a set of vertices in a graph. Here, a "blanket" node is * similar to nodes in a Markov Blanket, i.e. nodes that are one edge away from a set of asserted vertices in a * SemanticGraph. */ public static Map makeBlanketVertices(Iterable verts) { return anonymyizeNodes(verts, BLANKET_NODE_ANON_PREFIX); } /** * Given a set of edges, and a mapping between the replacement and target vertices that comprise the * vertices of the edges, returns a new set of edges with the replacement vertices. If a replacement * is not present, the WILDCARD_VERTICE is used in its place (i.e. can be anything). * * Currently used to generate "generic" versions of Semantic Graphs, when given a list of generic * vertices to replace with, but can conceivably be used for other purposes where vertices must * be replaced. */ public static List makeReplacedEdges(Iterable edges, Map vertReplacementMap, boolean useGenericReplacement) { List retList = new ArrayList<>(); for (SemanticGraphEdge edge : edges) { IndexedWord gov = edge.getGovernor(); IndexedWord dep = edge.getDependent(); IndexedWord newGov = vertReplacementMap.get(gov); IndexedWord newDep = vertReplacementMap.get(dep); if (useGenericReplacement) { if (newGov == null) { newGov = new IndexedWord(gov); newGov.set(CoreAnnotations.TextAnnotation.class, WILDCARD_VERTICE_TOKEN); newGov.set(CoreAnnotations.OriginalTextAnnotation.class, WILDCARD_VERTICE_TOKEN); newGov.set(CoreAnnotations.LemmaAnnotation.class, WILDCARD_VERTICE_TOKEN); } if (newDep == null) { newDep = new IndexedWord(dep); newDep.set(CoreAnnotations.TextAnnotation.class, WILDCARD_VERTICE_TOKEN); newDep.set(CoreAnnotations.OriginalTextAnnotation.class, WILDCARD_VERTICE_TOKEN); newDep.set(CoreAnnotations.LemmaAnnotation.class,WILDCARD_VERTICE_TOKEN); } } else { if (newGov == null) newGov = edge.getGovernor(); if (newDep == null) newDep = edge.getDependent(); } SemanticGraphEdge newEdge = new SemanticGraphEdge(newGov, newDep, edge.getRelation(), edge.getWeight(), edge.isExtra()); retList.add(newEdge); } return retList; } /** * Given a set of vertices from the same graph, returns the set of all edges between these * vertices. */ public static Set allEdgesInSet(Iterable vertices, SemanticGraph sg) { Set edges = Generics.newHashSet(); for (IndexedWord v1 : vertices) { for (SemanticGraphEdge edge : sg.outgoingEdgeIterable(v1)) { edges.add(edge); } for (SemanticGraphEdge edge : sg.incomingEdgeIterable(v1)) { edges.add(edge); } } return edges; } /** * Given two iterable sequences of edges, returns a pair containing the set of * edges in the first graph not in the second, and edges in the second not in the first. * Edge equality is determined using an object that implements ISemanticGraphEdgeEql. * */ public static EdgeDiffResult diffEdges(Collection edges1, Collection edges2, SemanticGraph sg1, SemanticGraph sg2, ISemanticGraphEdgeEql compareObj) { Set remainingEdges1 = Generics.newHashSet(); Set remainingEdges2 = Generics.newHashSet(); Set sameEdges = Generics.newHashSet(); ArrayList edges2Cache = new ArrayList<>(edges2); edge1Loop: for (SemanticGraphEdge edge1 : edges1) { for (SemanticGraphEdge edge2 : edges2Cache) { if (compareObj.equals(edge1, edge2, sg1, sg2)) { sameEdges.add(edge1); edges2Cache.remove(edge2); continue edge1Loop; } } remainingEdges1.add(edge1); } ArrayList edges1Cache = new ArrayList<>(edges1); edge2Loop: for (SemanticGraphEdge edge2 : edges2) { for (SemanticGraphEdge edge1 : edges1) { if (compareObj.equals(edge1, edge2, sg1, sg2)) { edges1Cache.remove(edge1); continue edge2Loop; } } remainingEdges2.add(edge2); } return new EdgeDiffResult(sameEdges, remainingEdges1, remainingEdges2); } public static class EdgeDiffResult { Set sameEdges; Set remaining1; Set remaining2; public EdgeDiffResult(Set sameEdges, Set remaining1, Set remaining2) { this.sameEdges = sameEdges; this.remaining1 = remaining1; this.remaining2 = remaining2; } public Set getRemaining1() { return remaining1; } public Set getRemaining2() { return remaining2; } public Set getSameEdges() { return sameEdges; } } /** * Pretty printers */ public static String printEdges(Iterable edges) { StringWriter buf = new StringWriter(); for (SemanticGraphEdge edge : edges) { buf.append("\t"); buf.append(edge.getRelation().toString()); buf.append("("); buf.append(edge.getGovernor().toString()); buf.append(", "); buf.append(edge.getDependent().toString()); buf.append(")\n"); } return buf.toString(); } public static class PrintVerticeParams { public boolean showWord = true; public boolean showIndex = true; public boolean showSentIndex = false; public boolean showPOS = false; public int wrapAt = 8; } public static String printVertices(SemanticGraph sg) { return printVertices(sg, new PrintVerticeParams()); } public static String printVertices(SemanticGraph sg, PrintVerticeParams params) { StringWriter buf = new StringWriter(); int count = 0; for (IndexedWord word : sg.vertexListSorted()) { count++; if (count % params.wrapAt == 0) { buf.write("\n\t"); } if (params.showIndex) { buf.write(String.valueOf(word.index())); buf.write(":"); } if (params.showSentIndex) { buf.write("s"); buf.write(String.valueOf(word.sentIndex())); buf.write("/"); } if (params.showPOS) { buf.write(word.tag()); buf.write("/"); } if (params.showWord) { buf.write(word.word()); } buf.write(" "); } return buf.toString(); } /** * Given a SemanticGraph, creates a SemgrexPattern string based off of this graph. * NOTE: the word() value of the vertice is the name to reference * NOTE: currently presumes there is only one root in this graph. * TODO: see if Semgrex can allow multiroot patterns * @param sg SemanticGraph to base this pattern on. */ public static String semgrexFromGraph(SemanticGraph sg, boolean matchTag, boolean matchWord, Map nodeNameMap) throws Exception { return semgrexFromGraph(sg, null, matchTag, matchWord, nodeNameMap); } public static String semgrexFromGraph(SemanticGraph sg, Collection wildcardNodes, boolean useTag, boolean useWord, Map nodeNameMap) throws Exception { Function transformNode = o ->{ String str = ""; if(useWord) str = "{word: /" + Pattern.quote(o.word()) + "/"; if(useTag){ if(!str.isEmpty()) str += "; "; str = "tag: " + o.tag(); } if(!str.isEmpty()) str += "}"; return str; }; return semgrexFromGraph(sg, wildcardNodes, nodeNameMap, transformNode); } /** * nodeValuesTranformation is a function that converts a vertex (IndexedWord) to the value. * For an example, see {@code semgrexFromGraph} * function implementations (if useWord and useTag is true, the value is "{word: vertex.word; tag: vertex.tag}"). * @throws Exception */ public static String semgrexFromGraph(SemanticGraph sg, Collection wildcardNodes, Map nodeNameMap, Function wordTransformation) throws Exception { IndexedWord patternRoot = sg.getFirstRoot(); StringWriter buf = new StringWriter(); Set tabu = Generics.newHashSet(); Set seenEdges = Generics.newHashSet(); buf.append(semgrexFromGraphHelper(patternRoot, sg, tabu, seenEdges, true, true, wildcardNodes, nodeNameMap, false, wordTransformation)); String patternString = buf.toString(); return patternString; } /** * Given a set of edges that form a rooted and connected graph, returns a Semgrex pattern * corresponding to it. * @throws Exception */ public static String semgrexFromGraph(Iterable edges, boolean matchTag, boolean matchWord, Map nodeNameMap) throws Exception { SemanticGraph sg = SemanticGraphFactory.makeFromEdges(edges); return semgrexFromGraph(sg, matchTag, matchWord, nodeNameMap); } /** * Recursive call to generate the Semgrex pattern based off of this SemanticGraph. * nodeValuesTranformation is a function that converts a vertex (IndexedWord) to the value. For an example, see {@code semgrexFromGraph} * function implementations. */ protected static String semgrexFromGraphHelper(IndexedWord vertice, SemanticGraph sg, Set tabu, Set seenEdges, boolean useWordAsLabel, boolean nameEdges, Collection wildcardNodes, Map nodeNameMap, boolean orderedNodes, Function nodeValuesTransformation) { StringWriter buf = new StringWriter(); // If the node is a wildcarded one, treat it as a {}, meaning any match. Currently these will not // be labeled, but this may change later. if (wildcardNodes != null && wildcardNodes.contains(vertice)) { buf.append("{}"); } else { String vertexStr = nodeValuesTransformation.apply(vertice); if(vertexStr != null && !vertexStr.isEmpty()){ buf.append(vertexStr); } // buf.append("{"); // int i = 0; // for(String corekey: useNodeCoreAnnotations){ // AnnotationLookup.KeyLookup lookup = AnnotationLookup.getCoreKey(corekey); // assert lookup != null : "Invalid key " + corekey; // if(i > 0) // buf.append("; "); // String value = vertice.containsKey(lookup.coreKey) ? vertice.get(lookup.coreKey).toString() : "null"; // buf.append(corekey+":"+nodeValuesTransformation.apply(value)); // i++; // } // if (useTag) { // // buf.append("tag:"); buf.append(vertice.tag()); // if (useWord) // buf.append(";"); // } // if (useWord) { // buf.append("word:"); buf.append(wordTransformation.apply(vertice.word())); // } // buf.append("}"); } if (nodeNameMap != null) { buf.append("="); buf.append(nodeNameMap.get(vertice)); buf.append(" "); } else if (useWordAsLabel) { buf.append("="); buf.append(sanitizeForSemgrexName(vertice.word())); buf.append(" "); } tabu.add(vertice); Iterable edgeIter = null; if(!orderedNodes){ edgeIter = sg.outgoingEdgeIterable(vertice); } else{ edgeIter = CollectionUtils.sorted(sg.outgoingEdgeList(vertice), (arg0, arg1) -> (arg0.getRelation().toString().compareTo(arg1.getRelation().toString()))); } // For each edge, record the edge, but do not traverse to the vertice if it is already in the // tabu list. If it already is, we emit the edge and the target vertice, as // we will not be continuing in that vertex, but we wish to record the relation. // If we will proceed down that node, add parens if it will continue recursing down. for (SemanticGraphEdge edge : edgeIter) { seenEdges.add(edge); IndexedWord tgtVert = edge.getDependent(); boolean applyParens = sg.outDegree(tgtVert) > 0 && !tabu.contains(tgtVert); buf.append(" >"); buf.append(edge.getRelation().toString()); if (nameEdges) { buf.append("=E"); buf.write(String.valueOf(seenEdges.size())); } buf.append(" "); if (applyParens) buf.append("("); if (tabu.contains(tgtVert)) { buf.append("{tag:"); buf.append(tgtVert.tag()); buf.append("}"); if (useWordAsLabel) { buf.append("="); buf.append(tgtVert.word()); buf.append(" "); } } else { buf.append(semgrexFromGraphHelper(tgtVert, sg, tabu, seenEdges, useWordAsLabel, nameEdges, wildcardNodes, nodeNameMap, orderedNodes, nodeValuesTransformation)); if (applyParens) buf.append(")"); } } return buf.toString(); } /** Same as semgrexFromGraph except the node traversal is ordered by sorting */ public static String semgrexFromGraphOrderedNodes(SemanticGraph sg, Collection wildcardNodes, Map nodeNameMap, Function wordTransformation) throws Exception { IndexedWord patternRoot = sg.getFirstRoot(); StringWriter buf = new StringWriter(); Set tabu = Generics.newHashSet(); Set seenEdges = Generics.newHashSet(); buf.append(semgrexFromGraphHelper(patternRoot, sg, tabu, seenEdges, true, true, wildcardNodes, nodeNameMap, true, wordTransformation)); String patternString = buf.toString(); return patternString; } /** * Sanitizes the given string into a Semgrex friendly name */ public static String sanitizeForSemgrexName(String text) { text = text.replaceAll("\\.", "_DOT_"); text = text.replaceAll("\\,", "_COMMA_"); text = text.replaceAll("\\\\", "_BSLASH_"); text = text.replaceAll("\\/", "_BSLASH_"); text = text.replaceAll("\\?", "_QUES_"); text = text.replaceAll("\\!", "_BANG_"); text = text.replaceAll("\\$", "_DOL_"); text = text.replaceAll("\\!", "_BANG_"); text = text.replaceAll("\\&", "_AMP_"); text = text.replaceAll("\\:", "_COL_"); text = text.replaceAll("\\;", "_SCOL_"); text = text.replaceAll("\\#", "_PND_"); text = text.replaceAll("\\@", "_AND_"); text = text.replaceAll("\\%", "_PER_"); text = text.replaceAll("\\(","_LRB_"); text = text.replaceAll("\\)", "_RRB_"); return text; } /** * Given a {@code SemanticGraph}, sets the lemmas on its label * objects based on their word and tag. */ public static void lemmatize(SemanticGraph sg) { for (IndexedWord node : sg.vertexSet()) { node.setLemma(Morphology.lemmaStatic(node.word(), node.tag())); } } /** * GIven a graph, returns a new graph with the the new sentence index enforced. * NOTE: new vertices are inserted. * TODO: is this ok? rewrite this? */ public static SemanticGraph setSentIndex(SemanticGraph sg, int newSentIndex) { SemanticGraph newGraph = new SemanticGraph(sg); List prevRoots = new ArrayList<>(newGraph.getRoots()); List newRoots = new ArrayList<>(); // TODO: we are using vertexListSorted here because we're changing // vertices while iterating. Perhaps there is a better way to do it. for (IndexedWord node : newGraph.vertexListSorted()) { IndexedWord newWord = new IndexedWord(node); newWord.setSentIndex(newSentIndex); SemanticGraphUtils.replaceNode(newWord, node, newGraph); if (prevRoots.contains(node)) newRoots.add(newWord); } newGraph.setRoots(newRoots); return newGraph; } //----------------------------------------------------------------------------------------------- // Graph redundancy checks //----------------------------------------------------------------------------------------------- /** * Removes duplicate graphs from the set, using the string form of the graph * as the key (obviating issues with object equality). */ public static Collection removeDuplicates(Collection graphs) { Map map = Generics.newHashMap(); for (SemanticGraph sg : graphs) { String keyVal = sg.toString().intern(); map.put(keyVal, sg); } return map.values(); } /** * Given the set of graphs to remove duplicates from, also removes those on the tabu graphs * (and does not include them in the return set). */ public static Collection removeDuplicates(Collection graphs, Collection tabuGraphs) { Map tabuMap = Generics.newHashMap(); for (SemanticGraph tabuSg : tabuGraphs) { String keyVal = tabuSg.toString().intern(); tabuMap.put(keyVal, tabuSg); } Map map = Generics.newHashMap(); for (SemanticGraph sg : graphs) { String keyVal = sg.toString().intern(); if (tabuMap.containsKey(keyVal)) continue; map.put(keyVal, sg); } return map.values(); } public static Collection removeDuplicates(Collection graphs, SemanticGraph tabuGraph) { Collection tabuSet = Generics.newHashSet(); tabuSet.add(tabuGraph); return removeDuplicates(graphs, tabuSet); } // ----------------------------------------------------------------------------------------------- // Tree matching code // ----------------------------------------------------------------------------------------------- /** * Given a CFG Tree parse, and the equivalent SemanticGraph derived from that Tree, generates a mapping * from each of the tree terminals to the best-guess SemanticGraph node(s). * This is performed using lexical matching, finding the nth match. * NOTE: not all tree nodes may match a Semgraph node, esp. for tokens removed in a collapsed Semgraph, * such as prepositions. */ public static Map mapTreeToSg(Tree tree, SemanticGraph sg) { // In order to keep track of positions, we store lists, in order encountered, of lex terms. // e.g. lexToTreeNode.get("the").get(2) should point to the same word as lexToSemNode.get("the").get(2) // Because IndexedWords may be collapsed together "A B" -> "A_B", we check the value of current(), and // split on whitespace if present. MapList lexToTreeNode = new MapList<>(); MapList lexToSemNode = new MapList<>(); for (Tree child : tree.getLeaves()) { List leafProxies = TreeNodeProxy.create(child, tree); for (TreeNodeProxy proxy : leafProxies) lexToTreeNode.add(proxy.lex, proxy); } Map depthMap = Generics.newHashMap(); for (IndexedWord node : sg.vertexSet()) { List path = sg.getPathToRoot(node); if (path != null) depthMap.put(node, path.size()); else depthMap.put(node, 99999); // Use an arbitrarily deep depth value, to trick it into never being used. List nodeProxies = IndexedWordProxy.create(node); for (IndexedWordProxy proxy : nodeProxies) lexToSemNode.add(proxy.lex, proxy); } // Now the map-lists (string->position encountered indices) are populated, // simply go through, finding matches. // NOTE: we use TreeNodeProxy instead of keying off of Tree, as // hash codes for Tree nodes do not consider position of the tree // within a tree: two subtrees with the same layout and child // labels will be equal. Map map = Generics.newHashMap(); for (String lex : lexToTreeNode.keySet()) { for (int i=0;i "+treeNode.toString()+", #="+treeNode.nodeNumber(root); } private TreeNodeProxy(Tree intree, String lex, Tree root) { this.treeNode = intree; this.lex = lex; this.root = root; } public static List create(Tree intree, Tree root) { List ret = new ArrayList<>(); if (intree.isLeaf()) { ret.add(new TreeNodeProxy(intree, intree.label().value(), root)); } else for (LabeledWord lword : intree.labeledYield()) { ret.add(new TreeNodeProxy(intree, lword.word(), root)); } return ret; } } /** * This is used to uniquely index trees within a * Tree, maintaining the position of this subtree * within the context of the root. * @author Eric Yeh * */ public static class PositionedTree { Tree tree; Tree root; int nodeNumber; public String toString() { return tree+"."+nodeNumber; } public PositionedTree(Tree tree, Tree root) { this.tree = tree; this.root = root; this.nodeNumber = tree.nodeNumber(root); } public boolean equals(Object obj) { if (obj instanceof PositionedTree) { PositionedTree tgt = (PositionedTree) obj; return tree.equals(tgt.tree) && root.equals(tgt.root) && tgt.nodeNumber == nodeNumber; } return false; } /** * TODO: verify this is correct */ @Override public int hashCode() { int hc = tree.hashCode() ^ (root.hashCode() << 8); hc ^= (2 ^ nodeNumber); return hc; } } /** * Private helper class for {@code mapTreeToSg}. Acts to * map between an IndexedWord (in a SemanticGraph) and a lexical value. * @author lumberjack * */ private static final class IndexedWordProxy { IndexedWord node; String lex; public String toString() { return lex+" -> "+node.word()+":"+node.sentIndex()+"."+node.index(); } private IndexedWordProxy(IndexedWord node, String lex) { this.node = node; this.lex = lex; } /** * Generates a set of IndexedWordProxy objects. If the current() field is present, splits the tokens by * a space, and for each, creates a new IndexedWordProxy, in order encountered, referencing this current * node, but using the lexical value of the current split token. Otherwise just use the value of word(). * This is used to retain attribution to the originating node. */ public static List create(IndexedWord node) { List ret = new ArrayList<>(); if (node.originalText().length() > 0) { for (String token : node.originalText().split(" ")) { ret.add(new IndexedWordProxy(node, token)); } } else { ret.add(new IndexedWordProxy(node, node.word())); } return ret; } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy