edu.stanford.nlp.patterns.dep.ExtractPhraseFromPattern Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.patterns.dep;

import java.io.BufferedWriter;
import java.io.FileWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Map.Entry;
import java.util.function.Function;

//import org.jdom.Element;
//import org.jdom.Namespace;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.patterns.GetPatternsFromDataMultiClass;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphEdge;
import edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher;
import edu.stanford.nlp.semgraph.semgrex.SemgrexPattern;
import edu.stanford.nlp.stats.Counters;
import edu.stanford.nlp.trees.GrammaticalRelation;
import edu.stanford.nlp.util.CollectionValuedMap;
import edu.stanford.nlp.util.IntPair;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.StringUtils;

public class ExtractPhraseFromPattern {

  public List cutoffRelations = new ArrayList<>();
  public int maxDepth = Integer.MAX_VALUE;
  public static List ignoreTags = Arrays.asList("PRP", "PRP$", "CD",
      "DT", ".", "..", ",", "SYM");
  boolean ignoreCommonTags = true;
  public static ArrayList cutoffTags = new ArrayList<>();
  public int maxPhraseLength = Integer.MAX_VALUE;
  //Namespace curNS;

  Map>> matchedGraphsForPattern = new HashMap<>();

  // 0 means none, 1 means partial, 2 means it shows sentences and their
  // techniques, app and focus, and 3 means full
  private static int DEBUG = 1;

  public ExtractPhraseFromPattern() {}
  public ExtractPhraseFromPattern(boolean ignoreCommonTags, int maxPhraseLength) {
    this.maxPhraseLength = maxPhraseLength;
    this.ignoreCommonTags = ignoreCommonTags;
    //this.curNS = null;
  }

  public void setMaxPhraseLength(int maxPhraseLength){
    this.maxPhraseLength = maxPhraseLength;
  }

  //public ExtractPhraseFromPattern(Namespace curNS) {
  //  this.curNS = curNS;
  //}

  private boolean checkIfSatisfiedMaxDepth(SemanticGraph g, IndexedWord parent,
      IndexedWord child, IntPair depths) {
    if (depths.get(0) == Integer.MAX_VALUE)
      return true;

    if (parent.equals(child))
      return true;

    boolean foundInMaxDepth = false;

    for (IndexedWord c : g.getChildren(parent)) {
      if (c.equals(child)) {
        return true;
      }
    }

    depths.set(1, depths.get(1) + 1);

    if (depths.get(1) >= depths.get(0))
      return false;

    for (IndexedWord c : g.getChildren(parent)) {
      foundInMaxDepth = checkIfSatisfiedMaxDepth(g, c, child, depths);
      if (foundInMaxDepth == true)
        return foundInMaxDepth;
    }

    return false;
  }

  public void processSentenceForType(SemanticGraph g,
      List typePatterns, List textTokens,
      Collection typePhrases, Collection typeIndices,
      Collection typeTriggerWords, boolean findSubTrees, Collection extractedPhrases, boolean lowercase) {

    for (SemgrexPattern pattern : typePatterns) {
      Collection triggerWords = getSemGrexPatternNodes(g,
          textTokens, typePhrases, typeIndices, pattern,
          findSubTrees, extractedPhrases, lowercase, o -> true);
      for (IndexedWord w : triggerWords) {
        if (!typeTriggerWords.contains(w))
          typeTriggerWords.add(w);
      }
      // System.out.println("the string is " + StringUtils.join(focuss, ";"));
    }

  }

  /*
   * Given a SemanticGraph g and a SemgrexPattern pattern
   * And a bunch of other parameters,
   * run the pattern matcher (get SemgrexMatcher m)
   * Iterate through to get matching words/phrases
   * 
   * Next, gets matchedGraphsForPattern.get(pattern),
   * a list of matched (String, semgraph) pairs
   * and adds the new graph and tokens if matched.
   * 
   * I need to clarify what's going on with tokens.
   */
  public Set getSemGrexPatternNodes(SemanticGraph g,
      List tokens, Collection outputNodes, Collection outputIndices,
      SemgrexPattern pattern, boolean findSubTrees,
      Collection extractedPhrases, boolean lowercase, Function acceptWord) {

    Set foundWordsParents = new HashSet<>();
    SemgrexMatcher m = pattern.matcher(g, lowercase);
    while (m.find()) {

      IndexedWord w = m.getNode("node");
      //System.out.println("found a match for " + pattern.pattern());

      IndexedWord parent = m.getNode("parent");

      boolean ifSatisfiedMaxDepth = checkIfSatisfiedMaxDepth(g, parent, w,
          new IntPair(maxDepth, 0));
      if (ifSatisfiedMaxDepth == false)
        continue;


      if(DEBUG > 3) {
        List> matchedGraphs = matchedGraphsForPattern.get(pattern);

        if (matchedGraphs == null)
          matchedGraphs = new ArrayList<>();
        matchedGraphs.add(new Pair<>(StringUtils.join(
                tokens, " "), g));
        //if (DEBUG >= 3)
        //  System.out.println("matched pattern is " + pattern);
        matchedGraphsForPattern.put(pattern, matchedGraphs);
      }

      foundWordsParents.add(parent);

      // String relationName = m.getRelnString("reln");
      // System.out.println("word is " + w.lemma() + " and " + w.tag());
      ArrayList seenNodes = new ArrayList<>();
      List cutoffrelations = new ArrayList<>();
//      if (elementStr.equalsIgnoreCase("technique"))
//        cutoffrelations = cutoffRelationsForTech;
//      if (elementStr.equalsIgnoreCase("app"))
//        cutoffrelations = this.cuttoffRelationsForApp;
      //System.out.println("g is ");
      //g.prettyPrint();
      printSubGraph(g, w, cutoffrelations, tokens, outputNodes, outputIndices, seenNodes, new ArrayList<>(),
          findSubTrees, extractedPhrases, pattern, acceptWord);
    }
    return foundWordsParents;
  }

  //Here, the index (startIndex, endIndex) seems to be inclusive of the endIndex
   public void printSubGraph(SemanticGraph g, IndexedWord w,
                             List additionalCutOffRels,
                             List textTokens,
                             Collection listOfOutput, Collection listOfOutputIndices,
                             List seenNodes, List doNotAddThese,
                             boolean findSubTrees, Collection extractedPhrases,
                             SemgrexPattern pattern, Function acceptWord) {
    try {
      if (seenNodes.contains(w))
        return;
      seenNodes.add(w);

      if (doNotAddThese.contains(w))
        return;

      List andNodes = new ArrayList<>();

      descendantsWithReln(g, w, "conj_and", new ArrayList<>(),
          andNodes);

      //System.out.println("and nodes are " + andNodes);

      for (IndexedWord w1 : andNodes) {
        printSubGraph(g, w1, additionalCutOffRels, textTokens,
            listOfOutput, listOfOutputIndices, seenNodes,
            doNotAddThese, findSubTrees, extractedPhrases, pattern, acceptWord);

      }
      doNotAddThese.addAll(andNodes);

      List allCutOffRels = new ArrayList<>();
      if (additionalCutOffRels != null)
        allCutOffRels.addAll(additionalCutOffRels);
      allCutOffRels.addAll(cutoffRelations);

      CollectionValuedMap featPerToken = new CollectionValuedMap<>();
      Collection feat = new ArrayList<>();
      GetPatternsFromDataMultiClass.getFeatures(g, w, true, feat, null);


      Set words = descendants(g, w, allCutOffRels, doNotAddThese, ignoreCommonTags, acceptWord, featPerToken);


      // words.addAll(andNodes);

      // if (includeSiblings == true) {
      // for (IndexedWord ws : g.getSiblings(w)) {
      // if (additionalCutOffNodes == null
      // || !additionalCutOffNodes.contains(g.reln(g.getParent(w),
      // ws).getShortName()))
      // words.addAll(descendants(g, ws, additionalCutOffNodes, doNotAddThese));
      // }
      // }
      // if(afterand != null){
      // Set wordsAnd = descendants(g,afterand,
      // additionalCutOffNodes);
      // words.removeAll(wordsAnd);
      // printSubGraph(g,afterand, includeSiblings, additionalCutOffNodes);
      // }
      //System.out.println("words are " + words);
      if (words.size() > 0) {
        int min = Integer.MAX_VALUE, max = -1;
        for (IndexedWord word : words) {
          if (word.index() < min)
            min = word.index();
          if (word.index() > max)
            max = word.index();
        }

        IntPair indices;

        // Map ph = new TreeMap();
        // String phrase = "";
        // for (IndexedWord word : words) {
        // ph.put(word.index(), word.value());
        // }
        // phrase = StringUtils.join(ph.values(), " ");
        if ((max - min + 1) > maxPhraseLength){
          max = min + maxPhraseLength - 1 ;
        }
        indices = new IntPair(min - 1, max -1);
        String phrase = StringUtils.join(
          textTokens.subList(min - 1, max), " ");
        phrase = phrase.trim();
        feat.add("LENGTH-" + (max - min + 1));
        for(int i = min; i <= max; i++)
          feat.addAll(featPerToken.get(i));

        //System.out.println("phrase is " + phrase  + " index is " + indices + " and maxphraselength is " + maxPhraseLength + " and descendentset is " + words);
        ExtractedPhrase  extractedPh = new ExtractedPhrase(min - 1, max -1, pattern,  phrase, Counters.asCounter(feat));


        if (!listOfOutput.contains(phrase) && !doNotAddThese.contains(phrase)) {

//          if (sentElem != null) {
//            Element node = new Element(elemString, curNS);
//            node.addContent(phrase);
//            sentElem.addContent(node);
//          }
          listOfOutput.add(phrase);

          if (!listOfOutputIndices.contains(indices)) {
            listOfOutputIndices.add(indices);
            extractedPhrases.add(extractedPh);
          }

          if (findSubTrees == true) {
            for (IndexedWord word : words)
              if (!seenNodes.contains(word))
                printSubGraph(g, word, additionalCutOffRels,
                    textTokens, listOfOutput,
                    listOfOutputIndices, seenNodes, doNotAddThese,
                    findSubTrees, extractedPhrases, pattern, acceptWord);
          }
        }
      }
    } catch (Exception e) {
      e.printStackTrace();

    }
  }

  public static Set descendants(SemanticGraph g,
      IndexedWord vertex, List allCutOffRels,
      List doNotAddThese, boolean ignoreCommonTags, Function acceptWord, CollectionValuedMap feat) throws Exception {
    // Do a depth first search
    Set descendantSet = new HashSet<>();

    if (doNotAddThese !=null && doNotAddThese.contains(vertex))
      return descendantSet;

    if(!acceptWord.apply(vertex.backingLabel()))
      return descendantSet;

    descendantsHelper(g, vertex, descendantSet, allCutOffRels, doNotAddThese,
            new ArrayList<>(), ignoreCommonTags, acceptWord, feat);
//    String descStr = "";
//    for(IndexedWord descendant: descendantSet){
//      descStr += descendant.word()+" ";
//    }
//    System.out.println(descStr);
    return descendantSet;
  }

  static boolean checkIfSatisfiesRelConstrains(SemanticGraph g,
      IndexedWord curr, IndexedWord child, List allCutOffRels, GrammaticalRelation rel) {
    String relName = rel.getShortName();
    String relSpecificName = rel.toString();
    String relFullName = rel.getLongName();
    if(allCutOffRels!=null)
      for (String check : allCutOffRels) {
        if (relName.matches(check)
            || (relSpecificName != null && relSpecificName.matches(check))
            || (relFullName != null && relFullName.matches(check))) {
          return true;
        }
      }
    return false;
  }

  private static void descendantsHelper(SemanticGraph g, IndexedWord curr,
      Set descendantSet, List allCutOffRels,
      List doNotAddThese, List seenNodes, boolean ignoreCommonTags, Function acceptWord, CollectionValuedMap feat)
      throws Exception {

    if (seenNodes.contains(curr))
      return;

    seenNodes.add(curr);
    if (descendantSet.contains(curr) || (doNotAddThese!=null && doNotAddThese.contains(curr)) || !acceptWord.apply(curr.backingLabel())) {
      return;
    }

    if (!ignoreCommonTags || !ignoreTags.contains(curr.tag().trim())) {
      descendantSet.add(curr);
    }

    for (IndexedWord child : g.getChildren(curr)) {
      boolean dontuse = false;
      if (doNotAddThese!=null &&doNotAddThese.contains(child))
        dontuse = true;

      GrammaticalRelation rel = null;
      if (dontuse == false) {
        rel = g.reln(curr, child);
        dontuse = checkIfSatisfiesRelConstrains(g, curr, child, allCutOffRels, rel);
      }
      if (dontuse == false) {
        for (String cutOffTagRegex : cutoffTags) {
          if (child.tag().matches(cutOffTagRegex)) {
            if (DEBUG >= 5)
              System.out.println("ignored tag " + child
                  + " because it satisfied " + cutOffTagRegex);
            dontuse = true;
            break;
          }
        }

      }
      if (dontuse == false){
        if(!feat.containsKey(curr.index())){
          feat.put(curr.index(), new ArrayList<>());
        }
        GetPatternsFromDataMultiClass.getFeatures(g, curr, false, feat.get(curr.index()), rel);
        //feat.add(curr.index(), "REL-" + rel.getShortName());
        descendantsHelper(g, child, descendantSet, allCutOffRels,
            doNotAddThese, seenNodes, ignoreCommonTags, acceptWord, feat);
      }
    }
  }

  // get descendants that have this relation
  private void descendantsWithReln(SemanticGraph g, IndexedWord w,
      String relation, List seenNodes,
      List descendantSet) {

    if (seenNodes.contains(w))
      return;
    seenNodes.add(w);
    if (descendantSet.contains(w))
      return;
    if (ignoreCommonTags && ignoreTags.contains(w.tag().trim()))
      return;
    for (IndexedWord child : g.getChildren(w)) {
      for (SemanticGraphEdge edge : g.getAllEdges(w, child)) {
        if (edge.getRelation().toString().equals(relation)) {
          descendantSet.add(child);
        }
      }
      descendantsWithReln(g, child, relation, seenNodes, descendantSet);
    }
  }

  public void printMatchedGraphsForPattern(String filename,
      int maxGraphsPerPattern) throws Exception {
    BufferedWriter w = new BufferedWriter(new FileWriter(filename));
    for (Entry>> en : matchedGraphsForPattern
        .entrySet()) {
      w.write("\n\nFor Pattern: " + en.getKey().pattern() + "\n");
      int num = 0;
      for (Pair gEn : en.getValue()) {
        num++;
        if (num > maxGraphsPerPattern)
          break;
        w.write(gEn.first() + "\n" + gEn.second().toFormattedString() + "\n\n");
      }
    }
    w.close();
  }
}