edu.stanford.nlp.patterns.surface.ApplyPatterns Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.patterns.surface;

import java.util.*;
import java.util.Map.Entry;
import java.util.concurrent.Callable;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher;
import edu.stanford.nlp.ling.tokensregex.TokenSequencePattern;
import edu.stanford.nlp.patterns.*;
import edu.stanford.nlp.stats.TwoDimensionalCounter;
import edu.stanford.nlp.util.CollectionValuedMap;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.Triple;

/**
 * Applying SurfacePattern to sentences.
 * @param 
 */
public class ApplyPatterns  implements Callable, CollectionValuedMap>, Set>> {
  String label;
  Map patterns;
  List sentids;
  boolean removeStopWordsFromSelectedPhrases;
  boolean removePhrasesWithStopWords;
  ConstantsAndVariables constVars;
  Map sents = null;


  public ApplyPatterns(Map sents, List sentids, Map patterns, String label, boolean removeStopWordsFromSelectedPhrases,
                       boolean removePhrasesWithStopWords, ConstantsAndVariables cv) {
    this.sents = sents;
    this.patterns = patterns;
    this.sentids = sentids;
    this.label = label;
    this.removeStopWordsFromSelectedPhrases = removeStopWordsFromSelectedPhrases;
    this.removePhrasesWithStopWords = removePhrasesWithStopWords;
    this.constVars = cv;
  }

  @Override
  public Triple, CollectionValuedMap>, Set> call()
    throws Exception {
    // CollectionValuedMap tokensMatchedPattern = new
    // CollectionValuedMap();
    try{
      Set alreadyLabeledPhrases = new HashSet<>();
      TwoDimensionalCounter allFreq = new TwoDimensionalCounter<>();
      CollectionValuedMap> matchedTokensByPat = new CollectionValuedMap<>();
      for (String sentid : sentids) {
        List sent = sents.get(sentid).getTokens();
        for (Entry pEn : patterns.entrySet()) {

          if (pEn.getKey() == null)
            throw new RuntimeException("why is the pattern " + pEn + " null?");

          TokenSequenceMatcher m = pEn.getKey().getMatcher(sent);

//        //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced
//        m.setFindType(SequenceMatcher.FindType.FIND_ALL);

          //Higher branch values makes the faster but uses more memory
          m.setBranchLimit(5);

          while (m.find()) {

            int s = m.start("$term");
            int e = m.end("$term");

            assert e-s <= PatternFactory.numWordsCompoundMapped.get(label) : "How come the pattern " + pEn.getKey()  + " is extracting phrases longer than numWordsCompound of " + PatternFactory.numWordsCompoundMapped.get(label) + " for label " + label;

            String phrase = "";
            String phraseLemma = "";
            boolean useWordNotLabeled = false;
            boolean doNotUse = false;

            //find if the neighboring words are labeled - if so - club them together
            if(constVars.clubNeighboringLabeledWords) {
              for (int i = s - 1; i >= 0; i--) {
                if (!sent.get(i).get(constVars.getAnswerClass().get(label)).equals(label)) {
                  s = i + 1;
                  break;
                }
              }
              for (int i = e; i < sent.size(); i++) {
                if (!sent.get(i).get(constVars.getAnswerClass().get(label)).equals(label)) {
                  e = i;
                  break;
                }
              }
            }

            //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
            boolean[] addedindices = new boolean[e-s];
            Arrays.fill(addedindices, false);

            for (int i = s; i < e; i++) {
              CoreLabel l = sent.get(i);
              l.set(PatternsAnnotations.MatchedPattern.class, true);

              if(!l.containsKey(PatternsAnnotations.MatchedPatterns.class) || l.get(PatternsAnnotations.MatchedPatterns.class) == null)
                l.set(PatternsAnnotations.MatchedPatterns.class, new HashSet<>());

              SurfacePattern pSur = (SurfacePattern) pEn.getValue();
              assert pSur != null : "Why is " + pEn.getValue() + " not present in the index?!";
              assert l.get(PatternsAnnotations.MatchedPatterns.class) != null : "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.keySet();
              l.get(PatternsAnnotations.MatchedPatterns.class).add(pSur);

              for (Entry ig : constVars.getIgnoreWordswithClassesDuringSelection()
                .get(label).entrySet()) {
                if (l.containsKey(ig.getKey())
                  && l.get(ig.getKey()).equals(ig.getValue())) {
                  doNotUse = true;
                }
              }
              boolean containsStop = containsStopWord(l,
                constVars.getCommonEngWords(), PatternFactory.ignoreWordRegex);
              if (removePhrasesWithStopWords && containsStop) {
                doNotUse = true;
              } else {
                if (!containsStop || !removeStopWordsFromSelectedPhrases) {
                  if (label == null
                    || l.get(constVars.getAnswerClass().get(label)) == null
                    || !l.get(constVars.getAnswerClass().get(label)).equals(
                    label.toString())) {
                    useWordNotLabeled = true;
                  }
                  phrase += " " + l.word();
                  phraseLemma += " " + l.lemma();
                  addedindices[i-s] = true;
                }
              }
            }

            for(int i =0; i < addedindices.length; i++){
              if(i > 0 && i < addedindices.length -1 && addedindices[i-1] == true && addedindices[i] == false && addedindices[i+1] == true){
                doNotUse = true;
                break;
              }
            }
            if (!doNotUse) {
              matchedTokensByPat.add(pEn.getValue(), new Triple<>(
                      sentid, s, e - 1));

              phrase = phrase.trim();
              if(!phrase.isEmpty()){
                phraseLemma = phraseLemma.trim();
                CandidatePhrase candPhrase = CandidatePhrase.createOrGet(phrase, phraseLemma);
                allFreq.incrementCount(candPhrase, pEn.getValue(), 1.0);
                if (!useWordNotLabeled)
                  alreadyLabeledPhrases.add(candPhrase);
              }
            }
          }
        }
      }
      return new Triple<>(allFreq, matchedTokensByPat, alreadyLabeledPhrases);
    }catch(Exception e){
      e.printStackTrace();
      throw e;
    }
  }

  static boolean lemmaExists(CoreLabel l ){
    if(l.lemma() != null && l.lemma().length() > 0)
      return true;
    else
      return false;

  }
  boolean  containsStopWord(CoreLabel l, Set commonEngWords, java.util.regex.Pattern ignoreWordRegex) {
    // if(useWordResultCache.containsKey(l.word()))
    // return useWordResultCache.get(l.word());

    if ((commonEngWords != null && ((lemmaExists(l) && commonEngWords.contains(l.lemma())) || commonEngWords.contains(l.word()))) || (ignoreWordRegex != null && ((lemmaExists(l) && ignoreWordRegex.matcher(l.lemma()).matches()) || ignoreWordRegex.matcher(l.word()).matches()))){
      //|| (ignoreWords !=null && (ignoreWords.contains(l.lemma()) || ignoreWords.contains(l.word())))) {
      // useWordResultCache.putIfAbsent(l.word(), false);
      return true;
    }
    //
    // if (l.word().length() >= minLen4Fuzzy) {
    // try {
    // String matchedFuzzy = NoisyLabelSentences.containsFuzzy(commonEngWords,
    // l.word(), minLen4Fuzzy);
    // if (matchedFuzzy != null) {
    // synchronized (commonEngWords) {
    // commonEngWords.add(l.word());
    // System.out.println("word is " + l.word() + " and matched fuzzy with " +
    // matchedFuzzy);
    // }
    // useWordResultCache.putIfAbsent(l.word(), false);
    // return false;
    // }
    // } catch (Exception e) {
    // e.printStackTrace();
    // System.out.println("Exception " + " while fuzzy matching " + l.word());
    // }
    // }
    // useWordResultCache.putIfAbsent(l.word(), true);
    return false;
  }

}