edu.stanford.nlp.patterns.InvertedIndexByTokens Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7

Show newest version

package edu.stanford.nlp.patterns;

import java.io.*;
import java.util.*;
import java.util.function.Function;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.patterns.surface.Token;
import edu.stanford.nlp.util.CollectionUtils;
import edu.stanford.nlp.util.CollectionValuedMap;
import edu.stanford.nlp.util.ArgumentParser;
import edu.stanford.nlp.util.logging.Redwood;

/**
 * Creates an inverted index of (classkey:value) => {sentid1,sentid2,.. }.
 *
 *
 * @author Sonal Gupta ([email protected])
 *
 */
public class InvertedIndexByTokens extends SentenceIndex implements Serializable {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(InvertedIndexByTokens.class);

  private static final long serialVersionUID = 1L;

  Map> index;

  public InvertedIndexByTokens(Properties props, Set stopWords, Function> transformSentenceToString) {
    super(stopWords, transformSentenceToString);
    ArgumentParser.fillOptions(this, props);
    index = new HashMap<>();
  }

  public InvertedIndexByTokens(Properties props, Set stopWords, Function> transformSentenceToString, Map> index) {
    super(stopWords, transformSentenceToString);
    ArgumentParser.fillOptions(this, props);
    this.index = index;
  }




  @Override
  public void add(Map sents, boolean addProcessedText) {
    for (Map.Entry sEn : sents.entrySet()) {
      add(sEn.getValue().getTokens(), sEn.getKey(), addProcessedText);
    }
  }

  @Override
  protected void add(List sent, String sentId, boolean addProcessedText){
    numAllSentences ++;
    for (CoreLabel l : sent) {

      //String w = l.word();
//        w = w.replaceAll("/", "\\\\/");
//        add(w, sEn.getKey());
      Map addThis = this.transformCoreLabeltoString.apply(l);
      for(Map.Entry en: addThis.entrySet()){
        String val = combineKeyValue(en.getKey(),en.getValue());
        add(val, sentId);
      }
      if(addProcessedText){
        String val  = Token.getKeyForClass(PatternsAnnotations.ProcessedTextAnnotation.class) +":"+ l.get(PatternsAnnotations.ProcessedTextAnnotation.class);
        if(!stopWords.contains(val.toLowerCase()))
          add(val, sentId);
      }
    }
  }

  @Override
  public void finishUpdating() {
    //nothing to do right now!
  }

  @Override
  public void update(List tokens, String sentid) {
    add(tokens, sentid, false);
  }

  void add(String w, String sentid){
    Set sentids = index.get(w);

    if (sentids == null) {
      sentids = new HashSet<>();
    }

    sentids.add(sentid);

    index.put(w, sentids);
  }

  String combineKeyValue(String key, String value){
    return key+":"+value;
  }

  public Set getFileSentIds(CollectionValuedMap relevantWords) {
    Set sentids = null;
    for (Map.Entry> en : relevantWords.entrySet()) {
      for(String en2: en.getValue()){
        if(!stopWords.contains(en2.toLowerCase())){
          String w = combineKeyValue(en.getKey(), en2);
          Set st = index.get(w);
          if (st == null){
            //log.info("\n\nWARNING: INDEX HAS NO SENTENCES FOR " + w);
            return Collections.emptySet();
            //throw new RuntimeException("How come the index does not have sentences for " + w);
          }
          if(sentids == null)
            sentids= st;
          else
            sentids = CollectionUtils.intersection(sentids, st);
        }
      }}
    return sentids;
  }

  //returns for each pattern, list of sentence ids
  public Map> getFileSentIdsFromPats(Collection pats) {
    Map> sents = new HashMap<>();
    for(E pat: pats){
      Set ids = getFileSentIds(pat.getRelevantWords());
      Redwood.log(ConstantsAndVariables.extremedebug, "For pattern with index " + pat + " extracted the following sentences from the index " + ids);
      sents.put(pat, ids);
    }
    return sents;
  }

  //The last variable is not really used!
  public static InvertedIndexByTokens createIndex(Map> sentences, Properties props, Set stopWords, String dir, Function> transformCoreLabeltoString) {
    InvertedIndexByTokens inv = new InvertedIndexByTokens(props, stopWords, transformCoreLabeltoString);

    if(sentences != null && sentences.size() > 0)
      inv.add(sentences, true);
    System.out.println("Created index with size " + inv.size() + ". Don't worry if it's zero and you are using batch process sents.");
    return inv;
  }

  @Override
  public Map> queryIndex(Collection patterns) {
    Map> sentSentids = getFileSentIdsFromPats(patterns);
    return sentSentids;
  }

  @Override
  public void saveIndex(String dir){
    try {
      IOUtils.ensureDir(new File(dir));
      IOUtils.writeObjectToFile(index, dir + "/map.ser");
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }

  //called by SentenceIndex.loadIndex
  public static InvertedIndexByTokens loadIndex(Properties props, Set stopwords, String dir,  Function> transformSentenceToString) {
    try {
      Map>  index = IOUtils.readObjectFromFile(dir + "/map.ser");
      System.out.println("Loading inverted index from " + dir);
      return new InvertedIndexByTokens(props, stopwords, transformSentenceToString, index);
    } catch (Exception e) {
      throw new RuntimeException("Cannot load the inverted index. " + e);
    }
  }


}