All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.patterns.surface.CreatePatterns Maven / Gradle / Ivy

Go to download

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.patterns.surface;

import java.io.IOException;
import java.util.*;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;

import edu.stanford.nlp.patterns.*;
import edu.stanford.nlp.util.*;
import edu.stanford.nlp.util.logging.Redwood;

public class CreatePatterns {

  //String channelNameLogger = "createpatterns";

  ConstantsAndVariables constVars;

  public CreatePatterns(Properties props, ConstantsAndVariables constVars)
      throws IOException {
    this.constVars = constVars;
    ArgumentParser.fillOptions(ConstantsAndVariables.class, props);
    constVars.setUp(props);
    setUp(props);
  }

  void setUp(Properties props) {
    ArgumentParser.fillOptions(this, props);
  }


  /**
   * creates all patterns and saves them in the correct PatternsForEachToken* class appropriately
   * @param sents
   * @param props
   * @param storePatsForEachTokenWay
   */
  public void getAllPatterns(Map sents, Properties props, ConstantsAndVariables.PatternForEachTokenWay storePatsForEachTokenWay) {

//    this.patternsForEachToken = new HashMap, Set, Set>>>();
   // this.patternsForEachToken = new HashMap>>();

    Date startDate = new Date();
    List keyset = new ArrayList<>(sents.keySet());

    int num;
    if (constVars.numThreads == 1)
      num = keyset.size();
    else
      num = keyset.size() / (constVars.numThreads);
    ExecutorService executor = Executors
        .newFixedThreadPool(constVars.numThreads);

    Redwood.log(ConstantsAndVariables.extremedebug, "Computing all patterns. keyset size is " + keyset.size() + ". Assigning " + num + " values to each thread");
    List> list = new ArrayList<>();
    for (int i = 0; i < constVars.numThreads; i++) {

      int from = i * num;
      int to = -1;
      if(i == constVars.numThreads -1)
        to = keyset.size();
      else
       to =Math.min(keyset.size(), (i + 1) * num);
//
//      Redwood.log(ConstantsAndVariables.extremedebug, "assigning from " + i * num
//          + " till " + Math.min(keyset.size(), (i + 1) * num));

      List ids = keyset.subList(from ,to);
      Callable task = new CreatePatternsThread(sents, ids, props, storePatsForEachTokenWay);

      Future submit = executor
          .submit(task);
      list.add(submit);
    }

    // Now retrieve the result

    for (Future future : list) {
      try{
        future.get();
        //patternsForEachToken.putAll(future.get());
      } catch(Exception e){
        executor.shutdownNow();
        throw new RuntimeException(e);
      }
    }
    executor.shutdown();

    Date endDate = new Date();

    String timeTaken = GetPatternsFromDataMultiClass.elapsedTime(startDate, endDate);
    Redwood.log(Redwood.DBG, "Done computing all patterns ["+timeTaken+"]");
    //return patternsForEachToken;
  }

//  /**
//   * Returns null if using DB backed!!
//   * @return
//   */
//  public Map>> getPatternsForEachToken() {
//    return patternsForEachToken;
//  }

  public class CreatePatternsThread
      implements
      Callable {

    //String label;
    // Class otherClass;
    Map sents;
    List sentIds;
    PatternsForEachToken patsForEach;

    public CreatePatternsThread(Map sents, List sentIds, Properties props, ConstantsAndVariables.PatternForEachTokenWay storePatsForEachToken) {

      //this.label = label;
      // this.otherClass = otherClass;
      this.sents = sents;
      this.sentIds = sentIds;
      this.patsForEach = PatternsForEachToken.getPatternsInstance(props, storePatsForEachToken);
    }

    @Override
    public Boolean call() throws Exception {
      Map>> tempPatternsForTokens = new HashMap<>();
      int numSentencesInOneCommit = 0;

      for (String id : sentIds) {
        DataInstance sent = sents.get(id);

        if(!constVars.storePatsForEachToken.equals(ConstantsAndVariables.PatternForEachTokenWay.MEMORY))
          tempPatternsForTokens.put(id, new HashMap<>());

        Map> p  = (Map) PatternFactory.getPatternsAroundTokens(constVars.patternType, sent, constVars.getStopWords());
        //to save number of commits to the database
        if(!constVars.storePatsForEachToken.equals(ConstantsAndVariables.PatternForEachTokenWay.MEMORY)){
          tempPatternsForTokens.put(id, p);
          numSentencesInOneCommit++;
          if(numSentencesInOneCommit % 1000 == 0){
            patsForEach.addPatterns(tempPatternsForTokens);
            tempPatternsForTokens.clear();
            numSentencesInOneCommit = 0;
          }
//          patsForEach.addPatterns(id, p);

        }
        else
          patsForEach.addPatterns(id, p);

      }

      //For the remaining sentences
      if(!constVars.storePatsForEachToken.equals(ConstantsAndVariables.PatternForEachTokenWay.MEMORY))
        patsForEach.addPatterns(tempPatternsForTokens);

      return true;
    }

  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy