edu.stanford.nlp.patterns.surface.CreatePatterns Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
package edu.stanford.nlp.patterns.surface;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import edu.stanford.nlp.patterns.*;
import edu.stanford.nlp.util.*;
import edu.stanford.nlp.util.logging.Redwood;
public class CreatePatterns {
//String channelNameLogger = "createpatterns";
ConstantsAndVariables constVars;
public CreatePatterns(Properties props, ConstantsAndVariables constVars)
throws IOException {
this.constVars = constVars;
ArgumentParser.fillOptions(ConstantsAndVariables.class, props);
constVars.setUp(props);
setUp(props);
}
void setUp(Properties props) {
ArgumentParser.fillOptions(this, props);
}
/**
* creates all patterns and saves them in the correct PatternsForEachToken* class appropriately
* @param sents
* @param props
* @param storePatsForEachTokenWay
*/
public void getAllPatterns(Map sents, Properties props, ConstantsAndVariables.PatternForEachTokenWay storePatsForEachTokenWay) {
// this.patternsForEachToken = new HashMap, Set, Set>>>();
// this.patternsForEachToken = new HashMap>>();
Date startDate = new Date();
List keyset = new ArrayList<>(sents.keySet());
int num;
if (constVars.numThreads == 1)
num = keyset.size();
else
num = keyset.size() / (constVars.numThreads);
ExecutorService executor = Executors
.newFixedThreadPool(constVars.numThreads);
Redwood.log(ConstantsAndVariables.extremedebug, "Computing all patterns. keyset size is " + keyset.size() + ". Assigning " + num + " values to each thread");
List> list = new ArrayList<>();
for (int i = 0; i < constVars.numThreads; i++) {
int from = i * num;
int to = -1;
if(i == constVars.numThreads -1)
to = keyset.size();
else
to =Math.min(keyset.size(), (i + 1) * num);
//
// Redwood.log(ConstantsAndVariables.extremedebug, "assigning from " + i * num
// + " till " + Math.min(keyset.size(), (i + 1) * num));
List ids = keyset.subList(from ,to);
Callable task = new CreatePatternsThread(sents, ids, props, storePatsForEachTokenWay);
Future submit = executor
.submit(task);
list.add(submit);
}
// Now retrieve the result
for (Future future : list) {
try{
future.get();
//patternsForEachToken.putAll(future.get());
} catch(Exception e){
executor.shutdownNow();
throw new RuntimeException(e);
}
}
executor.shutdown();
Date endDate = new Date();
String timeTaken = GetPatternsFromDataMultiClass.elapsedTime(startDate, endDate);
Redwood.log(Redwood.DBG, "Done computing all patterns ["+timeTaken+"]");
//return patternsForEachToken;
}
// /**
// * Returns null if using DB backed!!
// * @return
// */
// public Map>> getPatternsForEachToken() {
// return patternsForEachToken;
// }
public class CreatePatternsThread
implements
Callable {
//String label;
// Class otherClass;
Map sents;
List sentIds;
PatternsForEachToken patsForEach;
public CreatePatternsThread(Map sents, List sentIds, Properties props, ConstantsAndVariables.PatternForEachTokenWay storePatsForEachToken) {
//this.label = label;
// this.otherClass = otherClass;
this.sents = sents;
this.sentIds = sentIds;
this.patsForEach = PatternsForEachToken.getPatternsInstance(props, storePatsForEachToken);
}
@Override
public Boolean call() throws Exception {
Map>> tempPatternsForTokens = new HashMap<>();
int numSentencesInOneCommit = 0;
for (String id : sentIds) {
DataInstance sent = sents.get(id);
if(!constVars.storePatsForEachToken.equals(ConstantsAndVariables.PatternForEachTokenWay.MEMORY))
tempPatternsForTokens.put(id, new HashMap<>());
Map> p = (Map) PatternFactory.getPatternsAroundTokens(constVars.patternType, sent, constVars.getStopWords());
//to save number of commits to the database
if(!constVars.storePatsForEachToken.equals(ConstantsAndVariables.PatternForEachTokenWay.MEMORY)){
tempPatternsForTokens.put(id, p);
numSentencesInOneCommit++;
if(numSentencesInOneCommit % 1000 == 0){
patsForEach.addPatterns(tempPatternsForTokens);
tempPatternsForTokens.clear();
numSentencesInOneCommit = 0;
}
// patsForEach.addPatterns(id, p);
}
else
patsForEach.addPatterns(id, p);
}
//For the remaining sentences
if(!constVars.storePatsForEachToken.equals(ConstantsAndVariables.PatternForEachTokenWay.MEMORY))
patsForEach.addPatterns(tempPatternsForTokens);
return true;
}
}
}