> wordsForOtherClass = null;
// String channelNameLogger = "patterns";
/**
*
* RlogF is from Riloff 1996, when R's denominator is (pos+neg+unlabeled)
*
* RlogFPosNeg is when the R's denominator is just (pos+negative) examples
*
* PosNegOdds is just the ratio of number of positive words to number of
* negative
*
* PosNegUnlabOdds is just the ratio of number of positive words to number of
* negative (unlabeled words + negative)
*
* RatioAll is pos/(neg+pos+unlabeled)
*
* YanGarber02 is the modified version presented in
* "Unsupervised Learning of Generalized Names"
*
* LOGREG is learning a logistic regression classifier to combine weights to
* score a phrase (Same as PhEvalInPat, except score of an unlabeled phrase is
* computed using a logistic regression classifier)
*
* LOGREGlogP is learning a logistic regression classifier to combine weights
* to score a phrase (Same as PhEvalInPatLogP, except score of an unlabeled
* phrase is computed using a logistic regression classifier)
*
* SqrtAllRatio is the pattern scoring used in Gupta et al. JAMIA 2014 paper
*
* Below F1SeedPattern and BPB based on paper
* "Unsupervised Method for Automatics Construction of a disease dictionary..."
*
* Precision, Recall, and FMeasure (controlled by fbeta flag) is ranking the patterns using
* their precision, recall and F_beta measure
*/
public enum PatternScoring {
F1SeedPattern, RlogF, RlogFPosNeg, RlogFUnlabNeg, RlogFNeg, PhEvalInPat, PhEvalInPatLogP, PosNegOdds,
YanGarber02, PosNegUnlabOdds, RatioAll, LOGREG, LOGREGlogP, SqrtAllRatio, LinICML03, kNN
}
enum WordScoring {
BPB, WEIGHTEDNORM
}
private Map writtenPatInJustification = new HashMap<>();
private Map> learnedPatterns = new HashMap<>();
//Same as learnedPatterns but with iteration information
private Map>> learnedPatternsEachIter = new HashMap<>();
Map> matchedSeedWords = new HashMap<>();
public Map> wordsPatExtracted = new HashMap<>();
Properties props;
public ScorePhrases scorePhrases;
public ConstantsAndVariables constVars;
public CreatePatterns createPats;
private final DecimalFormat df = new DecimalFormat("#.##");
private boolean notComputedAllPatternsYet = true;
/*
* when there is only one label
*/
public GetPatternsFromDataMultiClass(Properties props, Map sents, Set seedSet, boolean labelUsingSeedSets,
String answerLabel) throws IOException, InstantiationException, IllegalAccessException, IllegalArgumentException, InvocationTargetException,
NoSuchMethodException, SecurityException, InterruptedException, ExecutionException, ClassNotFoundException {
this(props, sents, seedSet, labelUsingSeedSets, PatternsAnnotations.PatternLabel1.class, answerLabel);
}
@SuppressWarnings("rawtypes")
public GetPatternsFromDataMultiClass(Properties props, Map sents, Set seedSet, boolean labelUsingSeedSets,
Class answerClass, String answerLabel) throws IOException, InstantiationException, IllegalAccessException, IllegalArgumentException,
InvocationTargetException, NoSuchMethodException, SecurityException, InterruptedException, ExecutionException, ClassNotFoundException {
this.props = props;
Map>> ansCl = new HashMap<>();
ansCl.put(answerLabel, answerClass);
Map generalizeClasses = new HashMap<>();
Map> ignoreClasses = new HashMap<>();
ignoreClasses.put(answerLabel, new HashMap<>());
Map> seedSets = new HashMap<>();
seedSets.put(answerLabel, seedSet);
setUpConstructor(sents, seedSets, labelUsingSeedSets, ansCl, generalizeClasses, ignoreClasses);
}
@SuppressWarnings("rawtypes")
public GetPatternsFromDataMultiClass(Properties props, Map sents, Set seedSet, boolean labelUsingSeedSets,
String answerLabel, Map generalizeClasses, Map ignoreClasses) throws IOException, InstantiationException,
IllegalAccessException, IllegalArgumentException, InvocationTargetException, NoSuchMethodException, SecurityException, InterruptedException,
ExecutionException, ClassNotFoundException {
this(props, sents, seedSet, labelUsingSeedSets, PatternsAnnotations.PatternLabel1.class, answerLabel, generalizeClasses, ignoreClasses);
}
@SuppressWarnings("rawtypes")
public GetPatternsFromDataMultiClass(Properties props, Map sents, Set seedSet, boolean labelUsingSeedSets,
Class answerClass, String answerLabel, Map generalizeClasses, Map ignoreClasses) throws IOException,
InstantiationException, IllegalAccessException, IllegalArgumentException, InvocationTargetException, NoSuchMethodException, SecurityException,
InterruptedException, ExecutionException, ClassNotFoundException {
this.props = props;
Map>> ansCl = new HashMap<>();
ansCl.put(answerLabel, answerClass);
Map> iC = new HashMap<>();
iC.put(answerLabel, ignoreClasses);
Map> seedSets = new HashMap<>();
seedSets.put(answerLabel, seedSet);
setUpConstructor(sents, seedSets, labelUsingSeedSets, ansCl, generalizeClasses, iC);
}
@SuppressWarnings("rawtypes")
public GetPatternsFromDataMultiClass(Properties props, Map sents, Map> seedSets,
boolean labelUsingSeedSets) throws IOException, InstantiationException, IllegalAccessException, IllegalArgumentException,
InvocationTargetException, NoSuchMethodException, SecurityException, ClassNotFoundException, InterruptedException, ExecutionException {
this.props = props;
Map>> ansCl = new HashMap<>();
Map gC = new HashMap<>();
Map> iC = new HashMap<>();
int i = 1;
for (String label : seedSets.keySet()) {
String ansclstr = "edu.stanford.nlp.patterns.PatternsAnnotations$PatternLabel" + i;
ansCl.put(label, (Class>) Class.forName(ansclstr));
iC.put(label, new HashMap<>());
i++;
}
setUpConstructor(sents, seedSets, labelUsingSeedSets, ansCl, gC, iC);
}
@SuppressWarnings("rawtypes")
public GetPatternsFromDataMultiClass(Properties props, Map sents, Map> seedSets,
boolean labelUsingSeedSets, Map>> answerClass) throws IOException, InstantiationException,
IllegalAccessException, IllegalArgumentException, InvocationTargetException, NoSuchMethodException, SecurityException, InterruptedException,
ExecutionException, ClassNotFoundException {
this(props, sents, seedSets, labelUsingSeedSets, answerClass, new HashMap<>(), new HashMap<>());
}
/**
* Generalize classes basically maps label strings to a map of generalized
* strings and the corresponding class ignoreClasses have to be boolean.
*
* @throws IOException
* @throws SecurityException
* @throws NoSuchMethodException
* @throws InvocationTargetException
* @throws IllegalArgumentException
* @throws IllegalAccessException
* @throws InstantiationException
* @throws ExecutionException
* @throws InterruptedException
* @throws ClassNotFoundException
*/
@SuppressWarnings("rawtypes")
public GetPatternsFromDataMultiClass(Properties props, Map sents, Map> seedSets,
boolean labelUsingSeedSets, Map>> answerClass, Map generalizeClasses,
Map> ignoreClasses) throws IOException, InstantiationException, IllegalAccessException, IllegalArgumentException,
InvocationTargetException, NoSuchMethodException, SecurityException, InterruptedException, ExecutionException, ClassNotFoundException {
this.props = props;
if (ignoreClasses.isEmpty()) {
for (String label : seedSets.keySet())
ignoreClasses.put(label, new HashMap<>());
}
setUpConstructor(sents, seedSets, labelUsingSeedSets, answerClass, generalizeClasses, ignoreClasses);
}
@SuppressWarnings("rawtypes")
private void setUpConstructor(Map sents, Map> seedSets, boolean labelUsingSeedSets,
Map>> answerClass, Map generalizeClasses,
Map> ignoreClasses) throws IOException, InstantiationException, IllegalAccessException, IllegalArgumentException,
InvocationTargetException, NoSuchMethodException, SecurityException, InterruptedException, ExecutionException, ClassNotFoundException {
Data.sents = sents;
ArgumentParser.fillOptions(Data.class, props);
ArgumentParser.fillOptions(ConstantsAndVariables.class, props);
PatternFactory.setUp(props, PatternFactory.PatternType.valueOf(props.getProperty(Flags.patternType)), seedSets.keySet());
constVars = new ConstantsAndVariables(props, seedSets, answerClass, generalizeClasses, ignoreClasses);
if (constVars.writeMatchedTokensFiles && constVars.batchProcessSents) {
throw new RuntimeException(
"writeMatchedTokensFiles and batchProcessSents cannot be true at the same time (not implemented; also doesn't make sense to save a large sentences json file)");
}
if (constVars.debug < 1) {
Redwood.hideChannelsEverywhere(ConstantsAndVariables.minimaldebug);
}
if (constVars.debug < 2) {
Redwood.hideChannelsEverywhere(Redwood.DBG);
}
constVars.justify = true;
if (constVars.debug < 3) {
constVars.justify = false;
}
if (constVars.debug < 4) {
Redwood.hideChannelsEverywhere(ConstantsAndVariables.extremedebug);
}
Redwood.log(Redwood.DBG, "Running with debug output");
Redwood.log(ConstantsAndVariables.extremedebug, "Running with extreme debug output");
wordsPatExtracted = new HashMap<>();
for (String label : answerClass.keySet()) {
wordsPatExtracted.put(label, new TwoDimensionalCounter<>());
}
scorePhrases = new ScorePhrases(props, constVars);
createPats = new CreatePatterns(props, constVars);
assert !(constVars.doNotApplyPatterns && (PatternFactory.useStopWordsBeforeTerm || PatternFactory.numWordsCompoundMax > 1)) : " Cannot have both doNotApplyPatterns and (useStopWordsBeforeTerm true or numWordsCompound > 1)!";
if(constVars.invertedIndexDirectory == null){
File f = File.createTempFile("inv","index");
f.deleteOnExit();
f.mkdir();
constVars.invertedIndexDirectory = f.getAbsolutePath();
}
Set extremelySmallStopWordsList = CollectionUtils.asSet(".", ",", "in", "on", "of", "a", "the", "an");
//Function to use to how to add CoreLabels to index
Function> transformCoreLabelToString = l -> {
Map add = new HashMap<>();
for (Class gn: constVars.getGeneralizeClasses().values()) {
Object b = l.get(gn);
if (b != null && !b.toString().equals(constVars.backgroundSymbol)) {
add.put(Token.getKeyForClass(gn),b.toString());
}
}
return add;
};
boolean createIndex = false;
if (constVars.loadInvertedIndex)
constVars.invertedIndex = SentenceIndex.loadIndex(constVars.invertedIndexClass, props, extremelySmallStopWordsList, constVars.invertedIndexDirectory, transformCoreLabelToString);
else {
constVars.invertedIndex = SentenceIndex.createIndex(constVars.invertedIndexClass, null, props, extremelySmallStopWordsList, constVars.invertedIndexDirectory, transformCoreLabelToString);
createIndex = true;
}
int totalNumSents = 0;
boolean computeDataFreq = false;
if (Data.rawFreq == null) {
Data.rawFreq = new ClassicCounter<>();
computeDataFreq = true;
}
ConstantsAndVariables.DataSentsIterator iter = new ConstantsAndVariables.DataSentsIterator(constVars.batchProcessSents);
while(iter.hasNext()){
Pair