edu.stanford.nlp.patterns.ConstantsAndVariables Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
package edu.stanford.nlp.patterns;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.lang.reflect.Field;
import java.util.*;
import java.util.Map.Entry;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.tokensregex.Env;
import edu.stanford.nlp.ling.tokensregex.NodePattern;
import edu.stanford.nlp.ling.tokensregex.TokenSequencePattern;
import edu.stanford.nlp.patterns.GetPatternsFromDataMultiClass.PatternScoring;
import edu.stanford.nlp.patterns.GetPatternsFromDataMultiClass.WordScoring;
import edu.stanford.nlp.patterns.dep.DepPatternFactory;
import edu.stanford.nlp.patterns.surface.SurfacePatternFactory;
import edu.stanford.nlp.process.WordShapeClassifier;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.stats.Counters;
import edu.stanford.nlp.util.*;
import edu.stanford.nlp.util.ArgumentParser.Option;
import edu.stanford.nlp.util.TypesafeMap.Key;
import edu.stanford.nlp.util.logging.Redwood;
import javax.json.Json;
import javax.json.JsonArrayBuilder;
import javax.json.JsonObjectBuilder;
public class ConstantsAndVariables implements Serializable {
private static final long serialVersionUID = 1L;
/**
* Maximum number of iterations to run
*/
@Option(name = "numIterationsForPatterns")
public Integer numIterationsForPatterns = 10;
/**
* Maximum number of patterns learned in each iteration
*/
@Option(name = "numPatterns")
public int numPatterns = 10;
/**
* The output directory where the justifications of learning patterns and
* phrases would be saved. These are needed for visualization
*/
@Option(name = "outDir")
public String outDir = null;
/**
* Cached file of all patterns for all tokens
*/
@Option(name = "allPatternsDir")
public String allPatternsDir = null;
/**
* If all patterns should be computed. Otherwise patterns are read from
* allPatternsFile
*/
@Option(name = "computeAllPatterns")
public boolean computeAllPatterns = true;
// @Option(name = "removeRedundantPatterns")
// public boolean removeRedundantPatterns = true;
/**
* Pattern Scoring mechanism. See {@link PatternScoring} for options.
*/
@Option(name = "patternScoring")
public PatternScoring patternScoring = PatternScoring.PosNegUnlabOdds;
/**
* Threshold for learning a pattern
*/
@Option(name = "thresholdSelectPattern")
public double thresholdSelectPattern = 1.0;
// /**
// * Do not learn patterns that do not extract any unlabeled tokens (kind of
// * useless)
// */
// @Option(name = "discardPatternsWithNoUnlabSupport")
// public boolean discardPatternsWithNoUnlabSupport = true;
/**
* Currently, does not work correctly. TODO: make this work. Ideally this
* would label words only when they occur in the context of any learned
* pattern. This comment seems old. Test it!
*/
@Option(name = "restrictToMatched")
public boolean restrictToMatched = false;
/**
* Label words that are learned so that in further iterations we have more
* information
*/
@Option(name = "usePatternResultAsLabel")
public boolean usePatternResultAsLabel = true;
/**
* Debug flag for learning patterns. 0 means no output, 1 means necessary output, 2 means necessary output+some justification, 3 means extreme debug output
*/
@Option(name = "debug")
public int debug = 1;
/**
* Do not learn patterns in which the neighboring words have the same label.
* Deprecated!
*/
//@Option(name = "ignorePatWithLabeledNeigh")
//public boolean ignorePatWithLabeledNeigh = false;
/**
* Save this run as ...
*/
@Option(name = "identifier")
public String identifier = "getpatterns";
/**
* Use the actual dictionary matching phrase(s) instead of the token word or
* lemma in calculating the stats
*/
@Option(name = "useMatchingPhrase")
public boolean useMatchingPhrase = true;
/**
* Reduce pattern threshold (=0.8*current_value) to extract as many patterns
* as possible (still restricted by numPatterns
)
*/
@Option(name = "tuneThresholdKeepRunning")
public boolean tuneThresholdKeepRunning = false;
/**
* Maximum number of words to learn
*/
@Option(name = "maxExtractNumWords")
public int maxExtractNumWords = Integer.MAX_VALUE;
/**
* use the seed dictionaries and the new words learned for the other labels in
* the previous iterations as negative
*/
@Option(name = "useOtherLabelsWordsasNegative")
public boolean useOtherLabelsWordsasNegative = true;
/**
* If not null, write the output like
* "w1 w2 w3 w4 w5 ... " if w3 w4 have
* label1 and w4 has label 2
*/
@Option(name = "markedOutputTextFile")
String markedOutputTextFile = null;
/**
* If you want output of form "word\tlabels-separated-by-comma" in newlines
*/
@Option(name="columnOutputFile")
String columnOutputFile = null;
/**
* Lowercase the context words/lemmas
*/
@Option(name = "matchLowerCaseContext")
public static boolean matchLowerCaseContext = true;
/**
* Initials of all POS tags to use if
* usePOS4Pattern
is true, separated by comma.
*/
@Option(name = "targetAllowedTagsInitialsStr")
public String targetAllowedTagsInitialsStr = null;
public Map> allowedTagsInitials = null;
/**
* Allowed NERs for labels. Format is label1,NER1,NER11;label2,NER2,NER21,NER22;label3,...
* useTargetNERRestriction
flag should be true
*/
@Option(name = "targetAllowedNERs")
public String targetAllowedNERs = null;
public Map> allowedNERsforLabels = null;
/**
* Number of words to learn in each iteration
*/
@Option(name = "numWordsToAdd")
public int numWordsToAdd = 10;
@Option(name = "thresholdNumPatternsApplied")
public double thresholdNumPatternsApplied = 2;
@Option(name = "wordScoring")
public WordScoring wordScoring = WordScoring.WEIGHTEDNORM;
@Option(name = "thresholdWordExtract")
public double thresholdWordExtract = 0.2;
public boolean justify = false;
/**
* Sigma for L2 regularization in Logisitic regression, if a classifier is
* used to score phrases
*/
@Option(name = "LRSigma")
public double LRSigma = 1.0;
/**
* English words that are not labeled when labeling using seed dictionaries
*/
@Option(name = "englishWordsFiles")
public String englishWordsFiles = null;
private Set englishWords = new HashSet<>();
/**
* Words to be ignored when learning phrases if
* removePhrasesWithStopWords
or
* removeStopWordsFromSelectedPhrases
is true. Also, these words
* are considered negative when scoring a pattern (similar to
* othersemanticclasses).
*/
@Option(name = "commonWordsPatternFiles")
public String commonWordsPatternFiles = null;
private Set commonEngWords = null;
/**
* List of dictionary phrases that are negative for all labels to be learned.
* Format is file_1,file_2,... where file_i has each phrase in a different
* line
*
*/
@Option(name = "otherSemanticClassesFiles")
public String otherSemanticClassesFiles = null;
// set of words that are considered negative for all classes
private Set otherSemanticClassesWords = null;
/**
* Seed dictionary, set in the class that uses this class
*/
private Map> seedLabelDictionary = new HashMap<>();
/**
* Just the set of labels
*/
private Set labels = new HashSet<>();
private Map>> answerClass = null;
/**
* Can be used only when using the API - using the appropriate constructor.
* Tokens with specified classes set (has to be boolean return value, even
* though this variable says object) will be ignored.
*/
@SuppressWarnings("rawtypes")
private Map> ignoreWordswithClassesDuringSelection = null;
/**
* These classes will be generalized. It can only be used via the API using
* the appropriate constructor. All label classes are by default generalized.
*/
@SuppressWarnings("rawtypes")
private static Map generalizeClasses = new HashMap<>();
/**
* Minimum length of words that can be matched fuzzily
*/
@Option(name = "minLen4FuzzyForPattern")
public int minLen4FuzzyForPattern = 6;
/**
* Do not learn phrases that match this regex.
*/
@Option(name = "wordIgnoreRegex")
public String wordIgnoreRegex = "[^a-zA-Z]*";
/**
* Number of threads
*/
@Option(name = "numThreads")
public int numThreads = 1;
/**
* Words that are not learned. Patterns are not created around these words.
* And, if useStopWordsBeforeTerm in {@link edu.stanford.nlp.patterns.surface.CreatePatterns} is true.
*/
@Option(name = "stopWordsPatternFiles", gloss = "stop words")
public String stopWordsPatternFiles = null;
private static Set stopWords = null;
/**
* Environment for {@link TokenSequencePattern}
*/
public Map env = new HashMap<>();
public static Env globalEnv = TokenSequencePattern.getNewEnv();
/**
*
*/
@Option(name = "removeStopWordsFromSelectedPhrases")
public boolean removeStopWordsFromSelectedPhrases = false;
/**
*
*/
@Option(name = "removePhrasesWithStopWords")
public boolean removePhrasesWithStopWords = false;
private boolean alreadySetUp = false;
/**
* Cluster file, in which each line is word/phraseclusterid
*/
@Option(name = "wordClassClusterFile")
String wordClassClusterFile = null;
private Map wordClassClusters = new HashMap<>();
/**
* General cluster file, if you wanna use it somehow, in which each line is
* word/phraseclusterid
*/
@Option(name = "generalWordClassClusterFile")
String generalWordClassClusterFile = null;
private Map generalWordClassClusters = null;
// @Option(name = "includeExternalFeatures")
// public boolean includeExternalFeatures = false;
@Option(name = "externalFeatureWeightsFile")
public String externalFeatureWeightsDir = null;
@Option(name = "doNotApplyPatterns")
public boolean doNotApplyPatterns = false;
/**
* If score for a pattern is square rooted
*/
@Option(name = "sqrtPatScore")
public boolean sqrtPatScore = false;
/**
* Remove patterns that have number of unlabeled words is less than this.
*/
@Option(name = "minUnlabPhraseSupportForPat")
public int minUnlabPhraseSupportForPat = 0;
/**
* Remove patterns that have number of positive words less than this.
*/
@Option(name = "minPosPhraseSupportForPat")
public int minPosPhraseSupportForPat = 1;
/**
* For example, if positive seed dict contains "cancer" and "breast cancer" then "breast" is included as negative
*/
@Option(name="addIndvWordsFromPhrasesExceptLastAsNeg")
public boolean addIndvWordsFromPhrasesExceptLastAsNeg = false;
/**
* Cached files
*/
private ConcurrentHashMap editDistanceFromEnglishWords = new ConcurrentHashMap<>();
/**
* Cached files
*/
private ConcurrentHashMap editDistanceFromEnglishWordsMatches = new ConcurrentHashMap<>();
/**
* Cached files
*/
private ConcurrentHashMap editDistanceFromOtherSemanticClasses = new ConcurrentHashMap<>();
/**
* Cached files
*/
private ConcurrentHashMap editDistanceFromOtherSemanticClassesMatches = new ConcurrentHashMap<>();
/**
* Cached files
*/
private ConcurrentHashMap editDistanceFromThisClass = new ConcurrentHashMap<>();
/**
* Cached files
*/
private ConcurrentHashMap editDistanceFromThisClassMatches = new ConcurrentHashMap<>();
private ConcurrentHashMap> wordShapesForLabels = new ConcurrentHashMap<>();
String channelNameLogger = "settingUp";
public Map> distSimWeights = new HashMap<>();
public Map> dictOddsWeights = new HashMap<>();
@Option(name="invertedIndexClass", gloss="another option is Lucene backed, which is not included in the CoreNLP release. Contact us to get a copy (distributed under Apache License).")
public Class invertedIndexClass = InvertedIndexByTokens.class;
/**
* Where the inverted index (either in memory or lucene) is stored
*/
@Option(name="invertedIndexDirectory")
public String invertedIndexDirectory;
@Option(name="clubNeighboringLabeledWords")
public boolean clubNeighboringLabeledWords = false;
@Option(name="patternType")
public PatternFactory.PatternType patternType = PatternFactory.PatternType.SURFACE;
@Option(name="subsampleUnkAsNegUsingSim", gloss="When learning a classifier, remove phrases from unknown phrases that are too close to the positive phrases")
public boolean subsampleUnkAsNegUsingSim = false;
// @Option(name="subSampleUnkAsNegUsingSimPercentage", gloss="When using subsampleUnkAsNegUsingSim, select bottom %")
// public double subSampleUnkAsNegUsingSimPercentage = 0.95;
@Option(name="expandPositivesWhenSampling", gloss="when sampling for learning feature wts for learning phrases, expand the positives")
public boolean expandPositivesWhenSampling = false;
@Option(name="expandNegativesWhenSampling", gloss="when sampling for learning feature wts for learning phrases, expand the negatives")
public boolean expandNegativesWhenSampling = false;
@Option(name="similarityThresholdHighPrecision", gloss="used for expanding positives")
public double similarityThresholdHighPrecision = 0.7;
@Option(name="positiveSimilarityThresholdLowPrecision", gloss="used for not choosing close unknowns as positives")
public double positiveSimilarityThresholdLowPrecision = 0.5;
// @Option(name="subSampleUnkAsPosUsingSimPercentage", gloss="When using expandPositivesWhenSampling, select top % after applying the threshold")
// public double subSampleUnkAsPosUsingSimPercentage = 0.05;
@Option(name="wordVectorFile", gloss = "if using word vectors for computing similarities")
public String wordVectorFile = null;
@Option(name="useWordVectorsToComputeSim", gloss="use vectors directly instead of word classes for computing similarity")
public boolean useWordVectorsToComputeSim;
@Option(name="logFileVectorSimilarity", gloss="To store vectors for selected/almost-selected positive and negative words")
String logFileVectorSimilarity = null;
@Option(name="goldEntitiesEvalFiles", gloss="label1,gold_list_of_entities_file;label2,...")
public String goldEntitiesEvalFiles = null;
@Option(name="evaluate")
public boolean evaluate = false;
Map> goldEntities = new HashMap<>();
@Option(name="featureCountThreshold")
public int featureCountThreshold = 1;
@Option(name="expandPhrasesNumTopSimilar", gloss="k in kNN")
public int expandPhrasesNumTopSimilar = 1;
/**
* Whether to do a fuzzy matching when matching seeds to text. You can tune minLen4FuzzyForPattern parameter.
*/
@Option(name="fuzzyMatch")
public boolean fuzzyMatch = false;
/**
* Ignore case when matching seed words. It's a map so something like {name->true,place->false}
*/
@Option(name="ignoreCaseSeedMatch")
public Map ignoreCaseSeedMatch = new HashMap<>();
@Option(name="sentsOutFile")
public String sentsOutFile = null;
@Option(name="savePatternsWordsDir")
public boolean savePatternsWordsDir = true;
@Option(name="learn")
public boolean learn = true;
public Set getLabels() {
return labels;
}
// public void addLearnedWords(String trainLabel, Counter identifiedWords) {
// if(!learnedWords.containsKey(trainLabel))
// learnedWords.put(trainLabel, new ClassicCounter());
// this.learnedWords.get(trainLabel).addAll(identifiedWords);
// }
public Map getAllOptions() {
Map values = new HashMap<>();
if(props != null)
props.forEach( (x,y) -> values.put(x.toString(),y == null?"null":y.toString()));
Class thisClass;
try {
thisClass = Class.forName(this.getClass().getName());
Field[] aClassFields = thisClass.getDeclaredFields();
for(Field f : aClassFields){
if(f.getType().getClass().isPrimitive() || Arrays.binarySearch(GetPatternsFromDataMultiClass.printOptionClass, f.getType()) >= 0){
String fName = f.getName();
Object fvalue = f.get(this);
values.put(fName, fvalue == null ? "null" : fvalue.toString());
}
}
} catch (Exception e) {
e.printStackTrace();
}
return values;
}
public boolean hasSeedWordOrOtherSem(CandidatePhrase p) {
for(Map.Entry> seeds: this.seedLabelDictionary.entrySet()){
if(seeds.getValue().contains(p))
return true;
}
if(otherSemanticClassesWords.contains(p))
return true;
return false;
}
public TreeMap> getLearnedWordsEachIter(String label) {
return learnedWordsEachIter.get(label);
}
public Map>> getLearnedWordsEachIter() {
return learnedWordsEachIter;
}
public void setLearnedWordsEachIter(TreeMap> words, String label) {
this.learnedWordsEachIter.put(label, words);
}
//PatternFactory.PatternType.SURFACE;
// public PatternIndex getPatternIndex() {
// return patternIndex;
// }
//
// public void setPatternIndex(PatternIndex patternIndex) {
// this.patternIndex = patternIndex;
// }
static public class ScorePhraseMeasures implements Comparable {
String name;
static int num = 0;
int numObj;
static Map createdObjects = new ConcurrentHashMap<>();
public static ScorePhraseMeasures create(String n){
if(createdObjects.containsKey(n))
return createdObjects.get(n);
else
return new ScorePhraseMeasures(n);
}
private ScorePhraseMeasures(String n){
this.name= n;
numObj = num++;
createdObjects.put(n, this);
}
@Override
public String toString(){return name;}
@Override
public boolean equals(Object o){
if(! (o instanceof ScorePhraseMeasures)) return false;
return ((ScorePhraseMeasures)o).numObj == (this.numObj);
}
static final ScorePhraseMeasures DISTSIM = new ScorePhraseMeasures("DistSim");
static final ScorePhraseMeasures GOOGLENGRAM = new ScorePhraseMeasures("GoogleNGram");
static final ScorePhraseMeasures PATWTBYFREQ=new ScorePhraseMeasures("PatWtByFreq");
static final ScorePhraseMeasures EDITDISTSAME=new ScorePhraseMeasures("EditDistSame");
static final ScorePhraseMeasures EDITDISTOTHER =new ScorePhraseMeasures("EditDistOther");
static final ScorePhraseMeasures DOMAINNGRAM =new ScorePhraseMeasures("DomainNgram");
static final ScorePhraseMeasures SEMANTICODDS =new ScorePhraseMeasures("SemanticOdds");
static final ScorePhraseMeasures WORDSHAPE = new ScorePhraseMeasures("WordShape");
static final ScorePhraseMeasures WORDVECPOSSIMAVG = new ScorePhraseMeasures("WordVecPosSimAvg");
static final ScorePhraseMeasures WORDVECPOSSIMMAX = new ScorePhraseMeasures("WordVecPosSimMax");
static final ScorePhraseMeasures WORDVECNEGSIMAVG = new ScorePhraseMeasures("WordVecNegSimAvg");
static final ScorePhraseMeasures WORDVECNEGSIMMAX = new ScorePhraseMeasures("WordVecNegSimMax");
static final ScorePhraseMeasures ISFIRSTCAPITAL = new ScorePhraseMeasures("IsFirstLetterCapital");
static final ScorePhraseMeasures WORDSHAPESTR = new ScorePhraseMeasures("WordShapeStr");
static final ScorePhraseMeasures BOW = new ScorePhraseMeasures("Word");
@Override
public int compareTo(Object o) {
if(!(o instanceof ScorePhraseMeasures))
return -1;
else return o.toString().compareTo(this.toString());
}
}
/**
* Keeps only one label for each token, whichever has the longest
*/
@Option(name="removeOverLappingLabelsFromSeed")
public boolean removeOverLappingLabelsFromSeed = false;
/**
* Only works if you have single label. And the word classes are given.
*/
@Option(name = "usePhraseEvalWordClass")
public boolean usePhraseEvalWordClass = false;
/**
* Only works if you have single label. And the word vectors are given.
*/
@Option(name = "usePhraseEvalWordVector")
public boolean usePhraseEvalWordVector = false;
/**
* use google tf-idf for learning phrases. Need to also provide googleNgram_dbname,
* googleNgram_username and googleNgram_host
*/
@Option(name = "usePhraseEvalGoogleNgram")
public boolean usePhraseEvalGoogleNgram = false;
/**
* use domain tf-idf for learning phrases
*/
@Option(name = "usePhraseEvalDomainNgram")
public boolean usePhraseEvalDomainNgram = false;
/**
* use \sum_allpat pattern_wt_that_extracted_phrase/phrase_freq for learning
* phrases
*/
@Option(name = "usePhraseEvalPatWtByFreq")
public boolean usePhraseEvalPatWtByFreq = true;
/**
* odds of the phrase freq in the label dictionary vs other dictionaries
*/
@Option(name = "usePhraseEvalSemanticOdds")
public boolean usePhraseEvalSemanticOdds = false;
/**
* Edit distance between this phrase and the other phrases in the label
* dictionary
*/
@Option(name = "usePhraseEvalEditDistSame")
public boolean usePhraseEvalEditDistSame = false;
/**
* Edit distance between this phrase and other phrases in other dictionaries
*/
@Option(name = "usePhraseEvalEditDistOther")
public boolean usePhraseEvalEditDistOther = false;
@Option(name = "usePhraseEvalWordShape", gloss="% of phrases of that label that have the same word shape")
public boolean usePhraseEvalWordShape = false;
@Option(name="usePhraseEvalWordShapeStr", gloss="uses the word shape str as a feature")
public boolean usePhraseEvalWordShapeStr = false;
@Option(name="usePhraseEvalFirstCapital", gloss="words starts with a capital letter")
public boolean usePhraseEvalFirstCapital;
/**
* use bag of words
*/
@Option(name="usePhraseEvalBOW")
public boolean usePhraseEvalBOW = false;
/**
* Used only if {@link #patternScoring} is PhEvalInPat
or
* PhEvalInPat
. See usePhrase* for meanings.
*/
@Option(name = "usePatternEvalWordClass")
public boolean usePatternEvalWordClass = false;
/**
* Used only if {@link #patternScoring} is PhEvalInPat
or
* PhEvalInPat
. See usePhrase* for meanings.
*/
@Option(name = "usePatternEvalWordShape")
public boolean usePatternEvalWordShape = false;
@Option(name="usePatternEvalWordShapeStr", gloss="uses the word shape str as a feature")
public boolean usePatternEvalWordShapeStr = false;
@Option(name="usePatternEvalFirstCapital", gloss="words starts with a capital letter")
public boolean usePatternEvalFirstCapital;
/**
* Used only if {@link #patternScoring} is PhEvalInPat
or
* PhEvalInPat
. See usePhrase* for meanings.
*/
@Option(name = "usePatternEvalGoogleNgram")
public boolean usePatternEvalGoogleNgram = false;
/**
* Used only if {@link #patternScoring} is PhEvalInPat
or
* PhEvalInPat
. See usePhrase* for meanings. Need to also provide googleNgram_dbname,
* googleNgram_username and googleNgram_host
*/
@Option(name = "usePatternEvalDomainNgram")
public boolean usePatternEvalDomainNgram = false;
/**
* Used only if {@link #patternScoring} is PhEvalInPat
or
* PhEvalInPatLogP
. See usePhrase* for meanings.
*/
@Option(name = "usePatternEvalSemanticOdds")
public boolean usePatternEvalSemanticOdds = false;
/**
* Used only if {@link #patternScoring} is PhEvalInPat
or
* PhEvalInPatLogP
. See usePhrase* for meanings.
*/
@Option(name = "usePatternEvalEditDistSame")
public boolean usePatternEvalEditDistSame = false;
/**
* Used only if {@link #patternScoring} is PhEvalInPat
or
* PhEvalInPatLogP
. See usePhrase* for meanings.
*/
@Option(name = "usePatternEvalEditDistOther")
public boolean usePatternEvalEditDistOther = false;
/**
* use bag of words
*/
@Option(name="usePatternEvalBOW")
public boolean usePatternEvalBOW = false;
/**
* These are used to learn weights for features if using logistic regression.
* Percentage of non-labeled tokens selected as negative.
*/
@Option(name = "perSelectRand")
public double perSelectRand = 0.01;
/**
* These are used to learn weights for features if using logistic regression.
* Percentage of negative tokens selected as negative.
*/
@Option(name = "perSelectNeg")
public double perSelectNeg = 1;
/**
* Especially useful for multi word phrase extraction. Do not extract a phrase
* if any word is labeled with any other class.
*/
@Option(name = "doNotExtractPhraseAnyWordLabeledOtherClass")
public boolean doNotExtractPhraseAnyWordLabeledOtherClass = true;
/**
* You can save the inverted index. Lucene index is saved by default to invertedIndexDirectory
if given.
*/
@Option(name="saveInvertedIndex")
public boolean saveInvertedIndex = false;
/**
* You can load the inverted index using this file.
* If false and using lucene index, the existing directory is deleted and new index is made.
*/
@Option(name="loadInvertedIndex")
public boolean loadInvertedIndex = false;
@Option(name = "storePatsForEachToken", gloss="used for storing patterns in PSQL/MEMORY/LUCENE")
public PatternForEachTokenWay storePatsForEachToken = PatternForEachTokenWay.MEMORY;
//
// @Option(name = "storePatsIndex", gloss="used for storing patterns index")
// public PatternIndexWay storePatsIndex = PatternIndexWay.MEMORY;
@Option(name="sampleSentencesForSufficientStats",gloss="% sentences to use for learning pattterns" )
double sampleSentencesForSufficientStats = 1.0;
// /**
// * Directory where to save the sentences ser files.
// */
// @Option(name="saveSentencesSerDir")
// public File saveSentencesSerDir = null;
//
// public boolean usingDirForSentsInIndex = false;
// @Option(name = "wekaOptions")
// public String wekaOptions = "";
public static String backgroundSymbol = "O";
int wordShaper = WordShapeClassifier.WORDSHAPECHRIS2;
private ConcurrentHashMap wordShapeCache = new ConcurrentHashMap<>();
public SentenceIndex invertedIndex;
public static String extremedebug = "extremePatDebug";
public static String minimaldebug = "minimaldebug";
Properties props;
public enum PatternForEachTokenWay {MEMORY, LUCENE, DB};
public enum PatternIndexWay {MEMORY, OPENHFT, LUCENE};
public List functionWords = Arrays.asList("a","an","the","of","at","on","in","he","she","him","her","they","them","and","no","not","nor","as","do");
public ConstantsAndVariables(Properties props, Set labels, Map>> answerClass, Map generalizeClasses,
Map> ignoreClasses) throws IOException {
this.labels = labels;
for(String label: labels){
this.seedLabelDictionary.put(label, new HashSet<>());
}
this.answerClass = answerClass;
this.generalizeClasses = generalizeClasses;
if(this.generalizeClasses == null)
this.generalizeClasses = new HashMap<>();
this.generalizeClasses.putAll(answerClass);
this.ignoreWordswithClassesDuringSelection = ignoreClasses;
setUp(props);
}
public ConstantsAndVariables(Properties props, Map> labelDictionary, Map>> answerClass, Map generalizeClasses,
Map> ignoreClasses) throws IOException {
//make the list unmodifiable!
for(Entry> en2: labelDictionary.entrySet()){
seedLabelDictionary.put(en2.getKey(), Collections.unmodifiableSet(en2.getValue()));
}
this.labels = labelDictionary.keySet();
this.answerClass = answerClass;
this.generalizeClasses = generalizeClasses;
if(this.generalizeClasses == null)
this.generalizeClasses = new HashMap<>();
this.generalizeClasses.putAll(answerClass);
this.ignoreWordswithClassesDuringSelection = ignoreClasses;
setUp(props);
}
public ConstantsAndVariables(Properties props, Set labels, Map>> answerClass) throws IOException {
this.labels = labels;
for(String label: labels){
this.seedLabelDictionary.put(label, new HashSet<>());
}
this.answerClass = answerClass;
this.generalizeClasses = new HashMap<>();
this.generalizeClasses.putAll(answerClass);
setUp(props);
}
public ConstantsAndVariables(Properties props, String label, Class> answerClass) throws IOException {
this.labels = new HashSet<>();
this.labels.add(label);
this.seedLabelDictionary.put(label, new HashSet<>());
this.answerClass = new HashMap<>();
this.answerClass.put(label, answerClass);
this.generalizeClasses = new HashMap<>();
this.generalizeClasses.putAll(this.answerClass);
setUp(props);
}
public ConstantsAndVariables(Properties props, Set labels, Map>> answerClass, Map generalizeClasses) throws IOException {
this.labels = labels;
for(String label: labels){
this.seedLabelDictionary.put(label, new HashSet<>());
}
this.answerClass = answerClass;
this.generalizeClasses = generalizeClasses;
if(this.generalizeClasses == null)
this.generalizeClasses = new HashMap<>();
this.generalizeClasses.putAll(answerClass);
setUp(props);
}
@SuppressWarnings("rawtypes")
public void setUp(Properties props) throws IOException {
if (alreadySetUp) {
return;
}
Redwood.log(Redwood.DBG, "Setting up ConstantsAndVariables");
ArgumentParser.fillOptions(this, props);
ArgumentParser.fillOptions(PatternFactory.class, props);
ArgumentParser.fillOptions(SurfacePatternFactory.class, props);
ArgumentParser.fillOptions(DepPatternFactory.class, props);
if (wordIgnoreRegex != null && !wordIgnoreRegex.isEmpty()) {
Redwood.log(Redwood.DBG, "Ignore word regex is " + wordIgnoreRegex);
PatternFactory.ignoreWordRegex = Pattern.compile(wordIgnoreRegex);
}
for (String label : labels) {
env.put(label, TokenSequencePattern.getNewEnv());
// env.get(label).bind("answer", answerClass.get(label));
for (Entry>> en : this.answerClass
.entrySet()) {
env.get(label).bind(en.getKey(), en.getValue());
}
for (Entry en : generalizeClasses.entrySet())
env.get(label).bind(en.getKey(), en.getValue());
}
Redwood.log(Redwood.DBG, channelNameLogger, "Running with debug output");
stopWords = new HashSet<>();
if(stopWordsPatternFiles != null) {
Redwood.log(ConstantsAndVariables.minimaldebug, channelNameLogger, "Reading stop words from "
+ stopWordsPatternFiles);
for (String stopwfile : stopWordsPatternFiles.split("[;,]"))
{
for(String word: IOUtils.readLines(stopwfile)){
if(!word.trim().isEmpty())
stopWords.add(CandidatePhrase.createOrGet(word.trim()));
}
}
}
englishWords = new HashSet<>();
if(englishWordsFiles != null) {
System.out.println("Reading english words from " + englishWordsFiles);
for (String englishWordsFile : englishWordsFiles.split("[;,]"))
englishWords.addAll(IOUtils.linesFromFile(englishWordsFile));
}
if (commonWordsPatternFiles != null) {
commonEngWords = Collections.synchronizedSet(new HashSet<>());
for (String file : commonWordsPatternFiles.split("[;,]"))
commonEngWords.addAll(IOUtils.linesFromFile(file));
}
if (otherSemanticClassesFiles != null) {
if (otherSemanticClassesWords == null)
otherSemanticClassesWords = Collections
.synchronizedSet(new HashSet<>());
for (String file : otherSemanticClassesFiles.split("[;,]")) {
for (File f : listFileIncludingItself(file)) {
for (String w : IOUtils.readLines(f)) {
String[] t = w.split("\\s+");
if (t.length <= PatternFactory.numWordsCompoundMax)
otherSemanticClassesWords.add(CandidatePhrase.createOrGet(w));
}
}
}
System.out.println("Size of othersemantic class variables is "
+ otherSemanticClassesWords.size());
} else {
otherSemanticClassesWords = Collections.synchronizedSet(new HashSet<>());
System.out.println("Size of othersemantic class variables is " + 0);
}
String stopStr = "/";
int i = 0;
for (CandidatePhrase s : stopWords) {
if (i > 0)
stopStr += "|";
stopStr += Pattern.quote(s.getPhrase().replaceAll("\\\\", "\\\\\\\\"));
i++;
}
stopStr += "/";
for (String label : labels) {
env.get(label).bind("$FILLER",
"/" + StringUtils.join(PatternFactory.fillerWords, "|") + "/");
env.get(label).bind("$STOPWORD", stopStr);
env.get(label).bind("$MOD", "[{tag:/JJ.*/}]");
if (matchLowerCaseContext){
env.get(label).setDefaultStringMatchFlags(NodePattern.CASE_INSENSITIVE);
env.get(label).setDefaultStringPatternFlags(Pattern.CASE_INSENSITIVE);
}
env.get(label).bind("OTHERSEM",
PatternsAnnotations.OtherSemanticLabel.class);
env.get(label).bind("grandparentparsetag", CoreAnnotations.GrandparentAnnotation.class);
}
if (wordClassClusterFile != null) {
wordClassClusters = new HashMap<>();
for (String line : IOUtils.readLines(wordClassClusterFile)) {
String[] t = line.split("\t");
wordClassClusters.put(t[0], Integer.parseInt(t[1]));
}
}
if (generalWordClassClusterFile != null) {
setGeneralWordClassClusters(new HashMap<>());
for (String line : IOUtils.readLines(generalWordClassClusterFile)) {
String[] t = line.split("\t");
getGeneralWordClassClusters().put(t[0], Integer.parseInt(t[1]));
}
}
if(targetAllowedTagsInitialsStr!= null){
allowedTagsInitials = new HashMap<>();
for(String labelstr : targetAllowedTagsInitialsStr.split(";")){
String[] t = labelstr.split(",");
Set st = new HashSet<>();
for(int j = 1; j < t.length; j++)
st.add(t[j]);
allowedTagsInitials.put(t[0], st);
}
}
if(PatternFactory.useTargetNERRestriction && targetAllowedNERs !=null){
allowedNERsforLabels = new HashMap<>();
for(String labelstr : targetAllowedNERs.split(";")){
String[] t = labelstr.split(",");
Set st = new HashSet<>();
for(int j = 1; j < t.length; j++)
st.add(t[j]);
allowedNERsforLabels.put(t[0], st);
}
}
for(String label: labels){
learnedWordsEachIter.put(label, new TreeMap<>());
}
if(usePhraseEvalGoogleNgram || usePatternEvalDomainNgram) {
Data.usingGoogleNgram = true;
ArgumentParser.fillOptions(GoogleNGramsSQLBacked.class, props);
}
if(goldEntitiesEvalFiles !=null && evaluate)
goldEntities = readGoldEntities(goldEntitiesEvalFiles);
alreadySetUp = true;
}
public static Iterable listFileIncludingItself(String file) {
File f = new File(file);
if(!f.isDirectory())
return Arrays.asList(f);
else return IOUtils.iterFilesRecursive(f);
}
// The format of goldEntitiesEvalFiles is assumed same as
// seedwordsfiles: label,file;label2,file2;...
// Each file of gold entities consists of each entity in newline with
// incorrect entities marked with "#" at the end of the entity.
// Learned entities not present in the gold file are considered
// negative.
static Map> readGoldEntities(String goldEntitiesEvalFiles){
Map> goldWords = new HashMap<>();
if (goldEntitiesEvalFiles != null) {
for (String gfile : goldEntitiesEvalFiles.split(";")) {
String[] t = gfile.split(",");
String label = t[0];
String goldfile = t[1];
Map goldWords4Label = new HashMap<>();
for (String line : IOUtils.readLines(goldfile)) {
line = line.trim();
if (line.isEmpty())
continue;
if (line.endsWith("#"))
goldWords4Label.put(line.substring(0, line.length() - 1), false);
else
goldWords4Label.put(line, true);
}
goldWords.put(label, goldWords4Label);
}
}
return goldWords;
}
//streams sents, files-from-which-sents-were read
static public class DataSentsIterator implements Iterator, File>> {
boolean readInMemory = false;
Iterator sentfilesIter = null;
boolean batchProcessSents;
public DataSentsIterator(boolean batchProcessSents){
this.batchProcessSents = batchProcessSents;
if(batchProcessSents){
sentfilesIter = Data.sentsFiles.iterator();
}
}
@Override
public boolean hasNext() {
if(batchProcessSents){
return sentfilesIter.hasNext();
}else{
return !readInMemory;
}
}
@Override
public Pair