edu.stanford.nlp.patterns.ConstantsAndVariables Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.patterns;

import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.lang.reflect.Field;
import java.util.*;
import java.util.Map.Entry;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.tokensregex.Env;
import edu.stanford.nlp.ling.tokensregex.NodePattern;
import edu.stanford.nlp.ling.tokensregex.TokenSequencePattern;
import edu.stanford.nlp.patterns.GetPatternsFromDataMultiClass.PatternScoring;
import edu.stanford.nlp.patterns.GetPatternsFromDataMultiClass.WordScoring;
import edu.stanford.nlp.patterns.dep.DepPatternFactory;
import edu.stanford.nlp.patterns.surface.SurfacePatternFactory;
import edu.stanford.nlp.process.WordShapeClassifier;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.stats.Counters;
import edu.stanford.nlp.util.*;
import edu.stanford.nlp.util.ArgumentParser.Option;
import edu.stanford.nlp.util.TypesafeMap.Key;
import edu.stanford.nlp.util.logging.Redwood;

import javax.json.Json;
import javax.json.JsonArrayBuilder;
import javax.json.JsonObjectBuilder;

public class ConstantsAndVariables implements Serializable {

  private static final long serialVersionUID = 1L;

  /**
   * Maximum number of iterations to run
   */
  @Option(name = "numIterationsForPatterns")
  public Integer numIterationsForPatterns = 10;

  /**
   * Maximum number of patterns learned in each iteration
   */
  @Option(name = "numPatterns")
  public int numPatterns = 10;

  /**
   * The output directory where the justifications of learning patterns and
   * phrases would be saved. These are needed for visualization
   */
  @Option(name = "outDir")
  public String outDir = null;

  /**
   * Cached file of all patterns for all tokens
   */
  @Option(name = "allPatternsDir")
  public String allPatternsDir = null;

  /**
   * If all patterns should be computed. Otherwise patterns are read from
   * allPatternsFile
   */
  @Option(name = "computeAllPatterns")
  public boolean computeAllPatterns = true;

  // @Option(name = "removeRedundantPatterns")
  // public boolean removeRedundantPatterns = true;

  /**
   * Pattern Scoring mechanism. See {@link PatternScoring} for options.
   */
  @Option(name = "patternScoring")
  public PatternScoring patternScoring = PatternScoring.PosNegUnlabOdds;

  /**
   * Threshold for learning a pattern
   */
  @Option(name = "thresholdSelectPattern")
  public double thresholdSelectPattern = 1.0;

//  /**
//   * Do not learn patterns that do not extract any unlabeled tokens (kind of
//   * useless)
//   */
//  @Option(name = "discardPatternsWithNoUnlabSupport")
//  public boolean discardPatternsWithNoUnlabSupport = true;

  /**
   * Currently, does not work correctly. TODO: make this work. Ideally this
   * would label words only when they occur in the context of any learned
   * pattern. This comment seems old. Test it!
   */
  @Option(name = "restrictToMatched")
  public boolean restrictToMatched = false;

  /**
   * Label words that are learned so that in further iterations we have more
   * information
   */
  @Option(name = "usePatternResultAsLabel")
  public boolean usePatternResultAsLabel = true;

  /**
   * Debug flag for learning patterns. 0 means no output, 1 means necessary output, 2 means necessary output+some justification, 3 means extreme debug output
   */
  @Option(name = "debug")
  public int debug = 1;

  /**
   * Do not learn patterns in which the neighboring words have the same label.
   * Deprecated!
   */
  //@Option(name = "ignorePatWithLabeledNeigh")
  //public boolean ignorePatWithLabeledNeigh = false;

  /**
   * Save this run as ...
   */
  @Option(name = "identifier")
  public String identifier = "getpatterns";

  /**
   * Use the actual dictionary matching phrase(s) instead of the token word or
   * lemma in calculating the stats
   */
  @Option(name = "useMatchingPhrase")
  public boolean useMatchingPhrase = true;

  /**
   * Reduce pattern threshold (=0.8*current_value) to extract as many patterns
   * as possible (still restricted by numPatterns)
   */
  @Option(name = "tuneThresholdKeepRunning")
  public boolean tuneThresholdKeepRunning = false;

  /**
   * Maximum number of words to learn
   */
  @Option(name = "maxExtractNumWords")
  public int maxExtractNumWords = Integer.MAX_VALUE;

  /**
   * use the seed dictionaries and the new words learned for the other labels in
   * the previous iterations as negative
   */
  @Option(name = "useOtherLabelsWordsasNegative")
  public boolean useOtherLabelsWordsasNegative = true;

  /**
   * If not null, write the output like
   * "w1 w2  w3 w4  w5 ... " if w3 w4 have
   * label1 and w4 has label 2
   */
  @Option(name = "markedOutputTextFile")
  String markedOutputTextFile = null;

  /**
   * If you want output of form "word\tlabels-separated-by-comma" in newlines
   */
  @Option(name="columnOutputFile")
  String columnOutputFile = null;


  /**
   * Lowercase the context words/lemmas
   */
  @Option(name = "matchLowerCaseContext")
  public static boolean matchLowerCaseContext = true;


  /**
   * Initials of all POS tags to use if
   * usePOS4Pattern is true, separated by comma.
   */
  @Option(name = "targetAllowedTagsInitialsStr")
  public String targetAllowedTagsInitialsStr = null;

  public Map> allowedTagsInitials = null;

  /**
   * Allowed NERs for labels. Format is label1,NER1,NER11;label2,NER2,NER21,NER22;label3,...
   * useTargetNERRestriction flag should be true
   */
  @Option(name = "targetAllowedNERs")
  public String targetAllowedNERs = null;


  public Map> allowedNERsforLabels = null;

  /**
   * Number of words to learn in each iteration
   */
  @Option(name = "numWordsToAdd")
  public int numWordsToAdd = 10;


  @Option(name = "thresholdNumPatternsApplied")
  public double thresholdNumPatternsApplied = 2;

  @Option(name = "wordScoring")
  public WordScoring wordScoring = WordScoring.WEIGHTEDNORM;

  @Option(name = "thresholdWordExtract")
  public double thresholdWordExtract = 0.2;

  public boolean justify = false;

  /**
   * Sigma for L2 regularization in Logisitic regression, if a classifier is
   * used to score phrases
   */
  @Option(name = "LRSigma")
  public double LRSigma = 1.0;

  /**
   * English words that are not labeled when labeling using seed dictionaries
   */
  @Option(name = "englishWordsFiles")
  public String englishWordsFiles = null;

  private Set englishWords = new HashSet<>();

  /**
   * Words to be ignored when learning phrases if
   * removePhrasesWithStopWords or
   * removeStopWordsFromSelectedPhrases is true. Also, these words
   * are considered negative when scoring a pattern (similar to
   * othersemanticclasses).
   */
  @Option(name = "commonWordsPatternFiles")
  public String commonWordsPatternFiles = null;

  private Set commonEngWords = null;

  /**
   * List of dictionary phrases that are negative for all labels to be learned.
   * Format is file_1,file_2,... where file_i has each phrase in a different
   * line
   *
   */
  @Option(name = "otherSemanticClassesFiles")
  public String otherSemanticClassesFiles = null;

  // set of words that are considered negative for all classes
  private Set otherSemanticClassesWords = null;

  /**
   * Seed dictionary, set in the class that uses this class
   */
  private Map> seedLabelDictionary = new HashMap<>();

  /**
   * Just the set of labels
   */
  private Set labels = new HashSet<>();


  private Map>> answerClass = null;


  /**
   * Can be used only when using the API - using the appropriate constructor.
   * Tokens with specified classes set (has to be boolean return value, even
   * though this variable says object) will be ignored.
   */
  @SuppressWarnings("rawtypes")
  private Map> ignoreWordswithClassesDuringSelection = null;

  /**
   * These classes will be generalized. It can only be used via the API using
   * the appropriate constructor. All label classes are by default generalized.
   */
  @SuppressWarnings("rawtypes")
  private static Map generalizeClasses = new HashMap<>();

  /**
   * Minimum length of words that can be matched fuzzily
   */
  @Option(name = "minLen4FuzzyForPattern")
  public int minLen4FuzzyForPattern = 6;

  /**
   * Do not learn phrases that match this regex.
   */
  @Option(name = "wordIgnoreRegex")
  public String wordIgnoreRegex = "[^a-zA-Z]*";

  /**
   * Number of threads
   */
  @Option(name = "numThreads")
  public int numThreads = 1;

  /**
   * Words that are not learned. Patterns are not created around these words.
   * And, if useStopWordsBeforeTerm in {@link edu.stanford.nlp.patterns.surface.CreatePatterns} is true.
   */
  @Option(name = "stopWordsPatternFiles", gloss = "stop words")
  public String stopWordsPatternFiles = null;

  private static Set stopWords = null;



  /**
   * Environment for {@link TokenSequencePattern}
   */
  public Map env = new HashMap<>();

  public static Env globalEnv = TokenSequencePattern.getNewEnv();


  /**
   *
   */
  @Option(name = "removeStopWordsFromSelectedPhrases")
  public boolean removeStopWordsFromSelectedPhrases = false;

  /**
   *
   */
  @Option(name = "removePhrasesWithStopWords")
  public boolean removePhrasesWithStopWords = false;

  private boolean alreadySetUp = false;

  /**
   * Cluster file, in which each line is word/phraseclusterid
   */
  @Option(name = "wordClassClusterFile")
  String wordClassClusterFile = null;

  private Map wordClassClusters = new HashMap<>();

  /**
   * General cluster file, if you wanna use it somehow, in which each line is
   * word/phraseclusterid
   */
  @Option(name = "generalWordClassClusterFile")
  String generalWordClassClusterFile = null;

  private Map generalWordClassClusters = null;

//  @Option(name = "includeExternalFeatures")
//  public boolean includeExternalFeatures = false;

  @Option(name = "externalFeatureWeightsFile")
  public String externalFeatureWeightsDir = null;

  @Option(name = "doNotApplyPatterns")
  public boolean doNotApplyPatterns = false;


  /**
   * If score for a pattern is square rooted
   */
  @Option(name = "sqrtPatScore")
  public boolean sqrtPatScore = false;

  /**
   * Remove patterns that have number of unlabeled words is less than this.
   */
  @Option(name = "minUnlabPhraseSupportForPat")
  public int minUnlabPhraseSupportForPat = 0;

  /**
   * Remove patterns that have number of positive words less than this.
   */
  @Option(name = "minPosPhraseSupportForPat")
  public int minPosPhraseSupportForPat = 1;

  /**
   * For example, if positive seed dict contains "cancer" and "breast cancer" then "breast" is included as negative
   */
  @Option(name="addIndvWordsFromPhrasesExceptLastAsNeg")
  public boolean addIndvWordsFromPhrasesExceptLastAsNeg = false;

  /**
   * Cached files
   */
  private ConcurrentHashMap editDistanceFromEnglishWords = new ConcurrentHashMap<>();
  /**
   * Cached files
   */
  private ConcurrentHashMap editDistanceFromEnglishWordsMatches = new ConcurrentHashMap<>();
  /**
   * Cached files
   */
  private ConcurrentHashMap editDistanceFromOtherSemanticClasses = new ConcurrentHashMap<>();
  /**
   * Cached files
   */
  private ConcurrentHashMap editDistanceFromOtherSemanticClassesMatches = new ConcurrentHashMap<>();
  /**
   * Cached files
   */
  private ConcurrentHashMap editDistanceFromThisClass = new ConcurrentHashMap<>();
  /**
   * Cached files
   */
  private ConcurrentHashMap editDistanceFromThisClassMatches = new ConcurrentHashMap<>();

  private ConcurrentHashMap> wordShapesForLabels = new ConcurrentHashMap<>();



  String channelNameLogger = "settingUp";

  public Map> distSimWeights = new HashMap<>();
  public Map> dictOddsWeights = new HashMap<>();

  @Option(name="invertedIndexClass", gloss="another option is Lucene backed, which is not included in the CoreNLP release. Contact us to get a copy (distributed under Apache License).")
  public Class invertedIndexClass = InvertedIndexByTokens.class;

  /**
   * Where the inverted index (either in memory or lucene) is stored
   */
  @Option(name="invertedIndexDirectory")
  public String invertedIndexDirectory;

  @Option(name="clubNeighboringLabeledWords")
  public boolean clubNeighboringLabeledWords = false;

  @Option(name="patternType")
  public PatternFactory.PatternType patternType = PatternFactory.PatternType.SURFACE;

  @Option(name="subsampleUnkAsNegUsingSim", gloss="When learning a classifier, remove phrases from unknown phrases that are too close to the positive phrases")
  public boolean subsampleUnkAsNegUsingSim = false;

//  @Option(name="subSampleUnkAsNegUsingSimPercentage", gloss="When using subsampleUnkAsNegUsingSim, select bottom %")
//  public double subSampleUnkAsNegUsingSimPercentage = 0.95;

  @Option(name="expandPositivesWhenSampling", gloss="when sampling for learning feature wts for learning phrases, expand the positives")
  public boolean expandPositivesWhenSampling = false;

  @Option(name="expandNegativesWhenSampling", gloss="when sampling for learning feature wts for learning phrases, expand the negatives")
  public boolean expandNegativesWhenSampling = false;

  @Option(name="similarityThresholdHighPrecision", gloss="used for expanding positives")
  public double similarityThresholdHighPrecision  = 0.7;

  @Option(name="positiveSimilarityThresholdLowPrecision", gloss="used for not choosing close unknowns as positives")
  public double positiveSimilarityThresholdLowPrecision  = 0.5;

//  @Option(name="subSampleUnkAsPosUsingSimPercentage", gloss="When using expandPositivesWhenSampling, select top % after applying the threshold")
//  public double subSampleUnkAsPosUsingSimPercentage = 0.05;

  @Option(name="wordVectorFile", gloss = "if using word vectors for computing similarities")
  public String wordVectorFile = null;

  @Option(name="useWordVectorsToComputeSim", gloss="use vectors directly instead of word classes for computing similarity")
  public boolean useWordVectorsToComputeSim;

  @Option(name="logFileVectorSimilarity", gloss="To store vectors for selected/almost-selected positive and negative words")
  String logFileVectorSimilarity = null;

  @Option(name="goldEntitiesEvalFiles", gloss="label1,gold_list_of_entities_file;label2,...")
  public String goldEntitiesEvalFiles = null;

  @Option(name="evaluate")
  public boolean evaluate = false;

  Map> goldEntities = new HashMap<>();

  @Option(name="featureCountThreshold")
  public int featureCountThreshold = 1;

  @Option(name="expandPhrasesNumTopSimilar", gloss="k in kNN")
  public int expandPhrasesNumTopSimilar = 1;

  /**
   * Whether to do a fuzzy matching when matching seeds to text. You can tune minLen4FuzzyForPattern parameter.
   */
  @Option(name="fuzzyMatch")
  public boolean fuzzyMatch = false;

  /**
   * Ignore case when matching seed words. It's a map so something like {name->true,place->false}
   */
  @Option(name="ignoreCaseSeedMatch")
  public Map ignoreCaseSeedMatch = new HashMap<>();

  @Option(name="sentsOutFile")
  public String sentsOutFile = null;

  @Option(name="savePatternsWordsDir")
  public boolean savePatternsWordsDir = true;

  @Option(name="learn")
  public boolean learn = true;


  public Set getLabels() {
    return labels;
  }

//  public void addLearnedWords(String trainLabel, Counter identifiedWords) {
//    if(!learnedWords.containsKey(trainLabel))
//      learnedWords.put(trainLabel, new ClassicCounter());
//    this.learnedWords.get(trainLabel).addAll(identifiedWords);
//  }

  public Map getAllOptions() {
    Map values = new HashMap<>();
    if(props != null)
      props.forEach( (x,y) -> values.put(x.toString(),y == null?"null":y.toString()));

    Class thisClass;
    try {
      thisClass = Class.forName(this.getClass().getName());

      Field[] aClassFields = thisClass.getDeclaredFields();
      for(Field f : aClassFields){
        if(f.getType().getClass().isPrimitive() || Arrays.binarySearch(GetPatternsFromDataMultiClass.printOptionClass, f.getType()) >= 0){
          String fName = f.getName();
          Object fvalue = f.get(this);
          values.put(fName, fvalue == null ? "null" : fvalue.toString());
        }
      }

    } catch (Exception e) {
      e.printStackTrace();
    }
    return values;
  }

  public boolean hasSeedWordOrOtherSem(CandidatePhrase p) {
    for(Map.Entry> seeds: this.seedLabelDictionary.entrySet()){
      if(seeds.getValue().contains(p))
        return true;
    }
    if(otherSemanticClassesWords.contains(p))
      return true;
    return false;
  }

  public TreeMap> getLearnedWordsEachIter(String label) {
    return learnedWordsEachIter.get(label);
  }

  public Map>>  getLearnedWordsEachIter() {
    return learnedWordsEachIter;
  }

  public void setLearnedWordsEachIter(TreeMap> words, String label) {
    this.learnedWordsEachIter.put(label, words);
  }


  //PatternFactory.PatternType.SURFACE;


//  public PatternIndex getPatternIndex() {
//    return patternIndex;
//  }
//
//  public void setPatternIndex(PatternIndex patternIndex) {
//    this.patternIndex = patternIndex;
//  }


  static public class ScorePhraseMeasures implements Comparable {

        String name;
    static int num = 0;
    int numObj;
    static Map createdObjects = new ConcurrentHashMap<>();

    public static ScorePhraseMeasures create(String n){
      if(createdObjects.containsKey(n))
        return createdObjects.get(n);
      else
        return new ScorePhraseMeasures(n);
    }

    private ScorePhraseMeasures(String n){
      this.name= n;
      numObj = num++;
      createdObjects.put(n, this);
    }

    @Override
    public String toString(){return name;}

    @Override
    public boolean equals(Object o){
      if(! (o instanceof ScorePhraseMeasures)) return false;
      return ((ScorePhraseMeasures)o).numObj == (this.numObj);
    }

    static final ScorePhraseMeasures DISTSIM = new ScorePhraseMeasures("DistSim");
    static final ScorePhraseMeasures GOOGLENGRAM = new ScorePhraseMeasures("GoogleNGram");
    static final ScorePhraseMeasures PATWTBYFREQ=new ScorePhraseMeasures("PatWtByFreq");
    static final ScorePhraseMeasures  EDITDISTSAME=new ScorePhraseMeasures("EditDistSame");
    static final ScorePhraseMeasures  EDITDISTOTHER =new ScorePhraseMeasures("EditDistOther");
    static final ScorePhraseMeasures  DOMAINNGRAM =new ScorePhraseMeasures("DomainNgram");
    static final ScorePhraseMeasures  SEMANTICODDS =new ScorePhraseMeasures("SemanticOdds");
    static final ScorePhraseMeasures  WORDSHAPE = new ScorePhraseMeasures("WordShape");
    static final ScorePhraseMeasures WORDVECPOSSIMAVG = new ScorePhraseMeasures("WordVecPosSimAvg");
    static final ScorePhraseMeasures WORDVECPOSSIMMAX = new ScorePhraseMeasures("WordVecPosSimMax");
    static final ScorePhraseMeasures WORDVECNEGSIMAVG = new ScorePhraseMeasures("WordVecNegSimAvg");
    static final ScorePhraseMeasures WORDVECNEGSIMMAX = new ScorePhraseMeasures("WordVecNegSimMax");
    static final ScorePhraseMeasures ISFIRSTCAPITAL = new ScorePhraseMeasures("IsFirstLetterCapital");
    static final ScorePhraseMeasures WORDSHAPESTR = new  ScorePhraseMeasures("WordShapeStr");
    static final ScorePhraseMeasures BOW = new ScorePhraseMeasures("Word");
    @Override
    public int compareTo(Object o) {
      if(!(o instanceof  ScorePhraseMeasures))
        return -1;
      else return o.toString().compareTo(this.toString());
    }
  }


  /**
   * Keeps only one label for each token, whichever has the longest
   */
  @Option(name="removeOverLappingLabelsFromSeed")
  public boolean removeOverLappingLabelsFromSeed = false;

  /**
   * Only works if you have single label. And the word classes are given.
   */
  @Option(name = "usePhraseEvalWordClass")
  public boolean usePhraseEvalWordClass = false;

  /**
   * Only works if you have single label. And the word vectors are given.
   */
  @Option(name = "usePhraseEvalWordVector")
  public boolean usePhraseEvalWordVector = false;

  /**
   * use google tf-idf for learning phrases. Need to also provide googleNgram_dbname,
   * googleNgram_username and googleNgram_host
   */
  @Option(name = "usePhraseEvalGoogleNgram")
  public boolean usePhraseEvalGoogleNgram = false;

  /**
   * use domain tf-idf for learning phrases
   */
  @Option(name = "usePhraseEvalDomainNgram")
  public boolean usePhraseEvalDomainNgram = false;

  /**
   * use \sum_allpat pattern_wt_that_extracted_phrase/phrase_freq for learning
   * phrases
   */
  @Option(name = "usePhraseEvalPatWtByFreq")
  public boolean usePhraseEvalPatWtByFreq = true;

  /**
   * odds of the phrase freq in the label dictionary vs other dictionaries
   */
  @Option(name = "usePhraseEvalSemanticOdds")
  public boolean usePhraseEvalSemanticOdds = false;

  /**
   * Edit distance between this phrase and the other phrases in the label
   * dictionary
   */
  @Option(name = "usePhraseEvalEditDistSame")
  public boolean usePhraseEvalEditDistSame = false;

  /**
   * Edit distance between this phrase and other phrases in other dictionaries
   */
  @Option(name = "usePhraseEvalEditDistOther")
  public boolean usePhraseEvalEditDistOther = false;

  @Option(name = "usePhraseEvalWordShape", gloss="% of phrases of that label that have the same word shape")
  public boolean usePhraseEvalWordShape = false;

  @Option(name="usePhraseEvalWordShapeStr", gloss="uses the word shape str as a feature")
  public boolean usePhraseEvalWordShapeStr = false;

  @Option(name="usePhraseEvalFirstCapital", gloss="words starts with a capital letter")
  public boolean usePhraseEvalFirstCapital;

  /**
   * use bag of words
   */
  @Option(name="usePhraseEvalBOW")
  public boolean usePhraseEvalBOW = false;

  /**
   * Used only if {@link #patternScoring} is PhEvalInPat or
   * PhEvalInPat. See usePhrase* for meanings.
   */
  @Option(name = "usePatternEvalWordClass")
  public boolean usePatternEvalWordClass = false;

  /**
   * Used only if {@link #patternScoring} is PhEvalInPat or
   * PhEvalInPat. See usePhrase* for meanings.
   */
  @Option(name = "usePatternEvalWordShape")
  public boolean usePatternEvalWordShape = false;

  @Option(name="usePatternEvalWordShapeStr", gloss="uses the word shape str as a feature")
  public boolean usePatternEvalWordShapeStr = false;

  @Option(name="usePatternEvalFirstCapital", gloss="words starts with a capital letter")
  public boolean usePatternEvalFirstCapital;

  /**
   * Used only if {@link #patternScoring} is PhEvalInPat or
   * PhEvalInPat. See usePhrase* for meanings.
   */
  @Option(name = "usePatternEvalGoogleNgram")
  public boolean usePatternEvalGoogleNgram = false;

  /**
   * Used only if {@link #patternScoring} is PhEvalInPat or
   * PhEvalInPat. See usePhrase* for meanings. Need to also provide googleNgram_dbname,
   * googleNgram_username and googleNgram_host
   */
  @Option(name = "usePatternEvalDomainNgram")
  public boolean usePatternEvalDomainNgram = false;

  /**
   * Used only if {@link #patternScoring} is PhEvalInPat or
   * PhEvalInPatLogP. See usePhrase* for meanings.
   */
  @Option(name = "usePatternEvalSemanticOdds")
  public boolean usePatternEvalSemanticOdds = false;

  /**
   * Used only if {@link #patternScoring} is PhEvalInPat or
   * PhEvalInPatLogP. See usePhrase* for meanings.
   */
  @Option(name = "usePatternEvalEditDistSame")
  public boolean usePatternEvalEditDistSame = false;

  /**
   * Used only if {@link #patternScoring} is PhEvalInPat or
   * PhEvalInPatLogP. See usePhrase* for meanings.
   */
  @Option(name = "usePatternEvalEditDistOther")
  public boolean usePatternEvalEditDistOther = false;

  /**
   * use bag of words
   */
  @Option(name="usePatternEvalBOW")
  public boolean usePatternEvalBOW = false;


  /**
   * These are used to learn weights for features if using logistic regression.
   * Percentage of non-labeled tokens selected as negative.
   */
  @Option(name = "perSelectRand")
  public double perSelectRand = 0.01;

  /**
   * These are used to learn weights for features if using logistic regression.
   * Percentage of negative tokens selected as negative.
   */
  @Option(name = "perSelectNeg")
  public double perSelectNeg = 1;

  /**
   * Especially useful for multi word phrase extraction. Do not extract a phrase
   * if any word is labeled with any other class.
   */
  @Option(name = "doNotExtractPhraseAnyWordLabeledOtherClass")
  public boolean doNotExtractPhraseAnyWordLabeledOtherClass = true;

  /**
   * You can save the inverted index. Lucene index is saved by default to invertedIndexDirectory if given.
   */
  @Option(name="saveInvertedIndex")
  public boolean saveInvertedIndex  = false;

  /**
   * You can load the inverted index using this file.
   * If false and using lucene index, the existing directory is deleted and new index is made.
   */
  @Option(name="loadInvertedIndex")
  public boolean loadInvertedIndex  = false;


  @Option(name = "storePatsForEachToken", gloss="used for storing patterns in PSQL/MEMORY/LUCENE")
  public PatternForEachTokenWay storePatsForEachToken = PatternForEachTokenWay.MEMORY;
//
//  @Option(name = "storePatsIndex", gloss="used for storing patterns index")
//  public PatternIndexWay storePatsIndex = PatternIndexWay.MEMORY;

  @Option(name="sampleSentencesForSufficientStats",gloss="% sentences to use for learning pattterns" )
  double sampleSentencesForSufficientStats = 1.0;

//  /**
//   * Directory where to save the sentences ser files.
//   */
//  @Option(name="saveSentencesSerDir")
//  public File saveSentencesSerDir = null;
//
//  public boolean usingDirForSentsInIndex = false;

  // @Option(name = "wekaOptions")
  // public String wekaOptions = "";

  public static String backgroundSymbol = "O";

  int wordShaper = WordShapeClassifier.WORDSHAPECHRIS2;
  private ConcurrentHashMap wordShapeCache = new ConcurrentHashMap<>();

  public SentenceIndex invertedIndex;

  public static String extremedebug = "extremePatDebug";
  public static String minimaldebug = "minimaldebug";

  Properties props;

  public enum PatternForEachTokenWay {MEMORY, LUCENE, DB};
  public enum PatternIndexWay {MEMORY, OPENHFT, LUCENE};

  public List functionWords = Arrays.asList("a","an","the","of","at","on","in","he","she","him","her","they","them","and","no","not","nor","as","do");

  public ConstantsAndVariables(Properties props, Set labels, Map>> answerClass, Map generalizeClasses,
                               Map> ignoreClasses) throws IOException {
    this.labels = labels;
    for(String label: labels){
      this.seedLabelDictionary.put(label, new HashSet<>());
    }
    this.answerClass = answerClass;
    this.generalizeClasses = generalizeClasses;
    if(this.generalizeClasses == null)
      this.generalizeClasses = new HashMap<>();
    this.generalizeClasses.putAll(answerClass);
    this.ignoreWordswithClassesDuringSelection = ignoreClasses;
    setUp(props);
  }

  public ConstantsAndVariables(Properties props, Map> labelDictionary, Map>> answerClass, Map generalizeClasses,
                               Map> ignoreClasses) throws IOException {

    //make the list unmodifiable!
    for(Entry> en2: labelDictionary.entrySet()){
      seedLabelDictionary.put(en2.getKey(), Collections.unmodifiableSet(en2.getValue()));
    }

    this.labels = labelDictionary.keySet();
    this.answerClass = answerClass;
    this.generalizeClasses = generalizeClasses;
    if(this.generalizeClasses == null)
      this.generalizeClasses = new HashMap<>();
    this.generalizeClasses.putAll(answerClass);
    this.ignoreWordswithClassesDuringSelection = ignoreClasses;
    setUp(props);
  }

  public ConstantsAndVariables(Properties props, Set labels,  Map>> answerClass) throws IOException {
    this.labels = labels;
    for(String label: labels){
      this.seedLabelDictionary.put(label, new HashSet<>());
    }
    this.answerClass = answerClass;
    this.generalizeClasses = new HashMap<>();
    this.generalizeClasses.putAll(answerClass);
    setUp(props);
  }

  public ConstantsAndVariables(Properties props, String label,  Class> answerClass) throws IOException {
    this.labels = new HashSet<>();
    this.labels.add(label);
    this.seedLabelDictionary.put(label, new HashSet<>());
    this.answerClass = new HashMap<>();
    this.answerClass.put(label, answerClass);
    this.generalizeClasses = new HashMap<>();
    this.generalizeClasses.putAll(this.answerClass);
    setUp(props);
  }


  public ConstantsAndVariables(Properties props, Set labels,  Map>> answerClass, Map generalizeClasses) throws IOException {
    this.labels = labels;
    for(String label: labels){
      this.seedLabelDictionary.put(label, new HashSet<>());
    }
    this.answerClass = answerClass;
    this.generalizeClasses = generalizeClasses;
    if(this.generalizeClasses == null)
      this.generalizeClasses = new HashMap<>();
    this.generalizeClasses.putAll(answerClass);
    setUp(props);
  }

  @SuppressWarnings("rawtypes")
  public void setUp(Properties props) throws IOException {
    if (alreadySetUp) {
      return;
    }

    Redwood.log(Redwood.DBG, "Setting up ConstantsAndVariables");

    ArgumentParser.fillOptions(this, props);
    ArgumentParser.fillOptions(PatternFactory.class, props);
    ArgumentParser.fillOptions(SurfacePatternFactory.class, props);
    ArgumentParser.fillOptions(DepPatternFactory.class, props);

    if (wordIgnoreRegex != null && !wordIgnoreRegex.isEmpty()) {
      Redwood.log(Redwood.DBG, "Ignore word regex is " + wordIgnoreRegex);
      PatternFactory.ignoreWordRegex = Pattern.compile(wordIgnoreRegex);
    }

    for (String label : labels) {
      env.put(label, TokenSequencePattern.getNewEnv());
      // env.get(label).bind("answer", answerClass.get(label));
      for (Entry>> en : this.answerClass
          .entrySet()) {
        env.get(label).bind(en.getKey(), en.getValue());
      }
      for (Entry en : generalizeClasses.entrySet())
        env.get(label).bind(en.getKey(), en.getValue());
    }
    Redwood.log(Redwood.DBG, channelNameLogger, "Running with debug output");
    stopWords = new HashSet<>();

    if(stopWordsPatternFiles != null) {
      Redwood.log(ConstantsAndVariables.minimaldebug, channelNameLogger, "Reading stop words from "
        + stopWordsPatternFiles);
      for (String stopwfile : stopWordsPatternFiles.split("[;,]"))
      {
        for(String word: IOUtils.readLines(stopwfile)){
          if(!word.trim().isEmpty())
            stopWords.add(CandidatePhrase.createOrGet(word.trim()));
        }
      }
    }

    englishWords = new HashSet<>();
    if(englishWordsFiles != null) {
      System.out.println("Reading english words from " + englishWordsFiles);
      for (String englishWordsFile : englishWordsFiles.split("[;,]"))
        englishWords.addAll(IOUtils.linesFromFile(englishWordsFile));
    }

    if (commonWordsPatternFiles != null) {
      commonEngWords = Collections.synchronizedSet(new HashSet<>());
      for (String file : commonWordsPatternFiles.split("[;,]"))
        commonEngWords.addAll(IOUtils.linesFromFile(file));
    }

    if (otherSemanticClassesFiles != null) {
      if (otherSemanticClassesWords == null)
        otherSemanticClassesWords = Collections
            .synchronizedSet(new HashSet<>());
      for (String file : otherSemanticClassesFiles.split("[;,]")) {
        for (File f : listFileIncludingItself(file)) {
          for (String w : IOUtils.readLines(f)) {
            String[] t = w.split("\\s+");
            if (t.length <= PatternFactory.numWordsCompoundMax)
              otherSemanticClassesWords.add(CandidatePhrase.createOrGet(w));
          }
        }
      }
      System.out.println("Size of othersemantic class variables is "
        + otherSemanticClassesWords.size());
    } else {
      otherSemanticClassesWords = Collections.synchronizedSet(new HashSet<>());
      System.out.println("Size of othersemantic class variables is " + 0);
    }

    String stopStr = "/";
    int i = 0;
    for (CandidatePhrase s : stopWords) {
      if (i > 0)
        stopStr += "|";
      stopStr += Pattern.quote(s.getPhrase().replaceAll("\\\\", "\\\\\\\\"));
      i++;
    }
    stopStr += "/";
    for (String label : labels) {
      env.get(label).bind("$FILLER",
          "/" + StringUtils.join(PatternFactory.fillerWords, "|") + "/");
      env.get(label).bind("$STOPWORD", stopStr);
      env.get(label).bind("$MOD", "[{tag:/JJ.*/}]");
      if (matchLowerCaseContext){
        env.get(label).setDefaultStringMatchFlags(NodePattern.CASE_INSENSITIVE);
        env.get(label).setDefaultStringPatternFlags(Pattern.CASE_INSENSITIVE);
      }
      env.get(label).bind("OTHERSEM",
          PatternsAnnotations.OtherSemanticLabel.class);
      env.get(label).bind("grandparentparsetag", CoreAnnotations.GrandparentAnnotation.class);
    }

    if (wordClassClusterFile != null) {
      wordClassClusters = new HashMap<>();
      for (String line : IOUtils.readLines(wordClassClusterFile)) {
        String[] t = line.split("\t");
        wordClassClusters.put(t[0], Integer.parseInt(t[1]));
      }
    }

    if (generalWordClassClusterFile != null) {
      setGeneralWordClassClusters(new HashMap<>());
      for (String line : IOUtils.readLines(generalWordClassClusterFile)) {
        String[] t = line.split("\t");
        getGeneralWordClassClusters().put(t[0], Integer.parseInt(t[1]));
      }
    }

    if(targetAllowedTagsInitialsStr!= null){
      allowedTagsInitials = new HashMap<>();
      for(String labelstr : targetAllowedTagsInitialsStr.split(";")){
        String[] t = labelstr.split(",");
        Set st = new HashSet<>();
        for(int j = 1; j < t.length; j++)
          st.add(t[j]);
        allowedTagsInitials.put(t[0], st);
      }
    }

    if(PatternFactory.useTargetNERRestriction && targetAllowedNERs !=null){
      allowedNERsforLabels = new HashMap<>();
      for(String labelstr : targetAllowedNERs.split(";")){
        String[] t = labelstr.split(",");
        Set st = new HashSet<>();
        for(int j = 1; j < t.length; j++)
          st.add(t[j]);
        allowedNERsforLabels.put(t[0], st);

      }
    }

    for(String label: labels){
      learnedWordsEachIter.put(label, new TreeMap<>());
    }

   if(usePhraseEvalGoogleNgram || usePatternEvalDomainNgram) {
     Data.usingGoogleNgram = true;
     ArgumentParser.fillOptions(GoogleNGramsSQLBacked.class, props);
   }
  if(goldEntitiesEvalFiles !=null && evaluate)
    goldEntities = readGoldEntities(goldEntitiesEvalFiles);
    alreadySetUp = true;
  }


  public static Iterable listFileIncludingItself(String file) {
    File f = new File(file);
    if(!f.isDirectory())
      return Arrays.asList(f);
    else return IOUtils.iterFilesRecursive(f);
  }

  // The format of goldEntitiesEvalFiles is assumed same as
  // seedwordsfiles: label,file;label2,file2;...
  // Each file of gold entities consists of each entity in newline with
  // incorrect entities marked with "#" at the end of the entity.
  // Learned entities not present in the gold file are considered
  // negative.
  static Map> readGoldEntities(String goldEntitiesEvalFiles){
    Map> goldWords = new HashMap<>();
    if (goldEntitiesEvalFiles != null) {
      for (String gfile : goldEntitiesEvalFiles.split(";")) {
        String[] t = gfile.split(",");
        String label = t[0];
        String goldfile = t[1];
        Map goldWords4Label = new HashMap<>();
        for (String line : IOUtils.readLines(goldfile)) {
          line = line.trim();
          if (line.isEmpty())
            continue;

          if (line.endsWith("#"))
            goldWords4Label.put(line.substring(0, line.length() - 1), false);
          else
            goldWords4Label.put(line, true);
        }
        goldWords.put(label, goldWords4Label);
      }
    }
    return goldWords;
  }


  //streams sents, files-from-which-sents-were read
  static public class DataSentsIterator implements Iterator, File>> {

    boolean readInMemory = false;
    Iterator sentfilesIter = null;
    boolean batchProcessSents;
    public DataSentsIterator(boolean batchProcessSents){
      this.batchProcessSents = batchProcessSents;
      if(batchProcessSents){
        sentfilesIter = Data.sentsFiles.iterator();
        }

    }
    @Override
    public boolean hasNext() {
      if(batchProcessSents){
       return sentfilesIter.hasNext();
      }else{
        return !readInMemory;
      }
    }

    @Override
    public Pair, File> next() {
      if(batchProcessSents){
        try {
          File f= sentfilesIter.next();
          return new Pair<>(IOUtils.readObjectFromFile(f), f);
        } catch (IOException | ClassNotFoundException e) {
          throw new RuntimeException(e);
        }
      }else{
        readInMemory= true;
        return new Pair<>(Data.sents, new File(Data.inMemorySaveFileLocation));
      }
    }
  }

  public Map> getWordShapesForLabels() {
    return wordShapesForLabels;
  }

//  public void setWordShapesForLabels(ConcurrentHashMap> wordShapesForLabels) {
//    this.wordShapesForLabels = wordShapesForLabels;
//  }
//  public void addGeneralizeClasses(Map gen) {
//    this.generalizeClasses.putAll(gen);
//  }

  public static Map getGeneralizeClasses() {
    return generalizeClasses;
  }

  public static Set getStopWords() {
    return stopWords;
  }

  public void addWordShapes(String label, Set words){
    if(!this.wordShapesForLabels.containsKey(label)){
      this.wordShapesForLabels.put(label, new ClassicCounter<>());
    }
    for(CandidatePhrase wc: words){
      String w = wc.getPhrase();
      String ws = null;
      if(wordShapeCache.containsKey(w))
        ws = wordShapeCache.get(w);
      else{
       ws = WordShapeClassifier.wordShape(w, wordShaper);
       wordShapeCache.put(w, ws);
      }

      wordShapesForLabels.get(label).incrementCount(ws);

    }
  }

//  public void setSeedLabelDictionary(Map> seedSets) {
//    this.seedLabelDictionary = seedSets;
//
//    if(usePhraseEvalWordShape || usePatternEvalWordShape){
//      this.wordShapesForLabels.clear();
//     for(Entry> en: seedSets.entrySet())
//       addWordShapes(en.getKey(), en.getValue());
//    }
//  }

  public Map> getSeedLabelDictionary() {

    return this.seedLabelDictionary;
  }


  //Map> learnedWords = new HashMap>();
  Map>> learnedWordsEachIter = new HashMap<>();

  public Counter getLearnedWords(String label) {
    Counter learned = Counters.flatten(learnedWordsEachIter.get(label));
    if(learned == null){
      learned = new ClassicCounter<>();
      learnedWordsEachIter.put(label, new TreeMap<>());
    }
    return learned;
  }

//  public Map> getLearnedWords() {
//    return Counters.flatten(learnedWordsEachIter);
//  }
  //public void setLearnedWords(Counter words, String label) {
  //  this.learnedWords.put(label, words);
  //}

  public String getLearnedWordsAsJson(){
    JsonObjectBuilder obj = Json.createObjectBuilder();
    for(String label: getLabels()){
    Counter learnedWords =  getLearnedWords(label);
      JsonArrayBuilder arr = Json.createArrayBuilder();
      for(CandidatePhrase k: learnedWords.keySet())
        arr.add(k.getPhrase());
      obj.add(label, arr);
    }
    return obj.build().toString();
  }

  public String getLearnedWordsAsJsonLastIteration(){
    JsonObjectBuilder obj = Json.createObjectBuilder();
    for(String label: getLabels()){
      Counter learnedWords =  getLearnedWordsEachIter(label).lastEntry().getValue();
      JsonArrayBuilder arr = Json.createArrayBuilder();
      for(CandidatePhrase k: learnedWords.keySet())
        arr.add(k.getPhrase());
      obj.add(label, arr);
    }
    return obj.build().toString();
  }

  public String getSetWordsAsJson(Map> words){
    JsonObjectBuilder obj = Json.createObjectBuilder();
    for(String label: getLabels()){
      JsonArrayBuilder arr = Json.createArrayBuilder();
      for(CandidatePhrase k: words.get(label).keySet())
        arr.add(k.getPhrase());
      obj.add(label, arr);
    }
    return obj.build().toString();
  }



  public Set getEnglishWords() {
    return this.englishWords;
  }

  public Set getCommonEngWords() {
    return this.commonEngWords;
  }

  public Set getOtherSemanticClassesWords() {
    return this.otherSemanticClassesWords;
  }

  public void setOtherSemanticClassesWords(Set other) {
    this.otherSemanticClassesWords = other;
  }

  public Map getWordClassClusters() {
    return this.wordClassClusters;
  }

  private Pair getEditDist(Collection words, String ph) {
    double minD = editDistMax;
    String minPh = ph;
    for (CandidatePhrase ec : words) {
      String e = ec.getPhrase();
      if (e.equals(ph))
        return new Pair<>(ph, 0.0);

      double d = EditDistanceDamerauLevenshteinLike.editDistance(e, ph, 3);

      if (d == 1)
        return new Pair<>(e, d);
      if (d == -1)
        d = editDistMax;
      if (d < minD) {
        minD = d;
        minPh = e;
      }
    }
    return new Pair<>(minPh, minD);

  }

  final double editDistMax = 1000;

  /**
   * Use this option if you are limited by memory ; ignored if fileFormat is ser.
   */
  @Option(name="batchProcessSents")
  public boolean batchProcessSents = false;

  @Option(name="writeMatchedTokensFiles")
  public boolean writeMatchedTokensFiles = false;

  @Option(name="writeMatchedTokensIdsForEachPhrase")
  public boolean writeMatchedTokensIdsForEachPhrase = false;

  public Pair getEditDistanceFromThisClass(String label,
      String ph, int minLen) {
    if (ph.length() < minLen)
      return new Pair<>(ph, editDistMax);
//    if (editDistanceFromThisClass.containsKey(ph))
//      return new Pair(editDistanceFromThisClassMatches.get(ph),
//          editDistanceFromThisClass.get(ph));

    Set words = new HashSet<>(seedLabelDictionary.get(label));
    words.addAll(getLearnedWords(label).keySet());
    Pair minD = getEditDist(words, ph);

    double minDtotal = minD.second();
    String minPh = minD.first();
    assert (!minPh.isEmpty());
//    editDistanceFromThisClass.putIfAbsent(ph, minDtotal);
//    editDistanceFromThisClassMatches.putIfAbsent(ph, minPh);
    return new Pair<>(minPh, minDtotal);
  }

  public Pair getEditDistanceFromOtherClasses(String label, String ph, int minLen) {
    if (ph.length() < minLen)
      return new Pair<>(ph, editDistMax);
//    if (editDistanceFromOtherSemanticClasses.containsKey(ph))
//      return new Pair(
//          editDistanceFromOtherSemanticClassesMatches.get(ph),
//          editDistanceFromOtherSemanticClasses.get(ph));

    Pair minD = getEditDist(otherSemanticClassesWords, ph);
    String minPh = minD.first();
    double minDfinal = minD.second();
    for(String l: labels){
      if(l.equals(label))
        continue;
      Pair editMatch = getEditDistanceFromThisClass(l, ph, minLen);
      if(editMatch.second() < minDfinal){
        minDfinal = editMatch.second();
        minPh = editMatch.first();
      }
    }
    // double minDtotal = editDistMax;
    // String minPh = "";
    // if (minD.second() == editDistMax && ph.contains(" ")) {
    // for (String s : ph.split("\\s+")) {
    // Pair minDSingle = getEditDist(otherSemanticClassesWords, s);
    // if (minDSingle.second() < minDtotal) {
    // minDtotal = minDSingle.second;
    // }
    // minPh += " " + minDSingle.first();
    // }
    // minPh = minPh.trim();
    // } else {

    // }
    assert (!minPh.isEmpty());
//    editDistanceFromOtherSemanticClasses.putIfAbsent(ph, minDtotal);
//    editDistanceFromOtherSemanticClassesMatches.putIfAbsent(ph, minPh);
    return new Pair<>(minPh, minDfinal);
  }

//  public double getEditDistanceFromEng(String ph, int minLen) {
//    if (ph.length() < minLen)
//      return editDistMax;
//    if (editDistanceFromEnglishWords.containsKey(ph))
//      return editDistanceFromEnglishWords.get(ph);
//    Pair d = getEditDist(commonEngWords, ph);
//    double minD = d.second();
//    String minPh = d.first();
//    if (d.second() > 2) {
//      Pair minD2 = getEditDist(CandidatePhrase.convertToString(otherSemanticClassesWords), ph);
//      if (minD2.second < minD) {
//        minD = minD2.second();
//        minPh = minD2.first();
//      }
//    }
//
//    editDistanceFromEnglishWords.putIfAbsent(ph, minD);
//    editDistanceFromEnglishWordsMatches.putIfAbsent(ph, minPh);
//    return minD;
//  }

  public ConcurrentHashMap getEditDistanceFromEnglishWords() {
    return this.editDistanceFromEnglishWords;
  }

  public ConcurrentHashMap getEditDistanceFromEnglishWordsMatches() {
    return this.editDistanceFromEnglishWordsMatches;
  }

  public double getEditDistanceScoresOtherClass(String label, String g) {
    double editDist;
    String editDistPh;
//    if (editDistanceFromOtherSemanticClasses.containsKey(g)) {
//      editDist = editDistanceFromOtherSemanticClasses.get(g);
//      editDistPh = editDistanceFromOtherSemanticClassesMatches.get(g);
//    } else {
      Pair editMatch = getEditDistanceFromOtherClasses(label, g, 4);
      editDist = editMatch.second();
      editDistPh = editMatch.first();
//    }
    assert (!editDistPh.isEmpty());
    return (editDist == editDistMax ? 1.0 : (editDist / (double) Math.max(g.length(), editDistPh.length())));
  }

  /**
   * 1 if lies in edit distance, 0 if not close to any words
   *
   * @param g
   * @return
   */
  public double getEditDistanceScoresOtherClassThreshold(String label, String g) {
    double editDistRatio = getEditDistanceScoresOtherClass(label, g);

    if (editDistRatio < 0.2)
      return 1;
    else
      return 0;
  }

  public double getEditDistanceScoresThisClassThreshold(String label, String g) {
    double editDistRatio = getEditDistanceScoresThisClass(label, g);
    if (editDistRatio < 0.2)
      return 1;
    else
      return 0;
  }

  public double getEditDistanceScoresThisClass(String label, String g) {
    double editDist;
    String editDistPh;
//    if (editDistanceFromThisClass.containsKey(g)) {
//      editDist = editDistanceFromThisClass.get(g);
//      editDistPh = editDistanceFromThisClassMatches.get(g);
//      assert (!editDistPh.isEmpty());
//    } else {
//
      Pair editMatch = getEditDistanceFromThisClass(label, g, 4);
      editDist = editMatch.second();
      editDistPh = editMatch.first();
      assert (!editDistPh.isEmpty());
    //}

    return ((editDist == editDistMax) ? 1.0 : (editDist / (double) Math.max(g.length(), editDistPh.length())));
  }

  public static boolean isFuzzyMatch(String w1, String w2, int minLen4Fuzzy) {
    EditDistance editDistance = new EditDistance(true);
    if (w1.equals(w2))
      return true;
    if (w2.length() > minLen4Fuzzy) {
      double d = editDistance.score(w1, w2);
      if (d == 1) {
        return true;
      }
    }
    return false;
  }

  public static CandidatePhrase containsFuzzy(Set words, CandidatePhrase w,
      int minLen4Fuzzy) {
    for (CandidatePhrase w1 : words) {
      if (isFuzzyMatch(w1.getPhrase(), w.getPhrase(), minLen4Fuzzy))
        return w1;
    }
    return null;
  }

  public Map getGeneralWordClassClusters() {
    return generalWordClassClusters;
  }

  public void setGeneralWordClassClusters(
      Map generalWordClassClusters) {
    this.generalWordClassClusters = generalWordClassClusters;
  }

  public Map getWordShapeCache() {
    return wordShapeCache;
  }


  public Map>> getAnswerClass() {
    return answerClass;
  }


  public Map> getIgnoreWordswithClassesDuringSelection() {
    return ignoreWordswithClassesDuringSelection;
  }

  public void addSeedWords(String label, Collection seeds) throws Exception {
    if(!seedLabelDictionary.containsKey(label)){
      throw new Exception("label not present in the model");
    }

    Set seedWords = new HashSet<>(seedLabelDictionary.get(label));
    seedWords.addAll(seeds);
    seedLabelDictionary.put(label, Collections.unmodifiableSet(seedWords));
  }

}