Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
package edu.stanford.nlp.sequences;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.optimization.StochasticCalculateMethods;
import edu.stanford.nlp.process.WordShapeClassifier;
import java.util.function.Function;
import edu.stanford.nlp.util.ReflectionLoading;
import java.io.Serializable;
import java.lang.reflect.Field;
import java.util.*;
/**
* Flags for sequence classifiers. Documentation for general flags and
* flags for NER can be found in the Javadoc of
* {@link edu.stanford.nlp.ie.NERFeatureFactory}. Documentation for the flags
* for Chinese word segmentation can be found in the Javadoc of
* {@link edu.stanford.nlp.wordseg.ChineseSegmenterFeatureFactory}.
*
* IMPORTANT NOTE IF CHANGING THIS FILE:MAKE SURE TO
* ONLY ADD NEW VARIABLES AT THE END OF THE LIST OF VARIABLES (and not
* to change existing variables)! Otherwise you usually break all
* currently serialized classifiers!!! Search for "ADD VARIABLES ABOVE
* HERE" below.
*
* Some general flags are described here
*
*
*
Property Name
*
Type
*
Default Value
*
Description
*
*
*
useQN
*
boolean
*
true
*
Use Quasi-Newton (L-BFGS) optimization to find minimum. NOTE: Need to set this to
* false if using other minimizers such as SGD.
*
*
*
QNsize
*
int
*
25
*
Number of previous iterations of Quasi-Newton to store (this increases
* memory use, but speeds convergence by letting the Quasi-Newton optimization
* more effectively approximate the second derivative).
*
*
*
QNsize2
*
int
*
25
*
Number of previous iterations of Quasi-Newton to store (used when pruning
* features, after the first iteration - the first iteration is with QNSize).
*
*
*
useInPlaceSGD
*
boolean
*
false
*
Use SGD (tweaking weights in place) to find minimum (more efficient than
* the old SGD, faster to converge than Quasi-Newton if there are very large of
* samples). Implemented for CRFClassifier. NOTE: Remember to set useQN to false
*
*
*
*
tuneSampleSize
*
int
*
-1
*
If this number is greater than 0, specifies the number of samples to use
* for tuning (default is 1000).
*
*
*
SGDPasses
*
int
*
-1
*
If this number is greater than 0, specifies the number of SGD passes over
* entire training set) to do before giving up (default is 50). Can be smaller
* if sample size is very large.
*
*
*
useSGD
*
boolean
*
false
*
Use SGD to find minimum (can be slow). NOTE: Remember to set useQN to
* false
*
*
*
useSGDtoQN
*
boolean
*
false
*
Use SGD (SGD version selected by useInPlaceSGD or useSGD) for a certain
* number of passes (SGDPasses) and then switches to QN. Gives the quick initial
* convergence of SGD, with the desired convergence criterion of QN (there is
* some ramp up time for QN). NOTE: Remember to set useQN to false
*
*
*
evaluateIters
*
int
*
0
*
If this number is greater than 0, evaluates on the test set every so
* often while minimizing. Implemented for CRFClassifier.
*
*
*
evalCmd
*
String
*
*
If specified (and evaluateIters is set), runs the specified cmdline
* command during evaluation (instead of default CONLL-like NER evaluation)
*
*
*
evaluateTrain
*
boolean
*
false
*
If specified (and evaluateIters is set), also evaluate on training set
* (can be expensive)
*
*
*
tokenizerOptions
String
*
(null)
*
Extra options to supply to the tokenizer when creating it.
*
*
*
tokenizerFactory
String
*
(null)
*
A different tokenizer factory to use if the ReaderAndWriter in question uses tokenizers.
*
*
*
* @author Jenny Finkel
*/
public class SeqClassifierFlags implements Serializable {
private static final long serialVersionUID = -7076671761070232567L;
public static final String DEFAULT_BACKGROUND_SYMBOL = "O";
private String stringRep = "";
public boolean useNGrams = false;
public boolean conjoinShapeNGrams = false;
public boolean lowercaseNGrams = false;
public boolean dehyphenateNGrams = false;
public boolean usePrev = false;
public boolean useNext = false;
public boolean useTags = false;
public boolean useWordPairs = false;
public boolean useGazettes = false;
public boolean useSequences = true;
public boolean usePrevSequences = false;
public boolean useNextSequences = false;
public boolean useLongSequences = false;
public boolean useBoundarySequences = false;
public boolean useTaggySequences = false;
public boolean useExtraTaggySequences = false;
public boolean dontExtendTaggy = false;
public boolean useTaggySequencesShapeInteraction = false;
public boolean strictlyZeroethOrder = false;
public boolean strictlyFirstOrder = false;
public boolean strictlySecondOrder = false;
public boolean strictlyThirdOrder = false;
public String entitySubclassification = "IO";
public boolean retainEntitySubclassification = false;
public boolean useGazettePhrases = false;
public boolean makeConsistent = false;
public boolean useViterbi = true;
public int[] binnedLengths = null;
public boolean verboseMode = false;
public boolean useSum = false;
public double tolerance = 1e-4;
// Turned on if non-null. Becomes part of the filename features are printed to
public String printFeatures = null;
public boolean useSymTags = false;
/**
* useSymWordPairs Has a small negative effect.
*/
public boolean useSymWordPairs = false;
public String printClassifier = "WeightHistogram";
public int printClassifierParam = 100;
public boolean intern = false;
public boolean intern2 = false;
public boolean selfTest = false;
public boolean sloppyGazette = false;
public boolean cleanGazette = false;
public boolean noMidNGrams = false;
public int maxNGramLeng = -1;
public boolean useReverse = false;
public boolean greekifyNGrams = false;
public boolean useParenMatching = false;
public boolean useLemmas = false;
public boolean usePrevNextLemmas = false;
public boolean normalizeTerms = false;
public boolean normalizeTimex = false;
public boolean useNB = false;
public boolean useQN = true;
public boolean useFloat = false;
public int QNsize = 25;
public int QNsize2 = 25;
public int maxIterations = -1;
public int wordShape = WordShapeClassifier.NOWORDSHAPE;
public boolean useShapeStrings = false;
public boolean useTypeSeqs = false;
public boolean useTypeSeqs2 = false;
public boolean useTypeSeqs3 = false;
public boolean useDisjunctive = false;
public int disjunctionWidth = 4;
public boolean useDisjunctiveShapeInteraction = false;
public boolean useDisjShape = false;
public boolean useWord = true; // ON by default
public boolean useClassFeature = false;
public boolean useShapeConjunctions = false;
public boolean useWordTag = false;
public boolean useNPHead = false;
public boolean useNPGovernor = false;
public boolean useHeadGov = false;
public boolean useLastRealWord = false;
public boolean useNextRealWord = false;
public boolean useOccurrencePatterns = false;
public boolean useTypeySequences = false;
public boolean justify = false;
public boolean normalize = false;
public String priorType = "QUADRATIC";
public double sigma = 1.0;
public double epsilon = 0.01;
public int beamSize = 30;
public int maxLeft = 2;
public int maxRight = 0;
public boolean usePosition = false;
public boolean useBeginSent = false;
public boolean useGazFeatures = false;
public boolean useMoreGazFeatures = false;
public boolean useAbbr = false;
public boolean useMinimalAbbr = false;
public boolean useAbbr1 = false;
public boolean useMinimalAbbr1 = false;
public boolean useMoreAbbr = false;
public boolean deleteBlankLines = false;
public boolean useGENIA = false;
public boolean useTOK = false;
public boolean useABSTR = false;
public boolean useABSTRFreqDict = false;
public boolean useABSTRFreq = false;
public boolean useFREQ = false;
public boolean useABGENE = false;
public boolean useWEB = false;
public boolean useWEBFreqDict = false;
public boolean useIsURL = false;
public boolean useURLSequences = false;
public boolean useIsDateRange = false;
public boolean useEntityTypes = false;
public boolean useEntityTypeSequences = false;
public boolean useEntityRule = false;
public boolean useOrdinal = false;
public boolean useACR = false;
public boolean useANTE = false;
public boolean useMoreTags = false;
public boolean useChunks = false;
public boolean useChunkySequences = false;
public boolean usePrevVB = false;
public boolean useNextVB = false;
public boolean useVB = false;
public boolean subCWGaz = false;
// TODO OBSOLETE: delete when breaking serialization sometime.
public String documentReader = "ColumnDocumentReader";
// public String trainMap = "word=0,tag=1,answer=2";
// public String testMap = "word=0,tag=1,answer=2";
public String map = "word=0,tag=1,answer=2";
public boolean useWideDisjunctive = false;
public int wideDisjunctionWidth = 10;
// chinese word-segmenter features
public boolean useRadical = false;
public boolean useBigramInTwoClique = false;
public String morphFeatureFile = null;
public boolean useReverseAffix = false;
public int charHalfWindow = 3;
public boolean useWord1 = false;
public boolean useWord2 = false;
public boolean useWord3 = false;
public boolean useWord4 = false;
public boolean useRad1 = false;
public boolean useRad2 = false;
public boolean useWordn = false;
public boolean useCTBPre1 = false;
public boolean useCTBSuf1 = false;
public boolean useASBCPre1 = false;
public boolean useASBCSuf1 = false;
public boolean usePKPre1 = false;
public boolean usePKSuf1 = false;
public boolean useHKPre1 = false;
public boolean useHKSuf1 = false;
public boolean useCTBChar2 = false;
public boolean useASBCChar2 = false;
public boolean useHKChar2 = false;
public boolean usePKChar2 = false;
public boolean useRule2 = false;
public boolean useDict2 = false;
public boolean useOutDict2 = false;
public String outDict2 = "/u/htseng/scr/chunking/segmentation/out.lexicon";
public boolean useDictleng = false;
public boolean useDictCTB2 = false;
public boolean useDictASBC2 = false;
public boolean useDictPK2 = false;
public boolean useDictHK2 = false;
public boolean useBig5 = false;
public boolean useNegDict2 = false;
public boolean useNegDict3 = false;
public boolean useNegDict4 = false;
public boolean useNegCTBDict2 = false;
public boolean useNegCTBDict3 = false;
public boolean useNegCTBDict4 = false;
public boolean useNegASBCDict2 = false;
public boolean useNegASBCDict3 = false;
public boolean useNegASBCDict4 = false;
public boolean useNegHKDict2 = false;
public boolean useNegHKDict3 = false;
public boolean useNegHKDict4 = false;
public boolean useNegPKDict2 = false;
public boolean useNegPKDict3 = false;
public boolean useNegPKDict4 = false;
public boolean usePre = false;
public boolean useSuf = false;
public boolean useRule = false;
public boolean useHk = false;
public boolean useMsr = false;
public boolean useMSRChar2 = false;
public boolean usePk = false;
public boolean useAs = false;
public boolean useFilter = false; // TODO this flag is used for nothing;
// delete when breaking serialization
public boolean largeChSegFile = false; // TODO this flag is used for nothing;
// delete when breaking serialization
public boolean useRad2b = false;
/**
* Keep the whitespace between English words in testFile when printing out
* answers. Doesn't really change the content of the CoreLabels. (For Chinese
* segmentation.)
*/
public boolean keepEnglishWhitespaces = false;
/**
* Keep all the whitespace words in testFile when printing out answers.
* Doesn't really change the content of the CoreLabels. (For Chinese
* segmentation.)
*/
public boolean keepAllWhitespaces = false;
public boolean sighanPostProcessing = false;
/**
* use POS information (an "open" feature for Chinese segmentation)
*/
public boolean useChPos = false;
// CTBSegDocumentReader normalization table
// A value of null means that a default algorithmic normalization
// is done in which ASCII characters get mapped to their fullwidth
// equivalents in the Unihan range
public String normalizationTable; // = null;
public String dictionary; // = null;
public String serializedDictionary; // = null;
public String dictionary2; // = null;
public String normTableEncoding = "GB18030";
/**
* for Sighan bakeoff 2005, the path to the dictionary of bigrams appeared in
* corpus
*/
public String sighanCorporaDict = "/u/nlp/data/chinese-segmenter/";
// end Sighan 20005 chinese word-segmenter features/properties
public boolean useWordShapeGaz = false;
public String wordShapeGaz = null;
// TODO: This should be removed in favor of suppressing splitting when
// maxDocSize <= 0, when next breaking serialization
// this now controls nothing
public boolean splitDocuments = true;
public boolean printXML; // This is disused and can be removed when breaking serialization
public boolean useSeenFeaturesOnly = false;
public String lastNameList = "/u/nlp/data/dist.all.last";
public String maleNameList = "/u/nlp/data/dist.male.first";
public String femaleNameList = "/u/nlp/data/dist.female.first";
// don't want these serialized
public transient String trainFile = null;
/** NER adaptation (Gaussian prior) parameters. */
public transient String adaptFile = null;
public transient String devFile = null;
public transient String testFile = null;
public transient String textFile = null;
public transient String textFiles = null;
public transient boolean readStdin = false;
public transient String outputFile = null;
public transient String loadClassifier = null;
public transient String loadTextClassifier = null;
public transient String loadJarClassifier = null;
public transient String loadAuxClassifier = null;
public transient String serializeTo = null;
public transient String serializeToText = null;
public transient int interimOutputFreq = 0;
public transient String initialWeights = null;
public transient List gazettes = new ArrayList<>();
public transient String selfTrainFile = null;
public String inputEncoding = "UTF-8"; // used for CTBSegDocumentReader as well
public boolean bioSubmitOutput = false;
public int numRuns = 1;
public String answerFile = null;
public String altAnswerFile = null;
public String dropGaz;
public String printGazFeatures = null;
public int numStartLayers = 1;
public boolean dump = false;
// whether to merge B- and I- tags in an input file and to tag with IO tags
// (lacking a prefix). E.g., "I-PERS" goes to "PERS"
public boolean mergeTags;
public boolean splitOnHead;
// threshold
public int featureCountThreshold = 0;
public double featureWeightThreshold = 0.0;
// feature factory
public String featureFactory = "edu.stanford.nlp.ie.NERFeatureFactory";
public Object[] featureFactoryArgs = new Object[0];
public String backgroundSymbol = DEFAULT_BACKGROUND_SYMBOL;
// use
public boolean useObservedSequencesOnly = false;
public int maxDocSize = 0;
public boolean printProbs = false;
public boolean printFirstOrderProbs = false;
public boolean saveFeatureIndexToDisk = false;
public boolean removeBackgroundSingletonFeatures = false;
public boolean doGibbs = false;
public int numSamples = 100;
public boolean useNERPrior = false; // todo [cdm 2014]: Disused, to be deleted, use priorModelFactory
public boolean useAcqPrior = false; // todo [cdm 2014]: Disused, to be deleted, use priorModelFactory
public boolean useUniformPrior = false; // todo [cdm 2014]: Disused, to be deleted, use priorModelFactory
public boolean useMUCFeatures = false;
public double annealingRate = 0.0;
public String annealingType = null;
public String loadProcessedData = null;
public boolean initViterbi = true;
public boolean useUnknown = false;
public boolean checkNameList = false;
public boolean useSemPrior = false; // todo [cdm 2014]: Disused, to be deleted, use priorModelFactory
public boolean useFirstWord = false;
public boolean useNumberFeature = false;
public int ocrFold = 0;
public transient boolean ocrTrain = false;
public String classifierType = "MaxEnt";
public String svmModelFile = null;
public String inferenceType = "Viterbi";
public boolean useLemmaAsWord = false;
public String type = "cmm";
public String readerAndWriter = "edu.stanford.nlp.sequences.ColumnDocumentReaderAndWriter";
public List comboProps = new ArrayList<>();
public boolean usePrediction = false;
public boolean useAltGazFeatures = false;
public String gazFilesFile = null;
public boolean usePrediction2 = false;
public String baseTrainDir = ".";
public String baseTestDir = ".";
public String trainFiles = null;
public String trainFileList = null;
public String testFiles = null;
public String trainDirs = null; // cdm 2009: this is currently unsupported,
// but one user wanted something like this....
public String testDirs = null;
public boolean useOnlySeenWeights = false;
public String predProp = null;
public CoreLabel pad = new CoreLabel();
public boolean useObservedFeaturesOnly = false;
public String distSimLexicon = null;
public boolean useDistSim = false;
public int removeTopN = 0;
public int numTimesRemoveTopN = 1;
public double randomizedRatio = 1.0;
public double removeTopNPercent = 0.0;
public int purgeFeatures = -1;
public boolean booleanFeatures = false;
// This flag is only used for the sequences Type 2 CRF, not for ie.crf.CRFClassifier
public boolean iobWrapper = false;
public boolean iobTags = false;
/** Binary segmentation feature for character-based Chinese NER. */
public boolean useSegmentation = false;
public boolean memoryThrift = false;
public boolean timitDatum = false;
public String serializeDatasetsDir = null;
public String loadDatasetsDir = null;
public String pushDir = null;
public boolean purgeDatasets = false;
public boolean keepOBInMemory = true;
public boolean fakeDataset = false;
public boolean restrictTransitionsTimit = false;
public int numDatasetsPerFile = 1;
public boolean useTitle = false;
// these are for the old stuff
public boolean lowerNewgeneThreshold = false;
public boolean useEitherSideWord = false;
public boolean useEitherSideDisjunctive = false;
public boolean twoStage = false;
public String crfType = "MaxEnt";
public int featureThreshold = 1;
public String featThreshFile = null;
public double featureDiffThresh = 0.0;
public int numTimesPruneFeatures = 0;
public double newgeneThreshold = 0.0;
public boolean doAdaptation = false;
public boolean useInternal = true;
public boolean useExternal = true;
public double selfTrainConfidenceThreshold = 0.9;
public int selfTrainIterations = 1;
public int selfTrainWindowSize = 1; // Unigram
public boolean useHuber = false;
public boolean useQuartic = false;
public double adaptSigma = 1.0;
public int numFolds = 1;
public int startFold = 1;
public int endFold = 1;
public boolean cacheNGrams = false;
public String outputFormat;
public boolean useSMD = false;
public boolean useSGDtoQN = false;
public boolean useStochasticQN = false;
public boolean useScaledSGD = false;
public int scaledSGDMethod = 0;
public int SGDPasses = -1;
public int QNPasses = -1;
public boolean tuneSGD = false;
public StochasticCalculateMethods stochasticMethod = StochasticCalculateMethods.NoneSpecified;
public double initialGain = 0.1;
public int stochasticBatchSize = 15;
public boolean useSGD = false;
public double gainSGD = 0.1;
public boolean useHybrid = false;
public int hybridCutoffIteration = 0;
public boolean outputIterationsToFile = false;
public boolean testObjFunction = false;
public boolean testVariance = false;
public int SGD2QNhessSamples = 50;
public boolean testHessSamples = false;
public int CRForder = 1; // TODO remove this when breaking serialization; this is unused; really maxLeft/maxRight control order
public int CRFwindow = 2; // TODO remove this when breaking serialization; this is unused; really maxLeft/maxRight control clique size
public boolean estimateInitial = false;
public transient String biasedTrainFile = null;
public transient String confusionMatrix = null;
public String outputEncoding = null;
public boolean useKBest = false;
public String searchGraphPrefix = null;
public double searchGraphPrune = Double.POSITIVE_INFINITY;
public int kBest = 1;
// more chinese segmenter features for GALE 2007
public boolean useFeaturesC4gram;
public boolean useFeaturesC5gram;
public boolean useFeaturesC6gram;
public boolean useFeaturesCpC4gram;
public boolean useFeaturesCpC5gram;
public boolean useFeaturesCpC6gram;
public boolean useUnicodeType;
public boolean useUnicodeType4gram;
public boolean useUnicodeType5gram;
public boolean use4Clique;
public boolean useUnicodeBlock;
public boolean useShapeStrings1;
public boolean useShapeStrings3;
public boolean useShapeStrings4;
public boolean useShapeStrings5;
public boolean useGoodForNamesCpC;
public boolean useDictionaryConjunctions;
public boolean expandMidDot;
public int printFeaturesUpto = Integer.MAX_VALUE;
public boolean useDictionaryConjunctions3;
public boolean useWordUTypeConjunctions2;
public boolean useWordUTypeConjunctions3;
public boolean useWordShapeConjunctions2;
public boolean useWordShapeConjunctions3;
public boolean useMidDotShape;
public boolean augmentedDateChars;
public boolean suppressMidDotPostprocessing;
public boolean printNR; // a flag for WordAndTagDocumentReaderAndWriter
public String classBias = null;
public boolean printLabelValue; // Old printErrorStuff
public boolean useRobustQN = false;
public boolean combo = false;
public boolean useGenericFeatures = false;
public boolean verboseForTrueCasing = false;
public String trainHierarchical = null;
public String domain = null;
public boolean baseline = false;
public String transferSigmas = null;
public boolean doFE = false;
public boolean restrictLabels = true;
// whether to print a line saying each ObjectBank entry (usually a filename)
public boolean announceObjectBankEntries = false;
// This is for use with the OWLQNMinimizer L1 regularization. To use it, set useQN=false,
// and this to a positive number. A smaller number means more features are retained.
// Depending on the problem, a good value might be
// between 0.75 (POS tagger) down to 0.01 (Chinese word segmentation)
public double l1reg = 0.0;
// truecaser flags:
public String mixedCaseMapFile = "";
public String auxTrueCaseModels = "";
// more flags inspired by Zhang and Johnson 2003
public boolean use2W = false;
public boolean useLC = false;
public boolean useYetMoreCpCShapes = false;
// added for the NFL domain
public boolean useIfInteger = false;
public String exportFeatures = null;
public boolean useInPlaceSGD = false;
public boolean useTopics = false;
// Number of iterations before evaluating weights (0 = don't evaluate)
public int evaluateIters = 0;
// Command to use for evaluation
public String evalCmd = "";
// Evaluate on training set or not
public boolean evaluateTrain = false;
public int tuneSampleSize = -1;
public boolean usePhraseFeatures = false;
public boolean usePhraseWords = false;
public boolean usePhraseWordTags = false;
public boolean usePhraseWordSpecialTags = false;
public boolean useCommonWordsFeature = false;
public boolean useProtoFeatures = false;
public boolean useWordnetFeatures = false;
public String tokenFactory = "edu.stanford.nlp.process.CoreLabelTokenFactory";
public Object[] tokenFactoryArgs = new Object[0];
public String tokensAnnotationClassName = "edu.stanford.nlp.ling.CoreAnnotations$TokensAnnotation";
public transient String tokenizerOptions = null;
public transient String tokenizerFactory = null;
public boolean useCorefFeatures = false;
public String wikiFeatureDbFile = null;
// for combining 2 CRFs - one trained from noisy data and another trained from
// non-noisy
public boolean useNoisyNonNoisyFeature = false;
// year annotation of the document
public boolean useYear = false;
public boolean useSentenceNumber = false;
// to know source of the label. Currently, used to know which pattern is used
// to label the token
public boolean useLabelSource = false;
/**
* Whether to (not) lowercase tokens before looking them up in distsim
* lexicon. By default lowercasing was done, but now it doesn't have to be
* true :-).
*/
public boolean casedDistSim = false;
/**
* The format of the distsim file. Known values are: alexClark = TSV file.
* word TAB clusterNumber [optional other content] terryKoo = TSV file.
* clusterBitString TAB word TAB frequency
*/
public String distSimFileFormat = "alexClark";
/**
* If this number is greater than 0, the distSim class is assume to be a bit
* string and is truncated at this many characters. Normal distSim features
* will then use this amount of resolution. Extra, special distsim features
* may work at a coarser level of resolution. Since the lexicon only stores
* this length of bit string, there is then no way to have finer-grained
* clusters.
*/
public int distSimMaxBits = 8;
/**
* If this is set to true, all digit characters get mapped to '9' in a distsim
* lexicon and for lookup. This is a simple word shaping that can shrink
* distsim lexicons and improve their performance.
*/
public boolean numberEquivalenceDistSim = false;
/**
* What class to assign to words not found in the dist sim lexicon. You might
* want to make it a known class, if one is the "default class.
*/
public String unknownWordDistSimClass = "null";
/**
* Use prefixes and suffixes from the previous and next word.
*/
public boolean useNeighborNGrams = false;
/**
* This function maps words in the training or test data to new
* words. They are used at the feature extractor level, ie in the
* FeatureFactory. For now, only the NERFeatureFactory uses this.
*/
public Function wordFunction = null;
public static final String DEFAULT_PLAIN_TEXT_READER = "edu.stanford.nlp.sequences.PlainTextDocumentReaderAndWriter";
public String plainTextDocumentReaderAndWriter = DEFAULT_PLAIN_TEXT_READER;
/**
* Use a bag of all words as a feature. Perhaps this will find some
* words that indicate certain types of entities are present.
*/
public boolean useBagOfWords = false;
/**
* When scoring, count the background symbol stats too. Useful for
* things where the background symbol is particularly meaningful,
* such as truecase.
*/
public boolean evaluateBackground = false;
/**
* Number of experts to be used in Logarithmic Opinion Pool (product of experts) training
* default value is 1
*/
public int numLopExpert = 1;
public transient String initialLopScales = null;
public transient String initialLopWeights = null;
public boolean includeFullCRFInLOP = false;
public boolean backpropLopTraining = false;
public boolean randomLopWeights = false;
public boolean randomLopFeatureSplit = false;
public boolean nonLinearCRF = false;
public boolean secondOrderNonLinear = false;
public int numHiddenUnits = -1;
public boolean useOutputLayer = true;
public boolean useHiddenLayer = true;
public boolean gradientDebug = false;
public boolean checkGradient = false;
public boolean useSigmoid = false;
public boolean skipOutputRegularization = false;
public boolean sparseOutputLayer = false;
public boolean tieOutputLayer = false;
public boolean blockInitialize = false;
public boolean softmaxOutputLayer = false;
/**
* Bisequence CRF parameters
*/
public String loadBisequenceClassifierEn = null;
public String loadBisequenceClassifierCh = null;
public String bisequenceClassifierPropEn = null;
public String bisequenceClassifierPropCh = null;
public String bisequenceTestFileEn = null;
public String bisequenceTestFileCh = null;
public String bisequenceTestOutputEn = null;
public String bisequenceTestOutputCh = null;
public String bisequenceTestAlignmentFile = null;
public String bisequenceAlignmentTestOutput = null;
public int bisequencePriorType = 1;
public String bisequenceAlignmentPriorPenaltyCh = null;
public String bisequenceAlignmentPriorPenaltyEn = null;
public double alignmentPruneThreshold = 0.0;
public double alignmentDecodeThreshold = 0.5;
public boolean factorInAlignmentProb = false;
public boolean useChromaticSampling = false;
public boolean useSequentialScanSampling = false;
public int maxAllowedChromaticSize = 8;
/**
* Whether or not to keep blank sentences when processing. Useful
* for systems such as the segmenter if you want to line up each
* line exactly, including blank lines.
*/
public boolean keepEmptySentences = false;
public boolean useBilingualNERPrior = false;
public int samplingSpeedUpThreshold = -1;
public String entityMatrixCh = null;
public String entityMatrixEn = null;
public int multiThreadGibbs = 0;
public boolean matchNERIncentive = false;
public boolean useEmbedding = false;
public boolean prependEmbedding = false;
public String embeddingWords = null;
public String embeddingVectors = null;
public boolean transitionEdgeOnly = false;
// L1-prior used in QNMinimizer's OWLQN
public double priorLambda = 0;
public boolean addCapitalFeatures = false;
public int arbitraryInputLayerSize = -1;
public boolean noEdgeFeature = false;
public boolean terminateOnEvalImprovement = false;
public int terminateOnEvalImprovementNumOfEpoch = 1;
public boolean useMemoryEvaluator = true;
public boolean suppressTestDebug = false;
public boolean useOWLQN = false;
public boolean printWeights = false;
public int totalDataSlice = 10;
public int numOfSlices = 0;
public boolean regularizeSoftmaxTieParam = false;
public double softmaxTieLambda = 0;
public int totalFeatureSlice = 10;
public int numOfFeatureSlices = 0;
public boolean addBiasToEmbedding = false;
public boolean hardcodeSoftmaxOutputWeights = false;
public boolean useNERPriorBIO = false; // todo [cdm 2014]: Disused, to be deleted, use priorModelFactory
public String entityMatrix = null;
public int multiThreadClassifier = 0;
public boolean useDualDecomp = false;
public boolean biAlignmentPriorIsPMI = true;
public boolean dampDDStepSizeWithAlignmentProb = false;
public boolean dualDecompAlignment = false;
public double dualDecompInitialStepSizeAlignment = 0.1;
public boolean dualDecompNotBIO = false;
public String berkeleyAlignerLoadPath = null;
public boolean useBerkeleyAlignerForViterbi = false;
public boolean useBerkeleyCompetitivePosterior = false;
public boolean useDenero = true;
public double alignDDAlpha = 1;
public boolean factorInBiEdgePotential = false;
public boolean noNeighborConstraints = false;
public boolean includeC2EViterbi = true;
public boolean initWithPosterior = true;
public int nerSkipFirstK = 0;
public int nerSlowerTimes = 1;
public boolean powerAlignProb = false;
public boolean powerAlignProbAsAddition = false;
public boolean initWithNERPosterior = false;
public boolean applyNERPenalty = true;
public boolean printFactorTable = false;
public boolean useAdaGradFOBOS = false;
public double initRate = 0.1;
public boolean groupByFeatureTemplate = false;
public boolean groupByOutputClass = false;
public double priorAlpha = 0;
public String splitWordRegex = null;
public boolean groupByInput = false;
public boolean groupByHiddenUnit = false;
public String unigramLM = null;
public String bigramLM = null;
public int wordSegBeamSize = 1000;
public String vocabFile = null;
public String normalizedFile = null;
public boolean averagePerceptron = true;
public String loadCRFSegmenterPath = null;
public String loadPCTSegmenterPath = null;
public String crfSegmenterProp = null;
public String pctSegmenterProp = null;
public String intermediateSegmenterOut = null;
public String intermediateSegmenterModel = null;
public int dualDecompMaxItr = 0;
public double dualDecompInitialStepSize = 0.1;
public boolean dualDecompDebug = false;
public boolean useCWSWordFeatures = false;
public boolean useCWSWordFeaturesAll = false;
public boolean useCWSWordFeaturesBigram = false;
public boolean pctSegmenterLenAdjust = false;
public boolean useTrainLexicon = false;
public boolean useCWSFeatures = true;
public boolean appendLC = false;
public boolean perceptronDebug = false;
public boolean pctSegmenterScaleByCRF = false;
public double pctSegmenterScale = 0.0;
public boolean separateASCIIandRange = true;
public double dropoutRate = 0.0;
public double dropoutScale = 1.0;
// keenon: changed from = 1, nowadays it makes sense to default to parallelism
public int multiThreadGrad = Runtime.getRuntime().availableProcessors();
public int maxQNItr = 0;
public boolean dropoutApprox = false;
public String unsupDropoutFile = null;
public double unsupDropoutScale = 1.0;
public int startEvaluateIters = 0;
public int multiThreadPerceptron = 1;
public boolean lazyUpdate = false;
public int featureCountThresh = 0;
public transient String serializeWeightsTo = null;
public boolean geDebug = false;
public boolean doFeatureDiscovery = false;
public transient String loadWeightsFrom = null;
public transient String loadClassIndexFrom = null;
public transient String serializeClassIndexTo = null;
public boolean learnCHBasedOnEN = true;
public boolean learnENBasedOnCH = false;
public String loadWeightsFromEN = null;
public String loadWeightsFromCH = null;
public String serializeToEN = null;
public String serializeToCH = null;
public String testFileEN = null;
public String testFileCH = null;
public String unsupFileEN = null;
public String unsupFileCH = null;
public String unsupAlignFile = null;
public String supFileEN = null;
public String supFileCH = null;
public transient String serializeFeatureIndexTo = null;
public String loadFeatureIndexFromEN = null;
public String loadFeatureIndexFromCH = null;
public double lambdaEN = 1.0;
public double lambdaCH = 1.0;
public boolean alternateTraining = false;
public boolean weightByEntropy = false;
public boolean useKL = false;
public boolean useHardGE = false;
public boolean useCRFforUnsup = false;
public boolean useGEforSup = false;
public boolean useKnownLCWords = true; // disused, can be deleted when breaking serialization
// allow for multiple feature factories.
public String[] featureFactories = null;
public List