edu.stanford.nlp.parser.lexparser.TrainOptions Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
package edu.stanford.nlp.parser.lexparser;
import edu.stanford.nlp.util.logging.Redwood;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeTransformer;
import edu.stanford.nlp.ling.CategoryWordTag;
import java.util.Set;
import java.io.PrintWriter;
import java.io.Serializable;
/**
* Non-language-specific options for training a grammar from a treebank.
* These options are not used at parsing time.
*
* @author Dan Klein
* @author Christopher Manning
*/
public class TrainOptions implements Serializable {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(TrainOptions.class);
public String trainTreeFile = null; // same for me -- Teg
/* THESE OPTIONS AFFECT ONLY TRAIN TIME */
public TrainOptions() {}
public int trainLengthLimit = 100000;
/** Add all test set trees to training data for PCFG.
* (Currently only supported in FactoredParser main.)
*/
public boolean cheatPCFG = false;
/** Whether to do "horizontal Markovization" (as in ACL 2003 paper).
* False means regular PCFG expansions.
*/
public boolean markovFactor = false;
public int markovOrder = 1;
public boolean hSelSplit = false; // good with true;
public int HSEL_CUT = 10;
/** Whether or not to mark final states in binarized grammar.
* This must be off to get most value out of grammar compaction.
*/
public boolean markFinalStates = true;
/**
* A POS tag has to have been attributed to more than this number of word
* types before it is regarded as an open-class tag. Unknown words will
* only possibly be tagged as open-class tags (unless flexiTag is on).
* If flexiTag is on, unknown words will be able to be tagged any POS for
* which the unseenMap has nonzero count (that is, the tag was seen for
* a new word after unseen signature counting was started).
*/
public int openClassTypesThreshold = 50;
/**
* Start to aggregate signature-tag pairs only for words unseen in the first
* this fraction of the data.
*/
public double fractionBeforeUnseenCounting = 0.5;
/**
* If true, declare early -- leave this on except maybe with markov on.
* @return Whether to do outside factorization in binarization of the grammar
*/
public boolean outsideFactor() {
return !markovFactor;
}
/**
* This variable controls doing parent annotation of phrasal nodes. Good.
*/
public boolean PA = true;
/**
* This variable controls doing 2 levels of parent annotation. Bad.
*/
public boolean gPA = false;
public boolean postPA = false;
public boolean postGPA = false;
/**
* Only split the "common high KL divergence" parent categories.... Good.
*/
public boolean selectiveSplit = false; //true;
public double selectiveSplitCutOff = 0.0;
public boolean selectivePostSplit = false;
public double selectivePostSplitCutOff = 0.0;
/** Whether, in post-splitting of categories, nodes are annotated with the
* (grand)parent's base category or with its complete subcategorized
* category.
*/
public boolean postSplitWithBaseCategory = false;
/**
* Selective Sister annotation.
*/
public boolean sisterAnnotate = false;
public Set sisterSplitters;
/**
* Mark all unary nodes specially. Good for just PCFG. Bad for factored.
* markUnary affects phrasal nodes. A value of 0 means to do nothing;
* a value of 1 means to mark the parent (higher) node of a unary rewrite.
* A value of 2 means to mark the child (lower) node of a unary rewrie.
* Values of 1 and 2 only apply if the child (lower) node is phrasal.
* (A value of 1 is better than 2 in combos.) A value of 1 corresponds
* to the old boolean -unary flag.
*/
public int markUnary = 0;
/** Mark POS tags which are the sole member of their phrasal constituent.
* This is like markUnary=2, applied to POS tags.
*/
public boolean markUnaryTags = false;
/**
* Mark all pre-preterminals (also does splitBaseNP: don't need both)
*/
public boolean splitPrePreT = false;
/**
* Parent annotation on tags. Good (for PCFG?)
*/
public boolean tagPA = false;//true;
/**
* Do parent annotation on tags selectively. Neutral, but less splits.
*/
public boolean tagSelectiveSplit = false;
public double tagSelectiveSplitCutOff = 0.0;
public boolean tagSelectivePostSplit = false;
public double tagSelectivePostSplitCutOff = 0.0;
/**
* Right edge is right-recursive (X << X) Bad. (NP only is good)
*/
public boolean rightRec = false;//true;
/**
* Left edge is right-recursive (X << X) Bad.
*/
public boolean leftRec = false;
/**
* Promote/delete punctuation like Collins. Bad (!)
*/
public boolean collinsPunc = false;
/**
* Set the splitter strings. These are a set of parent and/or grandparent
* annotated categories which should be split off.
*/
public Set splitters;
public Set postSplitters;
public Set deleteSplitters;
/**
* Just for debugging: check that your tree transforms work correctly. This
* will print the transformations of the first printTreeTransformations trees.
*/
public int printTreeTransformations = 0;
public PrintWriter printAnnotatedPW;
public PrintWriter printBinarizedPW;
// todo [cdm nov 2012]: At present this does nothing. It should print the list of all states of a grammar it trains
// Maybe just make it an anytime option and print it at the same time that verbose printing of tags is done?
public boolean printStates = false;
/** How to compact grammars as FSMs.
* 0 = no compaction [uses makeSyntheticLabel1],
* 1 = no compaction but use label names that wrap from right to left in binarization [uses makeSyntheticLabel2],
* 2 = wrapping labels and materialize unary at top rewriting passive to active,
* 3 = ExactGrammarCompactor,
* 4 = LossyGrammarCompactor,
* 5 = CategoryMergingGrammarCompactor.
* (May 2007 CDM note: options 4 and 5 don't seem to be functioning sensibly. 0, 1, and 3
* seem to be the 'good' options. 2 is only useful as input to 3. There seems to be
* no reason not to use 0, despite the default.)
*/
public int compactGrammar = 3; // exact compaction on by default
public boolean leftToRight = false; // whether to binarize left to right or head out
public int compactGrammar() {
if (markovFactor) {
return compactGrammar;
}
return 0;
}
public boolean noTagSplit = false;
/**
* CHANGE ANYTHING BELOW HERE AT YOUR OWN RISK
*/
/**
* Enables linear rule smoothing during grammar extraction
* but before grammar compaction. The alpha term is the same
* as that described in Petrov et al. (2006), and has range [0,1].
*/
public boolean ruleSmoothing = false;
public double ruleSmoothingAlpha = 0.0;
/**
* TODO wsg2011: This is the old grammar smoothing parameter that no
* longer does anything in the parser. It should be removed.
*/
public boolean smoothing = false;
/* public boolean factorOut = false;
public boolean rightBonus = false;
public boolean brokenDep = false;*/
/** Discounts the count of BinaryRule's (only, apparently) in training data. */
public double ruleDiscount = 0.0;
//public boolean outsideFilter = false;
public boolean printAnnotatedRuleCounts = false;
public boolean printAnnotatedStateCounts = false;
/** Where to use the basic or split tags in the dependency grammar */
public boolean basicCategoryTagsInDependencyGrammar = false;
/**
* A transformer to use on the training data before any other
* processing step. This is specified by using the -preTransformer
* flag when training the parser. A comma separated list of classes
* will be turned into a CompositeTransformer. This can be used to
* strip subcategories, to run a tsurgeon pattern, or any number of
* other useful operations.
*/
public TreeTransformer preTransformer = null;
/**
* A set of files to use as extra information in the lexicon. This
* can provide tagged words which are not part of trees
*/
public String taggedFiles = null;
/**
* Use the method reported by Berkeley for splitting and recombining
* states. This is an experimental and still in development
* reimplementation of that work.
*/
public boolean predictSplits = false;
/**
* If we are predicting splits, we loop this many times
*/
public int splitCount = 1;
/**
* If we are predicting splits, we recombine states at this rate every loop
*/
public double splitRecombineRate = 0.0;
/**
* When binarizing trees, don't annotate the labels with anything
*/
public boolean simpleBinarizedLabels = false;
/**
* When binarizing trees, don't binarize trees with two children.
* Only applies when using inside markov binarization for now.
*/
public boolean noRebinarization = false;
/**
* If the training algorithm allows for parallelization, how many
* threads to use
*/
public int trainingThreads = 1;
/**
* When training the DV parsing method, how many of the top K trees
* to analyze from the underlying parser
*/
static public final int DEFAULT_K_BEST = 100;
public int dvKBest = DEFAULT_K_BEST;
/**
* When training a parsing method where the training has a (max)
* number of iterations, how many iterations to loop
*/
static public final int DEFAULT_TRAINING_ITERATIONS = 40;
public int trainingIterations = DEFAULT_TRAINING_ITERATIONS;
/**
* When training using batches of trees, such as in the DVParser,
* how many trees to use in one batch
*/
static public final int DEFAULT_BATCH_SIZE = 25;
public int batchSize = DEFAULT_BATCH_SIZE;
/**
* regularization constant
*/
public static final double DEFAULT_REGCOST = 0.0001;
public double regCost = DEFAULT_REGCOST;
/**
* When training the DV parsing method, how many iterations to loop
* for one batch of trees
*/
static public final int DEFAULT_QN_ITERATIONS_PER_BATCH = 1;
public int qnIterationsPerBatch = DEFAULT_QN_ITERATIONS_PER_BATCH;
/**
* When training the DV parsing method, how many estimates to keep
* for the qn approximation.
*/
public int qnEstimates = 15;
/**
* When training the DV parsing method, the tolerance to use if we
* want to stop qn early
*/
public double qnTolerance = 15;
/**
* If larger than 0, the parser may choose to output debug information
* every X seconds, X iterations, or some other similar metric
*/
public int debugOutputFrequency = 0;
public long randomSeed = 0;
public static final double DEFAULT_LEARNING_RATE = 0.1;
/**
* How fast to learn (can mean different things for different algorithms)
*/
public double learningRate = DEFAULT_LEARNING_RATE;
public static final double DEFAULT_DELTA_MARGIN = 0.1;
/**
* How much to penalize the wrong trees for how different they are
* from the gold tree when training
*/
public double deltaMargin = DEFAULT_DELTA_MARGIN;
/**
* Whether or not to build an unknown word vector specifically for numbers
*/
public boolean unknownNumberVector = true;
/**
* Whether or not to handle unknown dashed words by taking the last part
*/
public boolean unknownDashedWordVectors = true;
/**
* Whether or not to build an unknown word vector for words with caps in them
*/
public boolean unknownCapsVector = true;
/**
* Make the dv model as simple as possible
*/
public boolean dvSimplifiedModel = false;
/**
* Whether or not to build an unknown word vector to match Chinese years
*/
public boolean unknownChineseYearVector = true;
/**
* Whether or not to build an unknown word vector to match Chinese numbers
*/
public boolean unknownChineseNumberVector = true;
/**
* Whether or not to build an unknown word vector to match Chinese percentages
*/
public boolean unknownChinesePercentVector = true;
public static final double DEFAULT_SCALING_FOR_INIT = 0.5;
/**
* How much to scale certain parameters when initializing models.
* For example, the DVParser uses this to rescale its initial
* matrices.
*/
public double scalingForInit = DEFAULT_SCALING_FOR_INIT;
public int maxTrainTimeSeconds = 0;
public static final String DEFAULT_UNK_WORD = "*UNK*";
/**
* Some models will use external data sources which contain
* information about unknown words. This variable is a way to
* provide the name of the unknown word in the external data source.
*/
public String unkWord = DEFAULT_UNK_WORD;
/**
* Whether or not to lowercase word vectors
*/
public boolean lowercaseWordVectors = false;
public enum TransformMatrixType {
DIAGONAL, RANDOM, OFF_DIAGONAL, RANDOM_ZEROS
}
public TransformMatrixType transformMatrixType = TransformMatrixType.DIAGONAL;
/**
* Specifically for the DVModel, uses words on either side of a
* context when combining constituents. Gives perhaps a microscopic
* improvement in performance but causes a large slowdown.
*/
public boolean useContextWords = false;
/**
* Do we want a model that uses word vectors (such as the DVParser)
* to train those word vectors when training the model?
*
* Note: models prior to 2014-02-13 may have incorrect values in
* this field, as it was originally a compile time constant
*/
public boolean trainWordVectors = true;
public static final int DEFAULT_STALLED_ITERATION_LIMIT = 12;
/**
* How many iterations to allow training to stall before taking the
* best model, if training in an iterative manner
*/
public int stalledIterationLimit = DEFAULT_STALLED_ITERATION_LIMIT;
/** Horton-Strahler number/dimension (Maximilian Schlund) */
public boolean markStrahler;
public void display() {
log.info(toString());
}
@Override
public String toString() {
StringBuilder result = new StringBuilder();
result.append("Train parameters:\n");
result.append(" smooth=" + smoothing + "\n");
result.append(" PA=" + PA + "\n");
result.append(" GPA=" + gPA + "\n");
result.append(" selSplit=" + selectiveSplit + "\n");
result.append(" (" + selectiveSplitCutOff + ((deleteSplitters != null) ? ("; deleting " + deleteSplitters): "") + ")" + "\n");
result.append(" mUnary=" + markUnary + "\n");
result.append(" mUnaryTags=" + markUnaryTags + "\n");
result.append(" sPPT=" + splitPrePreT + "\n");
result.append(" tagPA=" + tagPA + "\n");
result.append(" tagSelSplit=" + tagSelectiveSplit + " (" + tagSelectiveSplitCutOff + ")" + "\n");
result.append(" rightRec=" + rightRec + "\n");
result.append(" leftRec=" + leftRec + "\n");
result.append(" collinsPunc=" + collinsPunc + "\n");
result.append(" markov=" + markovFactor + "\n");
result.append(" mOrd=" + markovOrder + "\n");
result.append(" hSelSplit=" + hSelSplit + " (" + HSEL_CUT + ")" + "\n");
result.append(" compactGrammar=" + compactGrammar() + "\n");
result.append(" postPA=" + postPA + "\n");
result.append(" postGPA=" + postGPA + "\n");
result.append(" selPSplit=" + selectivePostSplit + " (" + selectivePostSplitCutOff + ")" + "\n");
result.append(" tagSelPSplit=" + tagSelectivePostSplit + " (" + tagSelectivePostSplitCutOff + ")" + "\n");
result.append(" postSplitWithBase=" + postSplitWithBaseCategory + "\n");
result.append(" fractionBeforeUnseenCounting=" + fractionBeforeUnseenCounting + "\n");
result.append(" openClassTypesThreshold=" + openClassTypesThreshold + "\n");
result.append(" preTransformer=" + preTransformer + "\n");
result.append(" taggedFiles=" + taggedFiles + "\n");
result.append(" predictSplits=" + predictSplits + "\n");
result.append(" splitCount=" + splitCount + "\n");
result.append(" splitRecombineRate=" + splitRecombineRate + "\n");
result.append(" simpleBinarizedLabels=" + simpleBinarizedLabels + "\n");
result.append(" noRebinarization=" + noRebinarization + "\n");
result.append(" trainingThreads=" + trainingThreads + "\n");
result.append(" dvKBest=" + dvKBest + "\n");
result.append(" trainingIterations=" + trainingIterations + "\n");
result.append(" batchSize=" + batchSize + "\n");
result.append(" regCost=" + regCost + "\n");
result.append(" qnIterationsPerBatch=" + qnIterationsPerBatch + "\n");
result.append(" qnEstimates=" + qnEstimates + "\n");
result.append(" qnTolerance=" + qnTolerance + "\n");
result.append(" debugOutputFrequency=" + debugOutputFrequency + "\n");
result.append(" randomSeed=" + randomSeed + "\n");
result.append(" learningRate=" + learningRate + "\n");
result.append(" deltaMargin=" + deltaMargin + "\n");
result.append(" unknownNumberVector=" + unknownNumberVector + "\n");
result.append(" unknownDashedWordVectors=" + unknownDashedWordVectors + "\n");
result.append(" unknownCapsVector=" + unknownCapsVector + "\n");
result.append(" unknownChineseYearVector=" + unknownChineseYearVector + "\n");
result.append(" unknownChineseNumberVector=" + unknownChineseNumberVector + "\n");
result.append(" unknownChinesePercentVector=" + unknownChinesePercentVector + "\n");
result.append(" dvSimplifiedModel=" + dvSimplifiedModel + "\n");
result.append(" scalingForInit=" + scalingForInit + "\n");
result.append(" maxTrainTimeSeconds=" + maxTrainTimeSeconds + "\n");
result.append(" unkWord=" + unkWord + "\n");
result.append(" lowercaseWordVectors=" + lowercaseWordVectors + "\n");
result.append(" transformMatrixType=" + transformMatrixType + "\n");
result.append(" useContextWords=" + useContextWords + "\n");
result.append(" trainWordVectors=" + trainWordVectors + "\n");
result.append(" stalledIterationLimit=" + stalledIterationLimit + "\n");
result.append(" markStrahler=" + markStrahler + "\n");
return result.toString();
}
public static void printTrainTree(PrintWriter pw, String message, Tree t) {
PrintWriter myPW;
if (pw == null) {
myPW = new PrintWriter(System.out, true);
} else {
myPW = pw;
}
if (message != null && pw == null) {
// hard coded to not print message if using file output!
myPW.println(message);
}
// TODO FIXME: wtf is this shit
boolean previousState = CategoryWordTag.printWordTag;
CategoryWordTag.printWordTag = false;
t.pennPrint(myPW);
CategoryWordTag.printWordTag = previousState;
}
private static final long serialVersionUID = 72571349843538L;
} // end class Train