
cc.mallet.fst.CRFTrainerByLabelLikelihood Maven / Gradle / Ivy
Show all versions of jcore-mallet-2.0.9 Show documentation
package cc.mallet.fst;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.Random;
import java.util.logging.Logger;
import cc.mallet.optimize.LimitedMemoryBFGS;
import cc.mallet.optimize.Optimizer;
import cc.mallet.types.ExpGain;
import cc.mallet.types.FeatureInducer;
import cc.mallet.types.FeatureSelection;
import cc.mallet.types.FeatureVector;
import cc.mallet.types.GradientGain;
import cc.mallet.types.InfoGain;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;
import cc.mallet.types.Label;
import cc.mallet.types.LabelAlphabet;
import cc.mallet.types.LabelSequence;
import cc.mallet.types.LabelVector;
import cc.mallet.types.RankedFeatureVector;
import cc.mallet.types.Sequence;
import cc.mallet.util.MalletLogger;
/**
* Unlike ClassifierTrainer, TransducerTrainer is not "stateless" between calls to train. A TransducerTrainer is
* constructed paired with a specific Transducer, and can only train that Transducer. CRF stores and has methods for
* FeatureSelection and weight freezing. CRFTrainer stores and has methods for determining the
* contents/dimensions/sparsity/FeatureInduction of the CRF's weights as determined by training data.
*
* Note: In the future this class may go away in favor of some default version of CRFTrainerByValueGradients.
*/
public class CRFTrainerByLabelLikelihood extends TransducerTrainer implements TransducerTrainer.ByOptimization {
/*
* KT: 14.08.2008 the number of rounds of successive convergence used by the trainOptimized* methods
*/
private int minConvRounds = 5;
public void setMinConvRounds(int rounds) {
this.minConvRounds = rounds;
}
public int getMinConvRounds() {
return this.minConvRounds;
}
private static Logger logger = MalletLogger.getLogger(CRFTrainerByLabelLikelihood.class.getName());
static final double DEFAULT_GAUSSIAN_PRIOR_VARIANCE = 1.0;
static final double DEFAULT_HYPERBOLIC_PRIOR_SLOPE = 0.2;
static final double DEFAULT_HYPERBOLIC_PRIOR_SHARPNESS = 10.0;
CRF crf;
// OptimizableCRF ocrf;
CRFOptimizableByLabelLikelihood ocrf;
Optimizer opt;
int iterationCount = 0;
boolean converged;
boolean usingHyperbolicPrior = false;
double gaussianPriorVariance = DEFAULT_GAUSSIAN_PRIOR_VARIANCE;
double hyperbolicPriorSlope = DEFAULT_HYPERBOLIC_PRIOR_SLOPE;
double hyperbolicPriorSharpness = DEFAULT_HYPERBOLIC_PRIOR_SHARPNESS;
boolean useSparseWeights = true;
boolean useNoWeights = false; // TODO remove this; it is just for debugging
private transient boolean useSomeUnsupportedTrick = true;
// Various values from CRF acting as indicators of when we need to ...
private int cachedValueWeightsStamp = -1; // ... re-calculate expectations and values to getValue() because weights'
// values changed
private int cachedGradientWeightsStamp = -1; // ... re-calculate to getValueGradient() because weights' values
// changed
private int cachedWeightsStructureStamp = -1; // ... re-allocate crf.weights, expectations & constraints because new
// states, transitions
// Use mcrf.trainingSet to see when we need to re-allocate crf.weights, expectations & constraints because we are
// using a different TrainingList than last time
// xxx temporary hack. This is quite useful to have, though!! -cas
public boolean printGradient = false;
public CRFTrainerByLabelLikelihood(CRF crf) {
this.crf = crf;
}
@Override
public Transducer getTransducer() {
return crf;
}
public CRF getCRF() {
return crf;
}
@Override
public Optimizer getOptimizer() {
return opt;
}
public boolean isConverged() {
return converged;
}
@Override
public boolean isFinishedTraining() {
return converged;
}
@Override
public int getIteration() {
return iterationCount;
}
/**
* Use this method to specify whether or not factors are added to the CRF by this trainer. If you have already setup
* the factors in your CRF, you may not want the trainer to add additional factors.
*
* @param flag
* If true, this trainer adds no factors to the CRF.
*/
public void setAddNoFactors(boolean flag) {
this.useNoWeights = flag;
}
public CRFOptimizableByLabelLikelihood getOptimizableCRF(InstanceList trainingSet) {
if (cachedWeightsStructureStamp != crf.weightsStructureChangeStamp) {
if (!useNoWeights) {
if (useSparseWeights) {
crf.setWeightsDimensionAsIn(trainingSet, useSomeUnsupportedTrick);
} else {
crf.setWeightsDimensionDensely();
}
}
// reallocateSufficientStatistics(); // Not necessary here because it is done in the constructor for
// OptimizableCRF
ocrf = null;
cachedWeightsStructureStamp = crf.weightsStructureChangeStamp;
}
if (ocrf == null || ocrf.trainingSet != trainingSet) {
// ocrf = new OptimizableCRF (crf, trainingSet);
ocrf = new CRFOptimizableByLabelLikelihood(crf, trainingSet);
ocrf.setGaussianPriorVariance(gaussianPriorVariance);
ocrf.setHyperbolicPriorSharpness(hyperbolicPriorSharpness);
ocrf.setHyperbolicPriorSlope(hyperbolicPriorSlope);
ocrf.setUseHyperbolicPrior(usingHyperbolicPrior);
opt = null;
}
return ocrf;
}
public Optimizer getOptimizer(InstanceList trainingSet) {
getOptimizableCRF(trainingSet); // this will set this.mcrf if necessary
if (opt == null || ocrf != opt.getOptimizable()) {
opt = new LimitedMemoryBFGS(ocrf); // Alternative: opt = new ConjugateGradient (0.001);
}
return opt;
}
// Java question:
// If I make a non-static inner class CRF.Trainer,
// can that class by subclassed in another .java file,
// and can that subclass still have access to all the CRF's
// instance variables?
// ANSWER: Yes and yes, but you have to use special syntax in the subclass ctor (see mallet-dev archive) -cas
public boolean trainIncremental(InstanceList training) {
return train(training, Integer.MAX_VALUE);
}
/**
* KT, 14.10.2008
*
* "optimized" training where termination criterion is later that in "normal" train functions. Optimizer must
* converge n successive rounds. This avoid early stopping due to flip on gradient.
*
* This method is an extention of the respective train method.
*
* Number of successive rounds can be set by setMinConvRounds(int rounds). Default is set to 5!
*/
public boolean trainOptimized(InstanceList trainingSet) {
return trainOptimized(trainingSet, Integer.MAX_VALUE);
}
public boolean trainOptimized(InstanceList trainingSet, int numIterations) {
if (numIterations <= 0) {
return false;
}
assert trainingSet.size() > 0;
getOptimizableCRF(trainingSet); // This will set this.mcrf if necessary
getOptimizer(trainingSet); // This will set this.opt if necessary
int succConv = 0;
boolean convergedByOptimizer = false;
boolean convergedByException = false;
logger.info("CRF about to train with " + numIterations + " iterations");
for (int i = 0; i < numIterations; i++) {
try {
convergedByOptimizer = opt.optimize(1);
iterationCount++;
logger.info("CRF finished one iteration of maximizer, i=" + i);
runEvaluators();
} catch (IllegalArgumentException e) {
e.printStackTrace();
logger.info("Catching exception; saying converged.");
convergedByException = true;
}
if (convergedByOptimizer == true) {
if (succConv >= this.minConvRounds) {
logger.info("CRF training has converged by optimizer after " + succConv
+ " successive convergence rounds, i=" + i);
converged = true;
break;
} else {
logger.info("CRF optimizer converged, but need more successive convergence rounds, succConv="
+ succConv);
succConv++;
}
} else {
// if in a round optimizer has not converged, reset variable
succConv = 0;
}
if (convergedByException) {
logger.info("CRF training has converged by exception, i=" + i);
converged = true;
break;
}
}
return converged;
}
public boolean trainOptimized(InstanceList training, int numIterationsPerProportion, double[] trainingProportions) {
int trainingIteration = 0;
assert trainingProportions.length > 0;
boolean converged = false;
for (int i = 0; i < trainingProportions.length; i++) {
assert trainingProportions[i] <= 1.0;
logger.info("Training on " + trainingProportions[i] + "% of the data this round.");
if (trainingProportions[i] == 1.0) {
converged = this.trainOptimized(training, numIterationsPerProportion);
} else {
converged = this.trainOptimized(
training.split(new Random(1),
new double[] { trainingProportions[i], 1 - trainingProportions[i] })[0],
numIterationsPerProportion);
}
trainingIteration += numIterationsPerProportion;
}
return converged;
}
@Override
public boolean train(InstanceList trainingSet, int numIterations) {
if (numIterations <= 0) {
return false;
}
assert trainingSet.size() > 0;
getOptimizableCRF(trainingSet); // This will set this.mcrf if necessary
getOptimizer(trainingSet); // This will set this.opt if necessary
boolean converged = false;
logger.info("CRF about to train with " + numIterations + " iterations");
for (int i = 0; i < numIterations; i++) {
try {
converged = opt.optimize(1);
iterationCount++;
logger.info("CRF finished one iteration of maximizer, i=" + i);
runEvaluators();
} catch (IllegalArgumentException e) {
e.printStackTrace();
logger.info("Catching exception; saying converged.");
converged = true;
} catch (Exception e) {
e.printStackTrace();
logger.info("Catching exception; saying converged.");
converged = true;
}
if (converged) {
logger.info("CRF training has converged, i=" + i);
break;
}
}
return converged;
}
/**
* Train a CRF on various-sized subsets of the data. This method is typically used to accelerate training by quickly
* getting to reasonable parameters on only a subset of the parameters first, then on progressively more data.
*
* @param training
* The training Instances.
* @param numIterationsPerProportion
* Maximum number of Maximizer iterations per training proportion.
* @param trainingProportions
* If non-null, train on increasingly larger portions of the data, e.g. new double[] {0.2, 0.5, 1.0}.
* This can sometimes speedup convergence. Be sure to end in 1.0 if you want to train on all the data in
* the end.
* @return True if training has converged.
*/
public boolean train(InstanceList training, int numIterationsPerProportion, double[] trainingProportions) {
int trainingIteration = 0;
assert trainingProportions.length > 0;
boolean converged = false;
for (int i = 0; i < trainingProportions.length; i++) {
assert trainingProportions[i] <= 1.0;
logger.info("Training on " + trainingProportions[i] + "% of the data this round.");
if (trainingProportions[i] == 1.0) {
converged = this.train(training, numIterationsPerProportion);
} else {
converged = this.train(
training.split(new Random(1),
new double[] { trainingProportions[i], 1 - trainingProportions[i] })[0],
numIterationsPerProportion);
}
trainingIteration += numIterationsPerProportion;
}
return converged;
}
public boolean trainWithFeatureInduction(InstanceList trainingData, InstanceList validationData,
InstanceList testingData, TransducerEvaluator eval, int numIterations,
int numIterationsBetweenFeatureInductions, int numFeatureInductions, int numFeaturesPerFeatureInduction,
double trueLabelProbThreshold, boolean clusteredFeatureInduction, double[] trainingProportions) {
return trainWithFeatureInduction(trainingData, validationData, testingData, eval, numIterations,
numIterationsBetweenFeatureInductions, numFeatureInductions, numFeaturesPerFeatureInduction,
trueLabelProbThreshold, clusteredFeatureInduction, trainingProportions, "exp");
}
/**
* Train a CRF using feature induction to generate conjunctions of features. Feature induction is run periodically
* during training. The features are added to improve performance on the mislabeled instances, with the specific
* scoring criterion given by the {@link FeatureInducer} specified by gainName
*
* @param training
* The training Instances.
* @param validation
* The validation Instances.
* @param testing
* The testing instances.
* @param eval
* For evaluation during training.
* @param numIterations
* Maximum number of Maximizer iterations.
* @param numIterationsBetweenFeatureInductions
* Number of maximizer iterations between each call to the Feature Inducer.
* @param numFeatureInductions
* Maximum number of rounds of feature induction.
* @param numFeaturesPerFeatureInduction
* Maximum number of features to induce at each round of induction.
* @param trueLabelProbThreshold
* If the model's probability of the true Label of an Instance is less than this value, it is added as an
* error instance to the {@link FeatureInducer}.
* @param clusteredFeatureInduction
* If true, a separate {@link FeatureInducer} is constructed for each label pair. This can avoid inducing
* a disproportionate number of features for a single label.
* @param trainingProportions
* If non-null, train on increasingly larger portions of the data (e.g. [0.2, 0.5, 1.0]. This can
* sometimes speedup convergence.
* @param gainName
* The type of {@link FeatureInducer} to use. One of "exp", "grad", or "info" for {@link ExpGain},
* {@link GradientGain}, or {@link InfoGain}.
* @return True if training has converged.
*/
public boolean trainWithFeatureInduction(InstanceList trainingData, InstanceList validationData,
InstanceList testingData, TransducerEvaluator eval, int numIterations,
int numIterationsBetweenFeatureInductions, int numFeatureInductions, int numFeaturesPerFeatureInduction,
double trueLabelProbThreshold, boolean clusteredFeatureInduction, double[] trainingProportions,
String gainName) {
int trainingIteration = 0;
int numLabels = crf.outputAlphabet.size();
crf.globalFeatureSelection = trainingData.getFeatureSelection();
if (crf.globalFeatureSelection == null) {
// Mask out all features; some will be added later by FeatureInducer.induceFeaturesFor(.)
crf.globalFeatureSelection = new FeatureSelection(trainingData.getDataAlphabet());
trainingData.setFeatureSelection(crf.globalFeatureSelection);
}
// TODO Careful! If validationData and testingData get removed as arguments to this method
// then the next two lines of work will have to be done somewhere.
if (validationData != null) {
validationData.setFeatureSelection(crf.globalFeatureSelection);
}
if (testingData != null) {
testingData.setFeatureSelection(crf.globalFeatureSelection);
}
for (int featureInductionIteration = 0; featureInductionIteration < numFeatureInductions; featureInductionIteration++) {
// Print out some feature information
logger.info("Feature induction iteration " + featureInductionIteration);
// Train the CRF
InstanceList theTrainingData = trainingData;
if (trainingProportions != null && featureInductionIteration < trainingProportions.length) {
logger.info(
"Training on " + trainingProportions[featureInductionIteration] + "% of the data this round.");
InstanceList[] sampledTrainingData = trainingData.split(new Random(1),
new double[] { trainingProportions[featureInductionIteration],
1 - trainingProportions[featureInductionIteration] });
theTrainingData = sampledTrainingData[0];
theTrainingData.setFeatureSelection(crf.globalFeatureSelection); // xxx necessary?
logger.info(" which is " + theTrainingData.size() + " instances");
}
boolean converged = false;
if (featureInductionIteration != 0) {
// Don't train until we have added some features
converged = this.train(theTrainingData, numIterationsBetweenFeatureInductions);
}
trainingIteration += numIterationsBetweenFeatureInductions;
logger.info("Starting feature induction with " + crf.inputAlphabet.size() + " features.");
// Create the list of error tokens, for both unclustered and clustered feature induction
InstanceList errorInstances = new InstanceList(trainingData.getDataAlphabet(),
trainingData.getTargetAlphabet());
// This errorInstances.featureSelection will get examined by FeatureInducer,
// so it can know how to add "new" singleton features
errorInstances.setFeatureSelection(crf.globalFeatureSelection);
ArrayList errorLabelVectors = new ArrayList();
InstanceList clusteredErrorInstances[][] = new InstanceList[numLabels][numLabels];
ArrayList clusteredErrorLabelVectors[][] = new ArrayList[numLabels][numLabels];
for (int i = 0; i < numLabels; i++) {
for (int j = 0; j < numLabels; j++) {
clusteredErrorInstances[i][j] = new InstanceList(trainingData.getDataAlphabet(),
trainingData.getTargetAlphabet());
clusteredErrorInstances[i][j].setFeatureSelection(crf.globalFeatureSelection);
clusteredErrorLabelVectors[i][j] = new ArrayList();
}
}
for (int i = 0; i < theTrainingData.size(); i++) {
logger.info("instance=" + i);
Instance instance = theTrainingData.get(i);
Sequence input = (Sequence) instance.getData();
Sequence trueOutput = (Sequence) instance.getTarget();
assert input.size() == trueOutput.size();
SumLattice lattice = crf.sumLatticeFactory.newSumLattice(crf, input, (Sequence) null,
(Transducer.Incrementor) null, (LabelAlphabet) theTrainingData.getTargetAlphabet());
int prevLabelIndex = 0; // This will put extra error instances in this cluster
for (int j = 0; j < trueOutput.size(); j++) {
Label label = ((LabelSequence) trueOutput).getLabelAtPosition(j);
assert label != null;
// System.out.println ("Instance="+i+" position="+j+"
// fv="+lattice.getLabelingAtPosition(j).toString(true));
LabelVector latticeLabeling = lattice.getLabelingAtPosition(j);
double trueLabelProb = latticeLabeling.value(label.getIndex());
int labelIndex = latticeLabeling.getBestIndex();
// System.out.println ("position="+j+" trueLabelProb="+trueLabelProb);
if (trueLabelProb < trueLabelProbThreshold) {
logger.info("Adding error: instance=" + i + " position=" + j + " prtrue=" + trueLabelProb
+ (label == latticeLabeling.getBestLabel() ? " " : " *") + " truelabel=" + label
+ " predlabel=" + latticeLabeling.getBestLabel() + " fv="
+ ((FeatureVector) input.get(j)).toString(true));
errorInstances.add(input.get(j), label, null, null);
errorLabelVectors.add(latticeLabeling);
clusteredErrorInstances[prevLabelIndex][labelIndex].add(input.get(j), label, null, null);
clusteredErrorLabelVectors[prevLabelIndex][labelIndex].add(latticeLabeling);
}
prevLabelIndex = labelIndex;
}
}
logger.info("Error instance list size = " + errorInstances.size());
if (clusteredFeatureInduction) {
FeatureInducer[][] klfi = new FeatureInducer[numLabels][numLabels];
for (int i = 0; i < numLabels; i++) {
for (int j = 0; j < numLabels; j++) {
// Note that we may see some "impossible" transitions here (like O->I in a OIB model)
// because we are using lattice gammas to get the predicted label, not Viterbi.
// I don't believe this does any harm, and may do some good.
logger.info("Doing feature induction for " + crf.outputAlphabet.lookupObject(i) + " -> "
+ crf.outputAlphabet.lookupObject(j) + " with " + clusteredErrorInstances[i][j].size()
+ " instances");
if (clusteredErrorInstances[i][j].size() < 20) {
logger.info(
"..skipping because only " + clusteredErrorInstances[i][j].size() + " instances.");
continue;
}
int s = clusteredErrorLabelVectors[i][j].size();
LabelVector[] lvs = new LabelVector[s];
for (int k = 0; k < s; k++) {
lvs[k] = (LabelVector) clusteredErrorLabelVectors[i][j].get(k);
}
RankedFeatureVector.Factory gainFactory = null;
if (gainName.equals("exp")) {
gainFactory = new ExpGain.Factory(lvs, gaussianPriorVariance);
} else if (gainName.equals("grad")) {
gainFactory = new GradientGain.Factory(lvs);
} else if (gainName.equals("info")) {
gainFactory = new InfoGain.Factory();
}
klfi[i][j] = new FeatureInducer(gainFactory, clusteredErrorInstances[i][j],
numFeaturesPerFeatureInduction, 2 * numFeaturesPerFeatureInduction,
2 * numFeaturesPerFeatureInduction);
crf.featureInducers.add(klfi[i][j]);
}
}
for (int i = 0; i < numLabels; i++) {
for (int j = 0; j < numLabels; j++) {
logger.info("Adding new induced features for " + crf.outputAlphabet.lookupObject(i) + " -> "
+ crf.outputAlphabet.lookupObject(j));
if (klfi[i][j] == null) {
logger.info("...skipping because no features induced.");
continue;
}
// Note that this adds features globally, but not on a per-transition basis
klfi[i][j].induceFeaturesFor(trainingData, false, false);
if (testingData != null) {
klfi[i][j].induceFeaturesFor(testingData, false, false);
}
}
}
klfi = null;
} else {
int s = errorLabelVectors.size();
LabelVector[] lvs = new LabelVector[s];
for (int i = 0; i < s; i++) {
lvs[i] = (LabelVector) errorLabelVectors.get(i);
}
RankedFeatureVector.Factory gainFactory = null;
if (gainName.equals("exp")) {
gainFactory = new ExpGain.Factory(lvs, gaussianPriorVariance);
} else if (gainName.equals("grad")) {
gainFactory = new GradientGain.Factory(lvs);
} else if (gainName.equals("info")) {
gainFactory = new InfoGain.Factory();
}
FeatureInducer klfi = new FeatureInducer(gainFactory, errorInstances, numFeaturesPerFeatureInduction,
2 * numFeaturesPerFeatureInduction, 2 * numFeaturesPerFeatureInduction);
crf.featureInducers.add(klfi);
// Note that this adds features globally, but not on a per-transition basis
klfi.induceFeaturesFor(trainingData, false, false);
if (testingData != null) {
klfi.induceFeaturesFor(testingData, false, false);
}
logger.info(
"CRF4 FeatureSelection now includes " + crf.globalFeatureSelection.cardinality() + " features");
klfi = null;
}
// This is done in CRF4.train() anyway
// this.setWeightsDimensionAsIn (trainingData);
//// this.growWeightsDimensionToInputAlphabet ();
}
return this.train(trainingData, numIterations - trainingIteration);
}
public void setUseHyperbolicPrior(boolean f) {
usingHyperbolicPrior = f;
}
public void setHyperbolicPriorSlope(double p) {
hyperbolicPriorSlope = p;
}
public void setHyperbolicPriorSharpness(double p) {
hyperbolicPriorSharpness = p;
}
public double getUseHyperbolicPriorSlope() {
return hyperbolicPriorSlope;
}
public double getUseHyperbolicPriorSharpness() {
return hyperbolicPriorSharpness;
}
public void setGaussianPriorVariance(double p) {
gaussianPriorVariance = p;
}
public double getGaussianPriorVariance() {
return gaussianPriorVariance;
}
// public int getDefaultFeatureIndex () { return defaultFeatureIndex;}
public void setUseSparseWeights(boolean b) {
useSparseWeights = b;
}
public boolean getUseSparseWeights() {
return useSparseWeights;
}
/**
* Sets whether to use the 'some unsupported trick.' This trick is, if training a CRF where some training has been
* done and sparse weights are used, to add a few weights for feaures that do not occur in the tainig data.
*
* This generally leads to better accuracy at only a small memory cost.
*
* @param b
* Whether to use the trick
*/
public void setUseSomeUnsupportedTrick(boolean b) {
useSomeUnsupportedTrick = b;
}
// Serialization for CRFTrainerByLikelihood
private static final long serialVersionUID = 1;
private static final int CURRENT_SERIAL_VERSION = 1;
static final int NULL_INTEGER = -1;
/* Need to check for null pointers. */
private void writeObject(ObjectOutputStream out) throws IOException {
int i, size;
out.writeInt(CURRENT_SERIAL_VERSION);
// out.writeInt(defaultFeatureIndex);
out.writeBoolean(usingHyperbolicPrior);
out.writeDouble(gaussianPriorVariance);
out.writeDouble(hyperbolicPriorSlope);
out.writeDouble(hyperbolicPriorSharpness);
out.writeInt(cachedGradientWeightsStamp);
out.writeInt(cachedValueWeightsStamp);
out.writeInt(cachedWeightsStructureStamp);
out.writeBoolean(printGradient);
out.writeBoolean(useSparseWeights);
throw new IllegalStateException("Implementation not yet complete.");
}
private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
int size, i;
int version = in.readInt();
// defaultFeatureIndex = in.readInt();
usingHyperbolicPrior = in.readBoolean();
gaussianPriorVariance = in.readDouble();
hyperbolicPriorSlope = in.readDouble();
hyperbolicPriorSharpness = in.readDouble();
printGradient = in.readBoolean();
useSparseWeights = in.readBoolean();
throw new IllegalStateException("Implementation not yet complete.");
}
}