All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cc.mallet.fst.CRFTrainerByLabelLikelihood Maven / Gradle / Ivy

Go to download

MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.

The newest version!
package cc.mallet.fst;

import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.Random;
import java.util.logging.Logger;

import cc.mallet.optimize.LimitedMemoryBFGS;
import cc.mallet.optimize.Optimizer;
import cc.mallet.types.ExpGain;
import cc.mallet.types.FeatureInducer;
import cc.mallet.types.FeatureSelection;
import cc.mallet.types.FeatureVector;
import cc.mallet.types.GradientGain;
import cc.mallet.types.InfoGain;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;
import cc.mallet.types.Label;
import cc.mallet.types.LabelAlphabet;
import cc.mallet.types.LabelSequence;
import cc.mallet.types.LabelVector;
import cc.mallet.types.RankedFeatureVector;
import cc.mallet.types.Sequence;
import cc.mallet.util.MalletLogger;

/**
 * Unlike ClassifierTrainer, TransducerTrainer is not "stateless" between calls to train. A TransducerTrainer is
 * constructed paired with a specific Transducer, and can only train that Transducer. CRF stores and has methods for
 * FeatureSelection and weight freezing. CRFTrainer stores and has methods for determining the
 * contents/dimensions/sparsity/FeatureInduction of the CRF's weights as determined by training data.
 * 

* Note: In the future this class may go away in favor of some default version of CRFTrainerByValueGradients. */ public class CRFTrainerByLabelLikelihood extends TransducerTrainer implements TransducerTrainer.ByOptimization { /* * KT: 14.08.2008 the number of rounds of successive convergence used by the trainOptimized* methods */ private int minConvRounds = 5; public void setMinConvRounds(int rounds) { this.minConvRounds = rounds; } public int getMinConvRounds() { return this.minConvRounds; } private static Logger logger = MalletLogger.getLogger(CRFTrainerByLabelLikelihood.class.getName()); static final double DEFAULT_GAUSSIAN_PRIOR_VARIANCE = 1.0; static final double DEFAULT_HYPERBOLIC_PRIOR_SLOPE = 0.2; static final double DEFAULT_HYPERBOLIC_PRIOR_SHARPNESS = 10.0; CRF crf; // OptimizableCRF ocrf; CRFOptimizableByLabelLikelihood ocrf; Optimizer opt; int iterationCount = 0; boolean converged; boolean usingHyperbolicPrior = false; double gaussianPriorVariance = DEFAULT_GAUSSIAN_PRIOR_VARIANCE; double hyperbolicPriorSlope = DEFAULT_HYPERBOLIC_PRIOR_SLOPE; double hyperbolicPriorSharpness = DEFAULT_HYPERBOLIC_PRIOR_SHARPNESS; boolean useSparseWeights = true; boolean useNoWeights = false; // TODO remove this; it is just for debugging private transient boolean useSomeUnsupportedTrick = true; // Various values from CRF acting as indicators of when we need to ... private int cachedValueWeightsStamp = -1; // ... re-calculate expectations and values to getValue() because weights' // values changed private int cachedGradientWeightsStamp = -1; // ... re-calculate to getValueGradient() because weights' values // changed private int cachedWeightsStructureStamp = -1; // ... re-allocate crf.weights, expectations & constraints because new // states, transitions // Use mcrf.trainingSet to see when we need to re-allocate crf.weights, expectations & constraints because we are // using a different TrainingList than last time // xxx temporary hack. This is quite useful to have, though!! -cas public boolean printGradient = false; public CRFTrainerByLabelLikelihood(CRF crf) { this.crf = crf; } @Override public Transducer getTransducer() { return crf; } public CRF getCRF() { return crf; } @Override public Optimizer getOptimizer() { return opt; } public boolean isConverged() { return converged; } @Override public boolean isFinishedTraining() { return converged; } @Override public int getIteration() { return iterationCount; } /** * Use this method to specify whether or not factors are added to the CRF by this trainer. If you have already setup * the factors in your CRF, you may not want the trainer to add additional factors. * * @param flag * If true, this trainer adds no factors to the CRF. */ public void setAddNoFactors(boolean flag) { this.useNoWeights = flag; } public CRFOptimizableByLabelLikelihood getOptimizableCRF(InstanceList trainingSet) { if (cachedWeightsStructureStamp != crf.weightsStructureChangeStamp) { if (!useNoWeights) { if (useSparseWeights) { crf.setWeightsDimensionAsIn(trainingSet, useSomeUnsupportedTrick); } else { crf.setWeightsDimensionDensely(); } } // reallocateSufficientStatistics(); // Not necessary here because it is done in the constructor for // OptimizableCRF ocrf = null; cachedWeightsStructureStamp = crf.weightsStructureChangeStamp; } if (ocrf == null || ocrf.trainingSet != trainingSet) { // ocrf = new OptimizableCRF (crf, trainingSet); ocrf = new CRFOptimizableByLabelLikelihood(crf, trainingSet); ocrf.setGaussianPriorVariance(gaussianPriorVariance); ocrf.setHyperbolicPriorSharpness(hyperbolicPriorSharpness); ocrf.setHyperbolicPriorSlope(hyperbolicPriorSlope); ocrf.setUseHyperbolicPrior(usingHyperbolicPrior); opt = null; } return ocrf; } public Optimizer getOptimizer(InstanceList trainingSet) { getOptimizableCRF(trainingSet); // this will set this.mcrf if necessary if (opt == null || ocrf != opt.getOptimizable()) { opt = new LimitedMemoryBFGS(ocrf); // Alternative: opt = new ConjugateGradient (0.001); } return opt; } // Java question: // If I make a non-static inner class CRF.Trainer, // can that class by subclassed in another .java file, // and can that subclass still have access to all the CRF's // instance variables? // ANSWER: Yes and yes, but you have to use special syntax in the subclass ctor (see mallet-dev archive) -cas public boolean trainIncremental(InstanceList training) { return train(training, Integer.MAX_VALUE); } /** * KT, 14.10.2008 * * "optimized" training where termination criterion is later that in "normal" train functions. Optimizer must * converge n successive rounds. This avoid early stopping due to flip on gradient. * * This method is an extention of the respective train method. * * Number of successive rounds can be set by setMinConvRounds(int rounds). Default is set to 5! */ public boolean trainOptimized(InstanceList trainingSet) { return trainOptimized(trainingSet, Integer.MAX_VALUE); } public boolean trainOptimized(InstanceList trainingSet, int numIterations) { if (numIterations <= 0) { return false; } assert trainingSet.size() > 0; getOptimizableCRF(trainingSet); // This will set this.mcrf if necessary getOptimizer(trainingSet); // This will set this.opt if necessary int succConv = 0; boolean convergedByOptimizer = false; boolean convergedByException = false; logger.info("CRF about to train with " + numIterations + " iterations"); for (int i = 0; i < numIterations; i++) { try { convergedByOptimizer = opt.optimize(1); iterationCount++; logger.info("CRF finished one iteration of maximizer, i=" + i); runEvaluators(); } catch (IllegalArgumentException e) { e.printStackTrace(); logger.info("Catching exception; saying converged."); convergedByException = true; } if (convergedByOptimizer == true) { if (succConv >= this.minConvRounds) { logger.info("CRF training has converged by optimizer after " + succConv + " successive convergence rounds, i=" + i); converged = true; break; } else { logger.info("CRF optimizer converged, but need more successive convergence rounds, succConv=" + succConv); succConv++; } } else { // if in a round optimizer has not converged, reset variable succConv = 0; } if (convergedByException) { logger.info("CRF training has converged by exception, i=" + i); converged = true; break; } } return converged; } public boolean trainOptimized(InstanceList training, int numIterationsPerProportion, double[] trainingProportions) { int trainingIteration = 0; assert trainingProportions.length > 0; boolean converged = false; for (int i = 0; i < trainingProportions.length; i++) { assert trainingProportions[i] <= 1.0; logger.info("Training on " + trainingProportions[i] + "% of the data this round."); if (trainingProportions[i] == 1.0) { converged = this.trainOptimized(training, numIterationsPerProportion); } else { converged = this.trainOptimized( training.split(new Random(1), new double[] { trainingProportions[i], 1 - trainingProportions[i] })[0], numIterationsPerProportion); } trainingIteration += numIterationsPerProportion; } return converged; } @Override public boolean train(InstanceList trainingSet, int numIterations) { if (numIterations <= 0) { return false; } assert trainingSet.size() > 0; getOptimizableCRF(trainingSet); // This will set this.mcrf if necessary getOptimizer(trainingSet); // This will set this.opt if necessary boolean converged = false; logger.info("CRF about to train with " + numIterations + " iterations"); for (int i = 0; i < numIterations; i++) { try { converged = opt.optimize(1); iterationCount++; logger.info("CRF finished one iteration of maximizer, i=" + i); runEvaluators(); } catch (IllegalArgumentException e) { e.printStackTrace(); logger.info("Catching exception; saying converged."); converged = true; } catch (Exception e) { e.printStackTrace(); logger.info("Catching exception; saying converged."); converged = true; } if (converged) { logger.info("CRF training has converged, i=" + i); break; } } return converged; } /** * Train a CRF on various-sized subsets of the data. This method is typically used to accelerate training by quickly * getting to reasonable parameters on only a subset of the parameters first, then on progressively more data. * * @param training * The training Instances. * @param numIterationsPerProportion * Maximum number of Maximizer iterations per training proportion. * @param trainingProportions * If non-null, train on increasingly larger portions of the data, e.g. new double[] {0.2, 0.5, 1.0}. * This can sometimes speedup convergence. Be sure to end in 1.0 if you want to train on all the data in * the end. * @return True if training has converged. */ public boolean train(InstanceList training, int numIterationsPerProportion, double[] trainingProportions) { int trainingIteration = 0; assert trainingProportions.length > 0; boolean converged = false; for (int i = 0; i < trainingProportions.length; i++) { assert trainingProportions[i] <= 1.0; logger.info("Training on " + trainingProportions[i] + "% of the data this round."); if (trainingProportions[i] == 1.0) { converged = this.train(training, numIterationsPerProportion); } else { converged = this.train( training.split(new Random(1), new double[] { trainingProportions[i], 1 - trainingProportions[i] })[0], numIterationsPerProportion); } trainingIteration += numIterationsPerProportion; } return converged; } public boolean trainWithFeatureInduction(InstanceList trainingData, InstanceList validationData, InstanceList testingData, TransducerEvaluator eval, int numIterations, int numIterationsBetweenFeatureInductions, int numFeatureInductions, int numFeaturesPerFeatureInduction, double trueLabelProbThreshold, boolean clusteredFeatureInduction, double[] trainingProportions) { return trainWithFeatureInduction(trainingData, validationData, testingData, eval, numIterations, numIterationsBetweenFeatureInductions, numFeatureInductions, numFeaturesPerFeatureInduction, trueLabelProbThreshold, clusteredFeatureInduction, trainingProportions, "exp"); } /** * Train a CRF using feature induction to generate conjunctions of features. Feature induction is run periodically * during training. The features are added to improve performance on the mislabeled instances, with the specific * scoring criterion given by the {@link FeatureInducer} specified by gainName * * @param training * The training Instances. * @param validation * The validation Instances. * @param testing * The testing instances. * @param eval * For evaluation during training. * @param numIterations * Maximum number of Maximizer iterations. * @param numIterationsBetweenFeatureInductions * Number of maximizer iterations between each call to the Feature Inducer. * @param numFeatureInductions * Maximum number of rounds of feature induction. * @param numFeaturesPerFeatureInduction * Maximum number of features to induce at each round of induction. * @param trueLabelProbThreshold * If the model's probability of the true Label of an Instance is less than this value, it is added as an * error instance to the {@link FeatureInducer}. * @param clusteredFeatureInduction * If true, a separate {@link FeatureInducer} is constructed for each label pair. This can avoid inducing * a disproportionate number of features for a single label. * @param trainingProportions * If non-null, train on increasingly larger portions of the data (e.g. [0.2, 0.5, 1.0]. This can * sometimes speedup convergence. * @param gainName * The type of {@link FeatureInducer} to use. One of "exp", "grad", or "info" for {@link ExpGain}, * {@link GradientGain}, or {@link InfoGain}. * @return True if training has converged. */ public boolean trainWithFeatureInduction(InstanceList trainingData, InstanceList validationData, InstanceList testingData, TransducerEvaluator eval, int numIterations, int numIterationsBetweenFeatureInductions, int numFeatureInductions, int numFeaturesPerFeatureInduction, double trueLabelProbThreshold, boolean clusteredFeatureInduction, double[] trainingProportions, String gainName) { int trainingIteration = 0; int numLabels = crf.outputAlphabet.size(); crf.globalFeatureSelection = trainingData.getFeatureSelection(); if (crf.globalFeatureSelection == null) { // Mask out all features; some will be added later by FeatureInducer.induceFeaturesFor(.) crf.globalFeatureSelection = new FeatureSelection(trainingData.getDataAlphabet()); trainingData.setFeatureSelection(crf.globalFeatureSelection); } // TODO Careful! If validationData and testingData get removed as arguments to this method // then the next two lines of work will have to be done somewhere. if (validationData != null) { validationData.setFeatureSelection(crf.globalFeatureSelection); } if (testingData != null) { testingData.setFeatureSelection(crf.globalFeatureSelection); } for (int featureInductionIteration = 0; featureInductionIteration < numFeatureInductions; featureInductionIteration++) { // Print out some feature information logger.info("Feature induction iteration " + featureInductionIteration); // Train the CRF InstanceList theTrainingData = trainingData; if (trainingProportions != null && featureInductionIteration < trainingProportions.length) { logger.info( "Training on " + trainingProportions[featureInductionIteration] + "% of the data this round."); InstanceList[] sampledTrainingData = trainingData.split(new Random(1), new double[] { trainingProportions[featureInductionIteration], 1 - trainingProportions[featureInductionIteration] }); theTrainingData = sampledTrainingData[0]; theTrainingData.setFeatureSelection(crf.globalFeatureSelection); // xxx necessary? logger.info(" which is " + theTrainingData.size() + " instances"); } boolean converged = false; if (featureInductionIteration != 0) { // Don't train until we have added some features converged = this.train(theTrainingData, numIterationsBetweenFeatureInductions); } trainingIteration += numIterationsBetweenFeatureInductions; logger.info("Starting feature induction with " + crf.inputAlphabet.size() + " features."); // Create the list of error tokens, for both unclustered and clustered feature induction InstanceList errorInstances = new InstanceList(trainingData.getDataAlphabet(), trainingData.getTargetAlphabet()); // This errorInstances.featureSelection will get examined by FeatureInducer, // so it can know how to add "new" singleton features errorInstances.setFeatureSelection(crf.globalFeatureSelection); ArrayList errorLabelVectors = new ArrayList(); InstanceList clusteredErrorInstances[][] = new InstanceList[numLabels][numLabels]; ArrayList clusteredErrorLabelVectors[][] = new ArrayList[numLabels][numLabels]; for (int i = 0; i < numLabels; i++) { for (int j = 0; j < numLabels; j++) { clusteredErrorInstances[i][j] = new InstanceList(trainingData.getDataAlphabet(), trainingData.getTargetAlphabet()); clusteredErrorInstances[i][j].setFeatureSelection(crf.globalFeatureSelection); clusteredErrorLabelVectors[i][j] = new ArrayList(); } } for (int i = 0; i < theTrainingData.size(); i++) { logger.info("instance=" + i); Instance instance = theTrainingData.get(i); Sequence input = (Sequence) instance.getData(); Sequence trueOutput = (Sequence) instance.getTarget(); assert input.size() == trueOutput.size(); SumLattice lattice = crf.sumLatticeFactory.newSumLattice(crf, input, (Sequence) null, (Transducer.Incrementor) null, (LabelAlphabet) theTrainingData.getTargetAlphabet()); int prevLabelIndex = 0; // This will put extra error instances in this cluster for (int j = 0; j < trueOutput.size(); j++) { Label label = ((LabelSequence) trueOutput).getLabelAtPosition(j); assert label != null; // System.out.println ("Instance="+i+" position="+j+" // fv="+lattice.getLabelingAtPosition(j).toString(true)); LabelVector latticeLabeling = lattice.getLabelingAtPosition(j); double trueLabelProb = latticeLabeling.value(label.getIndex()); int labelIndex = latticeLabeling.getBestIndex(); // System.out.println ("position="+j+" trueLabelProb="+trueLabelProb); if (trueLabelProb < trueLabelProbThreshold) { logger.info("Adding error: instance=" + i + " position=" + j + " prtrue=" + trueLabelProb + (label == latticeLabeling.getBestLabel() ? " " : " *") + " truelabel=" + label + " predlabel=" + latticeLabeling.getBestLabel() + " fv=" + ((FeatureVector) input.get(j)).toString(true)); errorInstances.add(input.get(j), label, null, null); errorLabelVectors.add(latticeLabeling); clusteredErrorInstances[prevLabelIndex][labelIndex].add(input.get(j), label, null, null); clusteredErrorLabelVectors[prevLabelIndex][labelIndex].add(latticeLabeling); } prevLabelIndex = labelIndex; } } logger.info("Error instance list size = " + errorInstances.size()); if (clusteredFeatureInduction) { FeatureInducer[][] klfi = new FeatureInducer[numLabels][numLabels]; for (int i = 0; i < numLabels; i++) { for (int j = 0; j < numLabels; j++) { // Note that we may see some "impossible" transitions here (like O->I in a OIB model) // because we are using lattice gammas to get the predicted label, not Viterbi. // I don't believe this does any harm, and may do some good. logger.info("Doing feature induction for " + crf.outputAlphabet.lookupObject(i) + " -> " + crf.outputAlphabet.lookupObject(j) + " with " + clusteredErrorInstances[i][j].size() + " instances"); if (clusteredErrorInstances[i][j].size() < 20) { logger.info( "..skipping because only " + clusteredErrorInstances[i][j].size() + " instances."); continue; } int s = clusteredErrorLabelVectors[i][j].size(); LabelVector[] lvs = new LabelVector[s]; for (int k = 0; k < s; k++) { lvs[k] = (LabelVector) clusteredErrorLabelVectors[i][j].get(k); } RankedFeatureVector.Factory gainFactory = null; if (gainName.equals("exp")) { gainFactory = new ExpGain.Factory(lvs, gaussianPriorVariance); } else if (gainName.equals("grad")) { gainFactory = new GradientGain.Factory(lvs); } else if (gainName.equals("info")) { gainFactory = new InfoGain.Factory(); } klfi[i][j] = new FeatureInducer(gainFactory, clusteredErrorInstances[i][j], numFeaturesPerFeatureInduction, 2 * numFeaturesPerFeatureInduction, 2 * numFeaturesPerFeatureInduction); crf.featureInducers.add(klfi[i][j]); } } for (int i = 0; i < numLabels; i++) { for (int j = 0; j < numLabels; j++) { logger.info("Adding new induced features for " + crf.outputAlphabet.lookupObject(i) + " -> " + crf.outputAlphabet.lookupObject(j)); if (klfi[i][j] == null) { logger.info("...skipping because no features induced."); continue; } // Note that this adds features globally, but not on a per-transition basis klfi[i][j].induceFeaturesFor(trainingData, false, false); if (testingData != null) { klfi[i][j].induceFeaturesFor(testingData, false, false); } } } klfi = null; } else { int s = errorLabelVectors.size(); LabelVector[] lvs = new LabelVector[s]; for (int i = 0; i < s; i++) { lvs[i] = (LabelVector) errorLabelVectors.get(i); } RankedFeatureVector.Factory gainFactory = null; if (gainName.equals("exp")) { gainFactory = new ExpGain.Factory(lvs, gaussianPriorVariance); } else if (gainName.equals("grad")) { gainFactory = new GradientGain.Factory(lvs); } else if (gainName.equals("info")) { gainFactory = new InfoGain.Factory(); } FeatureInducer klfi = new FeatureInducer(gainFactory, errorInstances, numFeaturesPerFeatureInduction, 2 * numFeaturesPerFeatureInduction, 2 * numFeaturesPerFeatureInduction); crf.featureInducers.add(klfi); // Note that this adds features globally, but not on a per-transition basis klfi.induceFeaturesFor(trainingData, false, false); if (testingData != null) { klfi.induceFeaturesFor(testingData, false, false); } logger.info( "CRF4 FeatureSelection now includes " + crf.globalFeatureSelection.cardinality() + " features"); klfi = null; } // This is done in CRF4.train() anyway // this.setWeightsDimensionAsIn (trainingData); //// this.growWeightsDimensionToInputAlphabet (); } return this.train(trainingData, numIterations - trainingIteration); } public void setUseHyperbolicPrior(boolean f) { usingHyperbolicPrior = f; } public void setHyperbolicPriorSlope(double p) { hyperbolicPriorSlope = p; } public void setHyperbolicPriorSharpness(double p) { hyperbolicPriorSharpness = p; } public double getUseHyperbolicPriorSlope() { return hyperbolicPriorSlope; } public double getUseHyperbolicPriorSharpness() { return hyperbolicPriorSharpness; } public void setGaussianPriorVariance(double p) { gaussianPriorVariance = p; } public double getGaussianPriorVariance() { return gaussianPriorVariance; } // public int getDefaultFeatureIndex () { return defaultFeatureIndex;} public void setUseSparseWeights(boolean b) { useSparseWeights = b; } public boolean getUseSparseWeights() { return useSparseWeights; } /** * Sets whether to use the 'some unsupported trick.' This trick is, if training a CRF where some training has been * done and sparse weights are used, to add a few weights for feaures that do not occur in the tainig data. *

* This generally leads to better accuracy at only a small memory cost. * * @param b * Whether to use the trick */ public void setUseSomeUnsupportedTrick(boolean b) { useSomeUnsupportedTrick = b; } // Serialization for CRFTrainerByLikelihood private static final long serialVersionUID = 1; private static final int CURRENT_SERIAL_VERSION = 1; static final int NULL_INTEGER = -1; /* Need to check for null pointers. */ private void writeObject(ObjectOutputStream out) throws IOException { int i, size; out.writeInt(CURRENT_SERIAL_VERSION); // out.writeInt(defaultFeatureIndex); out.writeBoolean(usingHyperbolicPrior); out.writeDouble(gaussianPriorVariance); out.writeDouble(hyperbolicPriorSlope); out.writeDouble(hyperbolicPriorSharpness); out.writeInt(cachedGradientWeightsStamp); out.writeInt(cachedValueWeightsStamp); out.writeInt(cachedWeightsStructureStamp); out.writeBoolean(printGradient); out.writeBoolean(useSparseWeights); throw new IllegalStateException("Implementation not yet complete."); } private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException { int size, i; int version = in.readInt(); // defaultFeatureIndex = in.readInt(); usingHyperbolicPrior = in.readBoolean(); gaussianPriorVariance = in.readDouble(); hyperbolicPriorSlope = in.readDouble(); hyperbolicPriorSharpness = in.readDouble(); printGradient = in.readBoolean(); useSparseWeights = in.readBoolean(); throw new IllegalStateException("Implementation not yet complete."); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy