cc.mallet.classify.MCMaxEntTrainer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mallet Show documentation
Show all versions of mallet Show documentation
MALLET is a Java-based package for statistical natural language processing,
document classification, clustering, topic modeling, information extraction,
and other machine learning applications to text.
The newest version!
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
package cc.mallet.classify;
import java.util.logging.*;
import java.util.*;
import java.io.*;
import cc.mallet.classify.Classifier;
import cc.mallet.optimize.LimitedMemoryBFGS;
import cc.mallet.optimize.Optimizable;
import cc.mallet.optimize.Optimizer;
import cc.mallet.optimize.tests.*;
import cc.mallet.pipe.Pipe;
import cc.mallet.types.Alphabet;
import cc.mallet.types.ExpGain;
import cc.mallet.types.FeatureInducer;
import cc.mallet.types.FeatureSelection;
import cc.mallet.types.FeatureVector;
import cc.mallet.types.GradientGain;
import cc.mallet.types.InfoGain;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;
import cc.mallet.types.Label;
import cc.mallet.types.LabelAlphabet;
import cc.mallet.types.LabelVector;
import cc.mallet.types.Labeling;
import cc.mallet.types.MatrixOps;
import cc.mallet.types.RankedFeatureVector;
import cc.mallet.types.Vector;
import cc.mallet.util.CommandOption;
import cc.mallet.util.MalletLogger;
import cc.mallet.util.MalletProgressMessageLogger;
import cc.mallet.util.Maths;
// Does not currently handle instances that are labeled with distributions
// instead of a single label.
/**
* The trainer for a Maximum Entropy classifier.
@author Andrew McCallum [email protected]
*/
public class MCMaxEntTrainer extends ClassifierTrainer implements Boostable, Serializable //implements CommandOption.ListProviding
{
private static Logger logger = MalletLogger.getLogger(MCMaxEntTrainer.class.getName());
private static Logger progressLogger = MalletProgressMessageLogger.getLogger(MCMaxEntTrainer.class.getName()+"-pl");
int numGetValueCalls = 0;
int numGetValueGradientCalls = 0;
int numIterations = 10;
public static final String EXP_GAIN = "exp";
public static final String GRADIENT_GAIN = "grad";
public static final String INFORMATION_GAIN = "info";
// xxx Why does TestMaximizable fail when this variance is very small?
static final double DEFAULT_GAUSSIAN_PRIOR_VARIANCE = .1; // note used to be 1
static final double DEFAULT_HYPERBOLIC_PRIOR_SLOPE = 0.2;
static final double DEFAULT_HYPERBOLIC_PRIOR_SHARPNESS = 10.0;
static final Class DEFAULT_MAXIMIZER_CLASS = LimitedMemoryBFGS.class;
// CPAL
boolean usingMultiConditionalTraining = true;
boolean usingHyperbolicPrior = false;
double gaussianPriorVariance = DEFAULT_GAUSSIAN_PRIOR_VARIANCE;
double hyperbolicPriorSlope = DEFAULT_HYPERBOLIC_PRIOR_SLOPE;
double hyperbolicPriorSharpness = DEFAULT_HYPERBOLIC_PRIOR_SHARPNESS;
Class maximizerClass = DEFAULT_MAXIMIZER_CLASS;
double generativeWeighting = 1.0;
MaximizableTrainer mt;
MCMaxEnt initialClassifier;
// CPAL
static CommandOption.Boolean usingMultiConditionalTrainingOption =
new CommandOption.Boolean (MCMaxEntTrainer.class, "useMCTraining", "true|false", true, true,
"Use MultiConditional Training", null);
static CommandOption.Boolean usingHyperbolicPriorOption =
new CommandOption.Boolean (MCMaxEntTrainer.class, "useHyperbolicPrior", "true|false", false, false,
"Use hyperbolic (close to L1 penalty) prior over parameters", null);
static CommandOption.Double gaussianPriorVarianceOption =
new CommandOption.Double (MCMaxEntTrainer.class, "gaussianPriorVariance", "FLOAT", true, 10.0,
"Variance of the gaussian prior over parameters", null);
static CommandOption.Double hyperbolicPriorSlopeOption =
new CommandOption.Double (MCMaxEntTrainer.class, "hyperbolicPriorSlope", "FLOAT", true, 0.2,
"Slope of the (L1 penalty) hyperbolic prior over parameters", null);
static CommandOption.Double hyperbolicPriorSharpnessOption =
new CommandOption.Double (MCMaxEntTrainer.class, "hyperbolicPriorSharpness", "FLOAT", true, 10.0,
"Sharpness of the (L1 penalty) hyperbolic prior over parameters", null);
static final CommandOption.List commandOptions =
new CommandOption.List (
"MCMaximum Entropy Classifier",
new CommandOption[] {
usingHyperbolicPriorOption,
gaussianPriorVarianceOption,
hyperbolicPriorSlopeOption,
hyperbolicPriorSharpnessOption,
usingMultiConditionalTrainingOption, // CPAL
});
public static CommandOption.List getCommandOptionList ()
{
return commandOptions;
}
/*
public MCMaxEntTrainer(Maximizer.ByGradient maximizer)
{
this.maximizerByGradient = maximizer;
this.usingHyperbolicPrior = false;
}
*/
public MCMaxEntTrainer (CommandOption.List col)
{
this.usingHyperbolicPrior = usingHyperbolicPriorOption.value;
this.gaussianPriorVariance = gaussianPriorVarianceOption.value;
this.hyperbolicPriorSlope = hyperbolicPriorSlopeOption.value;
this.hyperbolicPriorSharpness = hyperbolicPriorSharpnessOption.value;
this.usingMultiConditionalTraining = usingMultiConditionalTrainingOption.value;
}
public MCMaxEntTrainer (MCMaxEnt initialClassifier) {
this.initialClassifier = initialClassifier;
}
public MCMaxEntTrainer ()
{
this (false);
}
public MCMaxEntTrainer (boolean useHyperbolicPrior)
{
this.usingHyperbolicPrior = useHyperbolicPrior;
}
/** Constructs a trainer with a parameter to avoid overtraining. 1.0 is
* usually a reasonable default value. */
public MCMaxEntTrainer (double gaussianPriorVariance)
{
this.usingHyperbolicPrior = false;
this.gaussianPriorVariance = gaussianPriorVariance;
}
// CPAL - added this to do MultiConditionalTraining
public MCMaxEntTrainer (double gaussianPriorVariance, boolean useMultiConditionalTraining )
{
this.usingHyperbolicPrior = false;
this.usingMultiConditionalTraining = useMultiConditionalTraining;
this.gaussianPriorVariance = gaussianPriorVariance;
}
public MCMaxEntTrainer (double hyperbolicPriorSlope,
double hyperbolicPriorSharpness)
{
this.usingHyperbolicPrior = true;
this.hyperbolicPriorSlope = hyperbolicPriorSlope;
this.hyperbolicPriorSharpness = hyperbolicPriorSharpness;
}
public Optimizable.ByGradientValue getMaximizableTrainer (InstanceList ilist)
{
if (ilist == null)
return new MaximizableTrainer ();
return new MaximizableTrainer (ilist, null);
}
/**
* Specifies the maximum number of iterations to run during a single call
* to train
or trainWithFeatureInduction
. Not
* currently functional.
* @return This trainer
*/
// XXX Since we maximize before using numIterations, this doesn't work.
// Is that a bug? If so, should the default numIterations be higher?
public MCMaxEntTrainer setNumIterations (int i)
{
numIterations = i;
return this;
}
public MCMaxEntTrainer setUseHyperbolicPrior (boolean useHyperbolicPrior)
{
this.usingHyperbolicPrior = useHyperbolicPrior;
return this;
}
/**
* Sets a parameter to prevent overtraining. A smaller variance for the prior
* means that feature weights are expected to hover closer to 0, so extra
* evidence is required to set a higher weight.
* @return This trainer
*/
public MCMaxEntTrainer setGaussianPriorVariance (double gaussianPriorVariance)
{
this.usingHyperbolicPrior = false;
this.gaussianPriorVariance = gaussianPriorVariance;
return this;
}
public MCMaxEntTrainer setHyperbolicPriorSlope(double hyperbolicPriorSlope)
{
this.usingHyperbolicPrior = true;
this.hyperbolicPriorSlope = hyperbolicPriorSlope;
return this;
}
public MCMaxEntTrainer setHyperbolicPriorSharpness (double hyperbolicPriorSharpness)
{
this.usingHyperbolicPrior = true;
this.hyperbolicPriorSharpness = hyperbolicPriorSharpness;
return this;
}
public MCMaxEnt getClassifier () {
return mt.getClassifier();
}
public MCMaxEnt train (InstanceList trainingSet)
{
logger.fine ("trainingSet.size() = "+trainingSet.size());
mt = new MaximizableTrainer (trainingSet, (MCMaxEnt)initialClassifier);
Optimizer maximizer = new LimitedMemoryBFGS(mt);
// CPAL - change the tolerance for large vocab experiments
((LimitedMemoryBFGS)maximizer).setTolerance(.00001); // std is .0001;
maximizer.optimize (); // XXX given the loop below, this seems wrong.
logger.info("MCMaxEnt ngetValueCalls:"+getValueCalls()+"\nMCMaxEnt ngetValueGradientCalls:"+getValueGradientCalls());
// boolean converged;
//
// for (int i = 0; i < numIterations; i++) {
// converged = maximizer.maximize (mt, 1);
// if (converged)
// break;
// else if (evaluator != null)
// if (!evaluator.evaluate (mt.getClassifier(), converged, i, mt.getValue(),
// trainingSet, validationSet, testSet))
// break;
// }
// TestMaximizable.testValueAndGradient (mt);
progressLogger.info("\n"); // progess messages are on one line; move on.
return mt.getClassifier ();
}
/**
* Like the other version of trainWithFeatureInduction
, but
* allows some default options to be changed.
*
* @param maxent An initial partially-trained classifier (default null
).
* This classifier may be modified during training.
* @param gainName The estimate of gain (log-likelihood increase) we want our chosen
* features to maximize.
* Should be one of MaxEntTrainer.EXP_GAIN
,
* MaxEntTrainer.GRADIENT_GAIN
, or
* MaxEntTrainer.INFORMATION_GAIN
(default EXP_GAIN
).
*
* @return The trained MaxEnt
classifier
*/
/*
public Classifier trainWithFeatureInduction (InstanceList trainingData,
InstanceList validationData,
InstanceList testingData,
ClassifierEvaluating evaluator,
MCMaxEnt maxent,
int totalIterations,
int numIterationsBetweenFeatureInductions,
int numFeatureInductions,
int numFeaturesPerFeatureInduction,
String gainName) {
// XXX This ought to be a parameter, except that setting it to true can
// crash training ("Jump too small").
boolean saveParametersDuringFI = false;
Alphabet inputAlphabet = trainingData.getDataAlphabet();
Alphabet outputAlphabet = trainingData.getTargetAlphabet();
if (maxent == null)
maxent = new MCMaxEnt(trainingData.getPipe(),
new double[(1+inputAlphabet.size()) * outputAlphabet.size()]);
int trainingIteration = 0;
int numLabels = outputAlphabet.size();
// Initialize feature selection
FeatureSelection globalFS = trainingData.getFeatureSelection();
if (globalFS == null) {
// Mask out all features; some will be added later by FeatureInducer.induceFeaturesFor(.)
globalFS = new FeatureSelection (trainingData.getDataAlphabet());
trainingData.setFeatureSelection (globalFS);
}
if (validationData != null) validationData.setFeatureSelection (globalFS);
if (testingData != null) testingData.setFeatureSelection (globalFS);
maxent = new MCMaxEnt(maxent.getInstancePipe(), maxent.getParameters(), globalFS);
// Run feature induction
for (int featureInductionIteration = 0;
featureInductionIteration < numFeatureInductions;
featureInductionIteration++) {
// Print out some feature information
logger.info ("Feature induction iteration "+featureInductionIteration);
// Train the model a little bit. We don't care whether it converges; we
// execute all feature induction iterations no matter what.
if (featureInductionIteration != 0) {
// Don't train until we have added some features
setNumIterations(numIterationsBetweenFeatureInductions);
maxent = (MCMaxEnt)this.train (trainingData, validationData, testingData, evaluator,
maxent);
}
trainingIteration += numIterationsBetweenFeatureInductions;
logger.info ("Starting feature induction with "+(1+inputAlphabet.size())+
" features over "+numLabels+" labels.");
// Create the list of error tokens
InstanceList errorInstances = new InstanceList (trainingData.getDataAlphabet(),
trainingData.getTargetAlphabet());
// This errorInstances.featureSelection will get examined by FeatureInducer,
// so it can know how to add "new" singleton features
errorInstances.setFeatureSelection (globalFS);
List errorLabelVectors = new ArrayList(); // these are length-1 vectors
for (int i = 0; i < trainingData.size(); i++) {
Instance instance = trainingData.get(i);
FeatureVector inputVector = (FeatureVector) instance.getData();
Label trueLabel = (Label) instance.getTarget();
// Having trained using just the current features, see how we classify
// the training data now.
Classification classification = maxent.classify(instance);
if (!classification.bestLabelIsCorrect()) {
errorInstances.add(inputVector, trueLabel, null, null);
errorLabelVectors.add(classification.getLabelVector());
}
}
logger.info ("Error instance list size = "+errorInstances.size());
int s = errorLabelVectors.size();
LabelVector[] lvs = new LabelVector[s];
for (int i = 0; i < s; i++) {
lvs[i] = (LabelVector)errorLabelVectors.get(i);
}
RankedFeatureVector.Factory gainFactory = null;
if (gainName.equals (EXP_GAIN))
gainFactory = new ExpGain.Factory (lvs, gaussianPriorVariance);
else if (gainName.equals(GRADIENT_GAIN))
gainFactory = new GradientGain.Factory (lvs);
else if (gainName.equals(INFORMATION_GAIN))
gainFactory = new InfoGain.Factory ();
else
throw new IllegalArgumentException("Unsupported gain name: "+gainName);
FeatureInducer klfi =
new FeatureInducer (gainFactory,
errorInstances,
numFeaturesPerFeatureInduction,
2*numFeaturesPerFeatureInduction,
2*numFeaturesPerFeatureInduction);
// Note that this adds features globally, but not on a per-transition basis
klfi.induceFeaturesFor (trainingData, false, false);
if (testingData != null) klfi.induceFeaturesFor (testingData, false, false);
logger.info ("MCMaxEnt FeatureSelection now includes "+globalFS.cardinality()+" features");
klfi = null;
double[] newParameters = new double[(1+inputAlphabet.size()) * outputAlphabet.size()];
// XXX (Executing this block often causes an error during training; I don't know why.)
if (saveParametersDuringFI) {
// Keep current parameter values
// XXX This relies on the implementation detail that the most recent features
// added to an Alphabet get the highest indices.
// Count parameters per output label
int oldParamCount = maxent.parameters.length / outputAlphabet.size();
int newParamCount = 1+inputAlphabet.size();
// Copy params into the proper locations
for (int i=0; i iter = trainingList.iterator();
//int ii = 0;
// Normalize the parameters to be per-class multinomials
double probs[][] = new double[scores.length][numFeatures];
double lprobs[][] = new double[scores.length][numFeatures];
for (int si = 0; si < scores.length; si++) {
double sum = 0, max = MatrixOps.max (parameters);
for (int fi = 0; fi < numFeatures; fi++) {
// TODO Strongly consider some smoothing here. What happens when all parameters are zero?
// Oh, this should be no problem, because exp(0) == 1.
probs[si][fi] = Math.exp(parameters[si*numFeatures+fi] - max);
sum += probs[si][fi];
}
assert (sum > 0);
for (int fi = 0; fi < numFeatures; fi++) {
probs[si][fi] /= sum;
lprobs[si][fi] = Math.log(probs[si][fi]);
}
}
while (iter.hasNext()) {
Instance instance = iter.next();
double instanceWeight = trainingList.getInstanceWeight(instance);
Labeling labeling = instance.getLabeling ();
//System.out.println("L Now "+inputAlphabet.size()+" regular features.");
this.theClassifier.getClassificationScores (instance, scores);
FeatureVector fv = (FeatureVector) instance.getData ();
int li = labeling.getBestIndex();
value = - (instanceWeight * Math.log (scores[li]));
if(Double.isNaN(value)) {
logger.fine ("MCMaxEntTrainer: Instance " + instance.getName() +
"has NaN value. log(scores)= " + Math.log(scores[li]) +
" scores = " + scores[li] +
" has instance weight = " + instanceWeight);
}
if (Double.isInfinite(value)) {
logger.warning ("Instance "+instance.getSource() + " has infinite value; skipping value and gradient");
cachedValue -= value;
cachedValueStale = false;
return -value;
// continue;
}
cachedValue += value;
// CPAL - this is a loop over classes and their scores
// - we compute the gradient by taking the dot product of the feature value
// and the probability of the class
for (int si = 0; si < scores.length; si++) {
if (scores[si] == 0) continue;
assert (!Double.isInfinite(scores[si]));
// CPAL - accumulating the current classifiers expectation of the feature
// vector counts for this class label
// Current classifier has expectation over class label, not over feature vector
MatrixOps.rowPlusEquals (cachedGradient, numFeatures,
si, fv, -instanceWeight * scores[si]);
cachedGradient[numFeatures*si + defaultFeatureIndex] += (-instanceWeight * scores[si]);
}
// CPAL - if we wish to do multiconditional training we need another term for this accumulated
// expectation
if (usingMultiConditionalTraining) {
// need something analogous to this
// this.theClassifier.getClassificationScores (instance, scores);
// this.theClassifier.getFeatureDistributions (instance,
// Note: li is the "label" for this instance
// Get the sum of the feature vector
// which is the number of counts for the document if we use that as input
double Ncounts = MatrixOps.sum(fv);
// CPAL - get the additional term for the value of our - log probability
// - this computation amounts to the dot product of the feature vector and the probability vector
cachedValue -= (instanceWeight * fv.dotProduct(lprobs[li]));
// CPAL - get the model expectation over features for the given class
for (int fi = 0; fi < numFeatures; fi++) {
//if(parameters[numFeatures*li + fi] != 0) {
// MatrixOps.rowPlusEquals(cachedGradient, numFeatures,li,fv,))
cachedGradient[numFeatures*li + fi] += (-instanceWeight * Ncounts * probs[li][fi]);
// }
}
}
}
//logger.info ("-Expectations:"); cachedGradient.print();
// Incorporate prior on parameters
if (usingHyperbolicPrior) {
for (int li = 0; li < numLabels; li++)
for (int fi = 0; fi < numFeatures; fi++)
cachedValue += (hyperbolicPriorSlope / hyperbolicPriorSharpness
* Math.log (Maths.cosh (hyperbolicPriorSharpness * parameters[li *numFeatures + fi])));
} else {
for (int li = 0; li < numLabels; li++)
for (int fi = 0; fi < numFeatures; fi++) {
double param = parameters[li*numFeatures + fi];
cachedValue += param * param / (2 * gaussianPriorVariance);
}
}
cachedValue *= -1.0; // MAXIMIZE, NOT MINIMIZE
cachedValueStale = false;
progressLogger.info ("Value (loglikelihood) = "+cachedValue);
}
return cachedValue;
}
// CPAL first get value, then gradient
public void getValueGradient (double [] buffer)
{
// Gradient is (constraint - expectation - parameters/gaussianPriorVariance)
if (cachedGradientStale) {
numGetValueGradientCalls++;
if (cachedValueStale)
// This will fill in the cachedGradient with the "-expectation"
getValue ();
// cachedGradient contains the negative expectations
// expectations are model expectations and constraints are
// empirical expectations
MatrixOps.plusEquals (cachedGradient, constraints);
// CPAL - we need a second copy of the constraints
// - actually, we only want this for the feature values
// - I've moved this up into getValue
//if (usingMultiConditionalTraining){
// MatrixOps.plusEquals(cachedGradient, constraints);
//}
// Incorporate prior on parameters
if (usingHyperbolicPrior) {
throw new UnsupportedOperationException ("Hyperbolic prior not yet implemented.");
}
else {
MatrixOps.plusEquals (cachedGradient, parameters,
-1.0 / gaussianPriorVariance);
}
// A parameter may be set to -infinity by an external user.
// We set gradient to 0 because the parameter's value can
// never change anyway and it will mess up future calculations
// on the matrix, such as norm().
MatrixOps.substitute (cachedGradient, Double.NEGATIVE_INFINITY, 0.0);
// Set to zero all the gradient dimensions that are not among the selected features
if (perLabelFeatureSelection == null) {
for (int labelIndex = 0; labelIndex < numLabels; labelIndex++)
MatrixOps.rowSetAll (cachedGradient, numFeatures,
labelIndex, 0.0, featureSelection, false);
} else {
for (int labelIndex = 0; labelIndex < numLabels; labelIndex++)
MatrixOps.rowSetAll (cachedGradient, numFeatures,
labelIndex, 0.0,
perLabelFeatureSelection[labelIndex], false);
}
cachedGradientStale = false;
}
assert (buffer != null && buffer.length == parameters.length);
System.arraycopy (cachedGradient, 0, buffer, 0, cachedGradient.length);
}
public double sumNegLogProb (double a, double b)
{
if (a == Double.POSITIVE_INFINITY && b == Double.POSITIVE_INFINITY)
return Double.POSITIVE_INFINITY;
else if (a > b)
return b - Math.log (1 + Math.exp(b-a));
else
return a - Math.log (1 + Math.exp(a-b));
}
}
}