cc.mallet.optimize.StochasticMetaAscent Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of jcore-mallet-2.0.9 Show documentation
MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.
The newest version!
package cc.mallet.optimize;

import java.util.Arrays;
import java.util.logging.Logger;
import java.text.DecimalFormat;

import cc.mallet.optimize.Optimizer;
import cc.mallet.types.MatrixOps;
import cc.mallet.util.MalletLogger;


/**
 * @author Greg Druck
 * @author Kedar Bellare
 */
public class StochasticMetaAscent implements Optimizer.ByBatches {

	private static Logger logger = 
		MalletLogger.getLogger(StochasticMetaAscent.class.getName());
	
  private final int MAX_ITER = 200;
	private final double LAMBDA = 1.0;
	private final double TOLERANCE = 0.01;
  private final double EPS = 1e-10;

  private double mu = 0.1;
  private int totalIterations = 0;
  private double eta_init = 0.03;
  private boolean useHessian = true;
	private double[] gain;
	private double[] gradientTrace;
	
	Optimizable.ByBatchGradient maxable = null;
	
	public StochasticMetaAscent (Optimizable.ByBatchGradient maxable) {
		this.maxable = maxable;
	}

  public void setInitialStep(double step) {
    eta_init = step;
  }
  
  public void setMu(double m) {
    mu = m;
  }
  
  public void setUseHessian(boolean flag) {
    useHessian = flag;
  }
  
	public boolean optimize(int numBatches, int[] batchAssignments) {
		return optimize(MAX_ITER,numBatches,batchAssignments);
	}

	public boolean optimize(int numIterations, int numBatches, int[] batchAssignments) {
		
		int numParameters = maxable.getNumParameters();
		double[] parameters = new double[numParameters];
		double[] gradient = new double[numParameters];
    double[] hessianProduct = new double[numParameters];
		
		// Only initialize these if they are null
		// in case someone wants to optimize a 
		// few iterations at a time.  
		if (gain == null) {
      System.err.println ("StochasticMetaAscent: initialStep="+eta_init+"  metaStep="+mu);
      gain = new double[numParameters];
      Arrays.fill(gain, eta_init);
      gradientTrace = new double[numParameters];
		}

    maxable.getParameters(parameters);
		
		for (int iteration = 0; iteration < numIterations; iteration++) {
      double oldApproxValue = 0;
			double approxValue = 0;
			for (int batch = 0; batch < numBatches; batch++) {
				logger.info("Iteration " + (totalIterations + iteration) + ", batch " + batch + " of " + numBatches);
				
				// Get current parameters
				maxable.getParameters(parameters);

				// Update value and gradient for the current batch
        double initialValue = maxable.getBatchValue (batch, batchAssignments);
        oldApproxValue += initialValue;

        if (Double.isNaN (initialValue)) {
          throw new IllegalArgumentException ("NaN in value computation.  Probably you need to reduce initialStep or metaStep.");
        }

        maxable.getBatchValueGradient(gradient,batch,batchAssignments);

        // The code below was originally written for stochastic meta
        // descent.  We are maximizing, so we want ascent.  Flip the
        // signs on the gradient to make it point downhill.
        MatrixOps.timesEquals(gradient, -1);

        if (useHessian) {
          computeHessianProduct(maxable, parameters, batch, batchAssignments, gradient, gradientTrace, hessianProduct);
        }

        reportOnVec ("x", parameters);
        reportOnVec ("step", gain);
        reportOnVec ("grad", gradient);
        reportOnVec ("trace", gradientTrace);
          
				// Update learning rates for individual parameters
				for (int index = 0; index < numParameters; index++) {
          // for the first iteration, this will just be the initial step
          // since gradientTrace will be all zeros
					gain[index] *= Math.max(0.5, 1 - mu * gradient[index] * gradientTrace[index]);
          
					// adjust parameters based on direction
					parameters[index] -= gain[index] * gradient[index];
					
          if (useHessian) {
            // adjust gradient trace
            gradientTrace[index] = LAMBDA * gradientTrace[index] - gain[index] * 
						  (gradient[index] + LAMBDA * hessianProduct[index]);
          }
          else {
            // adjust gradient trace
            gradientTrace[index] = LAMBDA * gradientTrace[index] - gain[index] * 
            (gradient[index] + LAMBDA * gradientTrace[index]);
          }
				}
				
				// Set new parameters
				maxable.setParameters(parameters);

        double finalValue = maxable.getBatchValue (batch, batchAssignments);
        approxValue += finalValue;

        logger.info ("StochasticMetaAscent: initial value: "+initialValue+"  final value:"+finalValue);
      }

      logger.info("StochasticMetaDescent: Value at iteration (" + (totalIterations + iteration) + ")= " + approxValue);
      
      // converge criteria from GradientAscent and LimitedMemoryBFGS
      if (2.0*Math.abs(approxValue-oldApproxValue) <= 
        TOLERANCE*(Math.abs(approxValue)+Math.abs(oldApproxValue)+EPS)) {
        logger.info ("Stochastic Meta Ascent: Value difference "
            +Math.abs(approxValue-oldApproxValue)
            +" below " + "tolerance; saying converged.");
        totalIterations += iteration;
        return true;
      }

      oldApproxValue = approxValue;
		}
    
    totalIterations += numIterations;
		return false;
	}

  private void reportOnVec (String s, double[] v)
  {
    DecimalFormat f = new DecimalFormat ("0.####");
    System.out.println ("StochasticMetaAscent: "+s+":"+
            "  min "+ f.format(MatrixOps.min (v)) +
            "  max "+ f.format(MatrixOps.max (v)) +
            "  mean "+ f.format(MatrixOps.mean (v)) +
            "  2norm "+ f.format(MatrixOps.twoNorm (v)) +
            "  abs-norm "+ f.format(MatrixOps.absNorm (v))
    );
  }

  // compute finite difference approximation of the Hessian product
  private void computeHessianProduct(Optimizable.ByBatchGradient maxable, 
      double[] parameters, int batchIndex, int[] batchAssignments, 
      double[] currentGradient, double[] vector, double[] result) {
    
    int numParameters = maxable.getNumParameters();
    double eps = 1.0e-6;
    double[] epsGradient = new double[numParameters];
    double[] oldParameters = new double[numParameters];
    
    // adjust parameters by (eps * vector) and recompute gradient
    System.arraycopy(parameters,0,oldParameters,0,numParameters);

    MatrixOps.plusEquals(parameters, vector, eps);
    maxable.setParameters(parameters);
    maxable.getBatchValueGradient(epsGradient, batchIndex, batchAssignments);
    
    // restore old parameters
    maxable.setParameters(oldParameters);
    
    // calculate Hessian product
    for (int index = 0; index < result.length; index++) {
      result[index] = (-epsGradient[index] - currentGradient[index]) / eps;
    }
  }

}