All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cmu.arktweetnlp.impl.OWLQN Maven / Gradle / Ivy

The newest version!
package cmu.arktweetnlp.impl;

import edu.stanford.nlp.math.ArrayMath;
import edu.stanford.nlp.optimization.DiffFunction;
import gnu.trove.set.hash.THashSet;

import java.util.*;


/**
 * Class implementing the Orthant-Wise Limited-memory Quasi-Newton
 * algorithm (OWL-QN). OWN-QN is a numerical optimization procedure for
 * finding the optimum of an objective of the form smooth function plus
 * L1-norm of the parameters. It has been used for training log-linear
 * models (such as logistic regression) with L1-regularization. The
 * algorithm is described in "Scalable training of L1-regularized
 * log-linear models" by Galen Andrew and Jianfeng Gao. This
 * implementation includes built-in capacity to train logistic regression
 * or least-squares models with L1 regularization. It is also possible to
 * use OWL-QN to optimize any arbitrary smooth convex loss plus L1
 * regularization by defining the function and its gradient using the
 * supplied "DifferentiableFunction" class, and passing an instance of
 * the function to the OWLQN object. For more information, please read
 * the included file README.txt. Also included in the distribution are
 * the ICML paper and slide presentation.
 *
 * Significant portions of this code are taken from
 * Galen Andew's implementation
 *
 * @author Michel Galley
 * 
 * (NOTE FROM BRENDAN 2012-08-14: What license did Stanford release this as?  GPL ??)
 * 
 * modified by Michael Heilman ([email protected]) 
 * -allow for bias/intercept parameters that shouldn't be penalized
 * -make outside API calls easier (11/9/2010)
 * -removed lots of extraneous stuff from the stanford API
 * 
 * modified by Nathan Schneider ([email protected])
 * - support for constrained optimization with the projected gradient method
 * (see {@link #setConstrained(boolean)}
 * 
 */
public class OWLQN {

	private int maxIters = Integer.MAX_VALUE;
	
	/** Whether to use the "projected gradient" method to require weights to be 
	 * nonnegative and sum to 1. The projection operation is implemented in 
	 * {@link #project(double[])}.
	 */
	private static boolean constrained = false;

	interface TerminationCriterion {
		double getValue(OptimizerState state, StringBuilder out);
	}

	static class RelativeMeanImprovementCriterion implements TerminationCriterion {
		int numItersToAvg;
		Queue prevVals;

		RelativeMeanImprovementCriterion() {
			this(10);
		}

		RelativeMeanImprovementCriterion(int numItersToAvg) {
			this.numItersToAvg = numItersToAvg;
			this.prevVals = new LinkedList();
		}

		public double getValue(OptimizerState state, StringBuilder out) {

			double retVal = Double.POSITIVE_INFINITY;

			if (prevVals.size() >= numItersToAvg) {
				double prevVal = prevVals.peek();
				if (prevVals.size() == numItersToAvg) prevVals.poll();
				double averageImprovement = (prevVal - state.getValue()) / prevVals.size();
				double relAvgImpr = averageImprovement / Math.abs(state.getValue());
				String relAvgImprStr = String.format("%.4e",relAvgImpr);
				out.append("  (").append(relAvgImprStr).append(") ");
				retVal = relAvgImpr;
			} else {
				out.append("  (wait for "+numItersToAvg+" iters) ");
			}

			prevVals.offer(state.getValue());
			return retVal;
		}
	} // end static class RelativeMeanImprovementCriterion


	boolean quiet;
	boolean responsibleForTermCrit;

	public static Set biasParameters = new THashSet();

	TerminationCriterion termCrit;

	public interface WeightsPrinter {
		public void printWeights();
	}
	
	WeightsPrinter printer;
	
	
	public OWLQN(boolean quiet) {
		this.quiet = quiet;
		this.termCrit = new RelativeMeanImprovementCriterion();
		this.responsibleForTermCrit = true;
	}

	public OWLQN() {
		this(false);
	}

	public OWLQN(TerminationCriterion termCrit, boolean quiet) {
		this.quiet = quiet;
		this.termCrit = termCrit;
		this.responsibleForTermCrit = false;
	}

	/* (non-Javadoc)
	 * @see edu.cmu.cs.ark.tdf.IOptimizer#setQuiet(boolean)
	 */
	public void setQuiet(boolean q) {
		quiet = q;
	}

	/*
	public void minimize(DiffFunction function, double[] initial) {
		minimize(function, initial, 1.0);
	}

	public void minimize(DiffFunction function, double[] initial, double l1weight) {
		minimize(function, initial, l1weight, 1e-5);
	}

	public void minimize(DiffFunction function, double[] initial, double l1weight, double tol) {
		minimize(function, initial, l1weight, tol, 10);
	}
	*/
	
	
	/* (non-Javadoc)
	 * @see edu.cmu.cs.ark.tdf.IOptimizer#minimize(edu.stanford.nlp.optimization.DiffFunction, double[], double, double, int)
	 */
	public double[] minimize(DiffFunction function, double[] initial, double l1weight, double tol, int m) {

		OptimizerState state = new OptimizerState(function, initial, m, l1weight, quiet);

		if (!quiet) {
			System.err.printf("Optimizing function of %d variables with OWL-QN parameters:\n", state.dim);
			System.err.printf("   l1 regularization weight: %f.\n", l1weight);
			System.err.printf("   L-BFGS memory parameter (m): %d\n", m);
			System.err.printf("   Convergence tolerance: %f\n\n", tol);
			System.err.printf("Iter    n:\tnew_value\tdf\t(conv_crit)\tline_search\n");
			System.err.printf("Iter    0:\t%.4e\t\t(***********)\t", state.value);
		}

		StringBuilder buf = new StringBuilder();
		termCrit.getValue(state, buf);

		for(int i=0; i
	   Given: a real vector c to be projected into a linearly-constrained subspace
			I ← ∅	// index set
			x ← c
			do
				Compute x' = P_V_I(x): x'_i =
				        0 if i ∈ I,
				        x_i - (1/(dim(c)-dim(I))) * (Σ_j { x_j } - 1) otherwise
				if x' ≥ 0: return x'
				I ← I ∪ {i | x'_i < 0}
				x ← P_X_I(x'): i.e. x' but with any negative components replaced by 0
			repeat
		Output: projection of c into constrained subspace K, i.e. P_K(c)
		
*/ public static double[] project(double[] c) { Set indexSet = new THashSet(); double[] x = c; while (true) { double sum = 0.0; for (double xj : x) { sum += xj; } final double adjustmentTerm = (sum-1.0)/(c.length - indexSet.size()); boolean nonnegative = true; for (int i=0; i sList = new LinkedList(); LinkedList yList = new LinkedList(); LinkedList roList = new LinkedList(); double[] alphas; double value; int iter, m; int dim; DiffFunction func; double l1weight; boolean quiet; //mheilman: I added this for debugging private String arrayToString(double [] arr){ String res = ""; for(int i=0; i0) res += "\t"; res += arr[i]; } return res; } //mheilman: I added this for debugging the updateDir() method. private void printStateValues() { System.err.println("\nSLIST:"); for(int i=0; i= 0; i--) { //mheilman: The program will try to divide by zero here unless there is a check //that the parameters change at each iteration. See comments in the minimize() method. //A roList value is the inner product of the change in the gradient //and the change in parameters between the current and last iterations. //See the discussion of L-BFGS in Nocedal and Wright's Numerical Optimization book //(though I think that defines rho as the multiplicative inverse of what is here). alphas[i] = -ArrayMath.innerProduct(sList.get(i), dir) / roList.get(i); ArrayMath.addMultInPlace(dir, yList.get(i), alphas[i]); } double[] lastY = yList.get(count - 1); double yDotY = ArrayMath.innerProduct(lastY, lastY); double scalar = roList.get(count - 1) / yDotY; ArrayMath.multiplyInPlace(dir, scalar); for (int i = 0; i < count; i++) { double beta = ArrayMath.innerProduct(yList.get(i), dir) / roList.get(i); ArrayMath.addMultInPlace(dir, sList.get(i), -alphas[i] - beta); } } } void makeSteepestDescDir() { if (l1weight == 0) { ArrayMath.multiplyInto(dir, grad, -1); } else { for (int i=0; i 0) { dir[i] = -grad[i] - l1weight; } else { if (grad[i] < -l1weight) { dir[i] = -grad[i] - l1weight; } else if (grad[i] > l1weight) { dir[i] = -grad[i] + l1weight; } else { dir[i] = 0; } } } } steepestDescDir = dir.clone(); // deep copy needed } void fixDirSigns() { if (l1weight > 0) { for (int i = 0; i 0) { val += dir[i] * (grad[i] + l1weight); } else if (dir[i] < 0) { val += dir[i] * (grad[i] - l1weight); } else if (dir[i] > 0) { val += dir[i] * (grad[i] + l1weight); } } } return val; } } private boolean getNextPoint(double alpha) { ArrayMath.addMultInto(newX, x, dir, alpha); /*if (OWLQN.isConstrained()) newX = OWLQN.projectWeights(newX);*/ //TODO if (l1weight > 0) { for (int i=0; i 0) { for (int i=0; i= 0) { throw new RuntimeException("L-BFGS chose a non-descent direction: check your gradient!"); } double alpha = 1.0; double backoff = 0.5; if (iter == 1) { double normDir = Math.sqrt(ArrayMath.innerProduct(dir, dir)); alpha = (1 / normDir); backoff = 0.1; } double c1 = 1e-4; double oldValue = value; while (true) { getNextPoint(alpha); value = evalL1(); //mheilman: I think loss of precision can happen here for pathological cases because //origDirDeriv can be many orders of magnitude less than value and oldValue. //Then, the right side will just end up being oldValue. if (value <= oldValue + c1 * origDirDeriv * alpha){ break; } //mheilman: I added this extra check to keep the program //from backing off for a long time until numerical underflow happens. //If the line search hasn't found something by 1e-30, it's probably not going to, //and I think not having this check may cause the program to crash //for some rare, pathological cases. if(alpha < 1e-30){ System.err.println("Warning: The line search backed off to alpha < 1e-30, and stayed with the current parameter values. This probably means converged has been reached."); value = oldValue; break; } if (!quiet) System.err.print("."); alpha *= backoff; } if (!quiet) System.err.println(); } void shift() { double[] nextS = null, nextY = null; int listSize = sList.size(); if (listSize < m) { try { nextS = new double[dim]; nextY = new double[dim]; } catch (OutOfMemoryError e) { m = listSize; nextS = null; } } if (nextS == null) { nextS = sList.poll(); nextY = yList.poll(); roList.poll(); } ArrayMath.addMultInto(nextS, newX, x, -1); ArrayMath.addMultInto(nextY, newGrad, grad, -1); double ro = ArrayMath.innerProduct(nextS, nextY); assert(ro != 0.0); sList.offer(nextS); yList.offer(nextY); roList.offer(ro); double[] tmpX = newX; newX = x; x = tmpX; // TODO: added: nschneid /*if (OWLQN.isConstrained()) { newX = OWLQN.projectWeights(newX); x = OWLQN.projectWeights(x); }*/ double[] tmpGrad = newGrad; newGrad = grad; grad = tmpGrad; ++iter; } double getValue() { return value; } OptimizerState(DiffFunction f, double[] init, int m, double l1weight, boolean quiet) { this.x = init; this.grad = new double[init.length]; this.newX = init.clone(); this.newGrad = new double[init.length]; this.dir = new double[init.length]; this.steepestDescDir = newGrad.clone(); this.alphas = new double[m]; this.iter = 1; this.m = m; this.dim = init.length; this.func = f; this.l1weight = l1weight; this.quiet = quiet; if (m <= 0) throw new RuntimeException("m must be an integer greater than zero."); value = evalL1(); grad = newGrad.clone(); } }



© 2015 - 2024 Weber Informatics LLC | Privacy Policy