cmu.arktweetnlp.impl.OWLQN Maven / Gradle / Ivy
The newest version!
package cmu.arktweetnlp.impl;
import edu.stanford.nlp.math.ArrayMath;
import edu.stanford.nlp.optimization.DiffFunction;
import gnu.trove.set.hash.THashSet;
import java.util.*;
/**
* Class implementing the Orthant-Wise Limited-memory Quasi-Newton
* algorithm (OWL-QN). OWN-QN is a numerical optimization procedure for
* finding the optimum of an objective of the form smooth function plus
* L1-norm of the parameters. It has been used for training log-linear
* models (such as logistic regression) with L1-regularization. The
* algorithm is described in "Scalable training of L1-regularized
* log-linear models" by Galen Andrew and Jianfeng Gao. This
* implementation includes built-in capacity to train logistic regression
* or least-squares models with L1 regularization. It is also possible to
* use OWL-QN to optimize any arbitrary smooth convex loss plus L1
* regularization by defining the function and its gradient using the
* supplied "DifferentiableFunction" class, and passing an instance of
* the function to the OWLQN object. For more information, please read
* the included file README.txt. Also included in the distribution are
* the ICML paper and slide presentation.
*
* Significant portions of this code are taken from
* Galen Andew's implementation
*
* @author Michel Galley
*
* (NOTE FROM BRENDAN 2012-08-14: What license did Stanford release this as? GPL ??)
*
* modified by Michael Heilman ([email protected])
* -allow for bias/intercept parameters that shouldn't be penalized
* -make outside API calls easier (11/9/2010)
* -removed lots of extraneous stuff from the stanford API
*
* modified by Nathan Schneider ([email protected])
* - support for constrained optimization with the projected gradient method
* (see {@link #setConstrained(boolean)}
*
*/
public class OWLQN {
private int maxIters = Integer.MAX_VALUE;
/** Whether to use the "projected gradient" method to require weights to be
* nonnegative and sum to 1. The projection operation is implemented in
* {@link #project(double[])}.
*/
private static boolean constrained = false;
interface TerminationCriterion {
double getValue(OptimizerState state, StringBuilder out);
}
static class RelativeMeanImprovementCriterion implements TerminationCriterion {
int numItersToAvg;
Queue prevVals;
RelativeMeanImprovementCriterion() {
this(10);
}
RelativeMeanImprovementCriterion(int numItersToAvg) {
this.numItersToAvg = numItersToAvg;
this.prevVals = new LinkedList();
}
public double getValue(OptimizerState state, StringBuilder out) {
double retVal = Double.POSITIVE_INFINITY;
if (prevVals.size() >= numItersToAvg) {
double prevVal = prevVals.peek();
if (prevVals.size() == numItersToAvg) prevVals.poll();
double averageImprovement = (prevVal - state.getValue()) / prevVals.size();
double relAvgImpr = averageImprovement / Math.abs(state.getValue());
String relAvgImprStr = String.format("%.4e",relAvgImpr);
out.append(" (").append(relAvgImprStr).append(") ");
retVal = relAvgImpr;
} else {
out.append(" (wait for "+numItersToAvg+" iters) ");
}
prevVals.offer(state.getValue());
return retVal;
}
} // end static class RelativeMeanImprovementCriterion
boolean quiet;
boolean responsibleForTermCrit;
public static Set biasParameters = new THashSet();
TerminationCriterion termCrit;
public interface WeightsPrinter {
public void printWeights();
}
WeightsPrinter printer;
public OWLQN(boolean quiet) {
this.quiet = quiet;
this.termCrit = new RelativeMeanImprovementCriterion();
this.responsibleForTermCrit = true;
}
public OWLQN() {
this(false);
}
public OWLQN(TerminationCriterion termCrit, boolean quiet) {
this.quiet = quiet;
this.termCrit = termCrit;
this.responsibleForTermCrit = false;
}
/* (non-Javadoc)
* @see edu.cmu.cs.ark.tdf.IOptimizer#setQuiet(boolean)
*/
public void setQuiet(boolean q) {
quiet = q;
}
/*
public void minimize(DiffFunction function, double[] initial) {
minimize(function, initial, 1.0);
}
public void minimize(DiffFunction function, double[] initial, double l1weight) {
minimize(function, initial, l1weight, 1e-5);
}
public void minimize(DiffFunction function, double[] initial, double l1weight, double tol) {
minimize(function, initial, l1weight, tol, 10);
}
*/
/* (non-Javadoc)
* @see edu.cmu.cs.ark.tdf.IOptimizer#minimize(edu.stanford.nlp.optimization.DiffFunction, double[], double, double, int)
*/
public double[] minimize(DiffFunction function, double[] initial, double l1weight, double tol, int m) {
OptimizerState state = new OptimizerState(function, initial, m, l1weight, quiet);
if (!quiet) {
System.err.printf("Optimizing function of %d variables with OWL-QN parameters:\n", state.dim);
System.err.printf(" l1 regularization weight: %f.\n", l1weight);
System.err.printf(" L-BFGS memory parameter (m): %d\n", m);
System.err.printf(" Convergence tolerance: %f\n\n", tol);
System.err.printf("Iter n:\tnew_value\tdf\t(conv_crit)\tline_search\n");
System.err.printf("Iter 0:\t%.4e\t\t(***********)\t", state.value);
}
StringBuilder buf = new StringBuilder();
termCrit.getValue(state, buf);
for(int i=0; i
Given: a real vector c to be projected into a linearly-constrained subspace
I ← ∅ // index set
x ← c
do
Compute x' = P_V_I(x): x'_i =
0 if i ∈ I,
x_i - (1/(dim(c)-dim(I))) * (Σ_j { x_j } - 1) otherwise
if x' ≥ 0: return x'
I ← I ∪ {i | x'_i < 0}
x ← P_X_I(x'): i.e. x' but with any negative components replaced by 0
repeat
Output: projection of c into constrained subspace K, i.e. P_K(c)
*/
public static double[] project(double[] c) {
Set