All Downloads are FREE. Search and download functionalities are using the official Maven repository.

opennlp.tools.ml.maxent.quasinewton.QNMinimizer Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.ml.maxent.quasinewton;

import opennlp.tools.ml.maxent.quasinewton.LineSearch.LineSearchResult;

/**
 * Implementation of L-BFGS which supports L1-, L2-regularization
 * and Elastic Net for solving convex optimization problems. 

* Usage example: *

 *  // Quadratic function f(x) = (x-1)^2 + 10
 *  // f obtains its minimum value 10 at x = 1
 *  Function f = new Function() {
 *
 *    {@literal @}Override
 *    public int getDimension() {
 *      return 1;
 *    }
 *
 *    {@literal @}Override
 *    public double valueAt(double[] x) {
 *      return Math.pow(x[0]-1, 2) + 10;
 *    }
 *
 *    {@literal @}Override
 *    public double[] gradientAt(double[] x) {
 *      return new double[] { 2*(x[0]-1) };
 *    }
 *
 *  };
 *
 *  QNMinimizer minimizer = new QNMinimizer();
 *  double[] x = minimizer.minimize(f);
 *  double min = f.valueAt(x);
 * 
*/ public class QNMinimizer { // Function change rate tolerance public static final double CONVERGE_TOLERANCE = 1e-4; // Relative gradient norm tolerance public static final double REL_GRAD_NORM_TOL = 1e-4; // Initial step size public static final double INITIAL_STEP_SIZE = 1.0; // Minimum step size public static final double MIN_STEP_SIZE = 1e-10; // Default L1-cost public static final double L1COST_DEFAULT = 0; // Default L2-cost public static final double L2COST_DEFAULT = 0; // Default number of iterations public static final int NUM_ITERATIONS_DEFAULT = 100; // Default number of Hessian updates to store public static final int M_DEFAULT = 15; // Default maximum number of function evaluations public static final int MAX_FCT_EVAL_DEFAULT = 30000; // L1-regularization cost private double l1Cost; // L2-regularization cost private double l2Cost; // Maximum number of iterations private int iterations; // Number of Hessian updates to store private int m; // Maximum number of function evaluations private int maxFctEval; // Verbose output private boolean verbose; // Objective function's dimension private int dimension; // Hessian updates private UpdateInfo updateInfo; // For evaluating quality of training parameters. // This is optional and can be omitted. private Evaluator evaluator; public QNMinimizer() { this(L1COST_DEFAULT, L2COST_DEFAULT); } public QNMinimizer(double l1Cost, double l2Cost) { this(l1Cost, l2Cost, NUM_ITERATIONS_DEFAULT); } public QNMinimizer(double l1Cost, double l2Cost, int iterations) { this(l1Cost, l2Cost, iterations, M_DEFAULT, MAX_FCT_EVAL_DEFAULT); } public QNMinimizer(double l1Cost, double l2Cost, int iterations, int m, int maxFctEval) { this(l1Cost, l2Cost, iterations, m, maxFctEval, true); } /** * Constructor * @param l1Cost L1-regularization cost * @param l2Cost L2-regularization cost * @param iterations maximum number of iterations * @param m number of Hessian updates to store * @param maxFctEval maximum number of function evaluations * @param verbose verbose output */ public QNMinimizer(double l1Cost, double l2Cost, int iterations, int m, int maxFctEval, boolean verbose) { // Check arguments if (l1Cost < 0 || l2Cost < 0) throw new IllegalArgumentException( "L1-cost and L2-cost must not be less than zero"); if (iterations <= 0) throw new IllegalArgumentException( "Number of iterations must be larger than zero"); if (m <= 0) throw new IllegalArgumentException( "Number of Hessian updates must be larger than zero"); if (maxFctEval <= 0) throw new IllegalArgumentException( "Maximum number of function evaluations must be larger than zero"); this.l1Cost = l1Cost; this.l2Cost = l2Cost; this.iterations = iterations; this.m = m; this.maxFctEval = maxFctEval; this.verbose = verbose; } public Evaluator getEvaluator() { return evaluator; } public void setEvaluator(Evaluator evaluator) { this.evaluator = evaluator; } /** * Find the parameters that minimize the objective function * @param function objective function * @return minimizing parameters */ public double[] minimize(Function function) { Function l2RegFunction = new L2RegFunction(function, l2Cost); this.dimension = l2RegFunction.getDimension(); this.updateInfo = new UpdateInfo(this.m, this.dimension); // Current point is at the origin double[] currPoint = new double[dimension]; double currValue = l2RegFunction.valueAt(currPoint); // Gradient at the current point double[] currGrad = new double[dimension]; System.arraycopy(l2RegFunction.gradientAt(currPoint), 0, currGrad, 0, dimension); // Pseudo-gradient - only use when L1-regularization is enabled double[] pseudoGrad = null; if (l1Cost > 0) { currValue += l1Cost * ArrayMath.l1norm(currPoint); pseudoGrad = new double[dimension]; computePseudoGrad(currPoint, currGrad, pseudoGrad); } LineSearchResult lsr; if (l1Cost > 0) { lsr = LineSearchResult.getInitialObjectForL1( currValue, currGrad, pseudoGrad, currPoint); } else { lsr = LineSearchResult.getInitialObject( currValue, currGrad, currPoint); } if (verbose) { display("\nSolving convex optimization problem."); display("\nObjective function has " + dimension + " variable(s)."); display("\n\nPerforming " + iterations + " iterations with " + "L1Cost=" + l1Cost + " and L2Cost=" + l2Cost + "\n"); } double[] direction = new double[dimension]; long startTime = System.currentTimeMillis(); // Initial step size for the 1st iteration double initialStepSize = l1Cost > 0 ? ArrayMath.invL2norm(lsr.getPseudoGradAtNext()) : ArrayMath.invL2norm(lsr.getGradAtNext()); for (int iter = 1; iter <= iterations; iter++) { // Find direction if (l1Cost > 0) { System.arraycopy(lsr.getPseudoGradAtNext(), 0, direction, 0, direction.length); } else { System.arraycopy(lsr.getGradAtNext(), 0, direction, 0, direction.length); } computeDirection(direction); // Line search if (l1Cost > 0) { // Constrain the search direction pseudoGrad = lsr.getPseudoGradAtNext(); for (int i = 0; i < dimension; i++) { if (direction[i] * pseudoGrad[i] >= 0) { direction[i] = 0; } } LineSearch.doConstrainedLineSearch(l2RegFunction, direction, lsr, l1Cost, initialStepSize); computePseudoGrad(lsr.getNextPoint(), lsr.getGradAtNext(), pseudoGrad); lsr.setPseudoGradAtNext(pseudoGrad); } else { LineSearch.doLineSearch(l2RegFunction, direction, lsr, initialStepSize); } // Save Hessian updates updateInfo.update(lsr); if (verbose) { if (iter < 10) display(" " + iter + ": "); else if (iter < 100) display(" " + iter + ": "); else display(iter + ": "); if (evaluator != null) { display("\t" + lsr.getValueAtNext() + "\t" + lsr.getFuncChangeRate() + "\t" + evaluator.evaluate(lsr.getNextPoint()) + "\n"); } else { display("\t " + lsr.getValueAtNext() + "\t" + lsr.getFuncChangeRate() + "\n"); } } if (isConverged(lsr)) break; initialStepSize = INITIAL_STEP_SIZE; } // Undo L2-shrinkage if Elastic Net is used (since // in that case, the shrinkage is done twice) if (l1Cost > 0 && l2Cost > 0) { double[] x = lsr.getNextPoint(); for (int i = 0; i < dimension; i++) { x[i] = Math.sqrt(1 + l2Cost) * x[i]; } } long endTime = System.currentTimeMillis(); long duration = endTime - startTime; display("Running time: " + (duration / 1000.) + "s\n"); // Release memory this.updateInfo = null; System.gc(); // Avoid returning the reference to LineSearchResult's member so that GC can // collect memory occupied by lsr after this function completes (is it necessary?) double[] parameters = new double[dimension]; System.arraycopy(lsr.getNextPoint(), 0, parameters, 0, dimension); return parameters; } /** * Pseudo-gradient for L1-regularization (see equation 4 in the paper * "Scalable Training of L1-Regularized Log-Linear Models", Andrew et al. 2007) * * @param x current point * @param g gradient at x * @param pg pseudo-gradient at x which is to be computed */ private void computePseudoGrad(double[] x, double[] g, double[] pg) { for (int i = 0; i < dimension; i++) { if (x[i] < 0) { pg[i] = g[i] - l1Cost; } else if (x[i] > 0) { pg[i] = g[i] + l1Cost; } else { if (g[i] < -l1Cost) { // right partial derivative pg[i] = g[i] + l1Cost; } else if (g[i] > l1Cost) { // left partial derivative pg[i] = g[i] - l1Cost; } else { pg[i] = 0; } } } } /** * L-BFGS two-loop recursion (see Nocedal & Wright 2006, Numerical Optimization, p. 178) */ private void computeDirection(double[] direction) { // Implemented two-loop Hessian update method. int k = updateInfo.kCounter; double[] rho = updateInfo.rho; double[] alpha = updateInfo.alpha; // just to avoid recreating alpha double[][] S = updateInfo.S; double[][] Y = updateInfo.Y; // First loop for (int i = k - 1; i >= 0; i--) { alpha[i] = rho[i] * ArrayMath.innerProduct(S[i], direction); for (int j = 0; j < dimension; j++) { direction[j] = direction[j] - alpha[i] * Y[i][j]; } } // Second loop for (int i = 0; i < k; i++) { double beta = rho[i] * ArrayMath.innerProduct(Y[i], direction); for (int j = 0; j < dimension; j++) { direction[j] = direction[j] + S[i][j] * (alpha[i] - beta); } } for (int i = 0; i < dimension; i++) { direction[i] = -direction[i]; } } private boolean isConverged(LineSearchResult lsr) { // Check function's change rate if (lsr.getFuncChangeRate() < CONVERGE_TOLERANCE) { if (verbose) display("Function change rate is smaller than the threshold " + CONVERGE_TOLERANCE + ".\nTraining will stop.\n\n"); return true; } // Check gradient's norm using the criteria: ||g(x)|| / max(1, ||x||) < threshold double xNorm = Math.max(1, ArrayMath.l2norm(lsr.getNextPoint())); double gradNorm = l1Cost > 0 ? ArrayMath.l2norm(lsr.getPseudoGradAtNext()) : ArrayMath.l2norm(lsr.getGradAtNext()); if (gradNorm / xNorm < REL_GRAD_NORM_TOL) { if (verbose) display("Relative L2-norm of the gradient is smaller than the threshold " + REL_GRAD_NORM_TOL + ".\nTraining will stop.\n\n"); return true; } // Check step size if (lsr.getStepSize() < MIN_STEP_SIZE) { if (verbose) display("Step size is smaller than the minimum step size " + MIN_STEP_SIZE + ".\nTraining will stop.\n\n"); return true; } // Check number of function evaluations if (lsr.getFctEvalCount() > this.maxFctEval) { if (verbose) display("Maximum number of function evaluations has exceeded the threshold " + this.maxFctEval + ".\nTraining will stop.\n\n"); return true; } return false; } /** * Shorthand for System.out.print */ private void display(String s) { System.out.print(s); } /** * Class to store vectors for Hessian approximation update. */ private class UpdateInfo { private double[][] S; private double[][] Y; private double[] rho; private double[] alpha; private int m; private int kCounter; // Constructor UpdateInfo(int numCorrection, int dimension) { this.m = numCorrection; this.kCounter = 0; S = new double[this.m][dimension]; Y = new double[this.m][dimension]; rho = new double[this.m]; alpha = new double[this.m]; } public void update(LineSearchResult lsr) { double[] currPoint = lsr.getCurrPoint(); double[] gradAtCurr = lsr.getGradAtCurr(); double[] nextPoint = lsr.getNextPoint(); double[] gradAtNext = lsr.getGradAtNext(); // Inner product of S_k and Y_k double SYk = 0.0; // Add new ones. if (kCounter < m) { for (int j = 0; j < dimension; j++) { S[kCounter][j] = nextPoint[j] - currPoint[j]; Y[kCounter][j] = gradAtNext[j] - gradAtCurr[j]; SYk += S[kCounter][j] * Y[kCounter][j]; } rho[kCounter] = 1.0 / SYk; } else { // Discard oldest vectors and add new ones. for (int i = 0; i < m - 1; i++) { S[i] = S[i + 1]; Y[i] = Y[i + 1]; rho[i] = rho[i + 1]; } for (int j = 0; j < dimension; j++) { S[m - 1][j] = nextPoint[j] - currPoint[j]; Y[m - 1][j] = gradAtNext[j] - gradAtCurr[j]; SYk += S[m - 1][j] * Y[m - 1][j]; } rho[m - 1] = 1.0 / SYk; } if (kCounter < m) kCounter++; } } /** * L2-regularized objective function */ public static class L2RegFunction implements Function { private Function f; private double l2Cost; public L2RegFunction(Function f, double l2Cost) { this.f = f; this.l2Cost = l2Cost; } @Override public int getDimension() { return f.getDimension(); } @Override public double valueAt(double[] x) { checkDimension(x); double value = f.valueAt(x); if (l2Cost > 0) { value += l2Cost * ArrayMath.innerProduct(x, x); } return value; } @Override public double[] gradientAt(double[] x) { checkDimension(x); double[] gradient = f.gradientAt(x); if (l2Cost > 0) { for (int i = 0; i < x.length; i++) { gradient[i] += 2 * l2Cost * x[i]; } } return gradient; } private void checkDimension(double[] x) { if (x.length != getDimension()) throw new IllegalArgumentException( "x's dimension is not the same as function's dimension"); } } /** * Evaluate quality of training parameters. For example, * it can be used to report model's training accuracy when * we train a Maximum Entropy classifier. */ public interface Evaluator { /** * Measure quality of the training parameters * @param parameters * @return evaluated result */ double evaluate(double[] parameters); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy