All Downloads are FREE. Search and download functionalities are using the official Maven repository.

opennlp.tools.ml.maxent.quasinewton.QNMinimizer Maven / Gradle / Ivy

There is a newer version: 2.5.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.ml.maxent.quasinewton;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import opennlp.tools.ml.ArrayMath;
import opennlp.tools.ml.maxent.quasinewton.LineSearch.LineSearchResult;

/**
 * Implementation of L-BFGS which supports L1-, L2-regularization
 * and Elastic Net for solving convex optimization problems.
 *
 * 

* Usage example: *

 *  // Quadratic function f(x) = (x-1)^2 + 10
 *  // f obtains its minimum value 10 at x = 1
 *  Function f = new Function() {
 *
 *    {@literal @}Override
 *    public int getDimension() {
 *      return 1;
 *    }
 *
 *    {@literal @}Override
 *    public double valueAt(double[] x) {
 *      return StrictMath.pow(x[0]-1, 2) + 10;
 *    }
 *
 *    {@literal @}Override
 *    public double[] gradientAt(double[] x) {
 *      return new double[] { 2*(x[0]-1) };
 *    }
 *
 *  };
 *
 *  QNMinimizer minimizer = new QNMinimizer();
 *  double[] x = minimizer.minimize(f);
 *  double min = f.valueAt(x);
 * 
*/ public class QNMinimizer { private static final Logger logger = LoggerFactory.getLogger(QNMinimizer.class); // Function change rate tolerance public static final double CONVERGE_TOLERANCE = 1e-4; // Relative gradient norm tolerance public static final double REL_GRAD_NORM_TOL = 1e-4; // Initial step size public static final double INITIAL_STEP_SIZE = 1.0; // Minimum step size public static final double MIN_STEP_SIZE = 1e-10; // Default L1-cost public static final double L1COST_DEFAULT = 0; // Default L2-cost public static final double L2COST_DEFAULT = 0; // Default number of iterations public static final int NUM_ITERATIONS_DEFAULT = 100; // Default number of Hessian updates to store public static final int M_DEFAULT = 15; // Default maximum number of function evaluations public static final int MAX_FCT_EVAL_DEFAULT = 30000; // L1-regularization cost private final double l1Cost; // L2-regularization cost private final double l2Cost; // Maximum number of iterations private final int iterations; // Number of Hessian updates to store private final int m; // Maximum number of function evaluations private final int maxFctEval; // Objective function's dimension private int dimension; // Hessian updates private UpdateInfo updateInfo; // For evaluating quality of training parameters. // This is optional and can be omitted. private Evaluator evaluator; /** * Initializes a {@link QNMinimizer} with default parameters. */ public QNMinimizer() { this(L1COST_DEFAULT, L2COST_DEFAULT); } /** * Initializes a {@link QNMinimizer}. * * @param l1Cost The L1-regularization cost. * @param l2Cost The L2-regularization cost. */ public QNMinimizer(double l1Cost, double l2Cost) { this(l1Cost, l2Cost, NUM_ITERATIONS_DEFAULT); } /** * Initializes a {@link QNMinimizer}. * * @param l1Cost The L1-regularization cost. * @param l2Cost The L2-regularization cost. * @param iterations The maximum number of iterations. */ public QNMinimizer(double l1Cost, double l2Cost, int iterations) { this(l1Cost, l2Cost, iterations, M_DEFAULT, MAX_FCT_EVAL_DEFAULT); } /** * Initializes a {@link QNMinimizer}. * * @param l1Cost The L1-regularization cost. * @param l2Cost The L2-regularization cost. * @param iterations The maximum number of iterations. * @param m The number of Hessian updates to store. * @param maxFctEval The maximum number of function evaluations. */ public QNMinimizer(double l1Cost, double l2Cost, int iterations, int m, int maxFctEval) { // Check arguments if (l1Cost < 0 || l2Cost < 0) throw new IllegalArgumentException( "L1-cost and L2-cost must not be less than zero"); if (iterations <= 0) throw new IllegalArgumentException( "Number of iterations must be larger than zero"); if (m <= 0) throw new IllegalArgumentException( "Number of Hessian updates must be larger than zero"); if (maxFctEval <= 0) throw new IllegalArgumentException( "Maximum number of function evaluations must be larger than zero"); this.l1Cost = l1Cost; this.l2Cost = l2Cost; this.iterations = iterations; this.m = m; this.maxFctEval = maxFctEval; } public Evaluator getEvaluator() { return evaluator; } public void setEvaluator(Evaluator evaluator) { this.evaluator = evaluator; } /** * Finds the parameters that minimize the objective function. * * @param function The objective {@link Function}. * @return The minimizing parameters. */ public double[] minimize(Function function) { Function l2RegFunction = new L2RegFunction(function, l2Cost); this.dimension = l2RegFunction.getDimension(); this.updateInfo = new UpdateInfo(this.m, this.dimension); // Current point is at the origin double[] currPoint = new double[dimension]; double currValue = l2RegFunction.valueAt(currPoint); // Gradient at the current point double[] currGrad = new double[dimension]; System.arraycopy(l2RegFunction.gradientAt(currPoint), 0, currGrad, 0, dimension); // Pseudo-gradient - only use when L1-regularization is enabled double[] pseudoGrad = null; if (l1Cost > 0) { currValue += l1Cost * ArrayMath.l1norm(currPoint); pseudoGrad = new double[dimension]; computePseudoGrad(currPoint, currGrad, pseudoGrad); } LineSearchResult lsr; if (l1Cost > 0) { lsr = LineSearchResult.getInitialObjectForL1( currValue, currGrad, pseudoGrad, currPoint); } else { lsr = LineSearchResult.getInitialObject( currValue, currGrad, currPoint); } if (logger.isDebugEnabled()) { logger.debug("Solving convex optimization problem."); logger.debug("Objective function has {} variable(s).", dimension); logger.debug("Performing {} iterations with L1Cost={} and L2Cost={}", iterations, l1Cost, l2Cost); } double[] direction = new double[dimension]; long startTime = System.currentTimeMillis(); // Initial step size for the 1st iteration double initialStepSize = l1Cost > 0 ? ArrayMath.invL2norm(lsr.getPseudoGradAtNext()) : ArrayMath.invL2norm(lsr.getGradAtNext()); for (int iter = 1; iter <= iterations; iter++) { // Find direction if (l1Cost > 0) { System.arraycopy(lsr.getPseudoGradAtNext(), 0, direction, 0, direction.length); } else { System.arraycopy(lsr.getGradAtNext(), 0, direction, 0, direction.length); } computeDirection(direction); // Line search if (l1Cost > 0) { // Constrain the search direction pseudoGrad = lsr.getPseudoGradAtNext(); for (int i = 0; i < dimension; i++) { if (direction[i] * pseudoGrad[i] >= 0) { direction[i] = 0; } } LineSearch.doConstrainedLineSearch(l2RegFunction, direction, lsr, l1Cost, initialStepSize); computePseudoGrad(lsr.getNextPoint(), lsr.getGradAtNext(), pseudoGrad); lsr.setPseudoGradAtNext(pseudoGrad); } else { LineSearch.doLineSearch(l2RegFunction, direction, lsr, initialStepSize); } // Save Hessian updates updateInfo.update(lsr); if (logger.isDebugEnabled()) { if (evaluator != null) { logger.debug("{}: \t{}\t{}\t{}", iter, lsr.getValueAtNext(), lsr.getFuncChangeRate(),evaluator.evaluate(lsr.getNextPoint())); } else { logger.debug("{}: \t {}\t{}\n", iter, lsr.getValueAtNext(), lsr.getFuncChangeRate()); } } if (isConverged(lsr)) break; initialStepSize = INITIAL_STEP_SIZE; } // Undo L2-shrinkage if Elastic Net is used (since // in that case, the shrinkage is done twice) if (l1Cost > 0 && l2Cost > 0) { double[] x = lsr.getNextPoint(); for (int i = 0; i < dimension; i++) { x[i] = StrictMath.sqrt(1 + l2Cost) * x[i]; } } long endTime = System.currentTimeMillis(); long duration = endTime - startTime; logger.info("Running time: {}s\n", (duration / 1000.)); // Release memory this.updateInfo = null; System.gc(); // Avoid returning the reference to LineSearchResult's member so that GC can // collect memory occupied by lsr after this function completes (is it necessary?) double[] parameters = new double[dimension]; System.arraycopy(lsr.getNextPoint(), 0, parameters, 0, dimension); return parameters; } /** * Pseudo-gradient for L1-regularization (see equation 4 in the paper * "Scalable Training of L1-Regularized Log-Linear Models", Andrew et al. 2007) * * @param x current point * @param g gradient at x * @param pg pseudo-gradient at x which is to be computed */ private void computePseudoGrad(double[] x, double[] g, double[] pg) { for (int i = 0; i < dimension; i++) { if (x[i] < 0) { pg[i] = g[i] - l1Cost; } else if (x[i] > 0) { pg[i] = g[i] + l1Cost; } else { if (g[i] < -l1Cost) { // right partial derivative pg[i] = g[i] + l1Cost; } else if (g[i] > l1Cost) { // left partial derivative pg[i] = g[i] - l1Cost; } else { pg[i] = 0; } } } } /** * L-BFGS two-loop recursion (see Nocedal & Wright 2006, Numerical Optimization, p. 178) */ private void computeDirection(double[] direction) { // Implemented two-loop Hessian update method. int k = updateInfo.kCounter; double[] rho = updateInfo.rho; double[] alpha = updateInfo.alpha; // just to avoid recreating alpha double[][] S = updateInfo.S; double[][] Y = updateInfo.Y; // First loop for (int i = k - 1; i >= 0; i--) { alpha[i] = rho[i] * ArrayMath.innerProduct(S[i], direction); for (int j = 0; j < dimension; j++) { direction[j] = direction[j] - alpha[i] * Y[i][j]; } } // Second loop for (int i = 0; i < k; i++) { double beta = rho[i] * ArrayMath.innerProduct(Y[i], direction); for (int j = 0; j < dimension; j++) { direction[j] = direction[j] + S[i][j] * (alpha[i] - beta); } } for (int i = 0; i < dimension; i++) { direction[i] = -direction[i]; } } private boolean isConverged(LineSearchResult lsr) { // Check function's change rate if (lsr.getFuncChangeRate() < CONVERGE_TOLERANCE) { if (logger.isDebugEnabled()) logger.debug("Function change rate is smaller than the threshold {}. " + "Training will stop.", CONVERGE_TOLERANCE); return true; } // Check gradient's norm using the criteria: ||g(x)|| / max(1, ||x||) < threshold double xNorm = StrictMath.max(1, ArrayMath.l2norm(lsr.getNextPoint())); double gradNorm = l1Cost > 0 ? ArrayMath.l2norm(lsr.getPseudoGradAtNext()) : ArrayMath.l2norm(lsr.getGradAtNext()); if (gradNorm / xNorm < REL_GRAD_NORM_TOL) { if (logger.isDebugEnabled()) logger.debug("Relative L2-norm of the gradient is smaller than the threshold {}. " + "Training will stop.", REL_GRAD_NORM_TOL); return true; } // Check step size if (lsr.getStepSize() < MIN_STEP_SIZE) { if (logger.isDebugEnabled()) logger.debug("Step size is smaller than the minimum step size {}. " + "Training will stop.", MIN_STEP_SIZE); return true; } // Check number of function evaluations if (lsr.getFctEvalCount() > this.maxFctEval) { if (logger.isDebugEnabled()) logger.debug("Maximum number of function evaluations has exceeded the threshold {}. " + "Training will stop.", this.maxFctEval); return true; } return false; } /** * Class to store vectors for Hessian approximation update. */ private class UpdateInfo { private final double[][] S; private final double[][] Y; private final double[] rho; private final double[] alpha; private final int m; private int kCounter; // Constructor UpdateInfo(int numCorrection, int dimension) { this.m = numCorrection; this.kCounter = 0; S = new double[this.m][dimension]; Y = new double[this.m][dimension]; rho = new double[this.m]; alpha = new double[this.m]; } public void update(LineSearchResult lsr) { double[] currPoint = lsr.getCurrPoint(); double[] gradAtCurr = lsr.getGradAtCurr(); double[] nextPoint = lsr.getNextPoint(); double[] gradAtNext = lsr.getGradAtNext(); // Inner product of S_k and Y_k double SYk = 0.0; // Add new ones. if (kCounter < m) { for (int j = 0; j < dimension; j++) { S[kCounter][j] = nextPoint[j] - currPoint[j]; Y[kCounter][j] = gradAtNext[j] - gradAtCurr[j]; SYk += S[kCounter][j] * Y[kCounter][j]; } rho[kCounter] = 1.0 / SYk; } else { // Discard oldest vectors and add new ones. for (int i = 0; i < m - 1; i++) { S[i] = S[i + 1]; Y[i] = Y[i + 1]; rho[i] = rho[i + 1]; } for (int j = 0; j < dimension; j++) { S[m - 1][j] = nextPoint[j] - currPoint[j]; Y[m - 1][j] = gradAtNext[j] - gradAtCurr[j]; SYk += S[m - 1][j] * Y[m - 1][j]; } rho[m - 1] = 1.0 / SYk; } if (kCounter < m) kCounter++; } } /** * L2-regularized objective {@link Function}. */ public static class L2RegFunction implements Function { private final Function f; private final double l2Cost; public L2RegFunction(Function f, double l2Cost) { this.f = f; this.l2Cost = l2Cost; } @Override public int getDimension() { return f.getDimension(); } @Override public double valueAt(double[] x) { checkDimension(x); double value = f.valueAt(x); if (l2Cost > 0) { value += l2Cost * ArrayMath.innerProduct(x, x); } return value; } @Override public double[] gradientAt(double[] x) { checkDimension(x); double[] gradient = f.gradientAt(x); if (l2Cost > 0) { for (int i = 0; i < x.length; i++) { gradient[i] += 2 * l2Cost * x[i]; } } return gradient; } private void checkDimension(double[] x) { if (x.length != getDimension()) throw new IllegalArgumentException( "x's dimension is not the same as function's dimension"); } } /** * Evaluate quality of training parameters. For example, * it can be used to report model's training accuracy when * we train a Maximum Entropy classifier. */ public interface Evaluator { /** * Measure quality of the training parameters. * * @param parameters The parameters used for training. * @return The evaluated result. */ double evaluate(double[] parameters); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy