opennlp.tools.ml.maxent.quasinewton.QNMinimizer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of opennlp-tools Show documentation
There is a newer version: 2.5.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.ml.maxent.quasinewton;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import opennlp.tools.ml.ArrayMath;
import opennlp.tools.ml.maxent.quasinewton.LineSearch.LineSearchResult;

/**
 * Implementation of L-BFGS which supports L1-, L2-regularization
 * and Elastic Net for solving convex optimization problems.
 *
 * 
 * Usage example:
 * 
 *  // Quadratic function f(x) = (x-1)^2 + 10
 *  // f obtains its minimum value 10 at x = 1
 *  Function f = new Function() {
 *
 *    {@literal @}Override
 *    public int getDimension() {
 *      return 1;
 *    }
 *
 *    {@literal @}Override
 *    public double valueAt(double[] x) {
 *      return StrictMath.pow(x[0]-1, 2) + 10;
 *    }
 *
 *    {@literal @}Override
 *    public double[] gradientAt(double[] x) {
 *      return new double[] { 2*(x[0]-1) };
 *    }
 *
 *  };
 *
 *  QNMinimizer minimizer = new QNMinimizer();
 *  double[] x = minimizer.minimize(f);
 *  double min = f.valueAt(x);
 * 
 */
public class QNMinimizer {

  private static final Logger logger = LoggerFactory.getLogger(QNMinimizer.class);

  // Function change rate tolerance
  public static final double CONVERGE_TOLERANCE = 1e-4;

  // Relative gradient norm tolerance
  public static final double REL_GRAD_NORM_TOL = 1e-4;

  // Initial step size
  public static final double INITIAL_STEP_SIZE = 1.0;

  // Minimum step size
  public static final double MIN_STEP_SIZE = 1e-10;

  // Default L1-cost
  public static final double L1COST_DEFAULT = 0;

  // Default L2-cost
  public static final double L2COST_DEFAULT = 0;

  // Default number of iterations
  public static final int NUM_ITERATIONS_DEFAULT = 100;

  // Default number of Hessian updates to store
  public static final int M_DEFAULT = 15;

  // Default maximum number of function evaluations
  public static final int MAX_FCT_EVAL_DEFAULT = 30000;

  // L1-regularization cost
  private final double l1Cost;

  // L2-regularization cost
  private final double l2Cost;

  // Maximum number of iterations
  private final int iterations;

  // Number of Hessian updates to store
  private final int m;

  // Maximum number of function evaluations
  private final int maxFctEval;

  // Objective function's dimension
  private int dimension;

  // Hessian updates
  private UpdateInfo updateInfo;

  // For evaluating quality of training parameters.
  // This is optional and can be omitted.
  private Evaluator evaluator;

  /**
   * Initializes a {@link QNMinimizer} with default parameters.
   */
  public QNMinimizer() {
    this(L1COST_DEFAULT, L2COST_DEFAULT);
  }

  /**
   * Initializes a {@link QNMinimizer}.
   *
   * @param l1Cost The L1-regularization cost.
   * @param l2Cost The L2-regularization cost.
   */
  public QNMinimizer(double l1Cost, double l2Cost) {
    this(l1Cost, l2Cost, NUM_ITERATIONS_DEFAULT);
  }

  /**
   * Initializes a {@link QNMinimizer}.
   *
   * @param l1Cost The L1-regularization cost.
   * @param l2Cost The L2-regularization cost.
   * @param iterations The maximum number of iterations.
   */
  public QNMinimizer(double l1Cost, double l2Cost, int iterations) {
    this(l1Cost, l2Cost, iterations, M_DEFAULT, MAX_FCT_EVAL_DEFAULT);
  }

  /**
   * Initializes a {@link QNMinimizer}.
   * 
   * @param l1Cost The L1-regularization cost.
   * @param l2Cost The L2-regularization cost.
   * @param iterations The maximum number of iterations.
   * @param m The number of Hessian updates to store.
   * @param maxFctEval The maximum number of function evaluations.
   */
  public QNMinimizer(double l1Cost, double l2Cost, int iterations,
      int m, int maxFctEval)
  {
    // Check arguments
    if (l1Cost < 0 || l2Cost < 0)
      throw new IllegalArgumentException(
          "L1-cost and L2-cost must not be less than zero");

    if (iterations <= 0)
      throw new IllegalArgumentException(
          "Number of iterations must be larger than zero");

    if (m <= 0)
      throw new IllegalArgumentException(
          "Number of Hessian updates must be larger than zero");

    if (maxFctEval <= 0)
      throw new IllegalArgumentException(
          "Maximum number of function evaluations must be larger than zero");

    this.l1Cost     = l1Cost;
    this.l2Cost     = l2Cost;
    this.iterations = iterations;
    this.m          = m;
    this.maxFctEval = maxFctEval;
  }

  public Evaluator getEvaluator() {
    return evaluator;
  }

  public void setEvaluator(Evaluator evaluator) {
    this.evaluator = evaluator;
  }

  /**
   * Finds the parameters that minimize the objective function.
   *
   * @param function The objective {@link Function}.
   * @return The minimizing parameters.
   */
  public double[] minimize(Function function) {

    Function l2RegFunction = new L2RegFunction(function, l2Cost);
    this.dimension  = l2RegFunction.getDimension();
    this.updateInfo = new UpdateInfo(this.m, this.dimension);

    // Current point is at the origin
    double[] currPoint = new double[dimension];

    double currValue = l2RegFunction.valueAt(currPoint);

    // Gradient at the current point
    double[] currGrad = new double[dimension];
    System.arraycopy(l2RegFunction.gradientAt(currPoint), 0,
        currGrad, 0, dimension);

    // Pseudo-gradient - only use when L1-regularization is enabled
    double[] pseudoGrad = null;
    if (l1Cost > 0) {
      currValue += l1Cost * ArrayMath.l1norm(currPoint);
      pseudoGrad = new double[dimension];
      computePseudoGrad(currPoint, currGrad, pseudoGrad);
    }

    LineSearchResult lsr;
    if (l1Cost > 0) {
      lsr = LineSearchResult.getInitialObjectForL1(
          currValue, currGrad, pseudoGrad, currPoint);
    } else {
      lsr = LineSearchResult.getInitialObject(
          currValue, currGrad, currPoint);
    }

    if (logger.isDebugEnabled()) {
      logger.debug("Solving convex optimization problem.");
      logger.debug("Objective function has {} variable(s).", dimension);
      logger.debug("Performing {} iterations with L1Cost={} and L2Cost={}",
          iterations, l1Cost, l2Cost);
    }

    double[] direction = new double[dimension];
    long startTime = System.currentTimeMillis();

    // Initial step size for the 1st iteration
    double initialStepSize = l1Cost > 0 ?
        ArrayMath.invL2norm(lsr.getPseudoGradAtNext()) :
          ArrayMath.invL2norm(lsr.getGradAtNext());

    for (int iter = 1; iter <= iterations; iter++) {
      // Find direction
      if (l1Cost > 0) {
        System.arraycopy(lsr.getPseudoGradAtNext(), 0, direction, 0, direction.length);
      } else {
        System.arraycopy(lsr.getGradAtNext(), 0, direction, 0, direction.length);
      }
      computeDirection(direction);

      // Line search
      if (l1Cost > 0) {
        // Constrain the search direction
        pseudoGrad = lsr.getPseudoGradAtNext();
        for (int i = 0; i < dimension; i++) {
          if (direction[i] * pseudoGrad[i] >= 0) {
            direction[i] = 0;
          }
        }
        LineSearch.doConstrainedLineSearch(l2RegFunction, direction, lsr, l1Cost, initialStepSize);
        computePseudoGrad(lsr.getNextPoint(), lsr.getGradAtNext(), pseudoGrad);
        lsr.setPseudoGradAtNext(pseudoGrad);
      }
      else {
        LineSearch.doLineSearch(l2RegFunction, direction, lsr, initialStepSize);
      }

      // Save Hessian updates
      updateInfo.update(lsr);

      if (logger.isDebugEnabled()) {

        if (evaluator != null) {
          logger.debug("{}: \t{}\t{}\t{}", iter,
              lsr.getValueAtNext(), lsr.getFuncChangeRate(),evaluator.evaluate(lsr.getNextPoint()));
        } else {
          logger.debug("{}: \t {}\t{}\n", iter, lsr.getValueAtNext(), lsr.getFuncChangeRate());
        }
      }
      if (isConverged(lsr))
        break;

      initialStepSize = INITIAL_STEP_SIZE;
    }

    // Undo L2-shrinkage if Elastic Net is used (since
    // in that case, the shrinkage is done twice)
    if (l1Cost > 0 && l2Cost > 0) {
      double[] x = lsr.getNextPoint();
      for (int i = 0; i < dimension; i++) {
        x[i] = StrictMath.sqrt(1 + l2Cost) * x[i];
      }
    }

    long endTime = System.currentTimeMillis();
    long duration = endTime - startTime;
    logger.info("Running time: {}s\n", (duration / 1000.));

    // Release memory
    this.updateInfo = null;
    System.gc();

    // Avoid returning the reference to LineSearchResult's member so that GC can
    // collect memory occupied by lsr after this function completes (is it necessary?)
    double[] parameters = new double[dimension];
    System.arraycopy(lsr.getNextPoint(), 0, parameters, 0, dimension);

    return parameters;
  }

  /**
   * Pseudo-gradient for L1-regularization (see equation 4 in the paper
   * "Scalable Training of L1-Regularized Log-Linear Models", Andrew et al. 2007)
   *
   * @param x current point
   * @param g gradient at x
   * @param pg pseudo-gradient at x which is to be computed
   */
  private void computePseudoGrad(double[] x, double[] g, double[] pg) {
    for (int i = 0; i < dimension; i++) {
      if (x[i] < 0) {
        pg[i] = g[i] - l1Cost;
      }
      else if (x[i] > 0) {
        pg[i] = g[i] + l1Cost;
      }
      else {
        if (g[i] < -l1Cost) {
          // right partial derivative
          pg[i] = g[i] + l1Cost;
        }
        else if (g[i] > l1Cost) {
          // left partial derivative
          pg[i] = g[i] - l1Cost;
        }
        else {
          pg[i] = 0;
        }
      }
    }
  }

  /**
   * L-BFGS two-loop recursion (see Nocedal & Wright 2006, Numerical Optimization, p. 178)
   */
  private void computeDirection(double[] direction) {

    // Implemented two-loop Hessian update method.
    int k = updateInfo.kCounter;
    double[] rho    = updateInfo.rho;
    double[] alpha  = updateInfo.alpha; // just to avoid recreating alpha
    double[][] S    = updateInfo.S;
    double[][] Y    = updateInfo.Y;

    // First loop
    for (int i = k - 1; i >= 0; i--) {
      alpha[i] = rho[i] * ArrayMath.innerProduct(S[i], direction);
      for (int j = 0; j < dimension; j++) {
        direction[j] = direction[j] - alpha[i] * Y[i][j];
      }
    }

    // Second loop
    for (int i = 0; i < k; i++) {
      double beta = rho[i] * ArrayMath.innerProduct(Y[i], direction);
      for (int j = 0; j < dimension; j++) {
        direction[j] = direction[j] + S[i][j] * (alpha[i] - beta);
      }
    }

    for (int i = 0; i < dimension; i++) {
      direction[i] = -direction[i];
    }
  }

  private boolean isConverged(LineSearchResult lsr) {

    // Check function's change rate
    if (lsr.getFuncChangeRate() < CONVERGE_TOLERANCE) {
      if (logger.isDebugEnabled())
        logger.debug("Function change rate is smaller than the threshold {}. " +
                "Training will stop.", CONVERGE_TOLERANCE);
      return true;
    }

    // Check gradient's norm using the criteria: ||g(x)|| / max(1, ||x||) < threshold
    double xNorm = StrictMath.max(1, ArrayMath.l2norm(lsr.getNextPoint()));
    double gradNorm = l1Cost > 0 ?
        ArrayMath.l2norm(lsr.getPseudoGradAtNext()) : ArrayMath.l2norm(lsr.getGradAtNext());
    if (gradNorm / xNorm < REL_GRAD_NORM_TOL) {
      if (logger.isDebugEnabled())
        logger.debug("Relative L2-norm of the gradient is smaller than the threshold {}. " +
                "Training will stop.", REL_GRAD_NORM_TOL);
      return true;
    }

    // Check step size
    if (lsr.getStepSize() < MIN_STEP_SIZE) {
      if (logger.isDebugEnabled())
        logger.debug("Step size is smaller than the minimum step size {}. " +
                "Training will stop.", MIN_STEP_SIZE);
      return true;
    }

    // Check number of function evaluations
    if (lsr.getFctEvalCount() > this.maxFctEval) {
      if (logger.isDebugEnabled())
        logger.debug("Maximum number of function evaluations has exceeded the threshold {}. " +
                "Training will stop.", this.maxFctEval);
      return true;
    }

    return false;
  }

  /**
   * Class to store vectors for Hessian approximation update.
   */
  private class UpdateInfo {
    private final double[][] S;
    private final double[][] Y;
    private final double[] rho;
    private final double[] alpha;
    private final int m;

    private int kCounter;

    // Constructor
    UpdateInfo(int numCorrection, int dimension) {
      this.m = numCorrection;
      this.kCounter = 0;
      S     = new double[this.m][dimension];
      Y     = new double[this.m][dimension];
      rho   = new double[this.m];
      alpha = new double[this.m];
    }

    public void update(LineSearchResult lsr) {
      double[] currPoint  = lsr.getCurrPoint();
      double[] gradAtCurr = lsr.getGradAtCurr();
      double[] nextPoint  = lsr.getNextPoint();
      double[] gradAtNext = lsr.getGradAtNext();

      // Inner product of S_k and Y_k
      double SYk = 0.0;

      // Add new ones.
      if (kCounter < m) {
        for (int j = 0; j < dimension; j++) {
          S[kCounter][j] = nextPoint[j] - currPoint[j];
          Y[kCounter][j] = gradAtNext[j] - gradAtCurr[j];
          SYk += S[kCounter][j] * Y[kCounter][j];
        }
        rho[kCounter] = 1.0 / SYk;
      }
      else {
        // Discard oldest vectors and add new ones.
        for (int i = 0; i < m - 1; i++) {
          S[i] = S[i + 1];
          Y[i] = Y[i + 1];
          rho[i] = rho[i + 1];
        }
        for (int j = 0; j < dimension; j++) {
          S[m - 1][j] = nextPoint[j] - currPoint[j];
          Y[m - 1][j] = gradAtNext[j] - gradAtCurr[j];
          SYk += S[m - 1][j] * Y[m - 1][j];
        }
        rho[m - 1] = 1.0 / SYk;
      }

      if (kCounter < m)
        kCounter++;
    }
  }

  /**
   * L2-regularized objective {@link Function}.
   */
  public static class L2RegFunction implements Function {
    private final Function f;
    private final double l2Cost;

    public L2RegFunction(Function f, double l2Cost) {
      this.f = f;
      this.l2Cost = l2Cost;
    }

    @Override
    public int getDimension() {
      return f.getDimension();
    }

    @Override
    public double valueAt(double[] x) {
      checkDimension(x);
      double value = f.valueAt(x);
      if (l2Cost > 0) {
        value += l2Cost * ArrayMath.innerProduct(x, x);
      }
      return value;
    }

    @Override
    public double[] gradientAt(double[] x) {
      checkDimension(x);
      double[] gradient = f.gradientAt(x);
      if (l2Cost > 0) {
        for (int i = 0; i < x.length; i++) {
          gradient[i] += 2 * l2Cost * x[i];
        }
      }
      return gradient;
    }

    private void checkDimension(double[] x) {
      if (x.length != getDimension())
        throw new IllegalArgumentException(
            "x's dimension is not the same as function's dimension");
    }
  }

  /**
   * Evaluate quality of training parameters. For example,
   * it can be used to report model's training accuracy when
   * we train a Maximum Entropy classifier.
   */
  public interface Evaluator {
    /**
     * Measure quality of the training parameters.
     * 
     * @param parameters The parameters used for training.
     * @return The evaluated result.
     */
    double evaluate(double[] parameters);
  }
}