hex.optimization.L_BFGS Maven / Gradle / Ivy

Go to download
package hex.optimization;

import hex.optimization.OptimizationUtils.GradientInfo;
import hex.optimization.OptimizationUtils.GradientSolver;
import hex.optimization.OptimizationUtils.LineSearchSolver;
import hex.optimization.OptimizationUtils.MoreThuente;
import water.Iced;
import water.MemoryManager;
import water.util.ArrayUtils;
import water.util.MathUtils;

import java.util.Arrays;
import java.util.Random;

/**
 * Created by tomasnykodym on 9/15/14.
 *
 * Generic L-BFGS optimizer implementation.
 *
 * NOTE: The solver object keeps its state and so the same object can not be reused to solve different problems.
 * (but can be used for warm-starting/continuation of the same problem)
 *
 * Usage:
 *
 * To apply L-BFGS to your optimization problem, provide a GradientSolver with following 2 methods:
 *   1) double [] getGradient(double []):
 *      evaluate ginfo at given coefficients, typically an MRTask
 *   2) double [] getObjVals(double[] beta, double[] direction):
 *      evaluate objective value at line-search search points (e.g. objVals[k] = obj(beta + step(k)*direction), step(k) = .75^k)
 *      typically a single MRTask
 *   @see hex.glm.GLM.GLMGradientSolver
 *
 * L-BFGS will then perform following loop:
 *   while(not converged):
 *     coefs    := doLineSearch(coefs, dir)   // distributed, 1 pass over data
 *     ginfo := getGradient(coefs)         // distributed, 1 pass over data
 *     history  += (coefs, ginfo)          // local
 *     dir      := newDir(history, ginfo)  // local
 *
 * 1 L-BFGS iteration thus takes 2 passes over the (distributed) dataset.
 *
*/
public final class L_BFGS extends Iced {
  int _maxIter = 500;
  double _gradEps = 1e-8;
  double _objEps = 1e-10;
  // line search params
  int _historySz = 20;

  History _hist;


  public L_BFGS() {}
  public L_BFGS setMaxIter(int m) {_maxIter = m; return this;}
  public L_BFGS setGradEps(double d) {_gradEps = d; return this;}
  public L_BFGS setObjEps(double d) {
    _objEps = d; return this;
  }
  public L_BFGS setHistorySz(int sz) {_historySz = sz; return this;}


  public int k() {return _hist._k;}
  public int maxIter(){ return _maxIter;}



  /**
   * Monitor progress and enable early termination.
   */
  public interface ProgressMonitor {
    boolean progress(double [] betaDiff, GradientInfo ginfo);
  }

  // constants used in line search

  public static final class Result {
    public final int iter;
    public final double [] coefs;
    public final GradientInfo ginfo;
    public final boolean converged;
    public final double rel_improvement;

    public Result(boolean converged, int iter, double [] coefs, GradientInfo ginfo, double rel_improvement){
      this.iter = iter;
      this.coefs = coefs;
      this.ginfo = ginfo;
      this.converged = converged;
      this.rel_improvement = rel_improvement;
    }

    public String toString(){
      if(coefs.length < 10) {
        return "L-BFGS_res(converged? " + converged + ", iter = " + iter + ", obj = " + ginfo._objVal + ", rel_improvement = " + rel_improvement +  ", coefs = " + Arrays.toString(coefs) + ", grad = " + Arrays.toString(ginfo._gradient) + ")";
      } else {
        return "L-BFGS_res(converged? " + converged + ", iter = " + iter + ", obj = " + ginfo._objVal + ", rel_improvement = " + rel_improvement +  "grad_linf_norm = " + ArrayUtils.linfnorm(ginfo._gradient,false) + ")";
      }
    }
  }

  /**
   *  Keeps L-BFGS history ie curvature information recorded over the last m steps.
   */
  public static final class History extends Iced {
    private final double [][] _s;
    private final double [][] _y;
    private final double [] _rho;
    private final double [] _alpha;

    final int _m, _n;

    public History(int m, int n) {
      _m = m;
      _alpha = new double[_m];
      _n = n;
      _s = new double[m][];
      _y = new double[m][];
      _rho = MemoryManager.malloc8d(m);
      Arrays.fill(_rho,Double.NaN);
      for (int i = 0; i < m; ++i) {
        _s[i] = MemoryManager.malloc8d(n);
        Arrays.fill(_s[i], Double.NaN); // to make sure we don't just run with zeros
        _y[i] = MemoryManager.malloc8d(n);
        Arrays.fill(_y[i], Double.NaN);
      }
    }
    int getId(int k) {return (_k + k) % _m;}

    int _k;

    private final void update(double [] pk, double [] gNew, double [] gOld){
      int id = getId(0);
      double[] y = _y[id];
      double[] s = _s[id];
      for (int i = 0; i < gNew.length; ++i)
        y[i] = gNew[i] - gOld[i];
      System.arraycopy(pk,0,s,0,pk.length);
      _rho[id] = 1.0/ArrayUtils.innerProduct(s,y);
      ++_k;
    }

    // the actual core of L-BFGS algo
    // compute new search direction using the ginfo at current beta and history
    protected  final double [] getSearchDirection(final double [] gradient, double [] q) {
      System.arraycopy(gradient,0,q,0,q.length);
      if(_k != 0) {
        int k = Math.min(_k,_m);
        for (int i = 1; i <= k; ++i) {
          int id = getId(-i);
          _alpha[id] = _rho[id] * ArrayUtils.innerProduct(_s[id], q);
          MathUtils.wadd(q, _y[id], -_alpha[id]);
        }
        int lastId = getId(-1);
        final double[] y = _y[lastId];
        double Hk0 = -1.0 / (ArrayUtils.innerProduct(y, y) * _rho[lastId]);
        ArrayUtils.mult(q, Hk0);
        for (int i = k; i > 0; --i) {
          int id = getId(-i);
          double beta = _rho[id] * ArrayUtils.innerProduct(_y[id], q);
          MathUtils.wadd(q, _s[id], -_alpha[id] - beta);
        }
      } else ArrayUtils.mult(q,-1);
      return q;
    }
  }

  /**
   * Solve the optimization problem defined by the user-supplied ginfo function using L-BFGS algorithm.
   *
   * Will result into multiple (10s to 100s or even 1000s) calls of the user-provided ginfo function.
   * Outside of that it does only limited single threaded computation (order of number of coefficients).
   * The ginfo is likely to be the most expensive part and key for good perfomance.
   *
   * @param gslvr - user ginfo function
   * @param beta - starting solution
   * @return Optimal solution (coefficients) + ginfo info returned by the user ginfo
   * function evaluated at the found optmimum.
   */
//  public final Result solve(GradientSolver gslvr, double [] beta, GradientInfo ginfo, ProgressMonitor pm) {return solve(gslvr,beta,beta.clone(),ginfo,pm);}
  public final Result solve(GradientSolver gslvr, double [] beta, GradientInfo ginfo, ProgressMonitor pm) {
    if(_hist == null)
      _hist = new History(_historySz, beta.length);
    int iter = 0;
    double rel_improvement = 1;
    final double [] pk = new double[beta.length];
    double minStep = 1e-16;
    LineSearchSolver lineSearch = new MoreThuente(gslvr,beta,ginfo);
    while(!ArrayUtils.hasNaNsOrInfs(beta) && (ArrayUtils.linfnorm(ginfo._gradient,false) > _gradEps  && rel_improvement > _objEps) && iter != _maxIter) {
      ++iter;
      _hist.getSearchDirection(ginfo._gradient,pk);
      if(!lineSearch.evaluate(pk))
        break;
      lineSearch.setInitialStep(Math.max(minStep, lineSearch.step()));
      GradientInfo newGinfo = lineSearch.ginfo();
      _hist.update(pk, newGinfo._gradient, ginfo._gradient);
      rel_improvement = (ginfo._objVal - newGinfo._objVal)/Math.abs(ginfo._objVal);
      ginfo = newGinfo;
      if(!pm.progress(lineSearch.getX(), ginfo))break;
    }
    return new Result((ArrayUtils.linfnorm(ginfo._gradient,false) <= _gradEps  || rel_improvement <= _objEps),iter,lineSearch.getX(), lineSearch.ginfo(),rel_improvement);
  }

  /**
   * Solve the optimization problem defined by the user-supplied ginfo function using L-BFGS algorithm.
   *
   * Will result into multiple (10s to 100s or even 1000s) calls of the user-provided ginfo function.
   * Outside of that it does only limited single threaded computation (order of number of coefficients).
   * The ginfo is likely to be the most expensive part and key for good perfomance.
   *
   * @param gslvr  - user ginfo function
   * @params coefs - intial solution
   * @return Optimal solution (coefficients) + ginfo info returned by the user ginfo
   * function evaluated at the found optmimum.
   */
  public final Result solve(GradientSolver gslvr, double [] coefs){
    return solve(gslvr, coefs, gslvr.getGradient(coefs), new ProgressMonitor(){
      @Override
      public boolean progress(double[] beta, GradientInfo ginfo) {
        return true;
      }
    });
  }

  public static double [] startCoefs(int n, long seed){
    double [] res = MemoryManager.malloc8d(n);
    Random r = new Random(seed);
    for(int i = 0; i < res.length; ++i)
      res[i] = r.nextGaussian();
    return res;
  }
}