All Downloads are FREE. Search and download functionalities are using the official Maven repository.

smile.math.BFGS Maven / Gradle / Ivy

/*******************************************************************************
 * Copyright (c) 2010-2020 Haifeng Li. All rights reserved.
 *
 * Smile is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation, either version 3 of
 * the License, or (at your option) any later version.
 *
 * Smile is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with Smile.  If not, see .
 ******************************************************************************/

package smile.math;

import java.util.ArrayList;
import java.util.Arrays;
import smile.math.blas.UPLO;
import smile.math.matrix.Matrix;
import smile.sort.QuickSort;
import smile.util.Strings;
import static java.lang.Math.abs;
import static java.lang.Math.max;
import static java.lang.Math.min;
import static java.lang.Math.sqrt;
import static smile.math.MathEx.axpy;
import static smile.math.MathEx.dot;
import static smile.math.MathEx.norm;

/**
 * The Broyden–Fletcher–Goldfarb–Shanno (BFGS) algorithm is an iterative
 * method for solving unconstrained nonlinear optimization problems.
 * 

* The BFGS method belongs to quasi-Newton methods, a class of hill-climbing * optimization techniques that seek a stationary point of a (preferably * twice continuously differentiable) function. For such problems, * a necessary condition for optimality is that the gradient be zero. * Newton's method and the BFGS methods are not guaranteed to converge * unless the function has a quadratic Taylor expansion near an optimum. * However, BFGS has proven to have good performance even for non-smooth * optimizations. *

* In quasi-Newton methods, the Hessian matrix of second derivatives is * not computed. Instead, the Hessian matrix is approximated using * updates specified by gradient evaluations (or approximate gradient * evaluations). Quasi-Newton methods are generalizations of the secant * method to find the root of the first derivative for multidimensional * problems. In multi-dimensional problems, the secant equation does not * specify a unique solution, and quasi-Newton methods differ in how they * constrain the solution. The BFGS method is one of the most popular * members of this class. *

* Like the original BFGS, the limited-memory BFGS (L-BFGS) uses an * estimation to the inverse Hessian matrix to steer its search * through variable space, but where BFGS stores a dense n × n * approximation to the inverse Hessian (n being the number of * variables in the problem), L-BFGS stores only a few vectors * that represent the approximation implicitly. Due to its resulting * linear memory requirement, the L-BFGS method is particularly well * suited for optimization problems with a large number of variables * (e.g., > 1000). Instead of the inverse Hessian H_k, L-BFGS * maintains * a history of the past m updates of the position * x and gradient ∇f(x), where generally the * history size m can be small (often m < 10). * These updates are used to implicitly do operations requiring the * H_k-vector product. * *

References

*
    *
  1. Roger Fletcher. Practical methods of optimization.
  2. *
  3. D. Liu and J. Nocedal. On the limited memory BFGS method for large scale optimization. Mathematical Programming B 45 (1989) 503-528.
  4. *
  5. Richard H. Byrd, Peihuang Lu, Jorge Nocedal and Ciyou Zhu. A limited memory algorithm for bound constrained optimization.
  6. *
* * @author Haifeng Li */ public class BFGS { private static org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(BFGS.class); /** A number close to zero, between machine epsilon and its square root. */ private static final double EPSILON = Double.valueOf(System.getProperty("smile.bfgs.epsilon", "1E-10")); /** The convergence criterion on x values. */ private static final double TOLX = 4 * EPSILON; /** The convergence criterion on function value. */ private static final double TOLF = 4 * EPSILON; /** The scaled maximum step length allowed in line searches. */ private static final double STPMX = 100.0; /** * This method solves the unconstrained minimization problem *

*

     *     min f(x),    x = (x1,x2,...,x_n),
     * 
* using the BFGS method. * * @param func the function to be minimized. * * @param x on initial entry this must be set by the user to the values * of the initial estimate of the solution vector. On exit, it * contains the values of the variables at the best point found * (usually a solution). * * @param gtol the convergence tolerance on zeroing the gradient. * @param maxIter the maximum number of iterations. * * @return the minimum value of the function. */ public static double minimize(DifferentiableMultivariateFunction func, double[] x, double gtol, int maxIter) { if (gtol <= 0.0) { throw new IllegalArgumentException("Invalid gradient tolerance: " + gtol); } if (maxIter <= 0) { throw new IllegalArgumentException("Invalid maximum number of iterations: " + maxIter); } double den, fac, fad, fae, sumdg, sumxi, temp, test; int n = x.length; double[] dg = new double[n]; double[] g = new double[n]; double[] hdg = new double[n]; double[] xnew = new double[n]; double[] xi = new double[n]; double[][] hessin = new double[n][n]; // Calculate starting function value and gradient and initialize the // inverse Hessian to the unit matrix. double f = func.g(x, g); logger.info(String.format("BFGS: initial function value: %.5f", f)); for (int i = 0; i < n; i++) { hessin[i][i] = 1.0; // Initialize line direction. xi[i] = -g[i]; } double stpmax = STPMX * max(norm(x), n); for (int iter = 1; iter <= maxIter; iter++) { // The new function evaluation occurs in line search. f = linesearch(func, x, f, g, xi, xnew, stpmax); if (iter % 100 == 0) { logger.info(String.format("BFGS: the function value after %3d iterations: %.5f", iter, f)); } // update the line direction and current point. for (int i = 0; i < n; i++) { xi[i] = xnew[i] - x[i]; x[i] = xnew[i]; } // Test for convergence on x. test = 0.0; for (int i = 0; i < n; i++) { temp = abs(xi[i]) / max(abs(x[i]), 1.0); if (temp > test) { test = temp; } } if (test < TOLX) { logger.info(String.format("BFGS converges on x after %d iterations: %.5f", iter, f)); return f; } System.arraycopy(g, 0, dg, 0, n); func.g(x, g); // Test for convergence on zero gradient. den = max(f, 1.0); test = 0.0; for (int i = 0; i < n; i++) { temp = abs(g[i]) * max(abs(x[i]), 1.0) / den; if (temp > test) { test = temp; } } if (test < gtol) { logger.info(String.format("BFGS converges on gradient after %d iterations: %.5f", iter, f)); return f; } for (int i = 0; i < n; i++) { dg[i] = g[i] - dg[i]; } for (int i = 0; i < n; i++) { hdg[i] = 0.0; for (int j = 0; j < n; j++) { hdg[i] += hessin[i][j] * dg[j]; } } fac = fae = sumdg = sumxi = 0.0; for (int i = 0; i < n; i++) { fac += dg[i] * xi[i]; fae += dg[i] * hdg[i]; sumdg += dg[i] * dg[i]; sumxi += xi[i] * xi[i]; } // Skip update if fac is not sufficiently positive. if (fac > sqrt(EPSILON * sumdg * sumxi)) { fac = 1.0 / fac; fad = 1.0 / fae; // The vector that makes BFGS different from DFP. for (int i = 0; i < n; i++) { dg[i] = fac * xi[i] - fad * hdg[i]; } // BFGS updating formula. for (int i = 0; i < n; i++) { for (int j = i; j < n; j++) { hessin[i][j] += fac * xi[i] * xi[j] - fad * hdg[i] * hdg[j] + fae * dg[i] * dg[j]; hessin[j][i] = hessin[i][j]; } } } // Calculate the next direction to go. Arrays.fill(xi, 0.0); for (int i = 0; i < n; i++) { for (int j = 0; j < n; j++) { xi[i] -= hessin[i][j] * g[j]; } } } logger.warn(String.format("BFGS reaches maximum %d iterations: %.5f", maxIter, f)); return f; } /** * This method solves the unconstrained minimization problem *

*

     *     min f(x),    x = (x1,x2,...,x_n),
     * 
* using the limited-memory BFGS method. The method is especially * effective on problems involving a large number of variables. In * a typical iteration of this method an approximation Hk to the * inverse of the Hessian is obtained by applying m BFGS updates to * a diagonal matrix Hk0, using information from the previous M steps. * The user specifies the number m, which determines the amount of * storage required by the routine. * * @param func the function to be minimized. * * @param m the number of corrections used in the L-BFGS update. * Values of m less than 3 are not recommended; * large values of m will result in excessive * computing time. 3 <= m <= 7 is recommended. * A common choice for m is m = 5. * * @param x on initial entry this must be set by the user to the values * of the initial estimate of the solution vector. On exit with * iflag = 0, it contains the values of the variables * at the best point found (usually a solution). * * @param gtol the convergence tolerance on zeroing the gradient. * @param maxIter the maximum number of iterations. * * @return the minimum value of the function. */ public static double minimize(DifferentiableMultivariateFunction func, int m, double[] x, double gtol, int maxIter) { if (gtol <= 0.0) { throw new IllegalArgumentException("Invalid gradient tolerance: " + gtol); } if (maxIter <= 0) { throw new IllegalArgumentException("Invalid maximum number of iterations: " + maxIter); } if (m <= 0) { throw new IllegalArgumentException("Invalid m: " + m); } int n = x.length; // The solution vector of line search. double[] xnew = new double[n]; // The gradient vector of line search. double[] gnew = new double[n]; // Line search direction. double[] xi = new double[n]; // difference of x from previous step. double[][] s = new double[m][n]; // difference of g from previous step. double[][] y = new double[m][n]; // buffer for 1 / (y' * s) double[] rho = new double[m]; double[] a = new double[m]; // Diagonal of initial H0. double diag = 1.0; // Current gradient. double[] g = new double[n]; // Current function value. double f = func.g(x, g); logger.info(String.format("L-BFGS: initial function value: %.5f", f)); // Initial line search direction. for (int i = 0; i < n; i++) { xi[i] = -g[i]; } // Upper limit for line search step. double stpmax = STPMX * max(norm(x), n); for (int iter = 1, k = 0; iter <= maxIter; iter++) { linesearch(func, x, f, g, xi, xnew, stpmax); f = func.g(xnew, gnew); for (int i = 0; i < n; i++) { s[k][i] = xnew[i] - x[i]; y[k][i] = gnew[i] - g[i]; x[i] = xnew[i]; g[i] = gnew[i]; } // Test for convergence on x. double test = 0.0; for (int i = 0; i < n; i++) { double temp = abs(s[k][i]) / max(abs(x[i]), 1.0); if (temp > test) { test = temp; } } if (test < TOLX) { logger.info(String.format("L-BFGS converges on x after %d iterations: %.5f", iter, f)); return f; } // Test for convergence on zero gradient. test = 0.0; double den = max(f, 1.0); for (int i = 0; i < n; i++) { double temp = abs(g[i]) * max(abs(x[i]), 1.0) / den; if (temp > test) { test = temp; } } if (test < gtol) { logger.info(String.format("L-BFGS converges on gradient after %d iterations: %.5f", iter, f)); return f; } if (iter % 100 == 0) { logger.info(String.format("L-BFGS: the function value after %3d iterations: %.5f", iter, f)); } double ys = dot(y[k], s[k]); double yy = dot(y[k], y[k]); diag = ys / yy; rho[k] = 1.0 / ys; for (int i = 0; i < n; i++) { xi[i] = -g[i]; } int cp = k; int bound = iter > m ? m : iter; for (int i = 0; i < bound; i++) { a[cp] = rho[cp] * dot(s[cp], xi); axpy(-a[cp], y[cp], xi); if (--cp == - 1) { cp = m - 1; } } for (int i = 0; i < n; i++) { xi[i] *= diag; } for (int i = 0; i < bound; i++) { if (++cp == m) { cp = 0; } double b = rho[cp] * dot(y[cp], xi); axpy(a[cp]-b, s[cp], xi); } if (++k == m) { k = 0; } } logger.warn(String.format("L-BFGS reaches maximum %d iterations: %.5f", maxIter, f)); return f; } /** * Minimize a function along a search direction by find a step which satisfies * a sufficient decrease condition and a curvature condition. *

* At each stage this function updates an interval of uncertainty with * endpoints stx and sty. The interval of * uncertainty is initially chosen so that it contains a * minimizer of the modified function *

*

     *      f(x+stp*s) - f(x) - ftol*stp*(gradf(x)'s).
     * 
* If a step is obtained for which the modified function * has a nonpositive function value and nonnegative derivative, * then the interval of uncertainty is chosen so that it * contains a minimizer of f(x+stp*s). *

* The algorithm is designed to find a step which satisfies * the sufficient decrease condition *

*

     *       f(x+stp*s) <= f(X) + ftol*stp*(gradf(x)'s),
     * 
* and the curvature condition *

*

     *       abs(gradf(x+stp*s)'s)) <= gtol*abs(gradf(x)'s).
     * 
* If ftol is less than gtol and if, for example, * the function is bounded below, then there is always a step which * satisfies both conditions. If no step can be found which satisfies both * conditions, then the algorithm usually stops when rounding * errors prevent further progress. In this case stp only * satisfies the sufficient decrease condition. * * @param xold on input this contains the base point for the line search. * * @param fold on input this contains the value of the objective function * at x. * * @param g on input this contains the gradient of the objective function * at x. * * @param p the search direction. * * @param x on output, it contains xold + γ*p, where * γ > 0 is the step length. * * @param stpmax specify upper bound for the step in the line search so that * we do not try to evaluate the function in regions where it is * undefined or subject to overflow. * * @return the new function value. */ private static double linesearch(MultivariateFunction func, double[] xold, double fold, double[] g, double[] p, double[] x, double stpmax) { if (stpmax <= 0) { throw new IllegalArgumentException("Invalid upper bound of linear search step: " + stpmax); } // Termination occurs when the relative width of the interval // of uncertainty is at most xtol. final double xtol = EPSILON; // Tolerance for the sufficient decrease condition. final double ftol = 1.0E-4; int n = xold.length; // Scale if attempted step is too big double pnorm = norm(p); if (pnorm > stpmax) { double r = stpmax / pnorm; for (int i = 0; i < n; i++) { p[i] *= r; } } // Check if s is a descent direction. double slope = 0.0; for (int i = 0; i < n; i++) { slope += g[i] * p[i]; } if (slope >= 0) { //throw new IllegalArgumentException("Line Search: the search direction is not a descent direction, which may be caused by roundoff problem."); } // Calculate minimum step. double test = 0.0; for (int i = 0; i < n; i++) { double temp = abs(p[i]) / max(xold[i], 1.0); if (temp > test) { test = temp; } } double alammin = xtol / test; double alam = 1.0; double alam2 = 0.0, f2 = 0.0; double a, b, disc, rhs1, rhs2, tmpalam; while (true) { // Evaluate the function and gradient at stp // and compute the directional derivative. for (int i = 0; i < n; i++) { x[i] = xold[i] + alam * p[i]; } double f = func.apply(x); // Convergence on Δ x. if (alam < alammin) { System.arraycopy(xold, 0, x, 0, n); return f; } else if (f <= fold + ftol * alam * slope) { // Sufficient function decrease. return f; } else { // Backtrack if (alam == 1.0) { // First time tmpalam = -slope / (2.0 * (f - fold - slope)); } else { // Subsequent backtracks. rhs1 = f - fold - alam * slope; rhs2 = f2 - fold - alam2 * slope; a = (rhs1 / (alam * alam) - rhs2 / (alam2 * alam2)) / (alam - alam2); b = (-alam2 * rhs1 / (alam * alam) + alam * rhs2 / (alam2 * alam2)) / (alam - alam2); if (a == 0.0) { tmpalam = -slope / (2.0 * b); } else { disc = b * b - 3.0 * a * slope; if (disc < 0.0) { tmpalam = 0.5 * alam; } else if (b <= 0.0) { tmpalam = (-b + sqrt(disc)) / (3.0 * a); } else { tmpalam = -slope / (b + sqrt(disc)); } } if (tmpalam > 0.5 * alam) { tmpalam = 0.5 * alam; } } } alam2 = alam; f2 = f; alam = max(tmpalam, 0.1 * alam); } } /** * This method solves the bound constrained minimization problem * using the L-BFGS-B method. The L-BFGS-B algorithm extends L-BFGS * to handle simple box constraints on variables; that is, constraints * of the form li ≤ xi ≤ ui where li and ui are per-variable constant * lower and upper bounds, respectively (for each xi, either or both * bounds may be omitted). The method works by identifying fixed and * free variables at every step (using a simple gradient method), * and then using the L-BFGS method on the free variables only to * get higher accuracy, and then repeating the process. * * @param func the function to be minimized. * * @param m the number of corrections used in the L-BFGS update. * Values of m less than 3 are not recommended; * large values of m will result in excessive * computing time. 3 <= m <= 7 is recommended. * A common choice for m is m = 5. * * @param x on initial entry this must be set by the user to the values * of the initial estimate of the solution vector. On exit with * iflag = 0, it contains the values of the variables * at the best point found (usually a solution). * * @param l the lower bound. * @param u the upper bound. * @param gtol the convergence tolerance on zeroing the gradient. * @param maxIter the maximum number of iterations. * * @return the minimum value of the function. */ public static double minimize(DifferentiableMultivariateFunction func, int m, double[] x, double[] l, double[] u, double gtol, int maxIter) { if (gtol <= 0.0) { throw new IllegalArgumentException("Invalid gradient tolerance: " + gtol); } if (maxIter <= 0) { throw new IllegalArgumentException("Invalid maximum number of iterations: " + maxIter); } if (m <= 0) { throw new IllegalArgumentException("Invalid m: " + m); } if (l.length != x.length) { throw new IllegalArgumentException("Invalid lower bound size: " + l.length); } if (u.length != x.length) { throw new IllegalArgumentException("Invalid upper bound size: " + u.length); } int n = x.length; double theta = 1.0; Matrix Y = null, S = null; Matrix W = new Matrix(n, 1); Matrix M = new Matrix(1, 1); ArrayList yHistory = new ArrayList<>(); ArrayList sHistory = new ArrayList<>(); double[] y = new double[n]; double[] s = new double[n]; double[] p = new double[n]; double[] g = new double[n]; double[] cauchy = new double[n]; double f = func.g(x, g); double[] x_old = new double[n]; double[] g_old = new double[n]; // Upper limit for line search step. double stpmax = STPMX * max(norm(x), n); for (int iter = 1; iter <= maxIter; iter++) { double f_old = f; System.arraycopy(x, 0, x_old, 0, n); System.arraycopy(g, 0, g_old, 0, n); // STEP 2: compute the cauchy point System.arraycopy(x, 0, cauchy, 0, n); double[] c = cauchy(x, g, cauchy, l, u, theta, W, M); clampToBound(cauchy, l, u); // STEP 3: compute a search direction d_k by the primal method for the sub-problem double[] subspaceMin = subspaceMinimization(x, g, cauchy, c, l, u, theta, W, M); clampToBound(subspaceMin, l, u); // STEP 4: perform line search for (int i = 0; i < n; i++) { p[i] = subspaceMin[i] - x[i]; } linesearch(func, x_old, f_old, g, p, x, stpmax); clampToBound(x, l, u); for (double xi : x) { if (Double.isNaN(xi) || Double.isInfinite(xi)) { logger.warn("L-BFGS-B: bad x produced by line search, return previous good x"); System.arraycopy(x_old, 0, x, 0, n); return f_old; } } // STEP 5: compute gradient update current guess and function information f = func.g(x, g); if (Double.isNaN(f) || Double.isInfinite(f)) { logger.warn("L-BFGS-B: bad f(x) produced by line search, return previous good x"); System.arraycopy(x_old, 0, x, 0, n); return f_old; } if (gnorm(x, g, l, u) < gtol) { logger.info(String.format("L-BFGS-B converges on gradient after %d iterations: %.5f", iter, f)); return f; } if (iter % 100 == 0) { logger.info(String.format("L-BFGS-B: the function value after %3d iterations: %.5f", iter, f)); } // prepare for next iteration for (int i = 0; i < n; i++) { y[i] = g[i] - g_old[i]; s[i] = x[i] - x_old[i]; } // STEP 6 double sy = dot(s, y); double yy = dot(y, y); double test = abs(sy); if (test > EPSILON * yy) { if (yHistory.size() >= m) { yHistory.remove (0); sHistory.remove (0); } yHistory.add(y); sHistory.add(s); int h = yHistory.size(); if (iter <= m) { Y = new Matrix(n, h); S = new Matrix(n, h); W = new Matrix(n, 2*h); M = new Matrix(2*h, 2*h); } // STEP 7 theta = yy / sy; for (int j = 0; j < h; j++) { double[] yj = yHistory.get(j); double[] sj = sHistory.get(j); for (int i = 0; i < n; i++) { Y.set(i, j, yj[i]); S.set(i, j, sj[i]); W.set(i, j, yj[i]); W.set(i, h+j, sj[i] * theta); } } Matrix SY = S.tm(Y); Matrix SS = S.ata(); for (int j = 0; j < h; j++) { M.set(j, j, -SY.get(j, j)); for (int i = 0; i <= j; i++) { M.set(h+i, j, 0.0); M.set(j, h+i, 0.0); } for (int i = j+1; i < h; i++) { M.set(h+i, j, SY.get(i, j)); M.set(j, h+i, SY.get(i, j)); } for (int i = 0; i < h; i++) { M.set(h+i, h+j, theta * SS.get(i, j)); } } M.uplo(UPLO.LOWER); M = M.inverse(); } logger.debug("L-BFGS-B iteration {} moves from {} to {} where f(x) = {}", iter, Strings.toString(x_old), Strings.toString(x), f); if (abs(f_old - f) < TOLF) { logger.info(String.format("L-BFGS-B converges on f(x) after %d iterations: %.5f", iter, f)); return f; } } logger.warn(String.format("L-BFGS-B reaches maximum %d iterations: %.5f", maxIter, f)); return f; } /** * Algorithm CP: Computation of the Generalized Cauchy Point. See page 8. * @param x the parameter vector * @param g the gradient vector */ private static double[] cauchy(double[] x, double[] g, double[] cauchy, double[] l, double[] u, double theta, Matrix W, Matrix M) { int n = x.length; double[] t = new double[n]; double[] d = new double[n]; for (int i = 0; i < n; i++) { t[i] = g[i] == 0 ? Double.MAX_VALUE : (g[i] < 0 ? (x[i] - u[i]) / g[i] : (x[i] - l[i]) / g[i]); if (t[i] != 0.0) d[i] = -g[i]; } int[] index = QuickSort.sort(t); double[] p = W.tv(d); double[] c = new double[p.length]; double fPrime = -dot(d, d); double fDoublePrime = max(-theta * fPrime - M.xAx(p), EPSILON); double f_dp_orig = fDoublePrime; double dt_min = -fPrime / fDoublePrime; double t_old = 0.0; int i = 0; for (; i < n; i++) { if (t[index[i]] > 0) break; } double dt = t[i]; for (; dt_min >= dt && i < n; i++) { int b = index[i]; double tb = t[i]; dt = tb - t_old; cauchy[b] = d[b] > 0 ? u[b] : (d[b] < 0 ? l[b] : cauchy[b]); double zb = cauchy[b] - x[b]; for (int j = 0; j < c.length; j++) { c[j] += p[j] * dt; } double gb = g[b]; double[] wbt = W.row(b); fPrime += dt * fDoublePrime + gb * gb + theta * gb * zb - gb * dot(wbt, M.mv(c)); fDoublePrime -= theta * gb * gb + 2.0 * gb * dot(wbt, M.mv(p)) + gb * gb * M.xAx(wbt); fDoublePrime = max(fDoublePrime, EPSILON * f_dp_orig); for (int j = 0; j < p.length; j++) { p[j] += wbt[j] * gb; } d[b] = 0; dt_min = -fPrime / fDoublePrime; t_old = tb; } dt_min = max(dt_min, 0.0); t_old += dt_min; for (int ii = i; ii < n; ii++) { int si = index[ii]; cauchy[si] = x[si] + t_old * d[si]; } for (int j = 0; j < c.length; j++) { c[j] += p[j] * dt_min; } return c; } /** * Minimization of the subspace of free variables. See Section 5 on page 9. * @param x the parameter vector * @param g the gradient vector * @param cauchy the cauchy points * @param c vector obtained from gcp() used to initialize the subspace * minimization process */ private static double[] subspaceMinimization(double[] x, double[] g, double[] cauchy, double[] c, double[] l, double[] u, double theta, Matrix W, Matrix M) { int n = x.length; double thetaInverse = 1.0 / theta; ArrayList freeVarIdx = new ArrayList<>(); for (int i = 0; i < n; i++) { if (cauchy[i] != u[i] && cauchy[i] != l[i]) { freeVarIdx.add(i); } } if (freeVarIdx.isEmpty()) { return cauchy.clone(); } int freeVarCount = freeVarIdx.size(); int[] freeVar = new int[freeVarCount]; for (int i = 0; i < freeVarCount; i++) { freeVar[i] = freeVarIdx.get(i); } double[] wmc = W.mv(M.mv(c)); double[] r = new double[freeVarCount]; for (int i = 0; i < freeVarCount; i++) { int fi = freeVar[i]; r[i] = g[fi] + (cauchy[fi] - x[fi]) * theta - wmc[fi]; } Matrix WZ = W.row(freeVar); double[] v = M.mv(WZ.tv(r)); Matrix N = WZ.ata().mul(-thetaInverse); N = M.mm(N); int n1 = N.nrows(); for (int i = 0; i < n1; i++) { N.add(i, i, 1.0); } Matrix.LU lu = N.lu(); v = lu.solve(v); double[] wzv = WZ.mv(v); double[] du = new double[freeVarCount]; for (int i = 0; i < freeVarCount; i++) { du[i] = -thetaInverse * (r[i] + wzv[i] * thetaInverse); } double alphaStar = findAlpha(cauchy, du, l, u, freeVar); double[] dStar = new double[freeVarCount]; for (int i = 0; i < freeVarCount; i++) { dStar[i] = du[i] * alphaStar; } double[] subspaceMin = cauchy.clone(); for (int i = 0; i < freeVarCount; i++) { subspaceMin[freeVar[i]] += dStar[i]; } return subspaceMin; } /** * Find the alpha* parameter, a positive scalar. See Equation 5.8 on page 11. * @param cauchy cauchy point * @param du intermediate results used to find alpha* @param freeVar the indices of free variable */ private static double findAlpha(double[] cauchy, double[] du, double[] l, double[] u, int[] freeVar) { double alphaStar = 1.0; int n = freeVar.length; for (int i = 0; i < n; i++) { int fi = freeVar[i]; alphaStar = du[i] > 0 ? min(alphaStar, (u[fi] - cauchy[fi]) / du[i]) : min(alphaStar, (l[fi] - cauchy[fi]) / du[i]); } return alphaStar; } /** * Returns the infinity norm of gradient. * @param x the parameter vector * @param g the gradient vector */ private static double gnorm(double[] x, double[] g, double[] l, double[] u) { double norm = 0.0; int n = x.length; for (int i = 0; i < n; i++) { double gi = g[i]; if (gi < 0) gi = max(x[i] - u[i], gi); else gi = min(x[i] - l[i], gi); norm = max(norm, abs(gi)); } return norm; } /** * Clamps the values of v to stay within the pre-defined bounds. * @param v the vector to be adjusted */ private static void clampToBound(double[] v, double[] l, double[] u) { int n = v.length; for (int i = 0; i < n; i++) { if (v[i] > u[i]) v[i] = u[i]; else if (v[i] < l[i]) v[i] = l[i]; } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy