smile.math.BFGS Maven / Gradle / Ivy
/*******************************************************************************
* Copyright (c) 2010-2020 Haifeng Li. All rights reserved.
*
* Smile is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* Smile is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Smile. If not, see .
******************************************************************************/
package smile.math;
import java.util.ArrayList;
import java.util.Arrays;
import smile.math.blas.UPLO;
import smile.math.matrix.Matrix;
import smile.sort.QuickSort;
import smile.util.Strings;
import static java.lang.Math.abs;
import static java.lang.Math.max;
import static java.lang.Math.min;
import static java.lang.Math.sqrt;
import static smile.math.MathEx.axpy;
import static smile.math.MathEx.dot;
import static smile.math.MathEx.norm;
/**
* The Broyden–Fletcher–Goldfarb–Shanno (BFGS) algorithm is an iterative
* method for solving unconstrained nonlinear optimization problems.
*
* The BFGS method belongs to quasi-Newton methods, a class of hill-climbing
* optimization techniques that seek a stationary point of a (preferably
* twice continuously differentiable) function. For such problems,
* a necessary condition for optimality is that the gradient be zero.
* Newton's method and the BFGS methods are not guaranteed to converge
* unless the function has a quadratic Taylor expansion near an optimum.
* However, BFGS has proven to have good performance even for non-smooth
* optimizations.
*
* In quasi-Newton methods, the Hessian matrix of second derivatives is
* not computed. Instead, the Hessian matrix is approximated using
* updates specified by gradient evaluations (or approximate gradient
* evaluations). Quasi-Newton methods are generalizations of the secant
* method to find the root of the first derivative for multidimensional
* problems. In multi-dimensional problems, the secant equation does not
* specify a unique solution, and quasi-Newton methods differ in how they
* constrain the solution. The BFGS method is one of the most popular
* members of this class.
*
* Like the original BFGS, the limited-memory BFGS (L-BFGS) uses an
* estimation to the inverse Hessian matrix to steer its search
* through variable space, but where BFGS stores a dense n × n
* approximation to the inverse Hessian (n
being the number of
* variables in the problem), L-BFGS stores only a few vectors
* that represent the approximation implicitly. Due to its resulting
* linear memory requirement, the L-BFGS method is particularly well
* suited for optimization problems with a large number of variables
* (e.g., > 1000). Instead of the inverse Hessian H_k
, L-BFGS
* maintains * a history of the past m
updates of the position
* x
and gradient ∇f(x)
, where generally the
* history size m
can be small (often m < 10
).
* These updates are used to implicitly do operations requiring the
* H_k
-vector product.
*
*
References
*
* - Roger Fletcher. Practical methods of optimization.
* - D. Liu and J. Nocedal. On the limited memory BFGS method for large scale optimization. Mathematical Programming B 45 (1989) 503-528.
* - Richard H. Byrd, Peihuang Lu, Jorge Nocedal and Ciyou Zhu. A limited memory algorithm for bound constrained optimization.
*
*
* @author Haifeng Li
*/
public class BFGS {
private static org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(BFGS.class);
/** A number close to zero, between machine epsilon and its square root. */
private static final double EPSILON = Double.valueOf(System.getProperty("smile.bfgs.epsilon", "1E-10"));
/** The convergence criterion on x values. */
private static final double TOLX = 4 * EPSILON;
/** The convergence criterion on function value. */
private static final double TOLF = 4 * EPSILON;
/** The scaled maximum step length allowed in line searches. */
private static final double STPMX = 100.0;
/**
* This method solves the unconstrained minimization problem
*
*
* min f(x), x = (x1,x2,...,x_n),
*
* using the BFGS method.
*
* @param func the function to be minimized.
*
* @param x on initial entry this must be set by the user to the values
* of the initial estimate of the solution vector. On exit, it
* contains the values of the variables at the best point found
* (usually a solution).
*
* @param gtol the convergence tolerance on zeroing the gradient.
* @param maxIter the maximum number of iterations.
*
* @return the minimum value of the function.
*/
public static double minimize(DifferentiableMultivariateFunction func, double[] x, double gtol, int maxIter) {
if (gtol <= 0.0) {
throw new IllegalArgumentException("Invalid gradient tolerance: " + gtol);
}
if (maxIter <= 0) {
throw new IllegalArgumentException("Invalid maximum number of iterations: " + maxIter);
}
double den, fac, fad, fae, sumdg, sumxi, temp, test;
int n = x.length;
double[] dg = new double[n];
double[] g = new double[n];
double[] hdg = new double[n];
double[] xnew = new double[n];
double[] xi = new double[n];
double[][] hessin = new double[n][n];
// Calculate starting function value and gradient and initialize the
// inverse Hessian to the unit matrix.
double f = func.g(x, g);
logger.info(String.format("BFGS: initial function value: %.5f", f));
for (int i = 0; i < n; i++) {
hessin[i][i] = 1.0;
// Initialize line direction.
xi[i] = -g[i];
}
double stpmax = STPMX * max(norm(x), n);
for (int iter = 1; iter <= maxIter; iter++) {
// The new function evaluation occurs in line search.
f = linesearch(func, x, f, g, xi, xnew, stpmax);
if (iter % 100 == 0) {
logger.info(String.format("BFGS: the function value after %3d iterations: %.5f", iter, f));
}
// update the line direction and current point.
for (int i = 0; i < n; i++) {
xi[i] = xnew[i] - x[i];
x[i] = xnew[i];
}
// Test for convergence on x.
test = 0.0;
for (int i = 0; i < n; i++) {
temp = abs(xi[i]) / max(abs(x[i]), 1.0);
if (temp > test) {
test = temp;
}
}
if (test < TOLX) {
logger.info(String.format("BFGS converges on x after %d iterations: %.5f", iter, f));
return f;
}
System.arraycopy(g, 0, dg, 0, n);
func.g(x, g);
// Test for convergence on zero gradient.
den = max(f, 1.0);
test = 0.0;
for (int i = 0; i < n; i++) {
temp = abs(g[i]) * max(abs(x[i]), 1.0) / den;
if (temp > test) {
test = temp;
}
}
if (test < gtol) {
logger.info(String.format("BFGS converges on gradient after %d iterations: %.5f", iter, f));
return f;
}
for (int i = 0; i < n; i++) {
dg[i] = g[i] - dg[i];
}
for (int i = 0; i < n; i++) {
hdg[i] = 0.0;
for (int j = 0; j < n; j++) {
hdg[i] += hessin[i][j] * dg[j];
}
}
fac = fae = sumdg = sumxi = 0.0;
for (int i = 0; i < n; i++) {
fac += dg[i] * xi[i];
fae += dg[i] * hdg[i];
sumdg += dg[i] * dg[i];
sumxi += xi[i] * xi[i];
}
// Skip update if fac is not sufficiently positive.
if (fac > sqrt(EPSILON * sumdg * sumxi)) {
fac = 1.0 / fac;
fad = 1.0 / fae;
// The vector that makes BFGS different from DFP.
for (int i = 0; i < n; i++) {
dg[i] = fac * xi[i] - fad * hdg[i];
}
// BFGS updating formula.
for (int i = 0; i < n; i++) {
for (int j = i; j < n; j++) {
hessin[i][j] += fac * xi[i] * xi[j] - fad * hdg[i] * hdg[j] + fae * dg[i] * dg[j];
hessin[j][i] = hessin[i][j];
}
}
}
// Calculate the next direction to go.
Arrays.fill(xi, 0.0);
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
xi[i] -= hessin[i][j] * g[j];
}
}
}
logger.warn(String.format("BFGS reaches maximum %d iterations: %.5f", maxIter, f));
return f;
}
/**
* This method solves the unconstrained minimization problem
*
*
* min f(x), x = (x1,x2,...,x_n),
*
* using the limited-memory BFGS method. The method is especially
* effective on problems involving a large number of variables. In
* a typical iteration of this method an approximation Hk
to the
* inverse of the Hessian is obtained by applying m
BFGS updates to
* a diagonal matrix Hk0
, using information from the previous M steps.
* The user specifies the number m
, which determines the amount of
* storage required by the routine.
*
* @param func the function to be minimized.
*
* @param m the number of corrections used in the L-BFGS update.
* Values of m
less than 3 are not recommended;
* large values of m
will result in excessive
* computing time. 3 <= m <= 7
is recommended.
* A common choice for m is m = 5.
*
* @param x on initial entry this must be set by the user to the values
* of the initial estimate of the solution vector. On exit with
* iflag = 0
, it contains the values of the variables
* at the best point found (usually a solution).
*
* @param gtol the convergence tolerance on zeroing the gradient.
* @param maxIter the maximum number of iterations.
*
* @return the minimum value of the function.
*/
public static double minimize(DifferentiableMultivariateFunction func, int m, double[] x, double gtol, int maxIter) {
if (gtol <= 0.0) {
throw new IllegalArgumentException("Invalid gradient tolerance: " + gtol);
}
if (maxIter <= 0) {
throw new IllegalArgumentException("Invalid maximum number of iterations: " + maxIter);
}
if (m <= 0) {
throw new IllegalArgumentException("Invalid m: " + m);
}
int n = x.length;
// The solution vector of line search.
double[] xnew = new double[n];
// The gradient vector of line search.
double[] gnew = new double[n];
// Line search direction.
double[] xi = new double[n];
// difference of x from previous step.
double[][] s = new double[m][n];
// difference of g from previous step.
double[][] y = new double[m][n];
// buffer for 1 / (y' * s)
double[] rho = new double[m];
double[] a = new double[m];
// Diagonal of initial H0.
double diag = 1.0;
// Current gradient.
double[] g = new double[n];
// Current function value.
double f = func.g(x, g);
logger.info(String.format("L-BFGS: initial function value: %.5f", f));
// Initial line search direction.
for (int i = 0; i < n; i++) {
xi[i] = -g[i];
}
// Upper limit for line search step.
double stpmax = STPMX * max(norm(x), n);
for (int iter = 1, k = 0; iter <= maxIter; iter++) {
linesearch(func, x, f, g, xi, xnew, stpmax);
f = func.g(xnew, gnew);
for (int i = 0; i < n; i++) {
s[k][i] = xnew[i] - x[i];
y[k][i] = gnew[i] - g[i];
x[i] = xnew[i];
g[i] = gnew[i];
}
// Test for convergence on x.
double test = 0.0;
for (int i = 0; i < n; i++) {
double temp = abs(s[k][i]) / max(abs(x[i]), 1.0);
if (temp > test) {
test = temp;
}
}
if (test < TOLX) {
logger.info(String.format("L-BFGS converges on x after %d iterations: %.5f", iter, f));
return f;
}
// Test for convergence on zero gradient.
test = 0.0;
double den = max(f, 1.0);
for (int i = 0; i < n; i++) {
double temp = abs(g[i]) * max(abs(x[i]), 1.0) / den;
if (temp > test) {
test = temp;
}
}
if (test < gtol) {
logger.info(String.format("L-BFGS converges on gradient after %d iterations: %.5f", iter, f));
return f;
}
if (iter % 100 == 0) {
logger.info(String.format("L-BFGS: the function value after %3d iterations: %.5f", iter, f));
}
double ys = dot(y[k], s[k]);
double yy = dot(y[k], y[k]);
diag = ys / yy;
rho[k] = 1.0 / ys;
for (int i = 0; i < n; i++) {
xi[i] = -g[i];
}
int cp = k;
int bound = iter > m ? m : iter;
for (int i = 0; i < bound; i++) {
a[cp] = rho[cp] * dot(s[cp], xi);
axpy(-a[cp], y[cp], xi);
if (--cp == - 1) {
cp = m - 1;
}
}
for (int i = 0; i < n; i++) {
xi[i] *= diag;
}
for (int i = 0; i < bound; i++) {
if (++cp == m) {
cp = 0;
}
double b = rho[cp] * dot(y[cp], xi);
axpy(a[cp]-b, s[cp], xi);
}
if (++k == m) {
k = 0;
}
}
logger.warn(String.format("L-BFGS reaches maximum %d iterations: %.5f", maxIter, f));
return f;
}
/**
* Minimize a function along a search direction by find a step which satisfies
* a sufficient decrease condition and a curvature condition.
*
* At each stage this function updates an interval of uncertainty with
* endpoints stx
and sty
. The interval of
* uncertainty is initially chosen so that it contains a
* minimizer of the modified function
*
*
* f(x+stp*s) - f(x) - ftol*stp*(gradf(x)'s).
*
* If a step is obtained for which the modified function
* has a nonpositive function value and nonnegative derivative,
* then the interval of uncertainty is chosen so that it
* contains a minimizer of f(x+stp*s)
.
*
* The algorithm is designed to find a step which satisfies
* the sufficient decrease condition
*
*
* f(x+stp*s) <= f(X) + ftol*stp*(gradf(x)'s),
*
* and the curvature condition
*
*
* abs(gradf(x+stp*s)'s)) <= gtol*abs(gradf(x)'s).
*
* If ftol
is less than gtol
and if, for example,
* the function is bounded below, then there is always a step which
* satisfies both conditions. If no step can be found which satisfies both
* conditions, then the algorithm usually stops when rounding
* errors prevent further progress. In this case stp
only
* satisfies the sufficient decrease condition.
*
* @param xold on input this contains the base point for the line search.
*
* @param fold on input this contains the value of the objective function
* at x
.
*
* @param g on input this contains the gradient of the objective function
* at x
.
*
* @param p the search direction.
*
* @param x on output, it contains xold + γ*p
, where
* γ > 0 is the step length.
*
* @param stpmax specify upper bound for the step in the line search so that
* we do not try to evaluate the function in regions where it is
* undefined or subject to overflow.
*
* @return the new function value.
*/
private static double linesearch(MultivariateFunction func, double[] xold, double fold, double[] g, double[] p, double[] x, double stpmax) {
if (stpmax <= 0) {
throw new IllegalArgumentException("Invalid upper bound of linear search step: " + stpmax);
}
// Termination occurs when the relative width of the interval
// of uncertainty is at most xtol.
final double xtol = EPSILON;
// Tolerance for the sufficient decrease condition.
final double ftol = 1.0E-4;
int n = xold.length;
// Scale if attempted step is too big
double pnorm = norm(p);
if (pnorm > stpmax) {
double r = stpmax / pnorm;
for (int i = 0; i < n; i++) {
p[i] *= r;
}
}
// Check if s is a descent direction.
double slope = 0.0;
for (int i = 0; i < n; i++) {
slope += g[i] * p[i];
}
if (slope >= 0) {
//throw new IllegalArgumentException("Line Search: the search direction is not a descent direction, which may be caused by roundoff problem.");
}
// Calculate minimum step.
double test = 0.0;
for (int i = 0; i < n; i++) {
double temp = abs(p[i]) / max(xold[i], 1.0);
if (temp > test) {
test = temp;
}
}
double alammin = xtol / test;
double alam = 1.0;
double alam2 = 0.0, f2 = 0.0;
double a, b, disc, rhs1, rhs2, tmpalam;
while (true) {
// Evaluate the function and gradient at stp
// and compute the directional derivative.
for (int i = 0; i < n; i++) {
x[i] = xold[i] + alam * p[i];
}
double f = func.apply(x);
// Convergence on Δ x.
if (alam < alammin) {
System.arraycopy(xold, 0, x, 0, n);
return f;
} else if (f <= fold + ftol * alam * slope) {
// Sufficient function decrease.
return f;
} else {
// Backtrack
if (alam == 1.0) {
// First time
tmpalam = -slope / (2.0 * (f - fold - slope));
} else {
// Subsequent backtracks.
rhs1 = f - fold - alam * slope;
rhs2 = f2 - fold - alam2 * slope;
a = (rhs1 / (alam * alam) - rhs2 / (alam2 * alam2)) / (alam - alam2);
b = (-alam2 * rhs1 / (alam * alam) + alam * rhs2 / (alam2 * alam2)) / (alam - alam2);
if (a == 0.0) {
tmpalam = -slope / (2.0 * b);
} else {
disc = b * b - 3.0 * a * slope;
if (disc < 0.0) {
tmpalam = 0.5 * alam;
} else if (b <= 0.0) {
tmpalam = (-b + sqrt(disc)) / (3.0 * a);
} else {
tmpalam = -slope / (b + sqrt(disc));
}
}
if (tmpalam > 0.5 * alam) {
tmpalam = 0.5 * alam;
}
}
}
alam2 = alam;
f2 = f;
alam = max(tmpalam, 0.1 * alam);
}
}
/**
* This method solves the bound constrained minimization problem
* using the L-BFGS-B method. The L-BFGS-B algorithm extends L-BFGS
* to handle simple box constraints on variables; that is, constraints
* of the form li ≤ xi ≤ ui where li and ui are per-variable constant
* lower and upper bounds, respectively (for each xi, either or both
* bounds may be omitted). The method works by identifying fixed and
* free variables at every step (using a simple gradient method),
* and then using the L-BFGS method on the free variables only to
* get higher accuracy, and then repeating the process.
*
* @param func the function to be minimized.
*
* @param m the number of corrections used in the L-BFGS update.
* Values of m
less than 3 are not recommended;
* large values of m
will result in excessive
* computing time. 3 <= m <= 7
is recommended.
* A common choice for m is m = 5.
*
* @param x on initial entry this must be set by the user to the values
* of the initial estimate of the solution vector. On exit with
* iflag = 0
, it contains the values of the variables
* at the best point found (usually a solution).
*
* @param l the lower bound.
* @param u the upper bound.
* @param gtol the convergence tolerance on zeroing the gradient.
* @param maxIter the maximum number of iterations.
*
* @return the minimum value of the function.
*/
public static double minimize(DifferentiableMultivariateFunction func, int m, double[] x, double[] l, double[] u, double gtol, int maxIter) {
if (gtol <= 0.0) {
throw new IllegalArgumentException("Invalid gradient tolerance: " + gtol);
}
if (maxIter <= 0) {
throw new IllegalArgumentException("Invalid maximum number of iterations: " + maxIter);
}
if (m <= 0) {
throw new IllegalArgumentException("Invalid m: " + m);
}
if (l.length != x.length) {
throw new IllegalArgumentException("Invalid lower bound size: " + l.length);
}
if (u.length != x.length) {
throw new IllegalArgumentException("Invalid upper bound size: " + u.length);
}
int n = x.length;
double theta = 1.0;
Matrix Y = null, S = null;
Matrix W = new Matrix(n, 1);
Matrix M = new Matrix(1, 1);
ArrayList yHistory = new ArrayList<>();
ArrayList sHistory = new ArrayList<>();
double[] y = new double[n];
double[] s = new double[n];
double[] p = new double[n];
double[] g = new double[n];
double[] cauchy = new double[n];
double f = func.g(x, g);
double[] x_old = new double[n];
double[] g_old = new double[n];
// Upper limit for line search step.
double stpmax = STPMX * max(norm(x), n);
for (int iter = 1; iter <= maxIter; iter++) {
double f_old = f;
System.arraycopy(x, 0, x_old, 0, n);
System.arraycopy(g, 0, g_old, 0, n);
// STEP 2: compute the cauchy point
System.arraycopy(x, 0, cauchy, 0, n);
double[] c = cauchy(x, g, cauchy, l, u, theta, W, M);
clampToBound(cauchy, l, u);
// STEP 3: compute a search direction d_k by the primal method for the sub-problem
double[] subspaceMin = subspaceMinimization(x, g, cauchy, c, l, u, theta, W, M);
clampToBound(subspaceMin, l, u);
// STEP 4: perform line search
for (int i = 0; i < n; i++) {
p[i] = subspaceMin[i] - x[i];
}
linesearch(func, x_old, f_old, g, p, x, stpmax);
clampToBound(x, l, u);
for (double xi : x) {
if (Double.isNaN(xi) || Double.isInfinite(xi)) {
logger.warn("L-BFGS-B: bad x produced by line search, return previous good x");
System.arraycopy(x_old, 0, x, 0, n);
return f_old;
}
}
// STEP 5: compute gradient update current guess and function information
f = func.g(x, g);
if (Double.isNaN(f) || Double.isInfinite(f)) {
logger.warn("L-BFGS-B: bad f(x) produced by line search, return previous good x");
System.arraycopy(x_old, 0, x, 0, n);
return f_old;
}
if (gnorm(x, g, l, u) < gtol) {
logger.info(String.format("L-BFGS-B converges on gradient after %d iterations: %.5f", iter, f));
return f;
}
if (iter % 100 == 0) {
logger.info(String.format("L-BFGS-B: the function value after %3d iterations: %.5f", iter, f));
}
// prepare for next iteration
for (int i = 0; i < n; i++) {
y[i] = g[i] - g_old[i];
s[i] = x[i] - x_old[i];
}
// STEP 6
double sy = dot(s, y);
double yy = dot(y, y);
double test = abs(sy);
if (test > EPSILON * yy) {
if (yHistory.size() >= m) {
yHistory.remove (0);
sHistory.remove (0);
}
yHistory.add(y);
sHistory.add(s);
int h = yHistory.size();
if (iter <= m) {
Y = new Matrix(n, h);
S = new Matrix(n, h);
W = new Matrix(n, 2*h);
M = new Matrix(2*h, 2*h);
}
// STEP 7
theta = yy / sy;
for (int j = 0; j < h; j++) {
double[] yj = yHistory.get(j);
double[] sj = sHistory.get(j);
for (int i = 0; i < n; i++) {
Y.set(i, j, yj[i]);
S.set(i, j, sj[i]);
W.set(i, j, yj[i]);
W.set(i, h+j, sj[i] * theta);
}
}
Matrix SY = S.tm(Y);
Matrix SS = S.ata();
for (int j = 0; j < h; j++) {
M.set(j, j, -SY.get(j, j));
for (int i = 0; i <= j; i++) {
M.set(h+i, j, 0.0);
M.set(j, h+i, 0.0);
}
for (int i = j+1; i < h; i++) {
M.set(h+i, j, SY.get(i, j));
M.set(j, h+i, SY.get(i, j));
}
for (int i = 0; i < h; i++) {
M.set(h+i, h+j, theta * SS.get(i, j));
}
}
M.uplo(UPLO.LOWER);
M = M.inverse();
}
logger.debug("L-BFGS-B iteration {} moves from {} to {} where f(x) = {}", iter, Strings.toString(x_old), Strings.toString(x), f);
if (abs(f_old - f) < TOLF) {
logger.info(String.format("L-BFGS-B converges on f(x) after %d iterations: %.5f", iter, f));
return f;
}
}
logger.warn(String.format("L-BFGS-B reaches maximum %d iterations: %.5f", maxIter, f));
return f;
}
/**
* Algorithm CP: Computation of the Generalized Cauchy Point. See page 8.
* @param x the parameter vector
* @param g the gradient vector
*/
private static double[] cauchy(double[] x, double[] g, double[] cauchy, double[] l, double[] u, double theta, Matrix W, Matrix M) {
int n = x.length;
double[] t = new double[n];
double[] d = new double[n];
for (int i = 0; i < n; i++) {
t[i] = g[i] == 0 ? Double.MAX_VALUE
: (g[i] < 0 ? (x[i] - u[i]) / g[i]
: (x[i] - l[i]) / g[i]);
if (t[i] != 0.0) d[i] = -g[i];
}
int[] index = QuickSort.sort(t);
double[] p = W.tv(d);
double[] c = new double[p.length];
double fPrime = -dot(d, d);
double fDoublePrime = max(-theta * fPrime - M.xAx(p), EPSILON);
double f_dp_orig = fDoublePrime;
double dt_min = -fPrime / fDoublePrime;
double t_old = 0.0;
int i = 0;
for (; i < n; i++) {
if (t[index[i]] > 0) break;
}
double dt = t[i];
for (; dt_min >= dt && i < n; i++) {
int b = index[i];
double tb = t[i];
dt = tb - t_old;
cauchy[b] = d[b] > 0 ? u[b] :
(d[b] < 0 ? l[b] : cauchy[b]);
double zb = cauchy[b] - x[b];
for (int j = 0; j < c.length; j++) {
c[j] += p[j] * dt;
}
double gb = g[b];
double[] wbt = W.row(b);
fPrime += dt * fDoublePrime + gb * gb + theta * gb * zb - gb * dot(wbt, M.mv(c));
fDoublePrime -= theta * gb * gb + 2.0 * gb * dot(wbt, M.mv(p)) + gb * gb * M.xAx(wbt);
fDoublePrime = max(fDoublePrime, EPSILON * f_dp_orig);
for (int j = 0; j < p.length; j++) {
p[j] += wbt[j] * gb;
}
d[b] = 0;
dt_min = -fPrime / fDoublePrime;
t_old = tb;
}
dt_min = max(dt_min, 0.0);
t_old += dt_min;
for (int ii = i; ii < n; ii++) {
int si = index[ii];
cauchy[si] = x[si] + t_old * d[si];
}
for (int j = 0; j < c.length; j++) {
c[j] += p[j] * dt_min;
}
return c;
}
/**
* Minimization of the subspace of free variables. See Section 5 on page 9.
* @param x the parameter vector
* @param g the gradient vector
* @param cauchy the cauchy points
* @param c vector obtained from gcp() used to initialize the subspace
* minimization process
*/
private static double[] subspaceMinimization(double[] x, double[] g, double[] cauchy, double[] c, double[] l, double[] u, double theta, Matrix W, Matrix M) {
int n = x.length;
double thetaInverse = 1.0 / theta;
ArrayList freeVarIdx = new ArrayList<>();
for (int i = 0; i < n; i++) {
if (cauchy[i] != u[i] && cauchy[i] != l[i]) {
freeVarIdx.add(i);
}
}
if (freeVarIdx.isEmpty()) {
return cauchy.clone();
}
int freeVarCount = freeVarIdx.size();
int[] freeVar = new int[freeVarCount];
for (int i = 0; i < freeVarCount; i++) {
freeVar[i] = freeVarIdx.get(i);
}
double[] wmc = W.mv(M.mv(c));
double[] r = new double[freeVarCount];
for (int i = 0; i < freeVarCount; i++) {
int fi = freeVar[i];
r[i] = g[fi] + (cauchy[fi] - x[fi]) * theta - wmc[fi];
}
Matrix WZ = W.row(freeVar);
double[] v = M.mv(WZ.tv(r));
Matrix N = WZ.ata().mul(-thetaInverse);
N = M.mm(N);
int n1 = N.nrows();
for (int i = 0; i < n1; i++) {
N.add(i, i, 1.0);
}
Matrix.LU lu = N.lu();
v = lu.solve(v);
double[] wzv = WZ.mv(v);
double[] du = new double[freeVarCount];
for (int i = 0; i < freeVarCount; i++) {
du[i] = -thetaInverse * (r[i] + wzv[i] * thetaInverse);
}
double alphaStar = findAlpha(cauchy, du, l, u, freeVar);
double[] dStar = new double[freeVarCount];
for (int i = 0; i < freeVarCount; i++) {
dStar[i] = du[i] * alphaStar;
}
double[] subspaceMin = cauchy.clone();
for (int i = 0; i < freeVarCount; i++) {
subspaceMin[freeVar[i]] += dStar[i];
}
return subspaceMin;
}
/**
* Find the alpha* parameter, a positive scalar. See Equation 5.8 on page 11.
* @param cauchy cauchy point
* @param du intermediate results used to find alpha*
@param freeVar the indices of free variable
*/
private static double findAlpha(double[] cauchy, double[] du, double[] l, double[] u, int[] freeVar) {
double alphaStar = 1.0;
int n = freeVar.length;
for (int i = 0; i < n; i++) {
int fi = freeVar[i];
alphaStar = du[i] > 0 ? min(alphaStar, (u[fi] - cauchy[fi]) / du[i])
: min(alphaStar, (l[fi] - cauchy[fi]) / du[i]);
}
return alphaStar;
}
/**
* Returns the infinity norm of gradient.
* @param x the parameter vector
* @param g the gradient vector
*/
private static double gnorm(double[] x, double[] g, double[] l, double[] u) {
double norm = 0.0;
int n = x.length;
for (int i = 0; i < n; i++) {
double gi = g[i];
if (gi < 0) gi = max(x[i] - u[i], gi);
else gi = min(x[i] - l[i], gi);
norm = max(norm, abs(gi));
}
return norm;
}
/**
* Clamps the values of v to stay within the pre-defined bounds.
* @param v the vector to be adjusted
*/
private static void clampToBound(double[] v, double[] l, double[] u) {
int n = v.length;
for (int i = 0; i < n; i++) {
if (v[i] > u[i]) v[i] = u[i];
else if (v[i] < l[i]) v[i] = l[i];
}
}
}