All Downloads are FREE. Search and download functionalities are using the official Maven repository.

smile.regression.RidgeRegression Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (c) 2010-2021 Haifeng Li. All rights reserved.
 *
 * Smile is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Smile is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Smile.  If not, see .
 */

package smile.regression;

import java.util.Arrays;
import java.util.Properties;
import smile.data.DataFrame;
import smile.data.formula.Formula;
import smile.data.type.StructType;
import smile.math.MathEx;
import smile.math.blas.UPLO;
import smile.math.matrix.Matrix;

/**
 * Ridge Regression. Coefficient estimates for multiple linear regression
 * models rely on the independence of the model terms. When terms are
 * correlated and the columns of the design matrix X have an approximate
 * linear dependence, the matrix X'X becomes close to singular.
 * As a result, the least-squares estimate becomes highly sensitive to random
 * errors in the observed response Y, producing a large variance.
 * 

* Ridge regression is one method to address these issues. In ridge regression, * the matrix X'X is perturbed to make its determinant * appreciably different from 0. *

* Ridge regression is a kind of Tikhonov regularization, which is the most * commonly used method of regularization of ill-posed problems. * Ridge regression shrinks the regression coefficients by imposing a penalty * on their size. By allowing a small amount of bias in the estimates, more * reasonable coefficients may often be obtained. Often, small amounts of bias * lead to dramatic reductions in the variance of the estimated model * coefficients. *

* Another interpretation of ridge regression is available through Bayesian * estimation. In this setting the belief that weight should be small is * coded into a prior distribution. *

* The penalty term is unfair if the predictor variables are not on the * same scale. Therefore, if we know that the variables are not measured * in the same units, we typically scale the columns of X (to have sample * variance 1), and then we perform ridge regression. *

* When including an intercept term in the regression, we usually leave * this coefficient unpenalized. Otherwise, we could add some constant * amount to the vector y, and this would not result in * the same solution. If we center the columns of X, then * the intercept estimate ends up just being the mean of y. *

* Ridge regression does not set coefficients exactly to zero unless * λ = ∞, in which case they're all zero. * Hence, ridge regression cannot perform variable selection, and * even though it performs well in terms of prediction accuracy, * it does poorly in terms of offering a clear interpretation. * * @author Haifeng Li */ public class RidgeRegression { /** * Fits a ridge regression model. * @param formula a symbolic description of the model to be fitted. * @param data the data frame of the explanatory and response variables. * NO NEED to include a constant column of 1s for bias. * @return the model. */ public static LinearModel fit(Formula formula, DataFrame data) { return fit(formula, data, new Properties()); } /** * Fits a ridge regression model. The hyperparameters in prop include *

    *
  • smile.ridge.lambda is the shrinkage/regularization parameter. Large lambda means more shrinkage. * Choosing an appropriate value of lambda is important, and also difficult. *
  • smile.ridge.standard.error is a boolean. If true, compute the estimated standard * errors of the estimate of parameters *
* @param formula a symbolic description of the model to be fitted. * @param data the data frame of the explanatory and response variables. * NO NEED to include a constant column of 1s for bias. * @param params the hyperparameters. * @return the model. */ public static LinearModel fit(Formula formula, DataFrame data, Properties params) { double lambda = Double.parseDouble(params.getProperty("smile.ridge.lambda", "1")); return fit(formula, data, lambda); } /** * Fits a ridge regression model. * @param formula a symbolic description of the model to be fitted. * @param data the data frame of the explanatory and response variables. * NO NEED to include a constant column of 1s for bias. * @param lambda the shrinkage/regularization parameter. Large lambda means more shrinkage. * Choosing an appropriate value of lambda is important, and also difficult. * @return the model. */ public static LinearModel fit(Formula formula, DataFrame data, double lambda) { int n = data.size(); double[] weights = new double[n]; Arrays.fill(weights, 1.0); return fit(formula, data, weights, new double[]{lambda}, new double[]{0.0}); } /** * Fits a generalized ridge regression model that minimizes a * weighted least squares criterion augmented with a * generalized ridge penalty: *
{@code
     *     (Y - X'*beta)' * W * (Y - X'*beta) + (beta - beta0)' * lambda * (beta - beta0)
     * }
* * @param formula a symbolic description of the model to be fitted. * @param data the data frame of the explanatory and response variables. * NO NEED to include a constant column of 1s for bias. * @param weights sample weights. * @param lambda the shrinkage/regularization parameter. Large lambda * means more shrinkage. Choosing an appropriate value of * lambda is important, and also difficult. Its length may * be 1 so that its value is applied to all variables. * @param beta0 generalized ridge penalty target. Its length may * be 1 so that its value is applied to all variables. * @return the model. */ public static LinearModel fit(Formula formula, DataFrame data, double[] weights, double[] lambda, double[] beta0) { formula = formula.expand(data.schema()); StructType schema = formula.bind(data.schema()); Matrix X = formula.matrix(data, false); double[] y = formula.y(data).toDoubleArray(); int n = X.nrow(); int p = X.ncol(); if (weights.length != n) { throw new IllegalArgumentException(String.format("Invalid weights vector size: %d != %d", weights.length, n)); } for (int i = 0; i < n; i++) { if (weights[i] <= 0.0) { throw new IllegalArgumentException(String.format("Invalid weights[%d] = %f", i, weights[i])); } } if (lambda.length == 1) { double shrinkage = lambda[0]; lambda = new double[p]; Arrays.fill(lambda, shrinkage); } else if (lambda.length != p) { throw new IllegalArgumentException(String.format("Invalid lambda vector size: %d != %d", lambda.length, p)); } for (int i = 0; i < p; i++) { if (lambda[i] < 0.0) { throw new IllegalArgumentException(String.format("Invalid lambda[%d] = %f", i, lambda[i])); } } if (beta0.length == 1) { double beta = beta0[0]; beta0 = new double[p]; Arrays.fill(beta0, beta); } else if (beta0.length != p) { throw new IllegalArgumentException(String.format("Invalid beta0 vector size: %d != %d", beta0.length, p)); } double[] center = X.colMeans(); double[] scale = X.colSds(); for (int j = 0; j < scale.length; j++) { if (MathEx.isZero(scale[j])) { throw new IllegalArgumentException(String.format("The column '%s' is constant", X.colName(j))); } } Matrix scaledX = X.scale(center, scale); Matrix XtW = new Matrix(p, n); for (int i = 0; i < p; i++) { for (int j = 0; j < n; j++) { XtW.set(i, j, weights[j] * scaledX.get(j, i)); } } double[] scaledY = XtW.mv(y); for (int i = 0; i < p; i++) { scaledY[i] += lambda[i] * beta0[i]; } Matrix XtX = XtW.mm(scaledX); XtX.uplo(UPLO.LOWER); XtX.addDiag(lambda); Matrix.Cholesky cholesky = XtX.cholesky(true); double[] w = cholesky.solve(scaledY); for (int j = 0; j < p; j++) { w[j] /= scale[j]; } double b = MathEx.mean(y) - MathEx.dot(w, center); return new LinearModel(formula, schema, X, y, w, b); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy