smile.regression.RidgeRegression Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of smile-core Show documentation
smile-core
The newest version!
/*
 * Copyright (c) 2010-2021 Haifeng Li. All rights reserved.
 *
 * Smile is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Smile is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Smile.  If not, see .
 */

package smile.regression;

import java.util.Arrays;
import java.util.Properties;
import smile.data.DataFrame;
import smile.data.formula.Formula;
import smile.data.type.StructType;
import smile.math.MathEx;
import smile.math.blas.UPLO;
import smile.math.matrix.Matrix;

/**
 * Ridge Regression. Coefficient estimates for multiple linear regression
 * models rely on the independence of the model terms. When terms are
 * correlated and the columns of the design matrix X have an approximate
 * linear dependence, the matrix X'X becomes close to singular.
 * As a result, the least-squares estimate becomes highly sensitive to random
 * errors in the observed response Y, producing a large variance.
 * 
 * Ridge regression is one method to address these issues. In ridge regression,
 * the matrix X'X is perturbed to make its determinant
 * appreciably different from 0.
 * 

 * Ridge regression is a kind of Tikhonov regularization, which is the most
 * commonly used method of regularization of ill-posed problems.
 * Ridge regression shrinks the regression coefficients by imposing a penalty
 * on their size. By allowing a small amount of bias in the estimates, more
 * reasonable coefficients may often be obtained. Often, small amounts of bias
 * lead to dramatic reductions in the variance of the estimated model
 * coefficients.
 * 

 * Another interpretation of ridge regression is available through Bayesian
 * estimation. In this setting the belief that weight should be small is
 * coded into a prior distribution.
 * 

 * The penalty term is unfair if the predictor variables are not on the
 * same scale. Therefore, if we know that the variables are not measured
 * in the same units, we typically scale the columns of X (to have sample
 * variance 1), and then we perform ridge regression.
 * 

 * When including an intercept term in the regression, we usually leave
 * this coefficient unpenalized. Otherwise, we could add some constant
 * amount to the vector y, and this would not result in
 * the same solution. If we center the columns of X, then
 * the intercept estimate ends up just being the mean of y.
 * 

 * Ridge regression does not set coefficients exactly to zero unless
 * λ = ∞, in which case they're all zero.
 * Hence, ridge regression cannot perform variable selection, and
 * even though it performs well in terms of prediction accuracy,
 * it does poorly in terms of offering a clear interpretation.
 *
 * @author Haifeng Li
 */
public class RidgeRegression {
    /**
     * Fits a ridge regression model.
     * @param formula a symbolic description of the model to be fitted.
     * @param data the data frame of the explanatory and response variables.
     *             NO NEED to include a constant column of 1s for bias.
     * @return the model.
     */
    public static LinearModel fit(Formula formula, DataFrame data) {
        return fit(formula, data, new Properties());
    }

    /**
     * Fits a ridge regression model. The hyperparameters in prop include
     * 

     * smile.ridge.lambda is the shrinkage/regularization parameter. Large lambda means more shrinkage.
     *               Choosing an appropriate value of lambda is important, and also difficult.
     * 
smile.ridge.standard.error is a boolean. If true, compute the estimated standard
     *     errors of the estimate of parameters
     * 
     * @param formula a symbolic description of the model to be fitted.
     * @param data the data frame of the explanatory and response variables.
     *             NO NEED to include a constant column of 1s for bias.
     * @param params the hyperparameters.
     * @return the model.
     */
    public static LinearModel fit(Formula formula, DataFrame data, Properties params) {
        double lambda = Double.parseDouble(params.getProperty("smile.ridge.lambda", "1"));
        return fit(formula, data, lambda);
    }

    /**
     * Fits a ridge regression model.
     * @param formula a symbolic description of the model to be fitted.
     * @param data the data frame of the explanatory and response variables.
     *             NO NEED to include a constant column of 1s for bias.
     * @param lambda the shrinkage/regularization parameter. Large lambda means more shrinkage.
     *               Choosing an appropriate value of lambda is important, and also difficult.
     * @return the model.
     */
    public static LinearModel fit(Formula formula, DataFrame data, double lambda) {
        int n = data.size();
        double[] weights = new double[n];
        Arrays.fill(weights, 1.0);
        return fit(formula, data, weights, new double[]{lambda}, new double[]{0.0});
    }

    /**
     * Fits a generalized ridge regression model that minimizes a
     * weighted least squares criterion augmented with a
     * generalized ridge penalty:
     * {@code
     *     (Y - X'*beta)' * W * (Y - X'*beta) + (beta - beta0)' * lambda * (beta - beta0)
     * }
     *
     * @param formula a symbolic description of the model to be fitted.
     * @param data the data frame of the explanatory and response variables.
     *             NO NEED to include a constant column of 1s for bias.
     * @param weights sample weights.
     * @param lambda the shrinkage/regularization parameter. Large lambda
     *               means more shrinkage. Choosing an appropriate value of
     *               lambda is important, and also difficult. Its length may
     *               be 1 so that its value is applied to all variables.
     * @param beta0 generalized ridge penalty target. Its length may
     *              be 1 so that its value is applied to all variables.
     * @return the model.
     */
    public static LinearModel fit(Formula formula, DataFrame data, double[] weights, double[] lambda, double[] beta0) {
        formula = formula.expand(data.schema());
        StructType schema = formula.bind(data.schema());

        Matrix X = formula.matrix(data, false);
        double[] y = formula.y(data).toDoubleArray();

        int n = X.nrow();
        int p = X.ncol();

        if (weights.length != n) {
            throw new IllegalArgumentException(String.format("Invalid weights vector size: %d != %d", weights.length, n));
        }

        for (int i = 0; i < n; i++) {
            if (weights[i] <= 0.0) {
                throw new IllegalArgumentException(String.format("Invalid weights[%d] = %f", i, weights[i]));
            }
        }

        if (lambda.length == 1) {
            double shrinkage = lambda[0];
            lambda = new double[p];
            Arrays.fill(lambda, shrinkage);
        } else if (lambda.length != p) {
            throw new IllegalArgumentException(String.format("Invalid lambda vector size: %d != %d", lambda.length, p));
        }

        for (int i = 0; i < p; i++) {
            if (lambda[i] < 0.0) {
                throw new IllegalArgumentException(String.format("Invalid lambda[%d] = %f", i, lambda[i]));
            }
        }

        if (beta0.length == 1) {
            double beta = beta0[0];
            beta0 = new double[p];
            Arrays.fill(beta0, beta);
        } else if (beta0.length != p) {
            throw new IllegalArgumentException(String.format("Invalid beta0 vector size: %d != %d", beta0.length, p));
        }

        double[] center = X.colMeans();
        double[] scale = X.colSds();

        for (int j = 0; j < scale.length; j++) {
            if (MathEx.isZero(scale[j])) {
                throw new IllegalArgumentException(String.format("The column '%s' is constant", X.colName(j)));
            }
        }

        Matrix scaledX = X.scale(center, scale);
        Matrix XtW = new Matrix(p, n);
        for (int i = 0; i < p; i++) {
            for (int j = 0; j < n; j++) {
                XtW.set(i, j, weights[j] * scaledX.get(j, i));
            }
        }

        double[] scaledY = XtW.mv(y);
        for (int i = 0; i < p; i++) {
            scaledY[i] += lambda[i] * beta0[i];
        }

        Matrix XtX = XtW.mm(scaledX);
        XtX.uplo(UPLO.LOWER);
        XtX.addDiag(lambda);
        Matrix.Cholesky cholesky = XtX.cholesky(true);

        double[] w = cholesky.solve(scaledY);
        for (int j = 0; j < p; j++) {
            w[j] /= scale[j];
        }

        double b = MathEx.mean(y) - MathEx.dot(w, center);
        return new LinearModel(formula, schema, X, y, w, b);
    }
}