smile.classification.LogisticRegression Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of smile-core Show documentation
smile-core
The newest version!
/*
 * Copyright (c) 2010-2021 Haifeng Li. All rights reserved.
 *
 * Smile is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Smile is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Smile.  If not, see .
 */

package smile.classification;

import java.util.Arrays;
import java.util.Properties;
import java.util.stream.IntStream;
import smile.math.MathEx;
import smile.math.DifferentiableMultivariateFunction;
import smile.math.BFGS;
import smile.util.IntSet;
import smile.validation.ModelSelection;

/**
 * Logistic regression. Logistic regression (logit model) is a generalized
 * linear model used for binomial regression. Logistic regression applies
 * maximum likelihood estimation after transforming the dependent into
 * a logit variable. A logit is the natural log of the odds of the dependent
 * equaling a certain value or not (usually 1 in binary logistic models,
 * the highest value in multinomial models). In this way, logistic regression
 * estimates the odds of a certain event (value) occurring. 
 * 
 * Goodness-of-fit tests such as the likelihood ratio test are available
 * as indicators of model appropriateness, as is the Wald statistic to test
 * the significance of individual independent variables. 
 * 

 * Logistic regression has many analogies to ordinary least squares (OLS)
 * regression. Unlike OLS regression, however, logistic regression does not
 * assume linearity of relationship between the raw values of the independent
 * variables and the dependent, does not require normally distributed variables,
 * does not assume homoscedasticity, and in general has less stringent
 * requirements.
 * 

 * Compared with linear discriminant analysis, logistic regression has several
 * advantages:
 * 

 *  It is more robust: the independent variables don't have to be normally
 * distributed, or have equal variance in each group
 * 
 It does not assume a linear relationship between the independent
 * variables and dependent variable.
 * 
 It may handle nonlinear effects since one can add explicit interaction
 * and power terms.
 * 
 * However, it requires much more data to achieve stable, meaningful results.
 * 
 * Logistic regression also has strong connections with neural network and
 * maximum entropy modeling. For example, binary logistic regression is
 * equivalent to a one-layer, single-output neural network with a logistic
 * activation function trained under log loss. Similarly, multinomial logistic
 * regression is equivalent to a one-layer, softmax-output neural network.
 * 
 * Logistic regression estimation also obeys the maximum entropy principle, and
 * thus logistic regression is sometimes called "maximum entropy modeling",
 * and the resulting classifier the "maximum entropy classifier".
 *
 * @see smile.glm.GLM
 * @see MLP
 * @see Maxent
 * @see LDA
 * 
 * @author Haifeng Li
 */
public abstract class LogisticRegression extends AbstractClassifier {
    private static final long serialVersionUID = 2L;

    /**
     * The dimension of input space.
     */
    int p;

    /**
     * The number of classes.
     */
    int k;

    /**
     * The log-likelihood of learned model.
     */
    double L;

    /**
     * Regularization factor.
     */
    double lambda;

    /**
     * learning rate for stochastic gradient descent.
     */
    double eta = 0.1;

    /**
     * Constructor.
     * @param p the dimension of input data.
     * @param L the log-likelihood of learned model.
     * @param lambda {@code lambda > 0} gives a "regularized" estimate of linear
     *               weights which often has superior generalization performance,
     *               especially when the dimensionality is high.
     * @param labels the class label encoder.
     */
    public LogisticRegression(int p, double L, double lambda, IntSet labels) {
        super(labels);
        this.k = labels.size();
        this.p = p;
        this.L = L;
        this.lambda = lambda;
    }

    /** Binomial logistic regression. The dependent variable is nominal of two levels. */
    public static class Binomial extends LogisticRegression {
        /**
         * The linear weights.
         */
        private final double[] w;

        /**
         * Constructor.
         * @param w the weights.
         * @param L the log-likelihood of learned model.
         * @param lambda {@code lambda > 0} gives a "regularized" estimate of linear
         *               weights which often has superior generalization performance,
         *               especially when the dimensionality is high.
         * @param labels the class label encoder.
         */
        public Binomial(double[] w, double L, double lambda, IntSet labels) {
            super(w.length - 1, L, lambda, labels);
            this.w = w;
        }

        /**
         * Returns an array of size (p+1) containing the linear weights
         * of binary logistic regression, where p is the dimension of
         * feature vectors. The last element is the weight of bias.
         * @return the linear weights.
         */
        public double[] coefficients() {
            return w;
        }

        @Override
        public double score(double[] x) {
            return 1.0 / (1.0 + Math.exp(-dot(x, w)));
        }

        @Override
        public int predict(double[] x) {
            double f = 1.0 / (1.0 + Math.exp(-dot(x, w)));
            return classes.valueOf(f < 0.5 ? 0 : 1);
        }

        @Override
        public int predict(double[] x, double[] posteriori) {
            if (x.length != p) {
                throw new IllegalArgumentException(String.format("Invalid input vector size: %d, expected: %d", x.length, p));
            }

            if (posteriori.length != k) {
                throw new IllegalArgumentException(String.format("Invalid posteriori vector size: %d, expected: %d", posteriori.length, k));
            }

            double f = 1.0 / (1.0 + Math.exp(-dot(x, w)));

            posteriori[0] = 1.0 - f;
            posteriori[1] = f;

            return classes.valueOf(f < 0.5 ? 0 : 1);
        }

        @Override
        public void update(double[] x, int y) {
            if (x.length != p) {
                throw new IllegalArgumentException("Invalid input vector size: " + x.length);
            }

            y = classes.indexOf(y);

            // calculate gradient for incoming data
            double wx = dot(x, w);
            double err = y - MathEx.sigmoid(wx);

            // update the weights
            w[p] += eta * err;
            for (int j = 0; j < p; j++) {
                w[j] += eta * err * x[j];
            }

            // add regularization part
            if (lambda > 0.0) {
                for (int j = 0; j < p; j++) {
                    w[j] -= eta * lambda * w[j];
                }
            }
        }
    }

    /** Multinomial logistic regression. The dependent variable is nominal with more than two levels. */
    public static class Multinomial extends LogisticRegression {
        /**
         * The linear weights.
         */
        private final double[][] w;

        /**
         * Constructor.
         * @param w the weights.
         * @param L the log-likelihood of learned model.
         * @param lambda {@code lambda > 0} gives a "regularized" estimate of linear
         *               weights which often has superior generalization performance,
         *               especially when the dimensionality is high.
         * @param labels the class label encoder.
         */
        public Multinomial(double[][] w, double L, double lambda, IntSet labels) {
            super(w[0].length - 1, L, lambda, labels);
            this.w = w;
        }

        /**
         * Returns a 2d-array of size (k-1) x (p+1), containing the linear weights
         * of multi-class logistic regression, where k is the number of classes
         * and p is the dimension of feature vectors. The last element of each
         * row is the weight of bias.
         * @return the linear weights.
         */
        public double[][] coefficients() {
            return w;
        }

        @Override
        public int predict(double[] x) {
            return predict(x, new double[k]);
        }

        @Override
        public int predict(double[] x, double[] posteriori) {
            if (x.length != p) {
                throw new IllegalArgumentException(String.format("Invalid input vector size: %d, expected: %d", x.length, p));
            }

            if (posteriori.length != k) {
                throw new IllegalArgumentException(String.format("Invalid posteriori vector size: %d, expected: %d", posteriori.length, k));
            }

            posteriori[k-1] = 0.0;
            for (int i = 0; i < k-1; i++) {
                posteriori[i] = dot(x, w[i]);
            }

            MathEx.softmax(posteriori);
            return classes.valueOf(MathEx.whichMax(posteriori));
        }

        @Override
        public void update(double[] x, int y) {
            if (x.length != p) {
                throw new IllegalArgumentException("Invalid input vector size: " + x.length);
            }

            y = classes.indexOf(y);

            double[] prob = new double[k];
            for (int j = 0; j < k-1; j++) {
                prob[j] = dot(x, w[j]);
            }

            MathEx.softmax(prob);

            // update the weights
            for (int i = 0; i < k-1; i++) {
                double[] wi = w[i];
                double err = (y == i ? 1.0 : 0.0) - prob[i];
                wi[p] += eta * err;
                for (int j = 0; j < p; j++) {
                    wi[j] += eta * err * x[j];
                }

                // add regularization part
                if (lambda > 0.0) {
                    for (int j = 0; j < p; j++) {
                        wi[j] -= eta * lambda * wi[j];
                    }
                }
            }
        }
    }

    /**
     * Fits binomial logistic regression.
     * @param x training samples.
     * @param y training labels.
     * @return the model.
     */
    public static Binomial binomial(double[][] x, int[] y) {
        return binomial(x, y, new Properties());
    }

    /**
     * Fits binomial logistic regression.
     * @param x training samples.
     * @param y training labels.
     * @param params the hyper-parameters.
     * @return the model.
     */
    public static Binomial binomial(double[][] x, int[] y, Properties params) {
        double lambda = Double.parseDouble(params.getProperty("smile.logistic.lambda", "0.1"));
        double tol = Double.parseDouble(params.getProperty("smile.logistic.tolerance", "1E-5"));
        int maxIter = Integer.parseInt(params.getProperty("smile.logistic.iterations", "500"));
        return binomial(x, y, lambda, tol, maxIter);
    }

    /**
     * Fits binomial logistic regression.
     * 
     * @param x training samples.
     * @param y training labels.
     * @param lambda {@code lambda > 0} gives a "regularized" estimate of linear
     *               weights which often has superior generalization performance,
     *               especially when the dimensionality is high.
     * @param tol the tolerance for stopping iterations.
     * @param maxIter the maximum number of iterations.
     * @return the model.
     */
    public static Binomial binomial(double[][] x, int[] y, double lambda, double tol, int maxIter) {
        if (x.length != y.length) {
            throw new IllegalArgumentException(String.format("The sizes of X and Y don't match: %d != %d", x.length, y.length));
        }

        if (lambda < 0.0) {
            throw new IllegalArgumentException("Invalid regularization factor: " + lambda);
        }

        if (tol <= 0.0) {
            throw new IllegalArgumentException("Invalid tolerance: " + tol);            
        }
        
        if (maxIter <= 0) {
            throw new IllegalArgumentException("Invalid maximum number of iterations: " + maxIter);            
        }

        int p = x[0].length;
        ClassLabels codec = ClassLabels.fit(y);
        int k = codec.k;
        y = codec.y;

        if (k != 2) {
            throw new IllegalArgumentException("Fits binomial model on multi-class data.");
        }

        BinomialObjective objective = new BinomialObjective(x, y, lambda);
        double[] w = new double[p + 1];
        double L = -BFGS.minimize(objective, 5, w, tol, maxIter);

        Binomial model = new Binomial(w, L, lambda, codec.classes);
        model.setLearningRate(0.1 / x.length);
        return model;
    }

    /**
     * Fits multinomial logistic regression.
     * @param x training samples.
     * @param y training labels.
     * @return the model.
     */
    public static Multinomial multinomial(double[][] x, int[] y) {
        return multinomial(x, y, new Properties());
    }

    /**
     * Fits multinomial logistic regression.
     * @param x training samples.
     * @param y training labels.
     * @param params the hyper-parameters.
     * @return the model.
     */
    public static Multinomial multinomial(double[][] x, int[] y, Properties params) {
        double lambda = Double.parseDouble(params.getProperty("smile.logistic.lambda", "0.1"));
        double tol = Double.parseDouble(params.getProperty("smile.logistic.tolerance", "1E-5"));
        int maxIter = Integer.parseInt(params.getProperty("smile.logistic.iterations", "500"));
        return multinomial(x, y, lambda, tol, maxIter);
    }

    /**
     * Fits multinomial logistic regression.
     *
     * @param x training samples.
     * @param y training labels.
     * @param lambda {@code lambda > 0} gives a "regularized" estimate of linear
     *               weights which often has superior generalization performance,
     *               especially when the dimensionality is high.
     * @param tol the tolerance for stopping iterations.
     * @param maxIter the maximum number of iterations.
     * @return the model.
     */
    public static Multinomial multinomial(double[][] x, int[] y, double lambda, double tol, int maxIter) {
        if (x.length != y.length) {
            throw new IllegalArgumentException(String.format("The sizes of X and Y don't match: %d != %d", x.length, y.length));
        }

        if (lambda < 0.0) {
            throw new IllegalArgumentException("Invalid regularization factor: " + lambda);
        }

        if (tol <= 0.0) {
            throw new IllegalArgumentException("Invalid tolerance: " + tol);
        }

        if (maxIter <= 0) {
            throw new IllegalArgumentException("Invalid maximum number of iterations: " + maxIter);
        }

        int p = x[0].length;
        ClassLabels codec = ClassLabels.fit(y);
        int k = codec.k;
        y = codec.y;

        if (k <= 2) {
            throw new IllegalArgumentException("Fits multinomial model on binary class data.");
        }

        MultinomialObjective objective = new MultinomialObjective(x, y, k, lambda);
        double[] w = new double[(k - 1) * (p + 1)];
        double L = -BFGS.minimize(objective, 5, w, tol, maxIter);

        double[][] W = new double[k-1][p+1];
        for (int i = 0, l = 0; i < k-1; i++) {
            for (int j = 0; j <= p; j++, l++) {
                W[i][j] = w[l];
            }
        }

        Multinomial model = new Multinomial(W, L, lambda, codec.classes);
        model.setLearningRate(0.1 / x.length);
        return model;
    }

    /**
     * Fits logistic regression.
     * @param x training samples.
     * @param y training labels.
     * @return the model.
     */
    public static LogisticRegression fit(double[][] x, int[] y) {
        return fit(x, y, new Properties());
    }

    /**
     * Fits logistic regression.
     * @param x training samples.
     * @param y training labels.
     * @param params the hyper-parameters.
     * @return the model.
     */
    public static LogisticRegression fit(double[][] x, int[] y, Properties params) {
        double lambda = Double.parseDouble(params.getProperty("smile.logistic.lambda", "0.1"));
        double tol = Double.parseDouble(params.getProperty("smile.logistic.tolerance", "1E-5"));
        int maxIter = Integer.parseInt(params.getProperty("smile.logistic.iterations", "500"));
        return fit(x, y, lambda, tol, maxIter);
    }

    /**
     * Fits logistic regression.
     *
     * @param x training samples.
     * @param y training labels.
     * @param lambda {@code lambda > 0} gives a "regularized" estimate of linear
     *               weights which often has superior generalization performance,
     *               especially when the dimensionality is high.
     * @param tol the tolerance to stop iterations.
     * @param maxIter the maximum number of iterations.
     * @return the model.
     */
    public static LogisticRegression fit(double[][] x, int[] y, double lambda, double tol, int maxIter) {
        ClassLabels codec = ClassLabels.fit(y);
        if (codec.k == 2)
            return binomial(x, y, lambda, tol, maxIter);
        else
            return multinomial(x, y, lambda, tol, maxIter);
    }

    /**
     * Binary-class logistic regression objective function.
     */
    static class BinomialObjective implements DifferentiableMultivariateFunction {
        /**
         * Training instances.
         */
        double[][] x;
        /**
         * Training labels.
         */
        int[] y;
        /**
         * The dimension of feature space.
         */
        int p;
        /**
         * Regularization factor.
         */
        double lambda;
        /**
         * The number of samples in a partition.
         */
        int partitionSize;
        /**
         * The number of partitions.
         */
        int partitions;
        /**
         * The workspace to store gradient for each data partition.
         */
        double[][] gradients;

        /**
         * Constructor.
         */
        BinomialObjective(double[][] x, int[] y, double lambda) {
            this.x = x;
            this.y = y;
            this.lambda = lambda;
            this.p = x[0].length;

            partitionSize = Integer.parseInt(System.getProperty("smile.data.partition.size", "1000"));
            partitions = x.length / partitionSize + (x.length % partitionSize == 0 ? 0 : 1);
            gradients = new double[partitions][p+1];
        }

        @Override
        public double f(double[] w) {
            // Since BFGS try to minimize the objective function
            // and we try to maximize the log-likelihood, we really
            // return the negative log-likelihood here.
            double f = IntStream.range(0, x.length).parallel().mapToDouble(i -> {
                double wx = dot(x[i], w);
                return MathEx.log1pe(wx) - y[i] * wx;
            }).sum();

            if (lambda > 0.0) {
                double wnorm = 0.0;
                for (int i = 0; i < p; i++) wnorm += w[i] * w[i];
                f += 0.5 * lambda * wnorm;
            }

            return f;
        }

        @Override
        public double g(double[] w, double[] g) {
            double f = IntStream.range(0, partitions).parallel().mapToDouble(r -> {
                double[] gradient = gradients[r];
                Arrays.fill(gradient, 0.0);

                int begin = r * partitionSize;
                int end = (r + 1) * partitionSize;
                if (end > x.length) end = x.length;

                return IntStream.range(begin, end).sequential().mapToDouble(i -> {
                    double[] xi = x[i];
                    double wx = dot(xi, w);
                    double err = y[i] - MathEx.sigmoid(wx);
                    for (int j = 0; j < p; j++) {
                        gradient[j] -= err * xi[j];
                    }
                    gradient[p] -= err;

                    return MathEx.log1pe(wx) - y[i] * wx;
                }).sum();
            }).sum();

            Arrays.fill(g, 0.0);
            for (double[] gradient : gradients) {
                for (int i = 0; i < g.length; i++) {
                    g[i] += gradient[i];
                }
            }

            if (lambda > 0.0) {
                double wnorm = 0.0;
                for (int i = 0; i < p; i++) {
                    wnorm += w[i] * w[i];
                    g[i] += lambda * w[i];
                }
                f += 0.5 * lambda * wnorm;
            }

            return f;
        }
    }

    /**
     * Multi-class logistic regression objective function.
     */
    static class MultinomialObjective implements DifferentiableMultivariateFunction {
        /**
         * Training instances.
         */
        double[][] x;
        /**
         * Training labels.
         */
        int[] y;
        /**
         * The number of classes.
         */
        int k;
        /**
         * The dimension of feature space.
         */
        int p;
        /**
         * Regularization factor.
         */
        double lambda;
        /**
         * The number of samples in a partition.
         */
        int partitionSize;
        /**
         * The number of partitions.
         */
        int partitions;
        /**
         * The workspace to store gradient for each data partition.
         */
        double[][] gradients;
        /**
         * The workspace to store posteriori probability for each data partition.
         */
        double[][] posterioris;

        /**
         * Constructor.
         */
        MultinomialObjective(double[][] x, int[] y, int k, double lambda) {
            this.x = x;
            this.y = y;
            this.k = k;
            this.lambda = lambda;
            this.p = x[0].length;

            partitionSize = Integer.parseInt(System.getProperty("smile.data.partition.size", "1000"));
            partitions = x.length / partitionSize + (x.length % partitionSize == 0 ? 0 : 1);
            gradients = new double[partitions][(k-1)*(p+1)];
            posterioris = new double[partitions][k];
        }

        @Override
        public double f(double[] w) {
            double f = IntStream.range(0, partitions).parallel().mapToDouble(r -> {
                double[] posteriori = posterioris[r];

                int begin = r * partitionSize;
                int end = (r+1) * partitionSize;
                if (end > x.length) end = x.length;

                return IntStream.range(begin, end).sequential().mapToDouble(i -> {
                    posteriori[k - 1] = 0.0;
                    for (int j = 0; j < k - 1; j++) {
                        posteriori[j] = dot(x[i], w, j, p);
                    }

                    MathEx.softmax(posteriori);

                    return -MathEx.log(posteriori[y[i]]);
                }).sum();
            }).sum();

            if (lambda > 0.0) {
                double wnorm = 0.0;
                for (int i = 0; i < k-1; i++) {
                    for (int j = 0, pos = i * (p+1); j < p; j++) {
                        double wi = w[pos + j];
                        wnorm += wi * wi;
                    }
                }
                f += 0.5 * lambda * wnorm;
            }

            return f;
        }

        @Override
        public double g(double[] w, double[] g) {
            double f = IntStream.range(0, partitions).parallel().mapToDouble(r -> {
                        double[] posteriori = posterioris[r];
                        double[] gradient = gradients[r];
                        Arrays.fill(gradient, 0.0);

                        int begin = r * partitionSize;
                        int end = (r+1) * partitionSize;
                        if (end > x.length) end = x.length;

                        return IntStream.range(begin, end).sequential().mapToDouble(i -> {
                            posteriori[k - 1] = 0.0;
                            for (int j = 0; j < k - 1; j++) {
                                posteriori[j] = dot(x[i], w, j, p);
                            }

                            MathEx.softmax(posteriori);

                            for (int j = 0; j < k - 1; j++) {
                                double err = (y[i] == j ? 1.0 : 0.0) - posteriori[j];

                                int pos = j * (p + 1);
                                for (int l = 0; l < p; l++) {
                                    gradient[pos + l] -= err * x[i][l];
                                }
                                gradient[pos + p] -= err;
                            }

                            return -MathEx.log(posteriori[y[i]]);
                        }).sum();
            }).sum();

            Arrays.fill(g, 0.0);
            for (double[] gradient : gradients) {
                for (int i = 0; i < g.length; i++) {
                    g[i] += gradient[i];
                }
            }

            if (lambda > 0.0) {
                double wnorm = 0.0;
                for (int i = 0; i < k-1; i++) {
                    for (int j = 0, pos = i * (p+1); j < p; j++) {
                        double wi = w[pos + j];
                        wnorm += wi * wi;
                        g[pos + j] += lambda * wi;
                    }
                }
                f += 0.5 * lambda * wnorm;
            }

            return f;
        }
    }

    /**
     * Returns the dot product between weight vector and x (augmented with 1).
     */
    private static double dot(double[] x, double[] w) {
        double dot = w[x.length];

        for (int i = 0; i < x.length; i++) {
            dot += x[i] * w[i];
        }

        return dot;
    }

    /**
     * Returns the dot product between weight vector and x (augmented with 1).
     */
    private static double dot(double[] x, double[] w, int j, int p) {
        int pos = j * (p + 1);
        double dot = w[pos + p];

        for (int i = 0; i < p; i++) {
            dot += x[i] * w[pos+i];
        }

        return dot;
    }

    @Override
    public boolean soft() {
        return true;
    }

    @Override
    public boolean online() {
        return true;
    }

    /**
     * Sets the learning rate of stochastic gradient descent.
     * It is a good practice to adapt the learning rate for
     * different data sizes. For example, it is typical to
     * set the learning rate to eta/n, where eta is in [0.1, 0.3]
     * and n is the size of the training data.
     *
     * @param rate the learning rate.
     */
    public void setLearningRate(double rate) {
        if (rate <= 0.0) {
            throw new IllegalArgumentException("Invalid learning rate: " + rate);
        }
        this.eta = rate;
    }

    /**
     * Returns the learning rate of stochastic gradient descent.
     * @return the learning rate of stochastic gradient descent.
     */
    public double getLearningRate() {
        return eta;
    }

    /**
     * Returns the log-likelihood of model.
     * @return the log-likelihood of model.
     */
    public double loglikelihood() {
        return L;
    }

    /**
     * Returns the AIC score.
     * @return the AIC score.
     */
    public double AIC() {
        return ModelSelection.AIC(L, (k-1)*(p+1));
    }
}