All Downloads are FREE. Search and download functionalities are using the official Maven repository.

smile.classification.LogisticRegression Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (c) 2010-2021 Haifeng Li. All rights reserved.
 *
 * Smile is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Smile is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Smile.  If not, see .
 */

package smile.classification;

import java.util.Arrays;
import java.util.Properties;
import java.util.stream.IntStream;
import smile.math.MathEx;
import smile.math.DifferentiableMultivariateFunction;
import smile.math.BFGS;
import smile.util.IntSet;
import smile.validation.ModelSelection;

/**
 * Logistic regression. Logistic regression (logit model) is a generalized
 * linear model used for binomial regression. Logistic regression applies
 * maximum likelihood estimation after transforming the dependent into
 * a logit variable. A logit is the natural log of the odds of the dependent
 * equaling a certain value or not (usually 1 in binary logistic models,
 * the highest value in multinomial models). In this way, logistic regression
 * estimates the odds of a certain event (value) occurring. 
 * 

* Goodness-of-fit tests such as the likelihood ratio test are available * as indicators of model appropriateness, as is the Wald statistic to test * the significance of individual independent variables. *

* Logistic regression has many analogies to ordinary least squares (OLS) * regression. Unlike OLS regression, however, logistic regression does not * assume linearity of relationship between the raw values of the independent * variables and the dependent, does not require normally distributed variables, * does not assume homoscedasticity, and in general has less stringent * requirements. *

* Compared with linear discriminant analysis, logistic regression has several * advantages: *

    *
  • It is more robust: the independent variables don't have to be normally * distributed, or have equal variance in each group *
  • It does not assume a linear relationship between the independent * variables and dependent variable. *
  • It may handle nonlinear effects since one can add explicit interaction * and power terms. *
* However, it requires much more data to achieve stable, meaningful results. *

* Logistic regression also has strong connections with neural network and * maximum entropy modeling. For example, binary logistic regression is * equivalent to a one-layer, single-output neural network with a logistic * activation function trained under log loss. Similarly, multinomial logistic * regression is equivalent to a one-layer, softmax-output neural network. *

* Logistic regression estimation also obeys the maximum entropy principle, and * thus logistic regression is sometimes called "maximum entropy modeling", * and the resulting classifier the "maximum entropy classifier". * * @see smile.glm.GLM * @see MLP * @see Maxent * @see LDA * * @author Haifeng Li */ public abstract class LogisticRegression extends AbstractClassifier { private static final long serialVersionUID = 2L; /** * The dimension of input space. */ int p; /** * The number of classes. */ int k; /** * The log-likelihood of learned model. */ double L; /** * Regularization factor. */ double lambda; /** * learning rate for stochastic gradient descent. */ double eta = 0.1; /** * Constructor. * @param p the dimension of input data. * @param L the log-likelihood of learned model. * @param lambda {@code lambda > 0} gives a "regularized" estimate of linear * weights which often has superior generalization performance, * especially when the dimensionality is high. * @param labels the class label encoder. */ public LogisticRegression(int p, double L, double lambda, IntSet labels) { super(labels); this.k = labels.size(); this.p = p; this.L = L; this.lambda = lambda; } /** Binomial logistic regression. The dependent variable is nominal of two levels. */ public static class Binomial extends LogisticRegression { /** * The linear weights. */ private final double[] w; /** * Constructor. * @param w the weights. * @param L the log-likelihood of learned model. * @param lambda {@code lambda > 0} gives a "regularized" estimate of linear * weights which often has superior generalization performance, * especially when the dimensionality is high. * @param labels the class label encoder. */ public Binomial(double[] w, double L, double lambda, IntSet labels) { super(w.length - 1, L, lambda, labels); this.w = w; } /** * Returns an array of size (p+1) containing the linear weights * of binary logistic regression, where p is the dimension of * feature vectors. The last element is the weight of bias. * @return the linear weights. */ public double[] coefficients() { return w; } @Override public double score(double[] x) { return 1.0 / (1.0 + Math.exp(-dot(x, w))); } @Override public int predict(double[] x) { double f = 1.0 / (1.0 + Math.exp(-dot(x, w))); return classes.valueOf(f < 0.5 ? 0 : 1); } @Override public int predict(double[] x, double[] posteriori) { if (x.length != p) { throw new IllegalArgumentException(String.format("Invalid input vector size: %d, expected: %d", x.length, p)); } if (posteriori.length != k) { throw new IllegalArgumentException(String.format("Invalid posteriori vector size: %d, expected: %d", posteriori.length, k)); } double f = 1.0 / (1.0 + Math.exp(-dot(x, w))); posteriori[0] = 1.0 - f; posteriori[1] = f; return classes.valueOf(f < 0.5 ? 0 : 1); } @Override public void update(double[] x, int y) { if (x.length != p) { throw new IllegalArgumentException("Invalid input vector size: " + x.length); } y = classes.indexOf(y); // calculate gradient for incoming data double wx = dot(x, w); double err = y - MathEx.sigmoid(wx); // update the weights w[p] += eta * err; for (int j = 0; j < p; j++) { w[j] += eta * err * x[j]; } // add regularization part if (lambda > 0.0) { for (int j = 0; j < p; j++) { w[j] -= eta * lambda * w[j]; } } } } /** Multinomial logistic regression. The dependent variable is nominal with more than two levels. */ public static class Multinomial extends LogisticRegression { /** * The linear weights. */ private final double[][] w; /** * Constructor. * @param w the weights. * @param L the log-likelihood of learned model. * @param lambda {@code lambda > 0} gives a "regularized" estimate of linear * weights which often has superior generalization performance, * especially when the dimensionality is high. * @param labels the class label encoder. */ public Multinomial(double[][] w, double L, double lambda, IntSet labels) { super(w[0].length - 1, L, lambda, labels); this.w = w; } /** * Returns a 2d-array of size (k-1) x (p+1), containing the linear weights * of multi-class logistic regression, where k is the number of classes * and p is the dimension of feature vectors. The last element of each * row is the weight of bias. * @return the linear weights. */ public double[][] coefficients() { return w; } @Override public int predict(double[] x) { return predict(x, new double[k]); } @Override public int predict(double[] x, double[] posteriori) { if (x.length != p) { throw new IllegalArgumentException(String.format("Invalid input vector size: %d, expected: %d", x.length, p)); } if (posteriori.length != k) { throw new IllegalArgumentException(String.format("Invalid posteriori vector size: %d, expected: %d", posteriori.length, k)); } posteriori[k-1] = 0.0; for (int i = 0; i < k-1; i++) { posteriori[i] = dot(x, w[i]); } MathEx.softmax(posteriori); return classes.valueOf(MathEx.whichMax(posteriori)); } @Override public void update(double[] x, int y) { if (x.length != p) { throw new IllegalArgumentException("Invalid input vector size: " + x.length); } y = classes.indexOf(y); double[] prob = new double[k]; for (int j = 0; j < k-1; j++) { prob[j] = dot(x, w[j]); } MathEx.softmax(prob); // update the weights for (int i = 0; i < k-1; i++) { double[] wi = w[i]; double err = (y == i ? 1.0 : 0.0) - prob[i]; wi[p] += eta * err; for (int j = 0; j < p; j++) { wi[j] += eta * err * x[j]; } // add regularization part if (lambda > 0.0) { for (int j = 0; j < p; j++) { wi[j] -= eta * lambda * wi[j]; } } } } } /** * Fits binomial logistic regression. * @param x training samples. * @param y training labels. * @return the model. */ public static Binomial binomial(double[][] x, int[] y) { return binomial(x, y, new Properties()); } /** * Fits binomial logistic regression. * @param x training samples. * @param y training labels. * @param params the hyper-parameters. * @return the model. */ public static Binomial binomial(double[][] x, int[] y, Properties params) { double lambda = Double.parseDouble(params.getProperty("smile.logistic.lambda", "0.1")); double tol = Double.parseDouble(params.getProperty("smile.logistic.tolerance", "1E-5")); int maxIter = Integer.parseInt(params.getProperty("smile.logistic.iterations", "500")); return binomial(x, y, lambda, tol, maxIter); } /** * Fits binomial logistic regression. * * @param x training samples. * @param y training labels. * @param lambda {@code lambda > 0} gives a "regularized" estimate of linear * weights which often has superior generalization performance, * especially when the dimensionality is high. * @param tol the tolerance for stopping iterations. * @param maxIter the maximum number of iterations. * @return the model. */ public static Binomial binomial(double[][] x, int[] y, double lambda, double tol, int maxIter) { if (x.length != y.length) { throw new IllegalArgumentException(String.format("The sizes of X and Y don't match: %d != %d", x.length, y.length)); } if (lambda < 0.0) { throw new IllegalArgumentException("Invalid regularization factor: " + lambda); } if (tol <= 0.0) { throw new IllegalArgumentException("Invalid tolerance: " + tol); } if (maxIter <= 0) { throw new IllegalArgumentException("Invalid maximum number of iterations: " + maxIter); } int p = x[0].length; ClassLabels codec = ClassLabels.fit(y); int k = codec.k; y = codec.y; if (k != 2) { throw new IllegalArgumentException("Fits binomial model on multi-class data."); } BinomialObjective objective = new BinomialObjective(x, y, lambda); double[] w = new double[p + 1]; double L = -BFGS.minimize(objective, 5, w, tol, maxIter); Binomial model = new Binomial(w, L, lambda, codec.classes); model.setLearningRate(0.1 / x.length); return model; } /** * Fits multinomial logistic regression. * @param x training samples. * @param y training labels. * @return the model. */ public static Multinomial multinomial(double[][] x, int[] y) { return multinomial(x, y, new Properties()); } /** * Fits multinomial logistic regression. * @param x training samples. * @param y training labels. * @param params the hyper-parameters. * @return the model. */ public static Multinomial multinomial(double[][] x, int[] y, Properties params) { double lambda = Double.parseDouble(params.getProperty("smile.logistic.lambda", "0.1")); double tol = Double.parseDouble(params.getProperty("smile.logistic.tolerance", "1E-5")); int maxIter = Integer.parseInt(params.getProperty("smile.logistic.iterations", "500")); return multinomial(x, y, lambda, tol, maxIter); } /** * Fits multinomial logistic regression. * * @param x training samples. * @param y training labels. * @param lambda {@code lambda > 0} gives a "regularized" estimate of linear * weights which often has superior generalization performance, * especially when the dimensionality is high. * @param tol the tolerance for stopping iterations. * @param maxIter the maximum number of iterations. * @return the model. */ public static Multinomial multinomial(double[][] x, int[] y, double lambda, double tol, int maxIter) { if (x.length != y.length) { throw new IllegalArgumentException(String.format("The sizes of X and Y don't match: %d != %d", x.length, y.length)); } if (lambda < 0.0) { throw new IllegalArgumentException("Invalid regularization factor: " + lambda); } if (tol <= 0.0) { throw new IllegalArgumentException("Invalid tolerance: " + tol); } if (maxIter <= 0) { throw new IllegalArgumentException("Invalid maximum number of iterations: " + maxIter); } int p = x[0].length; ClassLabels codec = ClassLabels.fit(y); int k = codec.k; y = codec.y; if (k <= 2) { throw new IllegalArgumentException("Fits multinomial model on binary class data."); } MultinomialObjective objective = new MultinomialObjective(x, y, k, lambda); double[] w = new double[(k - 1) * (p + 1)]; double L = -BFGS.minimize(objective, 5, w, tol, maxIter); double[][] W = new double[k-1][p+1]; for (int i = 0, l = 0; i < k-1; i++) { for (int j = 0; j <= p; j++, l++) { W[i][j] = w[l]; } } Multinomial model = new Multinomial(W, L, lambda, codec.classes); model.setLearningRate(0.1 / x.length); return model; } /** * Fits logistic regression. * @param x training samples. * @param y training labels. * @return the model. */ public static LogisticRegression fit(double[][] x, int[] y) { return fit(x, y, new Properties()); } /** * Fits logistic regression. * @param x training samples. * @param y training labels. * @param params the hyper-parameters. * @return the model. */ public static LogisticRegression fit(double[][] x, int[] y, Properties params) { double lambda = Double.parseDouble(params.getProperty("smile.logistic.lambda", "0.1")); double tol = Double.parseDouble(params.getProperty("smile.logistic.tolerance", "1E-5")); int maxIter = Integer.parseInt(params.getProperty("smile.logistic.iterations", "500")); return fit(x, y, lambda, tol, maxIter); } /** * Fits logistic regression. * * @param x training samples. * @param y training labels. * @param lambda {@code lambda > 0} gives a "regularized" estimate of linear * weights which often has superior generalization performance, * especially when the dimensionality is high. * @param tol the tolerance to stop iterations. * @param maxIter the maximum number of iterations. * @return the model. */ public static LogisticRegression fit(double[][] x, int[] y, double lambda, double tol, int maxIter) { ClassLabels codec = ClassLabels.fit(y); if (codec.k == 2) return binomial(x, y, lambda, tol, maxIter); else return multinomial(x, y, lambda, tol, maxIter); } /** * Binary-class logistic regression objective function. */ static class BinomialObjective implements DifferentiableMultivariateFunction { /** * Training instances. */ double[][] x; /** * Training labels. */ int[] y; /** * The dimension of feature space. */ int p; /** * Regularization factor. */ double lambda; /** * The number of samples in a partition. */ int partitionSize; /** * The number of partitions. */ int partitions; /** * The workspace to store gradient for each data partition. */ double[][] gradients; /** * Constructor. */ BinomialObjective(double[][] x, int[] y, double lambda) { this.x = x; this.y = y; this.lambda = lambda; this.p = x[0].length; partitionSize = Integer.parseInt(System.getProperty("smile.data.partition.size", "1000")); partitions = x.length / partitionSize + (x.length % partitionSize == 0 ? 0 : 1); gradients = new double[partitions][p+1]; } @Override public double f(double[] w) { // Since BFGS try to minimize the objective function // and we try to maximize the log-likelihood, we really // return the negative log-likelihood here. double f = IntStream.range(0, x.length).parallel().mapToDouble(i -> { double wx = dot(x[i], w); return MathEx.log1pe(wx) - y[i] * wx; }).sum(); if (lambda > 0.0) { double wnorm = 0.0; for (int i = 0; i < p; i++) wnorm += w[i] * w[i]; f += 0.5 * lambda * wnorm; } return f; } @Override public double g(double[] w, double[] g) { double f = IntStream.range(0, partitions).parallel().mapToDouble(r -> { double[] gradient = gradients[r]; Arrays.fill(gradient, 0.0); int begin = r * partitionSize; int end = (r + 1) * partitionSize; if (end > x.length) end = x.length; return IntStream.range(begin, end).sequential().mapToDouble(i -> { double[] xi = x[i]; double wx = dot(xi, w); double err = y[i] - MathEx.sigmoid(wx); for (int j = 0; j < p; j++) { gradient[j] -= err * xi[j]; } gradient[p] -= err; return MathEx.log1pe(wx) - y[i] * wx; }).sum(); }).sum(); Arrays.fill(g, 0.0); for (double[] gradient : gradients) { for (int i = 0; i < g.length; i++) { g[i] += gradient[i]; } } if (lambda > 0.0) { double wnorm = 0.0; for (int i = 0; i < p; i++) { wnorm += w[i] * w[i]; g[i] += lambda * w[i]; } f += 0.5 * lambda * wnorm; } return f; } } /** * Multi-class logistic regression objective function. */ static class MultinomialObjective implements DifferentiableMultivariateFunction { /** * Training instances. */ double[][] x; /** * Training labels. */ int[] y; /** * The number of classes. */ int k; /** * The dimension of feature space. */ int p; /** * Regularization factor. */ double lambda; /** * The number of samples in a partition. */ int partitionSize; /** * The number of partitions. */ int partitions; /** * The workspace to store gradient for each data partition. */ double[][] gradients; /** * The workspace to store posteriori probability for each data partition. */ double[][] posterioris; /** * Constructor. */ MultinomialObjective(double[][] x, int[] y, int k, double lambda) { this.x = x; this.y = y; this.k = k; this.lambda = lambda; this.p = x[0].length; partitionSize = Integer.parseInt(System.getProperty("smile.data.partition.size", "1000")); partitions = x.length / partitionSize + (x.length % partitionSize == 0 ? 0 : 1); gradients = new double[partitions][(k-1)*(p+1)]; posterioris = new double[partitions][k]; } @Override public double f(double[] w) { double f = IntStream.range(0, partitions).parallel().mapToDouble(r -> { double[] posteriori = posterioris[r]; int begin = r * partitionSize; int end = (r+1) * partitionSize; if (end > x.length) end = x.length; return IntStream.range(begin, end).sequential().mapToDouble(i -> { posteriori[k - 1] = 0.0; for (int j = 0; j < k - 1; j++) { posteriori[j] = dot(x[i], w, j, p); } MathEx.softmax(posteriori); return -MathEx.log(posteriori[y[i]]); }).sum(); }).sum(); if (lambda > 0.0) { double wnorm = 0.0; for (int i = 0; i < k-1; i++) { for (int j = 0, pos = i * (p+1); j < p; j++) { double wi = w[pos + j]; wnorm += wi * wi; } } f += 0.5 * lambda * wnorm; } return f; } @Override public double g(double[] w, double[] g) { double f = IntStream.range(0, partitions).parallel().mapToDouble(r -> { double[] posteriori = posterioris[r]; double[] gradient = gradients[r]; Arrays.fill(gradient, 0.0); int begin = r * partitionSize; int end = (r+1) * partitionSize; if (end > x.length) end = x.length; return IntStream.range(begin, end).sequential().mapToDouble(i -> { posteriori[k - 1] = 0.0; for (int j = 0; j < k - 1; j++) { posteriori[j] = dot(x[i], w, j, p); } MathEx.softmax(posteriori); for (int j = 0; j < k - 1; j++) { double err = (y[i] == j ? 1.0 : 0.0) - posteriori[j]; int pos = j * (p + 1); for (int l = 0; l < p; l++) { gradient[pos + l] -= err * x[i][l]; } gradient[pos + p] -= err; } return -MathEx.log(posteriori[y[i]]); }).sum(); }).sum(); Arrays.fill(g, 0.0); for (double[] gradient : gradients) { for (int i = 0; i < g.length; i++) { g[i] += gradient[i]; } } if (lambda > 0.0) { double wnorm = 0.0; for (int i = 0; i < k-1; i++) { for (int j = 0, pos = i * (p+1); j < p; j++) { double wi = w[pos + j]; wnorm += wi * wi; g[pos + j] += lambda * wi; } } f += 0.5 * lambda * wnorm; } return f; } } /** * Returns the dot product between weight vector and x (augmented with 1). */ private static double dot(double[] x, double[] w) { double dot = w[x.length]; for (int i = 0; i < x.length; i++) { dot += x[i] * w[i]; } return dot; } /** * Returns the dot product between weight vector and x (augmented with 1). */ private static double dot(double[] x, double[] w, int j, int p) { int pos = j * (p + 1); double dot = w[pos + p]; for (int i = 0; i < p; i++) { dot += x[i] * w[pos+i]; } return dot; } @Override public boolean soft() { return true; } @Override public boolean online() { return true; } /** * Sets the learning rate of stochastic gradient descent. * It is a good practice to adapt the learning rate for * different data sizes. For example, it is typical to * set the learning rate to eta/n, where eta is in [0.1, 0.3] * and n is the size of the training data. * * @param rate the learning rate. */ public void setLearningRate(double rate) { if (rate <= 0.0) { throw new IllegalArgumentException("Invalid learning rate: " + rate); } this.eta = rate; } /** * Returns the learning rate of stochastic gradient descent. * @return the learning rate of stochastic gradient descent. */ public double getLearningRate() { return eta; } /** * Returns the log-likelihood of model. * @return the log-likelihood of model. */ public double loglikelihood() { return L; } /** * Returns the AIC score. * @return the AIC score. */ public double AIC() { return ModelSelection.AIC(L, (k-1)*(p+1)); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy