All Downloads are FREE. Search and download functionalities are using the official Maven repository.

smile.stat.distribution.GaussianDistribution Maven / Gradle / Ivy

There is a newer version: 2024.11.2
Show newest version
/*******************************************************************************
 * Copyright (c) 2010 Haifeng Li
 *   
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *  
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/

package smile.stat.distribution;

import smile.math.Math;
import smile.math.special.Erf;

/**
 * The normal distribution or Gaussian distribution is a continuous probability
 * distribution that describes data that clusters around a mean.
 * The graph of the associated probability density function is bell-shaped,
 * with a peak at the mean, and is known as the Gaussian function or bell curve.
 * The normal distribution can be used to describe any variable that tends
 * to cluster around the mean.
 * 

* The family of normal distributions is closed under linear transformations. * That is, if X is normally distributed, then a linear transform aX + b * (for some real numbers a ≠ 0 and b) is also normally distributed. * If X1, X2 are two independent normal random variables, * then their linear combination will also be normally distributed. * The converse is also true: if X1 and X2 are independent and their sum X1 + X2 * is distributed normally, then both X1 and X2 must also be normal, which is * known as the Cramer's theorem. Of all probability distributions over the * reals with mean μ and variance σ2, the normal * distribution N(μ, σ2) is the one with the maximum entropy. *

* The central limit theorem states that under certain, fairly common conditions, * the sum of a large number of random variables will have approximately normal * distribution. For example if X1, …, Xn is a * sequence of iid random variables, each having mean μ and variance σ2 * but otherwise distributions of Xi's can be arbitrary, then the * central limit theorem states that *

* √n (1⁄n Σ Xi - μ) → N(0, σ2). *

* The theorem will hold even if the summands Xi are not iid, * although some constraints on the degree of dependence and the growth rate * of moments still have to be imposed. *

* Therefore, certain other distributions can be approximated by the normal distribution, for example: *

    *
  • The binomial distribution B(n, p) is approximately normal N(np, np(1-p)) for large n and for p not too close to zero or one. *
  • The Poisson(λ) distribution is approximately normal N(λ, λ) for large values of λ. *
  • The chi-squared distribution Χ2(k) is approximately normal N(k, 2k) for large k. *
  • The Student's t-distribution t(ν) is approximately normal N(0, 1) when ν is large. *
* * @author Haifeng Li */ public class GaussianDistribution extends AbstractDistribution implements ExponentialFamily { private static final long serialVersionUID = 1L; private static final double LOG2PIE_2 = Math.log(2 * Math.PI * Math.E) / 2; private static final double LOG2PI_2 = Math.log(2 * Math.PI) / 2; private static final GaussianDistribution singleton = new GaussianDistribution(0.0, 1.0); private double mu; private double sigma; private double variance; private double entropy; private double pdfConstant; /** * Constructor * @param mu mean. * @param sigma standard deviation. */ public GaussianDistribution(double mu, double sigma) { this.mu = mu; this.sigma = sigma; variance = sigma * sigma; entropy = Math.log(sigma) + LOG2PIE_2; pdfConstant = Math.log(sigma) + LOG2PI_2; } /** * Constructor. Mean and standard deviation will be estimated from the data by MLE. */ public GaussianDistribution(double[] data) { mu = Math.mean(data); sigma = Math.sd(data); variance = sigma * sigma; entropy = Math.log(sigma) + LOG2PIE_2; pdfConstant = Math.log(sigma) + LOG2PI_2; } public static GaussianDistribution getInstance() { return singleton; } @Override public int npara() { return 2; } @Override public double mean() { return mu; } @Override public double var() { return variance; } @Override public double sd() { return sigma; } @Override public double entropy() { return entropy; } @Override public String toString() { return String.format("Gaussian Distribution(%.4f, %.4f)", mu, sigma); } /** * Cache the generated random number in Box-Muller algorithm. */ private Double boxMuller; /** * Uses the Box-Muller algorithm to transform Random.random()'s into Gaussian deviates. */ @Override public double rand() { double out, x, y, r, z; if (boxMuller != null) { out = boxMuller.doubleValue(); boxMuller = null; } else { do { x = Math.random(-1, 1); y = Math.random(-1, 1); r = x * x + y * y; } while (r >= 1.0); z = Math.sqrt(-2.0 * Math.log(r) / r); boxMuller = new Double(x * z); out = y * z; } return mu + sigma * out; } /** * Uses Inverse CDF method to generate a Gaussian deviate. */ public double randInverseCDF() { final double a0 = 2.50662823884; final double a1 = -18.61500062529; final double a2 = 41.39119773534; final double a3 = -25.44106049637; final double b0 = -8.47351093090; final double b1 = 23.08336743743; final double b2 = -21.06224101826; final double b3 = 3.13082909833; final double c0 = 0.3374754822726147; final double c1 = 0.9761690190917186; final double c2 = 0.1607979714918209; final double c3 = 0.0276438810333863; final double c4 = 0.0038405729373609; final double c5 = 0.0003951896511919; final double c6 = 0.0000321767881768; final double c7 = 0.0000002888167364; final double c8 = 0.0000003960315187; double y, r, x; double u = Math.random(); while (u == 0.0) { u = Math.random(); } y = u - 0.5; if (Math.abs(y) < 0.42) { r = y * y; x = y * (((a3 * r + a2) * r + a1) * r + a0) / ((((b3 * r + b2) * r + b1) * r + b0) * r + 1); } else { r = u; if (y > 0) { r = 1 - u; } r = Math.log(-Math.log(r)); x = c0 + r * (c1 + r * (c2 + r * (c3 + r * (c4 + r * (c5 + r * (c6 + r * (c7 + r * c8))))))); if (y < 0) { x = -(x); } } return mu + sigma * x; } @Override public double p(double x) { if (sigma == 0) { if (x == mu) { return 1.0; } else { return 0.0; } } return Math.exp(logp(x)); } @Override public double logp(double x) { if (sigma == 0) { if (x == mu) { return 0.0; } else { return Double.NEGATIVE_INFINITY; } } double d = x - mu; return -0.5 * d * d / variance - pdfConstant; } @Override public double cdf(double x) { if (sigma == 0) { if (x < mu) { return 0.0; } else { return 1.0; } } return 0.5 * Erf.erfc(-0.707106781186547524 * (x - mu) / sigma); } /** * The quantile, the probability to the left of quantile(p) is p. This is * actually the inverse of cdf. * * Original algorythm and Perl implementation can * be found at: http://www.math.uio.no/~jacklam/notes/invnorm/index.html */ @Override public double quantile(double p) { if (p < 0.0 || p > 1.0) { throw new IllegalArgumentException("Invalid p: " + p); } if (sigma == 0.0) { if (p < 1.0) { return mu - 1E-10; } else { return mu; } } return -1.41421356237309505 * sigma * Erf.inverfc(2.0 * p) + mu; } @Override public Mixture.Component M(double[] x, double[] posteriori) { double alpha = 0.0; double mean = 0.0; double sd = 0.0; for (int i = 0; i < x.length; i++) { alpha += posteriori[i]; mean += x[i] * posteriori[i]; } mean /= alpha; for (int i = 0; i < x.length; i++) { double d = x[i] - mean; sd += d * d * posteriori[i]; } sd = Math.sqrt(sd / alpha); Mixture.Component c = new Mixture.Component(); c.priori = alpha; c.distribution = new GaussianDistribution(mean, sd); return c; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy