
smile.stat.distribution.Distribution Maven / Gradle / Ivy
/*
* Copyright (c) 2010-2021 Haifeng Li. All rights reserved.
*
* Smile is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Smile is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Smile. If not, see .
*/
package smile.stat.distribution;
import smile.math.MathEx;
import java.io.Serializable;
/**
* Probability distribution of univariate random variable. A probability
* distribution identifies either the probability of each value of a random
* variable (when the variable is discrete), or the probability of the value
* falling within a particular interval (when the variable is continuous).
* When the random variable takes values in the set of real numbers, the
* probability distribution is completely described by the cumulative
* distribution function, whose value at each real x is the probability
* that the random variable is smaller than or equal to x.
*
* Both rejection and inverse transform sampling methods are implemented to
* provide some general approaches to generate random samples based on
* probability density function or quantile function. Besides, a quantile
* function is also provided based on bisection searching.
*
* @see MultivariateDistribution
*
* @author Haifeng Li
*/
public interface Distribution extends Serializable {
/**
* Returns the number of parameters of the distribution.
* The "length" is in the sense of the minimum description
* length principle.
* @return The number of parameters.
*/
int length();
/**
* Returns the mean of distribution.
* @return The mean.
*/
double mean();
/**
* Returns the variance of distribution.
* @return The variance.
*/
double variance();
/**
* Returns the standard deviation of distribution.
* @return The standard deviation.
*/
default double sd() {
return Math.sqrt(variance());
}
/**
* Returns Shannon entropy of the distribution.
* @return Shannon entropy.
*/
double entropy();
/**
* Generates a random number following this distribution.
* @return a random number.
*/
double rand();
/**
* Generates a set of random numbers following this distribution.
* @param n the number of random numbers to generate.
* @return an array of random numbers.
*/
default double[] rand(int n) {
double[] data = new double[n];
for (int i = 0; i < n; i++) {
data[i] = rand();
}
return data;
}
/**
* The probability density function for continuous distribution
* or probability mass function for discrete distribution at x.
* @param x a real number.
* @return the density.
*/
double p(double x);
/**
* The density at x in log scale, which may prevents the underflow problem.
* @param x a real number.
* @return the log density.
*/
double logp(double x);
/**
* Cumulative distribution function. That is the probability to the left of x.
* @param x a real number.
* @return the probability.
*/
double cdf(double x);
/**
* The quantile, the probability to the left of quantile is p. It is
* actually the inverse of cdf.
* @param p the probability.
* @return the quantile.
*/
double quantile(double p);
/**
* The likelihood of the sample set following this distribution.
* @param x a set of samples.
* @return the likelihood.
*/
default double likelihood(double[] x) {
return Math.exp(logLikelihood(x));
}
/**
* The log likelihood of the sample set following this distribution.
* @param x a set of samples.
* @return the log likelihood.
*/
default double logLikelihood(double[] x) {
double L = 0.0;
for (double xi : x)
L += logp(xi);
return L;
}
/**
* Use the rejection technique to draw a sample from the given
* distribution. WARNING: this simulation technique can
* take a very long time. Rejection sampling is also commonly
* called the acceptance-rejection method or "accept-reject algorithm".
* It generates sampling values from an arbitrary probability distribution
* function f(x) by using an instrumental distribution g(x), under the
* only restriction that {@code f(x) < M g(x)} where
* {@code M > 1} is an appropriate bound on
* {@code f(x) / g(x)}.
*
* Rejection sampling is usually used in cases where the form of
* f(x)
makes sampling difficult. Instead of sampling
* directly from the distribution f(x)
, we use an envelope
* distribution M g(x)
where sampling is easier. These
* samples from M g(x)
are probabilistically
* accepted or rejected.
*
* This method relates to the general field of Monte Carlo techniques,
* including Markov chain Monte Carlo algorithms that also use a proxy
* distribution to achieve simulation from the target distribution
* f(x)
. It forms the basis for algorithms such as
* the Metropolis algorithm.
*
* @param pmax the scale of instrumental distribution (uniform).
* @param xmin the lower bound of random variable range.
* @param xmax the upper bound of random variable range.
* @return a random number.
*/
default double rejectionSampling(double pmax, double xmin, double xmax) {
double x;
double y;
do {
x = xmin + MathEx.random() * (xmax - xmin);
y = MathEx.random() * pmax;
} while (p(x) < y);
return x;
}
/**
* Use inverse transform sampling (also known as the inverse probability
* integral transform or inverse transformation method or Smirnov transform)
* to draw a sample from the given distribution. This is a method for
* generating sample numbers at random from any probability distribution
* given its cumulative distribution function (cdf). Subject to the
* restriction that the distribution is continuous, this method is
* generally applicable (and can be computationally efficient if the
* cdf can be analytically inverted), but may be too computationally
* expensive in practice for some probability distributions. The
* Box-Muller transform is an example of an algorithm which is
* less general but more computationally efficient. It is often the
* case that, even for simple distributions, the inverse transform
* sampling method can be improved on, given substantial research
* effort, e.g. the ziggurat algorithm and rejection sampling.
*
* @return a random number.
*/
default double inverseTransformSampling() {
double u = MathEx.random();
return quantile(u);
}
/**
* Inversion of CDF by bisection numeric root finding of "cdf(x) = p"
* for continuous distribution.
*
* @param p the probability.
* @param xmin the lower bound of search range.
* @param xmax the upper bound of search range.
* @param eps the epsilon close to zero.
* @return the quantile.
*/
default double quantile(double p, double xmin, double xmax, double eps) {
if (eps <= 0.0) {
throw new IllegalArgumentException("Invalid epsilon: " + eps);
}
while (Math.abs(xmax - xmin) > eps) {
double xmed = (xmax + xmin) / 2;
if (cdf(xmed) > p) {
xmax = xmed;
} else {
xmin = xmed;
}
}
return xmin;
}
/**
* Inversion of CDF by bisection numeric root finding of "cdf(x) = p"
* for continuous distribution. The default epsilon is 1E-6.
* @param p the probability.
* @param xmin the lower bound of search range.
* @param xmax the upper bound of search range.
* @return the quantile.
*/
default double quantile(double p, double xmin, double xmax) {
return quantile(p, xmin, xmax, 1.0E-6);
}
}