org.apache.commons.math3.distribution.ZipfDistribution Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.math3.distribution;
import org.apache.commons.math3.exception.NotStrictlyPositiveException;
import org.apache.commons.math3.exception.util.LocalizedFormats;
import org.apache.commons.math3.random.RandomGenerator;
import org.apache.commons.math3.random.Well19937c;
import org.apache.commons.math3.util.FastMath;
/**
* Implementation of the Zipf distribution.
*
* Parameters:
* For a random variable {@code X} whose values are distributed according to this
* distribution, the probability mass function is given by
*
* P(X = k) = H(N,s) * 1 / k^s for {@code k = 1,2,...,N}.
*
* {@code H(N,s)} is the normalizing constant
* which corresponds to the generalized harmonic number of order N of s.
*
*
* - {@code N} is the number of elements
* - {@code s} is the exponent
*
* @see Zipf's law (Wikipedia)
* @see Generalized harmonic numbers
*/
public class ZipfDistribution extends AbstractIntegerDistribution {
/** Serializable version identifier. */
private static final long serialVersionUID = -140627372283420404L;
/** Number of elements. */
private final int numberOfElements;
/** Exponent parameter of the distribution. */
private final double exponent;
/** Cached numerical mean */
private double numericalMean = Double.NaN;
/** Whether or not the numerical mean has been calculated */
private boolean numericalMeanIsCalculated = false;
/** Cached numerical variance */
private double numericalVariance = Double.NaN;
/** Whether or not the numerical variance has been calculated */
private boolean numericalVarianceIsCalculated = false;
/** The sampler to be used for the sample() method */
private transient ZipfRejectionInversionSampler sampler;
/**
* Create a new Zipf distribution with the given number of elements and
* exponent.
*
* Note: this constructor will implicitly create an instance of
* {@link Well19937c} as random generator to be used for sampling only (see
* {@link #sample()} and {@link #sample(int)}). In case no sampling is
* needed for the created distribution, it is advised to pass {@code null}
* as random generator via the appropriate constructors to avoid the
* additional initialisation overhead.
*
* @param numberOfElements Number of elements.
* @param exponent Exponent.
* @exception NotStrictlyPositiveException if {@code numberOfElements <= 0}
* or {@code exponent <= 0}.
*/
public ZipfDistribution(final int numberOfElements, final double exponent) {
this(new Well19937c(), numberOfElements, exponent);
}
/**
* Creates a Zipf distribution.
*
* @param rng Random number generator.
* @param numberOfElements Number of elements.
* @param exponent Exponent.
* @exception NotStrictlyPositiveException if {@code numberOfElements <= 0}
* or {@code exponent <= 0}.
* @since 3.1
*/
public ZipfDistribution(RandomGenerator rng,
int numberOfElements,
double exponent)
throws NotStrictlyPositiveException {
super(rng);
if (numberOfElements <= 0) {
throw new NotStrictlyPositiveException(LocalizedFormats.DIMENSION,
numberOfElements);
}
if (exponent <= 0) {
throw new NotStrictlyPositiveException(LocalizedFormats.EXPONENT,
exponent);
}
this.numberOfElements = numberOfElements;
this.exponent = exponent;
}
/**
* Get the number of elements (e.g. corpus size) for the distribution.
*
* @return the number of elements
*/
public int getNumberOfElements() {
return numberOfElements;
}
/**
* Get the exponent characterizing the distribution.
*
* @return the exponent
*/
public double getExponent() {
return exponent;
}
/** {@inheritDoc} */
public double probability(final int x) {
if (x <= 0 || x > numberOfElements) {
return 0.0;
}
return (1.0 / FastMath.pow(x, exponent)) / generalizedHarmonic(numberOfElements, exponent);
}
/** {@inheritDoc} */
@Override
public double logProbability(int x) {
if (x <= 0 || x > numberOfElements) {
return Double.NEGATIVE_INFINITY;
}
return -FastMath.log(x) * exponent - FastMath.log(generalizedHarmonic(numberOfElements, exponent));
}
/** {@inheritDoc} */
public double cumulativeProbability(final int x) {
if (x <= 0) {
return 0.0;
} else if (x >= numberOfElements) {
return 1.0;
}
return generalizedHarmonic(x, exponent) / generalizedHarmonic(numberOfElements, exponent);
}
/**
* {@inheritDoc}
*
* For number of elements {@code N} and exponent {@code s}, the mean is
* {@code Hs1 / Hs}, where
*
* - {@code Hs1 = generalizedHarmonic(N, s - 1)},
* - {@code Hs = generalizedHarmonic(N, s)}.
*
*/
public double getNumericalMean() {
if (!numericalMeanIsCalculated) {
numericalMean = calculateNumericalMean();
numericalMeanIsCalculated = true;
}
return numericalMean;
}
/**
* Used by {@link #getNumericalMean()}.
*
* @return the mean of this distribution
*/
protected double calculateNumericalMean() {
final int N = getNumberOfElements();
final double s = getExponent();
final double Hs1 = generalizedHarmonic(N, s - 1);
final double Hs = generalizedHarmonic(N, s);
return Hs1 / Hs;
}
/**
* {@inheritDoc}
*
* For number of elements {@code N} and exponent {@code s}, the mean is
* {@code (Hs2 / Hs) - (Hs1^2 / Hs^2)}, where
*
* - {@code Hs2 = generalizedHarmonic(N, s - 2)},
* - {@code Hs1 = generalizedHarmonic(N, s - 1)},
* - {@code Hs = generalizedHarmonic(N, s)}.
*
*/
public double getNumericalVariance() {
if (!numericalVarianceIsCalculated) {
numericalVariance = calculateNumericalVariance();
numericalVarianceIsCalculated = true;
}
return numericalVariance;
}
/**
* Used by {@link #getNumericalVariance()}.
*
* @return the variance of this distribution
*/
protected double calculateNumericalVariance() {
final int N = getNumberOfElements();
final double s = getExponent();
final double Hs2 = generalizedHarmonic(N, s - 2);
final double Hs1 = generalizedHarmonic(N, s - 1);
final double Hs = generalizedHarmonic(N, s);
return (Hs2 / Hs) - ((Hs1 * Hs1) / (Hs * Hs));
}
/**
* Calculates the Nth generalized harmonic number. See
* Harmonic
* Series.
*
* @param n Term in the series to calculate (must be larger than 1)
* @param m Exponent (special case {@code m = 1} is the harmonic series).
* @return the nth generalized harmonic number.
*/
private double generalizedHarmonic(final int n, final double m) {
double value = 0;
for (int k = n; k > 0; --k) {
value += 1.0 / FastMath.pow(k, m);
}
return value;
}
/**
* {@inheritDoc}
*
* The lower bound of the support is always 1 no matter the parameters.
*
* @return lower bound of the support (always 1)
*/
public int getSupportLowerBound() {
return 1;
}
/**
* {@inheritDoc}
*
* The upper bound of the support is the number of elements.
*
* @return upper bound of the support
*/
public int getSupportUpperBound() {
return getNumberOfElements();
}
/**
* {@inheritDoc}
*
* The support of this distribution is connected.
*
* @return {@code true}
*/
public boolean isSupportConnected() {
return true;
}
/**
* {@inheritDoc}
*/
@Override
public int sample() {
if (sampler == null) {
sampler = new ZipfRejectionInversionSampler(numberOfElements, exponent);
}
return sampler.sample(random);
}
/**
* Utility class implementing a rejection inversion sampling method for a discrete,
* bounded Zipf distribution that is based on the method described in
*
* Wolfgang Hörmann and Gerhard Derflinger
* "Rejection-inversion to generate variates from monotone discrete distributions."
* ACM Transactions on Modeling and Computer Simulation (TOMACS) 6.3 (1996): 169-184.
*
* The paper describes an algorithm for exponents larger than 1 (Algorithm ZRI).
* The original method uses {@code H(x) := (v + x)^(1 - q) / (1 - q)}
* as the integral of the hat function. This function is undefined for
* q = 1, which is the reason for the limitation of the exponent.
* If instead the integral function
* {@code H(x) := ((v + x)^(1 - q) - 1) / (1 - q)} is used,
* for which a meaningful limit exists for q = 1,
* the method works for all positive exponents.
*
* The following implementation uses v := 0 and generates integral numbers
* in the range [1, numberOfElements]. This is different to the original method
* where v is defined to be positive and numbers are taken from [0, i_max].
* This explains why the implementation looks slightly different.
*
* @since 3.6
*/
static final class ZipfRejectionInversionSampler {
/** Exponent parameter of the distribution. */
private final double exponent;
/** Number of elements. */
private final int numberOfElements;
/** Constant equal to {@code hIntegral(1.5) - 1}. */
private final double hIntegralX1;
/** Constant equal to {@code hIntegral(numberOfElements + 0.5)}. */
private final double hIntegralNumberOfElements;
/** Constant equal to {@code 2 - hIntegralInverse(hIntegral(2.5) - h(2)}. */
private final double s;
/** Simple constructor.
* @param numberOfElements number of elements
* @param exponent exponent parameter of the distribution
*/
ZipfRejectionInversionSampler(final int numberOfElements, final double exponent) {
this.exponent = exponent;
this.numberOfElements = numberOfElements;
this.hIntegralX1 = hIntegral(1.5) - 1d;
this.hIntegralNumberOfElements = hIntegral(numberOfElements + 0.5);
this.s = 2d - hIntegralInverse(hIntegral(2.5) - h(2));
}
/** Generate one integral number in the range [1, numberOfElements].
* @param random random generator to use
* @return generated integral number in the range [1, numberOfElements]
*/
int sample(final RandomGenerator random) {
while(true) {
final double u = hIntegralNumberOfElements + random.nextDouble() * (hIntegralX1 - hIntegralNumberOfElements);
// u is uniformly distributed in (hIntegralX1, hIntegralNumberOfElements]
double x = hIntegralInverse(u);
int k = (int)(x + 0.5);
// Limit k to the range [1, numberOfElements]
// (k could be outside due to numerical inaccuracies)
if (k < 1) {
k = 1;
}
else if (k > numberOfElements) {
k = numberOfElements;
}
// Here, the distribution of k is given by:
//
// P(k = 1) = C * (hIntegral(1.5) - hIntegralX1) = C
// P(k = m) = C * (hIntegral(m + 1/2) - hIntegral(m - 1/2)) for m >= 2
//
// where C := 1 / (hIntegralNumberOfElements - hIntegralX1)
if (k - x <= s || u >= hIntegral(k + 0.5) - h(k)) {
// Case k = 1:
//
// The right inequality is always true, because replacing k by 1 gives
// u >= hIntegral(1.5) - h(1) = hIntegralX1 and u is taken from
// (hIntegralX1, hIntegralNumberOfElements].
//
// Therefore, the acceptance rate for k = 1 is P(accepted | k = 1) = 1
// and the probability that 1 is returned as random value is
// P(k = 1 and accepted) = P(accepted | k = 1) * P(k = 1) = C = C / 1^exponent
//
// Case k >= 2:
//
// The left inequality (k - x <= s) is just a short cut
// to avoid the more expensive evaluation of the right inequality
// (u >= hIntegral(k + 0.5) - h(k)) in many cases.
//
// If the left inequality is true, the right inequality is also true:
// Theorem 2 in the paper is valid for all positive exponents, because
// the requirements h'(x) = -exponent/x^(exponent + 1) < 0 and
// (-1/hInverse'(x))'' = (1+1/exponent) * x^(1/exponent-1) >= 0
// are both fulfilled.
// Therefore, f(x) := x - hIntegralInverse(hIntegral(x + 0.5) - h(x))
// is a non-decreasing function. If k - x <= s holds,
// k - x <= s + f(k) - f(2) is obviously also true which is equivalent to
// -x <= -hIntegralInverse(hIntegral(k + 0.5) - h(k)),
// -hIntegralInverse(u) <= -hIntegralInverse(hIntegral(k + 0.5) - h(k)),
// and finally u >= hIntegral(k + 0.5) - h(k).
//
// Hence, the right inequality determines the acceptance rate:
// P(accepted | k = m) = h(m) / (hIntegrated(m+1/2) - hIntegrated(m-1/2))
// The probability that m is returned is given by
// P(k = m and accepted) = P(accepted | k = m) * P(k = m) = C * h(m) = C / m^exponent.
//
// In both cases the probabilities are proportional to the probability mass function
// of the Zipf distribution.
return k;
}
}
}
/**
* {@code H(x) :=}
*
* - {@code (x^(1-exponent) - 1)/(1 - exponent)}, if {@code exponent != 1}
* - {@code log(x)}, if {@code exponent == 1}
*
* H(x) is an integral function of h(x),
* the derivative of H(x) is h(x).
*
* @param x free parameter
* @return {@code H(x)}
*/
private double hIntegral(final double x) {
final double logX = FastMath.log(x);
return helper2((1d-exponent)*logX)*logX;
}
/**
* {@code h(x) := 1/x^exponent}
*
* @param x free parameter
* @return h(x)
*/
private double h(final double x) {
return FastMath.exp(-exponent * FastMath.log(x));
}
/**
* The inverse function of H(x).
*
* @param x free parameter
* @return y for which {@code H(y) = x}
*/
private double hIntegralInverse(final double x) {
double t = x*(1d-exponent);
if (t < -1d) {
// Limit value to the range [-1, +inf).
// t could be smaller than -1 in some rare cases due to numerical errors.
t = -1;
}
return FastMath.exp(helper1(t)*x);
}
/**
* Helper function that calculates {@code log(1+x)/x}.
*
* A Taylor series expansion is used, if x is close to 0.
*
* @param x a value larger than or equal to -1
* @return {@code log(1+x)/x}
*/
static double helper1(final double x) {
if (FastMath.abs(x)>1e-8) {
return FastMath.log1p(x)/x;
}
else {
return 1.-x*((1./2.)-x*((1./3.)-x*(1./4.)));
}
}
/**
* Helper function to calculate {@code (exp(x)-1)/x}.
*
* A Taylor series expansion is used, if x is close to 0.
*
* @param x free parameter
* @return {@code (exp(x)-1)/x} if x is non-zero, or 1 if x=0
*/
static double helper2(final double x) {
if (FastMath.abs(x)>1e-8) {
return FastMath.expm1(x)/x;
}
else {
return 1.+x*(1./2.)*(1.+x*(1./3.)*(1.+x*(1./4.)));
}
}
}
}