com.tdunning.math.stats.Comparison Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of t-digest Show documentation
Data structure which allows accurate estimation of quantiles and related rank statistics
The newest version!
/*
 * Licensed to Ted Dunning under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.tdunning.math.stats;

import java.util.Iterator;

/**
 * Static class with methods for comparing distributions.
 */
@SuppressWarnings("WeakerAccess")
public class Comparison {

    /**
     * Use a log-likelihood ratio test to compare two distributions.
     * This is done by estimating counts in quantile ranges from each
     * distribution and then comparing those counts using a multinomial
     * test. The result should be asymptotically chi^2 distributed if
     * the data comes from the same distribution, but this isn't so
     * much useful as a traditional test of a null hypothesis as it is
     * just a reasonably well-behaved score that is bigger when the
     * distributions are more different, subject to having enough data
     * to tell.
     *
     * @param dist1 First distribution (usually the reference)
     * @param dist2 Second distribution to compare (usually the test case)
     * @param qCuts The quantiles that define the bin boundaries. Values ≤0 or ≥1
     *              may result in zero counts. Note that the actual cuts are
     *              defined loosely as dist1.quantile(qCuts[i]).
     * @return A score that is big when dist1 and dist2 are discernibly different.
     * A small score does not mean similarity. Instead, it could just mean insufficient
     * data.
     */
    @SuppressWarnings("WeakerAccess")
    public static double compareChi2(TDigest dist1, TDigest dist2, double[] qCuts) {
        double[][] count = new double[2][];
        count[0] = new double[qCuts.length + 1];
        count[1] = new double[qCuts.length + 1];

        double oldQ = 0;
        double oldQ2 = 0;
        for (int i = 0; i <= qCuts.length; i++) {
            double newQ;
            double x;
            if (i == qCuts.length) {
                newQ = 1;
                x = Math.max(dist1.getMax(), dist2.getMax()) + 1;
            } else {
                newQ = qCuts[i];
                x = dist1.quantile(newQ);
            }
            count[0][i] = dist1.size() * (newQ - oldQ);

            double q2 = dist2.cdf(x);
            count[1][i] = dist2.size() * (q2 - oldQ2);
            oldQ = newQ;
            oldQ2 = q2;
        }

        return llr(count);
    }

    /**
     * Use a log-likelihood ratio test to compare two distributions.
     * With non-linear histograms that have compatible bin boundaries,
     * all that we have to do is compare two count vectors using a
     * chi^2 test (actually a log-likelihood ratio version called a G-test).
     *
     * @param dist1 First distribution (usually the reference)
     * @param dist2 Second distribution to compare (usually the test case)
     * @return A score that is big when dist1 and dist2 are discernibly different.
     * A small score does not mean similarity. Instead, it could just mean insufficient
     * data.
     */
    @SuppressWarnings("WeakerAccess")
    public static double compareChi2(Histogram dist1, Histogram dist2) {
        if (!dist1.getClass().equals(dist2.getClass())) {
            throw new IllegalArgumentException(String.format("Must have same class arguments, got %s and %s",
                    dist1.getClass(), dist2.getClass()));
        }

        long[] k1 = dist1.getCounts();
        long[] k2 = dist2.getCounts();

        int n1 = k1.length;
        if (n1 != k2.length ||
                dist1.lowerBound(0) != dist2.lowerBound(0) ||
                dist1.lowerBound(n1 - 1) != dist2.lowerBound(n1 - 1)) {
            throw new IllegalArgumentException("Incompatible histograms in terms of size or bounds");
        }

        double[][] count = new double[2][n1];
        for (int i = 0; i < n1; i++) {
            count[0][i] = k1[i];
            count[1][i] = k2[i];
        }
        return llr(count);
    }

    @SuppressWarnings("WeakerAccess")
    public static double llr(double[][] count) {
        if (count.length == 0) {
            throw new IllegalArgumentException("Must have some data in llr");
        }
        int columns = count[0].length;
        int rows = count.length;
        double[] rowSums = new double[rows];
        double[] colSums = new double[columns];

        double totalCount = 0;
        double h = 0; // accumulator for entropy

        for (int i = 0; i < rows; i++) {
            for (int j = 0; j < columns; j++) {
                double k = count[i][j];
                rowSums[i] += k;
                colSums[j] += k;
                if (k < 0) {
                    throw new IllegalArgumentException(String.format("Illegal negative count (%.5f) at %d,%d", k, i, j));
                }
                if (k > 0) {
                    h += k * Math.log(k);
                    totalCount += k;
                }
            }
        }

        double normalizer = totalCount * Math.log(totalCount);
        h -= normalizer;  // same as dividing every count by total inside the log

        double hr = 0; // accumulator for row-wise entropy
        for (int i = 0; i < rows; i++) {
            if (rowSums[i] > 0) {
                hr += rowSums[i] * Math.log(rowSums[i]);
            }
        }
        hr -= normalizer;

        double hc = 0; // accumulator for column-wise entropy
        for (int j = 0; j < columns; j++) {
            if (colSums[j] > 0) {
                hc += colSums[j] * Math.log(colSums[j]);
            }
        }
        hc -= normalizer;
        // return value is 2N * mutualInformation(count)
        return 2 * (h - hr - hc);
    }

    /**
     * Returns the observed value of the Kolmogorov-Smirnov statistic normalized by sample counts so
     * that the score should be roughly distributed as sqrt(-log(u)/2). This is equal to the normal
     * KS statistic multiplied by sqrt(m*n/(m+n)) where m and n are the number of samples in d1 and
     * d2 respectively.
     * @param d1  A digest of the first set of samples.
     * @param d2  A digest of the second set of samples.
     * @return A statistic which is bigger when d1 and d2 seem to represent different distributions.
     */
    public static double ks(TDigest d1, TDigest d2) {
        Iterator ix1 = d1.centroids().iterator();
        Iterator ix2 = d2.centroids().iterator();

        double diff = 0;

        double x1 = d1.getMin();
        double x2 = d2.getMin();

        while (x1 <= d1.getMax() && x2 <= d2.getMax()) {
            if (x1 < x2) {
                diff = maxDiff(d1, d2, diff, x1);
                x1 = nextValue(d1, ix1, x1);
            } else if (x1 > x2) {
                diff = maxDiff(d1, d2, diff, x2);
                x2 = nextValue(d2, ix2, x2);
            } else if (x1 == x2) {
                diff = maxDiff(d1, d2, diff, x1);

                double q1 = d1.cdf(x1);
                double q2 = d2.cdf(x2);
                if (q1 < q2) {
                    x1 = nextValue(d1, ix1, x1);
                } else if (q1 > q2) {
                    x2 = nextValue(d2, ix2, x2);
                } else {
                    x1 = nextValue(d1, ix1, x1);
                    x2 = nextValue(d2, ix2, x2);
                }
            }
        }
        while (x1 <= d1.getMax()) {
            diff = maxDiff(d1, d2, diff, x1);
            x1 = nextValue(d1, ix1, x1);
        }

        while (x2 <= d2.getMax()) {
            diff = maxDiff(d2, d2, diff, x2);
            x2 = nextValue(d2, ix2, x2);
        }

        long n1 = d1.size();
        long n2 = d2.size();
        return diff * Math.sqrt((double) n1 * n2 / (n1 + n2));
    }

    private static double maxDiff(TDigest d1, TDigest d2, double diff, double x1) {
        diff = Math.max(diff, Math.abs(d1.cdf(x1) - d2.cdf(x1)));
        return diff;
    }

    private static double nextValue(TDigest d, Iterator ix, double x) {
        if (ix.hasNext()) {
            return ix.next().mean();
        } else if (x < d.getMax()) {
            return d.getMax();
        } else {
            return d.getMax() + 1;
        }
    }
}