com.dimajix.shaded.guava.math.PairedStats Maven / Gradle / Ivy
/*
* Copyright (C) 2012 The Guava Authors
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*/
package com.dimajix.shaded.guava.math;
import static com.dimajix.shaded.guava.base.Preconditions.checkArgument;
import static com.dimajix.shaded.guava.base.Preconditions.checkNotNull;
import static com.dimajix.shaded.guava.base.Preconditions.checkState;
import static java.lang.Double.NaN;
import static java.lang.Double.doubleToLongBits;
import static java.lang.Double.isNaN;
import com.dimajix.shaded.guava.annotations.Beta;
import com.dimajix.shaded.guava.annotations.GwtIncompatible;
import com.dimajix.shaded.guava.base.MoreObjects;
import com.dimajix.shaded.guava.base.Objects;
import java.io.Serializable;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import javax.annotation.CheckForNull;
/**
* An immutable value object capturing some basic statistics about a collection of paired double
* values (e.g. points on a plane). Build instances with {@link PairedStatsAccumulator#snapshot}.
*
* @author Pete Gillin
* @since 20.0
*/
@Beta
@GwtIncompatible
@ElementTypesAreNonnullByDefault
public final class PairedStats implements Serializable {
private final Stats xStats;
private final Stats yStats;
private final double sumOfProductsOfDeltas;
/**
* Internal constructor. Users should use {@link PairedStatsAccumulator#snapshot}.
*
* To ensure that the created instance obeys its contract, the parameters should satisfy the
* following constraints. This is the callers responsibility and is not enforced here.
*
*
* - Both {@code xStats} and {@code yStats} must have the same {@code count}.
*
- If that {@code count} is 1, {@code sumOfProductsOfDeltas} must be exactly 0.0.
*
- If that {@code count} is more than 1, {@code sumOfProductsOfDeltas} must be finite.
*
*/
PairedStats(Stats xStats, Stats yStats, double sumOfProductsOfDeltas) {
this.xStats = xStats;
this.yStats = yStats;
this.sumOfProductsOfDeltas = sumOfProductsOfDeltas;
}
/** Returns the number of pairs in the dataset. */
public long count() {
return xStats.count();
}
/** Returns the statistics on the {@code x} values alone. */
public Stats xStats() {
return xStats;
}
/** Returns the statistics on the {@code y} values alone. */
public Stats yStats() {
return yStats;
}
/**
* Returns the population covariance of the values. The count must be non-zero.
*
* This is guaranteed to return zero if the dataset contains a single pair of finite values. It
* is not guaranteed to return zero when the dataset consists of the same pair of values multiple
* times, due to numerical errors.
*
*
Non-finite values
*
* If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link
* Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
*
* @throws IllegalStateException if the dataset is empty
*/
public double populationCovariance() {
checkState(count() != 0);
return sumOfProductsOfDeltas / count();
}
/**
* Returns the sample covariance of the values. The count must be greater than one.
*
*
This is not guaranteed to return zero when the dataset consists of the same pair of values
* multiple times, due to numerical errors.
*
*
Non-finite values
*
* If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link
* Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
*
* @throws IllegalStateException if the dataset is empty or contains a single pair of values
*/
public double sampleCovariance() {
checkState(count() > 1);
return sumOfProductsOfDeltas / (count() - 1);
}
/**
* Returns the Pearson's or
* product-moment correlation coefficient of the values. The count must greater than one, and
* the {@code x} and {@code y} values must both have non-zero population variance (i.e. {@code
* xStats().populationVariance() > 0.0 && yStats().populationVariance() > 0.0}). The result is not
* guaranteed to be exactly +/-1 even when the data are perfectly (anti-)correlated, due to
* numerical errors. However, it is guaranteed to be in the inclusive range [-1, +1].
*
*
Non-finite values
*
* If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link
* Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
*
* @throws IllegalStateException if the dataset is empty or contains a single pair of values, or
* either the {@code x} and {@code y} dataset has zero population variance
*/
public double pearsonsCorrelationCoefficient() {
checkState(count() > 1);
if (isNaN(sumOfProductsOfDeltas)) {
return NaN;
}
double xSumOfSquaresOfDeltas = xStats().sumOfSquaresOfDeltas();
double ySumOfSquaresOfDeltas = yStats().sumOfSquaresOfDeltas();
checkState(xSumOfSquaresOfDeltas > 0.0);
checkState(ySumOfSquaresOfDeltas > 0.0);
// The product of two positive numbers can be zero if the multiplication underflowed. We
// force a positive value by effectively rounding up to MIN_VALUE.
double productOfSumsOfSquaresOfDeltas =
ensurePositive(xSumOfSquaresOfDeltas * ySumOfSquaresOfDeltas);
return ensureInUnitRange(sumOfProductsOfDeltas / Math.sqrt(productOfSumsOfSquaresOfDeltas));
}
/**
* Returns a linear transformation giving the best fit to the data according to Ordinary Least Squares linear
* regression of {@code y} as a function of {@code x}. The count must be greater than one, and
* either the {@code x} or {@code y} data must have a non-zero population variance (i.e. {@code
* xStats().populationVariance() > 0.0 || yStats().populationVariance() > 0.0}). The result is
* guaranteed to be horizontal if there is variance in the {@code x} data but not the {@code y}
* data, and vertical if there is variance in the {@code y} data but not the {@code x} data.
*
*
This fit minimizes the root-mean-square error in {@code y} as a function of {@code x}. This
* error is defined as the square root of the mean of the squares of the differences between the
* actual {@code y} values of the data and the values predicted by the fit for the {@code x}
* values (i.e. it is the square root of the mean of the squares of the vertical distances between
* the data points and the best fit line). For this fit, this error is a fraction {@code sqrt(1 -
* R*R)} of the population standard deviation of {@code y}, where {@code R} is the Pearson's
* correlation coefficient (as given by {@link #pearsonsCorrelationCoefficient()}).
*
*
The corresponding root-mean-square error in {@code x} as a function of {@code y} is a
* fraction {@code sqrt(1/(R*R) - 1)} of the population standard deviation of {@code x}. This fit
* does not normally minimize that error: to do that, you should swap the roles of {@code x} and
* {@code y}.
*
*
Non-finite values
*
* If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link
* Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link
* LinearTransformation#forNaN()}.
*
* @throws IllegalStateException if the dataset is empty or contains a single pair of values, or
* both the {@code x} and {@code y} dataset must have zero population variance
*/
public LinearTransformation leastSquaresFit() {
checkState(count() > 1);
if (isNaN(sumOfProductsOfDeltas)) {
return LinearTransformation.forNaN();
}
double xSumOfSquaresOfDeltas = xStats.sumOfSquaresOfDeltas();
if (xSumOfSquaresOfDeltas > 0.0) {
if (yStats.sumOfSquaresOfDeltas() > 0.0) {
return LinearTransformation.mapping(xStats.mean(), yStats.mean())
.withSlope(sumOfProductsOfDeltas / xSumOfSquaresOfDeltas);
} else {
return LinearTransformation.horizontal(yStats.mean());
}
} else {
checkState(yStats.sumOfSquaresOfDeltas() > 0.0);
return LinearTransformation.vertical(xStats.mean());
}
}
/**
* {@inheritDoc}
*
*
Note: This tests exact equality of the calculated statistics, including the floating
* point values. Two instances are guaranteed to be considered equal if one is copied from the
* other using {@code second = new PairedStatsAccumulator().addAll(first).snapshot()}, if both
* were obtained by calling {@code snapshot()} on the same {@link PairedStatsAccumulator} without
* adding any values in between the two calls, or if one is obtained from the other after
* round-tripping through java serialization. However, floating point rounding errors mean that it
* may be false for some instances where the statistics are mathematically equal, including
* instances constructed from the same values in a different order... or (in the general case)
* even in the same order. (It is guaranteed to return true for instances constructed from the
* same values in the same order if {@code strictfp} is in effect, or if the system architecture
* guarantees {@code strictfp}-like semantics.)
*/
@Override
public boolean equals(@CheckForNull Object obj) {
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
PairedStats other = (PairedStats) obj;
return xStats.equals(other.xStats)
&& yStats.equals(other.yStats)
&& doubleToLongBits(sumOfProductsOfDeltas) == doubleToLongBits(other.sumOfProductsOfDeltas);
}
/**
* {@inheritDoc}
*
*
Note: This hash code is consistent with exact equality of the calculated statistics,
* including the floating point values. See the note on {@link #equals} for details.
*/
@Override
public int hashCode() {
return Objects.hashCode(xStats, yStats, sumOfProductsOfDeltas);
}
@Override
public String toString() {
if (count() > 0) {
return MoreObjects.toStringHelper(this)
.add("xStats", xStats)
.add("yStats", yStats)
.add("populationCovariance", populationCovariance())
.toString();
} else {
return MoreObjects.toStringHelper(this)
.add("xStats", xStats)
.add("yStats", yStats)
.toString();
}
}
double sumOfProductsOfDeltas() {
return sumOfProductsOfDeltas;
}
private static double ensurePositive(double value) {
if (value > 0.0) {
return value;
} else {
return Double.MIN_VALUE;
}
}
private static double ensureInUnitRange(double value) {
if (value >= 1.0) {
return 1.0;
}
if (value <= -1.0) {
return -1.0;
}
return value;
}
// Serialization helpers
/** The size of byte array representation in bytes. */
private static final int BYTES = Stats.BYTES * 2 + Double.SIZE / Byte.SIZE;
/**
* Gets a byte array representation of this instance.
*
*
Note: No guarantees are made regarding stability of the representation between
* versions.
*/
public byte[] toByteArray() {
ByteBuffer buffer = ByteBuffer.allocate(BYTES).order(ByteOrder.LITTLE_ENDIAN);
xStats.writeTo(buffer);
yStats.writeTo(buffer);
buffer.putDouble(sumOfProductsOfDeltas);
return buffer.array();
}
/**
* Creates a {@link PairedStats} instance from the given byte representation which was obtained by
* {@link #toByteArray}.
*
*
Note: No guarantees are made regarding stability of the representation between
* versions.
*/
public static PairedStats fromByteArray(byte[] byteArray) {
checkNotNull(byteArray);
checkArgument(
byteArray.length == BYTES,
"Expected PairedStats.BYTES = %s, got %s",
BYTES,
byteArray.length);
ByteBuffer buffer = ByteBuffer.wrap(byteArray).order(ByteOrder.LITTLE_ENDIAN);
Stats xStats = Stats.readFrom(buffer);
Stats yStats = Stats.readFrom(buffer);
double sumOfProductsOfDeltas = buffer.getDouble();
return new PairedStats(xStats, yStats, sumOfProductsOfDeltas);
}
private static final long serialVersionUID = 0;
}