All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.dimajix.shaded.guava.math.Stats Maven / Gradle / Ivy

/*
 * Copyright (C) 2012 The Guava Authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 */

package com.dimajix.shaded.guava.math;

import static com.dimajix.shaded.guava.base.Preconditions.checkArgument;
import static com.dimajix.shaded.guava.base.Preconditions.checkNotNull;
import static com.dimajix.shaded.guava.base.Preconditions.checkState;
import static com.dimajix.shaded.guava.math.DoubleUtils.ensureNonNegative;
import static com.dimajix.shaded.guava.math.StatsAccumulator.calculateNewMeanNonFinite;
import static com.dimajix.shaded.guava.primitives.Doubles.isFinite;
import static java.lang.Double.NaN;
import static java.lang.Double.doubleToLongBits;
import static java.lang.Double.isNaN;

import com.dimajix.shaded.guava.annotations.GwtIncompatible;
import com.dimajix.shaded.guava.annotations.J2ktIncompatible;
import com.dimajix.shaded.guava.base.MoreObjects;
import com.dimajix.shaded.guava.base.Objects;
import java.io.Serializable;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.Iterator;
import java.util.stream.Collector;
import java.util.stream.DoubleStream;
import java.util.stream.IntStream;
import java.util.stream.LongStream;
import javax.annotation.CheckForNull;

/**
 * A bundle of statistical summary values -- sum, count, mean/average, min and max, and several
 * forms of variance -- that were computed from a single set of zero or more floating-point values.
 *
 * 

There are two ways to obtain a {@code Stats} instance: * *

    *
  • If all the values you want to summarize are already known, use the appropriate {@code * Stats.of} factory method below. Primitive arrays, iterables and iterators of any kind of * {@code Number}, and primitive varargs are supported. *
  • Or, to avoid storing up all the data first, create a {@link StatsAccumulator} instance, * feed values to it as you get them, then call {@link StatsAccumulator#snapshot}. *
* *

Static convenience methods called {@code meanOf} are also provided for users who wish to * calculate only the mean. * *

Java 8 users: If you are not using any of the variance statistics, you may wish to use * built-in JDK libraries instead of this class. * * @author Pete Gillin * @author Kevin Bourrillion * @since 20.0 */ @J2ktIncompatible @GwtIncompatible @ElementTypesAreNonnullByDefault public final class Stats implements Serializable { private final long count; private final double mean; private final double sumOfSquaresOfDeltas; private final double min; private final double max; /** * Internal constructor. Users should use {@link #of} or {@link StatsAccumulator#snapshot}. * *

To ensure that the created instance obeys its contract, the parameters should satisfy the * following constraints. This is the callers responsibility and is not enforced here. * *

    *
  • If {@code count} is 0, {@code mean} may have any finite value (its only usage will be to * get multiplied by 0 to calculate the sum), and the other parameters may have any values * (they will not be used). *
  • If {@code count} is 1, {@code sumOfSquaresOfDeltas} must be exactly 0.0 or {@link * Double#NaN}. *
*/ Stats(long count, double mean, double sumOfSquaresOfDeltas, double min, double max) { this.count = count; this.mean = mean; this.sumOfSquaresOfDeltas = sumOfSquaresOfDeltas; this.min = min; this.max = max; } /** * Returns statistics over a dataset containing the given values. * * @param values a series of values, which will be converted to {@code double} values (this may * cause loss of precision) */ public static Stats of(Iterable values) { StatsAccumulator accumulator = new StatsAccumulator(); accumulator.addAll(values); return accumulator.snapshot(); } /** * Returns statistics over a dataset containing the given values. The iterator will be completely * consumed by this method. * * @param values a series of values, which will be converted to {@code double} values (this may * cause loss of precision) */ public static Stats of(Iterator values) { StatsAccumulator accumulator = new StatsAccumulator(); accumulator.addAll(values); return accumulator.snapshot(); } /** * Returns statistics over a dataset containing the given values. * * @param values a series of values */ public static Stats of(double... values) { StatsAccumulator accumulator = new StatsAccumulator(); accumulator.addAll(values); return accumulator.snapshot(); } /** * Returns statistics over a dataset containing the given values. * * @param values a series of values */ public static Stats of(int... values) { StatsAccumulator accumulator = new StatsAccumulator(); accumulator.addAll(values); return accumulator.snapshot(); } /** * Returns statistics over a dataset containing the given values. * * @param values a series of values, which will be converted to {@code double} values (this may * cause loss of precision for longs of magnitude over 2^53 (slightly over 9e15)) */ public static Stats of(long... values) { StatsAccumulator accumulator = new StatsAccumulator(); accumulator.addAll(values); return accumulator.snapshot(); } /** * Returns statistics over a dataset containing the given values. The stream will be completely * consumed by this method. * *

If you have a {@code Stream} rather than a {@code DoubleStream}, you should collect * the values using {@link #toStats()} instead. * * @param values a series of values * @since 28.2 */ public static Stats of(DoubleStream values) { return values .collect(StatsAccumulator::new, StatsAccumulator::add, StatsAccumulator::addAll) .snapshot(); } /** * Returns statistics over a dataset containing the given values. The stream will be completely * consumed by this method. * *

If you have a {@code Stream} rather than an {@code IntStream}, you should collect * the values using {@link #toStats()} instead. * * @param values a series of values * @since 28.2 */ public static Stats of(IntStream values) { return values .collect(StatsAccumulator::new, StatsAccumulator::add, StatsAccumulator::addAll) .snapshot(); } /** * Returns statistics over a dataset containing the given values. The stream will be completely * consumed by this method. * *

If you have a {@code Stream} rather than a {@code LongStream}, you should collect the * values using {@link #toStats()} instead. * * @param values a series of values, which will be converted to {@code double} values (this may * cause loss of precision for longs of magnitude over 2^53 (slightly over 9e15)) * @since 28.2 */ public static Stats of(LongStream values) { return values .collect(StatsAccumulator::new, StatsAccumulator::add, StatsAccumulator::addAll) .snapshot(); } /** * Returns a {@link Collector} which accumulates statistics from a {@link java.util.stream.Stream} * of any type of boxed {@link Number} into a {@link Stats}. Use by calling {@code * boxedNumericStream.collect(toStats())}. The numbers will be converted to {@code double} values * (which may cause loss of precision). * *

If you have any of the primitive streams {@code DoubleStream}, {@code IntStream}, or {@code * LongStream}, you should use the factory method {@link #of} instead. * * @since 28.2 */ public static Collector toStats() { return Collector.of( StatsAccumulator::new, (a, x) -> a.add(x.doubleValue()), (l, r) -> { l.addAll(r); return l; }, StatsAccumulator::snapshot, Collector.Characteristics.UNORDERED); } /** Returns the number of values. */ public long count() { return count; } /** * Returns the arithmetic mean of the * values. The count must be non-zero. * *

If these values are a sample drawn from a population, this is also an unbiased estimator of * the arithmetic mean of the population. * *

Non-finite values

* *

If the dataset contains {@link Double#NaN} then the result is {@link Double#NaN}. If it * contains both {@link Double#POSITIVE_INFINITY} and {@link Double#NEGATIVE_INFINITY} then the * result is {@link Double#NaN}. If it contains {@link Double#POSITIVE_INFINITY} and finite values * only or {@link Double#POSITIVE_INFINITY} only, the result is {@link Double#POSITIVE_INFINITY}. * If it contains {@link Double#NEGATIVE_INFINITY} and finite values only or {@link * Double#NEGATIVE_INFINITY} only, the result is {@link Double#NEGATIVE_INFINITY}. * *

If you only want to calculate the mean, use {@link #meanOf} instead of creating a {@link * Stats} instance. * * @throws IllegalStateException if the dataset is empty */ public double mean() { checkState(count != 0); return mean; } /** * Returns the sum of the values. * *

Non-finite values

* *

If the dataset contains {@link Double#NaN} then the result is {@link Double#NaN}. If it * contains both {@link Double#POSITIVE_INFINITY} and {@link Double#NEGATIVE_INFINITY} then the * result is {@link Double#NaN}. If it contains {@link Double#POSITIVE_INFINITY} and finite values * only or {@link Double#POSITIVE_INFINITY} only, the result is {@link Double#POSITIVE_INFINITY}. * If it contains {@link Double#NEGATIVE_INFINITY} and finite values only or {@link * Double#NEGATIVE_INFINITY} only, the result is {@link Double#NEGATIVE_INFINITY}. */ public double sum() { return mean * count; } /** * Returns the population * variance of the values. The count must be non-zero. * *

This is guaranteed to return zero if the dataset contains only exactly one finite value. It * is not guaranteed to return zero when the dataset consists of the same value multiple times, * due to numerical errors. However, it is guaranteed never to return a negative result. * *

Non-finite values

* *

If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}. * * @throws IllegalStateException if the dataset is empty */ public double populationVariance() { checkState(count > 0); if (isNaN(sumOfSquaresOfDeltas)) { return NaN; } if (count == 1) { return 0.0; } return ensureNonNegative(sumOfSquaresOfDeltas) / count(); } /** * Returns the * population standard deviation of the values. The count must be non-zero. * *

This is guaranteed to return zero if the dataset contains only exactly one finite value. It * is not guaranteed to return zero when the dataset consists of the same value multiple times, * due to numerical errors. However, it is guaranteed never to return a negative result. * *

Non-finite values

* *

If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}. * * @throws IllegalStateException if the dataset is empty */ public double populationStandardDeviation() { return Math.sqrt(populationVariance()); } /** * Returns the unbiased sample * variance of the values. If this dataset is a sample drawn from a population, this is an * unbiased estimator of the population variance of the population. The count must be greater than * one. * *

This is not guaranteed to return zero when the dataset consists of the same value multiple * times, due to numerical errors. However, it is guaranteed never to return a negative result. * *

Non-finite values

* *

If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}. * * @throws IllegalStateException if the dataset is empty or contains a single value */ public double sampleVariance() { checkState(count > 1); if (isNaN(sumOfSquaresOfDeltas)) { return NaN; } return ensureNonNegative(sumOfSquaresOfDeltas) / (count - 1); } /** * Returns the * corrected sample standard deviation of the values. If this dataset is a sample drawn from a * population, this is an estimator of the population standard deviation of the population which * is less biased than {@link #populationStandardDeviation()} (the unbiased estimator depends on * the distribution). The count must be greater than one. * *

This is not guaranteed to return zero when the dataset consists of the same value multiple * times, due to numerical errors. However, it is guaranteed never to return a negative result. * *

Non-finite values

* *

If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}. * * @throws IllegalStateException if the dataset is empty or contains a single value */ public double sampleStandardDeviation() { return Math.sqrt(sampleVariance()); } /** * Returns the lowest value in the dataset. The count must be non-zero. * *

Non-finite values

* *

If the dataset contains {@link Double#NaN} then the result is {@link Double#NaN}. If it * contains {@link Double#NEGATIVE_INFINITY} and not {@link Double#NaN} then the result is {@link * Double#NEGATIVE_INFINITY}. If it contains {@link Double#POSITIVE_INFINITY} and finite values * only then the result is the lowest finite value. If it contains {@link * Double#POSITIVE_INFINITY} only then the result is {@link Double#POSITIVE_INFINITY}. * * @throws IllegalStateException if the dataset is empty */ public double min() { checkState(count != 0); return min; } /** * Returns the highest value in the dataset. The count must be non-zero. * *

Non-finite values

* *

If the dataset contains {@link Double#NaN} then the result is {@link Double#NaN}. If it * contains {@link Double#POSITIVE_INFINITY} and not {@link Double#NaN} then the result is {@link * Double#POSITIVE_INFINITY}. If it contains {@link Double#NEGATIVE_INFINITY} and finite values * only then the result is the highest finite value. If it contains {@link * Double#NEGATIVE_INFINITY} only then the result is {@link Double#NEGATIVE_INFINITY}. * * @throws IllegalStateException if the dataset is empty */ public double max() { checkState(count != 0); return max; } /** * {@inheritDoc} * *

Note: This tests exact equality of the calculated statistics, including the floating * point values. Two instances are guaranteed to be considered equal if one is copied from the * other using {@code second = new StatsAccumulator().addAll(first).snapshot()}, if both were * obtained by calling {@code snapshot()} on the same {@link StatsAccumulator} without adding any * values in between the two calls, or if one is obtained from the other after round-tripping * through java serialization. However, floating point rounding errors mean that it may be false * for some instances where the statistics are mathematically equal, including instances * constructed from the same values in a different order... or (in the general case) even in the * same order. (It is guaranteed to return true for instances constructed from the same values in * the same order if {@code strictfp} is in effect, or if the system architecture guarantees * {@code strictfp}-like semantics.) */ @Override public boolean equals(@CheckForNull Object obj) { if (obj == null) { return false; } if (getClass() != obj.getClass()) { return false; } Stats other = (Stats) obj; return count == other.count && doubleToLongBits(mean) == doubleToLongBits(other.mean) && doubleToLongBits(sumOfSquaresOfDeltas) == doubleToLongBits(other.sumOfSquaresOfDeltas) && doubleToLongBits(min) == doubleToLongBits(other.min) && doubleToLongBits(max) == doubleToLongBits(other.max); } /** * {@inheritDoc} * *

Note: This hash code is consistent with exact equality of the calculated statistics, * including the floating point values. See the note on {@link #equals} for details. */ @Override public int hashCode() { return Objects.hashCode(count, mean, sumOfSquaresOfDeltas, min, max); } @Override public String toString() { if (count() > 0) { return MoreObjects.toStringHelper(this) .add("count", count) .add("mean", mean) .add("populationStandardDeviation", populationStandardDeviation()) .add("min", min) .add("max", max) .toString(); } else { return MoreObjects.toStringHelper(this).add("count", count).toString(); } } double sumOfSquaresOfDeltas() { return sumOfSquaresOfDeltas; } /** * Returns the arithmetic mean of the * values. The count must be non-zero. * *

The definition of the mean is the same as {@link Stats#mean}. * * @param values a series of values, which will be converted to {@code double} values (this may * cause loss of precision) * @throws IllegalArgumentException if the dataset is empty */ public static double meanOf(Iterable values) { return meanOf(values.iterator()); } /** * Returns the arithmetic mean of the * values. The count must be non-zero. * *

The definition of the mean is the same as {@link Stats#mean}. * * @param values a series of values, which will be converted to {@code double} values (this may * cause loss of precision) * @throws IllegalArgumentException if the dataset is empty */ public static double meanOf(Iterator values) { checkArgument(values.hasNext()); long count = 1; double mean = values.next().doubleValue(); while (values.hasNext()) { double value = values.next().doubleValue(); count++; if (isFinite(value) && isFinite(mean)) { // Art of Computer Programming vol. 2, Knuth, 4.2.2, (15) mean += (value - mean) / count; } else { mean = calculateNewMeanNonFinite(mean, value); } } return mean; } /** * Returns the arithmetic mean of the * values. The count must be non-zero. * *

The definition of the mean is the same as {@link Stats#mean}. * * @param values a series of values * @throws IllegalArgumentException if the dataset is empty */ public static double meanOf(double... values) { checkArgument(values.length > 0); double mean = values[0]; for (int index = 1; index < values.length; index++) { double value = values[index]; if (isFinite(value) && isFinite(mean)) { // Art of Computer Programming vol. 2, Knuth, 4.2.2, (15) mean += (value - mean) / (index + 1); } else { mean = calculateNewMeanNonFinite(mean, value); } } return mean; } /** * Returns the arithmetic mean of the * values. The count must be non-zero. * *

The definition of the mean is the same as {@link Stats#mean}. * * @param values a series of values * @throws IllegalArgumentException if the dataset is empty */ public static double meanOf(int... values) { checkArgument(values.length > 0); double mean = values[0]; for (int index = 1; index < values.length; index++) { double value = values[index]; if (isFinite(value) && isFinite(mean)) { // Art of Computer Programming vol. 2, Knuth, 4.2.2, (15) mean += (value - mean) / (index + 1); } else { mean = calculateNewMeanNonFinite(mean, value); } } return mean; } /** * Returns the arithmetic mean of the * values. The count must be non-zero. * *

The definition of the mean is the same as {@link Stats#mean}. * * @param values a series of values, which will be converted to {@code double} values (this may * cause loss of precision for longs of magnitude over 2^53 (slightly over 9e15)) * @throws IllegalArgumentException if the dataset is empty */ public static double meanOf(long... values) { checkArgument(values.length > 0); double mean = values[0]; for (int index = 1; index < values.length; index++) { double value = values[index]; if (isFinite(value) && isFinite(mean)) { // Art of Computer Programming vol. 2, Knuth, 4.2.2, (15) mean += (value - mean) / (index + 1); } else { mean = calculateNewMeanNonFinite(mean, value); } } return mean; } // Serialization helpers /** The size of byte array representation in bytes. */ static final int BYTES = (Long.SIZE + Double.SIZE * 4) / Byte.SIZE; /** * Gets a byte array representation of this instance. * *

Note: No guarantees are made regarding stability of the representation between * versions. */ public byte[] toByteArray() { ByteBuffer buff = ByteBuffer.allocate(BYTES).order(ByteOrder.LITTLE_ENDIAN); writeTo(buff); return buff.array(); } /** * Writes to the given {@link ByteBuffer} a byte representation of this instance. * *

Note: No guarantees are made regarding stability of the representation between * versions. * * @param buffer A {@link ByteBuffer} with at least BYTES {@link ByteBuffer#remaining}, ordered as * {@link ByteOrder#LITTLE_ENDIAN}, to which a BYTES-long byte representation of this instance * is written. In the process increases the position of {@link ByteBuffer} by BYTES. */ void writeTo(ByteBuffer buffer) { checkNotNull(buffer); checkArgument( buffer.remaining() >= BYTES, "Expected at least Stats.BYTES = %s remaining , got %s", BYTES, buffer.remaining()); buffer .putLong(count) .putDouble(mean) .putDouble(sumOfSquaresOfDeltas) .putDouble(min) .putDouble(max); } /** * Creates a Stats instance from the given byte representation which was obtained by {@link * #toByteArray}. * *

Note: No guarantees are made regarding stability of the representation between * versions. */ public static Stats fromByteArray(byte[] byteArray) { checkNotNull(byteArray); checkArgument( byteArray.length == BYTES, "Expected Stats.BYTES = %s remaining , got %s", BYTES, byteArray.length); return readFrom(ByteBuffer.wrap(byteArray).order(ByteOrder.LITTLE_ENDIAN)); } /** * Creates a Stats instance from the byte representation read from the given {@link ByteBuffer}. * *

Note: No guarantees are made regarding stability of the representation between * versions. * * @param buffer A {@link ByteBuffer} with at least BYTES {@link ByteBuffer#remaining}, ordered as * {@link ByteOrder#LITTLE_ENDIAN}, from which a BYTES-long byte representation of this * instance is read. In the process increases the position of {@link ByteBuffer} by BYTES. */ static Stats readFrom(ByteBuffer buffer) { checkNotNull(buffer); checkArgument( buffer.remaining() >= BYTES, "Expected at least Stats.BYTES = %s remaining , got %s", BYTES, buffer.remaining()); return new Stats( buffer.getLong(), buffer.getDouble(), buffer.getDouble(), buffer.getDouble(), buffer.getDouble()); } private static final long serialVersionUID = 0; }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy