org.elasticsearch.tdigest.TDigest Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of elasticsearch-tdigest Show documentation
Show all versions of elasticsearch-tdigest Show documentation
Elasticsearch subproject :libs:elasticsearch-tdigest
The newest version!
/*
* Licensed to Elasticsearch B.V. under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch B.V. licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
* This project is based on a modification of https://github.com/tdunning/t-digest which is licensed under the Apache 2.0 License.
*/
package org.elasticsearch.tdigest;
import org.apache.lucene.util.Accountable;
import org.elasticsearch.core.Releasable;
import org.elasticsearch.tdigest.arrays.TDigestArrays;
import java.util.Collection;
import java.util.Locale;
/**
* Adaptive histogram based on something like streaming k-means crossed with Q-digest.
* The special characteristics of this algorithm are:
* - smaller summaries than Q-digest
* - works on doubles as well as integers.
* - provides part per million accuracy for extreme quantiles and typically <1000 ppm accuracy for middle quantiles
* - fast
* - simple
* - test coverage roughly at 90%
* - easy to adapt for use with map-reduce
*/
public abstract class TDigest implements Releasable, Accountable {
protected ScaleFunction scale = ScaleFunction.K_2;
double min = Double.POSITIVE_INFINITY;
double max = Double.NEGATIVE_INFINITY;
/**
* Creates an {@link MergingDigest}. This is the fastest implementation for large sample populations, with constant memory
* allocation while delivering relating accuracy close to 1%.
*
* @param compression The compression parameter. 100 is a common value for normal uses. 1000 is extremely large.
* The number of centroids retained will be a smallish (usually less than 10) multiple of this number.
* @return the MergingDigest
*/
public static MergingDigest createMergingDigest(TDigestArrays arrays, double compression) {
return MergingDigest.create(arrays, compression);
}
/**
* Creates an {@link AVLTreeDigest}. This is the most accurate implementation, delivering relative accuracy close to 0.01% for large
* sample populations. Still, its construction takes 2x-10x longer than {@link MergingDigest}, while its memory footprint increases
* (slowly) with the sample population size.
*
* @param compression The compression parameter. 100 is a common value for normal uses. 1000 is extremely large.
* The number of centroids retained will be a smallish (usually less than 10) multiple of this number.
* @return the AvlTreeDigest
*/
public static AVLTreeDigest createAvlTreeDigest(TDigestArrays arrays, double compression) {
return AVLTreeDigest.create(arrays, compression);
}
/**
* Creates a {@link SortingDigest}. SortingDigest is the most accurate and an extremely fast implementation but stores all samples
* internally so it uses much more memory than the rest, for sample populations of 1000 or higher.
*
* @return the SortingDigest
*/
public static SortingDigest createSortingDigest(TDigestArrays arrays) {
return SortingDigest.create(arrays);
}
/**
* Creates a {@link HybridDigest}. HybridDigest uses a SortingDigest for small sample populations, then switches to a MergingDigest,
* thus combining the best of both implementations: fastest overall, small footprint and perfect accuracy for small populations,
* constant memory footprint and acceptable accuracy for larger ones.
*
* @param compression The compression parameter. 100 is a common value for normal uses. 1000 is extremely large.
* The number of centroids retained will be a smallish (usually less than 10) multiple of this number.
* @return the HybridDigest
*/
public static HybridDigest createHybridDigest(TDigestArrays arrays, double compression) {
return HybridDigest.create(arrays, compression);
}
/**
* Adds a sample to a histogram.
*
* @param x The value to add.
* @param w The weight of this point.
*/
public abstract void add(double x, long w);
/**
* Add a single sample to this TDigest.
*
* @param x The data value to add
*/
public final void add(double x) {
add(x, 1);
}
static void checkValue(double x) {
if (Double.isNaN(x) || Double.isInfinite(x)) {
throw new IllegalArgumentException("Invalid value: " + x);
}
}
/**
* Re-examines a t-digest to determine whether some centroids are redundant. If your data are
* perversely ordered, this may be a good idea. Even if not, this may save 20% or so in space.
*
* The cost is roughly the same as adding as many data points as there are centroids. This
* is typically < 10 * compression, but could be as high as 100 * compression.
*
* This is a destructive operation that is not thread-safe.
*/
public abstract void compress();
/**
* Returns the number of points that have been added to this TDigest.
*
* @return The sum of the weights on all centroids.
*/
public abstract long size();
/**
* Returns the fraction of all points added which are ≤ x. Points
* that are exactly equal get half credit (i.e. we use the mid-point
* rule)
*
* @param x The cutoff for the cdf.
* @return The fraction of all data which is less or equal to x.
*/
public abstract double cdf(double x);
/**
* Returns an estimate of a cutoff such that a specified fraction of the data
* added to this TDigest would be less than or equal to the cutoff.
*
* @param q The desired fraction
* @return The smallest value x such that cdf(x) ≥ q
*/
public abstract double quantile(double q);
/**
* A {@link Collection} that lets you go through the centroids in ascending order by mean. Centroids
* returned will not be re-used, but may or may not share storage with this TDigest.
*
* @return The centroids in the form of a Collection.
*/
public abstract Collection centroids();
/**
* Returns the current compression factor.
*
* @return The compression factor originally used to set up the TDigest.
*/
public abstract double compression();
/**
* Returns the number of bytes required to encode this TDigest using #asBytes().
*
* @return The number of bytes required.
*/
public abstract int byteSize();
public void setScaleFunction(ScaleFunction scaleFunction) {
if (scaleFunction.toString().endsWith("NO_NORM")) {
throw new IllegalArgumentException(String.format(Locale.ROOT, "Can't use %s as scale with %s", scaleFunction, this.getClass()));
}
this.scale = scaleFunction;
}
/**
* Add all of the centroids of another TDigest to this one.
*
* @param other The other TDigest
*/
public abstract void add(TDigest other);
public abstract int centroidCount();
/**
* Prepare internal structure for loading the requested number of samples.
* @param size number of samples to be loaded
*/
public void reserve(long size) {}
public double getMin() {
return min;
}
public double getMax() {
return max;
}
}