org.elasticsearch.tdigest.AVLTreeDigest Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of elasticsearch-tdigest Show documentation
Show all versions of elasticsearch-tdigest Show documentation
Elasticsearch subproject :libs:elasticsearch-tdigest
The newest version!
/*
* Licensed to Elasticsearch B.V. under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch B.V. licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
* This project is based on a modification of https://github.com/tdunning/t-digest which is licensed under the Apache 2.0 License.
*/
package org.elasticsearch.tdigest;
import org.apache.lucene.util.RamUsageEstimator;
import org.elasticsearch.core.Releasables;
import org.elasticsearch.tdigest.arrays.TDigestArrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.Random;
import static org.elasticsearch.tdigest.IntAVLTree.NIL;
public class AVLTreeDigest extends AbstractTDigest {
private static final long SHALLOW_SIZE = RamUsageEstimator.shallowSizeOfInstance(AVLTreeDigest.class);
private final TDigestArrays arrays;
private boolean closed = false;
final Random gen = new Random();
private final double compression;
private AVLGroupTree summary;
private long count = 0; // package private for testing
// Indicates if a sample has been added after the last compression.
private boolean needsCompression;
static AVLTreeDigest create(TDigestArrays arrays, double compression) {
arrays.adjustBreaker(SHALLOW_SIZE);
try {
return new AVLTreeDigest(arrays, compression);
} catch (Exception e) {
arrays.adjustBreaker(-SHALLOW_SIZE);
throw e;
}
}
/**
* A histogram structure that will record a sketch of a distribution.
*
* @param compression How should accuracy be traded for size? A value of N here will give quantile errors
* almost always less than 3/N with considerably smaller errors expected for extreme
* quantiles. Conversely, you should expect to track about 5 N centroids for this
* accuracy.
*/
private AVLTreeDigest(TDigestArrays arrays, double compression) {
this.arrays = arrays;
this.compression = compression;
summary = AVLGroupTree.create(arrays);
}
@Override
public long ramBytesUsed() {
return SHALLOW_SIZE + summary.ramBytesUsed();
}
/**
* Sets the seed for the RNG.
* In cases where a predictable tree should be created, this function may be used to make the
* randomness in this AVLTree become more deterministic.
*
* @param seed The random seed to use for RNG purposes
*/
public void setRandomSeed(long seed) {
gen.setSeed(seed);
}
@Override
public int centroidCount() {
return summary.size();
}
@Override
public void add(double x, long w) {
checkValue(x);
needsCompression = true;
if (x < min) {
min = x;
}
if (x > max) {
max = x;
}
int start = summary.floor(x);
if (start == NIL) {
start = summary.first();
}
if (start == NIL) { // empty summary
assert summary.isEmpty();
summary.add(x, w);
count = w;
} else {
double minDistance = Double.MAX_VALUE;
int lastNeighbor = NIL;
for (int neighbor = start; neighbor != NIL; neighbor = summary.next(neighbor)) {
double z = Math.abs(summary.mean(neighbor) - x);
if (z < minDistance) {
start = neighbor;
minDistance = z;
} else if (z > minDistance) {
// as soon as z increases, we have passed the nearest neighbor and can quit
lastNeighbor = neighbor;
break;
}
}
int closest = NIL;
double n = 0;
long sum = summary.headSum(start);
for (int neighbor = start; neighbor != lastNeighbor; neighbor = summary.next(neighbor)) {
assert minDistance == Math.abs(summary.mean(neighbor) - x);
double q = count == 1 ? 0.5 : (sum + (summary.count(neighbor) - 1) / 2.0) / (count - 1);
double k = 4 * count * q * (1 - q) / compression;
// this slightly clever selection method improves accuracy with lots of repeated points
// what it does is sample uniformly from all clusters that have room
if (summary.count(neighbor) + w <= k) {
n++;
if (gen.nextDouble() < 1 / n) {
closest = neighbor;
}
}
sum += summary.count(neighbor);
}
if (closest == NIL) {
summary.add(x, w);
} else {
// if the nearest point was not unique, then we may not be modifying the first copy
// which means that ordering can change
double centroid = summary.mean(closest);
long count = summary.count(closest);
centroid = weightedAverage(centroid, count, x, w);
count += w;
summary.update(closest, centroid, count);
}
count += w;
if (summary.size() > 20 * compression) {
// may happen in case of sequential points
compress();
}
}
}
@Override
public void compress() {
if (needsCompression == false) {
return;
}
needsCompression = false;
try (AVLGroupTree centroids = summary) {
this.summary = AVLGroupTree.create(arrays);
final int[] nodes = new int[centroids.size()];
nodes[0] = centroids.first();
for (int i = 1; i < nodes.length; ++i) {
nodes[i] = centroids.next(nodes[i - 1]);
assert nodes[i] != IntAVLTree.NIL;
}
assert centroids.next(nodes[nodes.length - 1]) == IntAVLTree.NIL;
for (int i = centroids.size() - 1; i > 0; --i) {
final int other = gen.nextInt(i + 1);
final int tmp = nodes[other];
nodes[other] = nodes[i];
nodes[i] = tmp;
}
for (int node : nodes) {
add(centroids.mean(node), centroids.count(node));
}
}
}
/**
* Returns the number of samples represented in this histogram. If you want to know how many
* centroids are being used, try centroids().size().
*
* @return the number of samples that have been added.
*/
@Override
public long size() {
return count;
}
/**
* @param x the value at which the CDF should be evaluated
* @return the approximate fraction of all samples that were less than or equal to x.
*/
@Override
public double cdf(double x) {
AVLGroupTree values = summary;
if (values.isEmpty()) {
return Double.NaN;
}
if (values.size() == 1) {
if (x < values.mean(values.first())) return 0;
if (x > values.mean(values.first())) return 1;
return 0.5;
} else {
if (x < min) {
return 0;
}
if (Double.compare(x, min) == 0) {
// we have one or more centroids == x, treat them as one
// dw will accumulate the weight of all of the centroids at x
double dw = 0;
for (Centroid value : values) {
if (Double.compare(value.mean(), x) != 0) {
break;
}
dw += value.count();
}
return dw / 2.0 / size();
}
if (x > max) {
return 1;
}
if (Double.compare(x, max) == 0) {
int ix = values.last();
double dw = 0;
while (ix != NIL && Double.compare(values.mean(ix), x) == 0) {
dw += values.count(ix);
ix = values.prev(ix);
}
long n = size();
return (n - dw / 2.0) / n;
}
// we scan a across the centroids
Iterator it = values.iterator();
Centroid a = it.next();
// b is the look-ahead to the next centroid
Centroid b = it.next();
// initially, we set left width equal to right width
double left = (b.mean() - a.mean()) / 2;
double right = left;
// scan to next to last element
double r = 0;
while (it.hasNext()) {
if (x < a.mean() + right) {
double value = (r + a.count() * interpolate(x, a.mean() - left, a.mean() + right)) / count;
return Math.max(value, 0.0);
}
r += a.count();
a = b;
left = right;
b = it.next();
right = (b.mean() - a.mean()) / 2;
}
// for the last element, assume right width is same as left
if (x < a.mean() + right) {
return (r + a.count() * interpolate(x, a.mean() - right, a.mean() + right)) / count;
}
return 1;
}
}
/**
* @param q The quantile desired. Can be in the range [0,1].
* @return The minimum value x such that we think that the proportion of samples is ≤ x is q.
*/
@Override
public double quantile(double q) {
if (q < 0 || q > 1) {
throw new IllegalArgumentException("q should be in [0,1], got " + q);
}
AVLGroupTree values = summary;
if (values.isEmpty()) {
// no centroids means no data, no way to get a quantile
return Double.NaN;
} else if (values.size() == 1) {
// with one data point, all quantiles lead to Rome
return values.iterator().next().mean();
}
// if values were stored in a sorted array, index would be the offset we are interested in
final double index = q * count;
// deal with min and max as a special case singletons
if (index <= 0) {
return min;
}
if (index >= count) {
return max;
}
int currentNode = values.first();
long currentWeight = values.count(currentNode);
// Total mass to the left of the center of the current node.
double weightSoFar = currentWeight / 2.0;
if (index <= weightSoFar && weightSoFar > 1) {
// Interpolate between min and first mean, if there's no singleton on the left boundary.
return weightedAverage(min, weightSoFar - index, values.mean(currentNode), index);
}
for (int i = 0; i < values.size() - 1; i++) {
int nextNode = values.next(currentNode);
long nextWeight = values.count(nextNode);
// this is the mass between current center and next center
double dw = (currentWeight + nextWeight) / 2.0;
if (index < weightSoFar + dw) {
// index is bracketed between centroids i and i+1
assert dw >= 1;
double w1 = index - weightSoFar;
double w2 = weightSoFar + dw - index;
return weightedAverage(values.mean(currentNode), w2, values.mean(nextNode), w1);
}
weightSoFar += dw;
currentNode = nextNode;
currentWeight = nextWeight;
}
// Index is close or after the last centroid.
assert currentWeight >= 1;
assert index - weightSoFar < count - currentWeight / 2.0;
assert count - weightSoFar >= 0.5;
// Interpolate between the last mean and the max.
double w1 = index - weightSoFar;
double w2 = currentWeight / 2.0 - w1;
return weightedAverage(values.mean(currentNode), w2, max, w1);
}
@Override
public Collection centroids() {
return Collections.unmodifiableCollection(summary);
}
@Override
public double compression() {
return compression;
}
/**
* Returns an upper bound on the number bytes that will be required to represent this histogram.
*/
@Override
public int byteSize() {
compress();
return 64 + summary.size() * 13;
}
@Override
public void close() {
if (closed == false) {
closed = true;
arrays.adjustBreaker(-SHALLOW_SIZE);
Releasables.close(summary);
}
}
}