org.elasticsearch.tdigest.ScaleFunction Maven / Gradle / Ivy
Show all versions of elasticsearch-tdigest Show documentation
/*
* Licensed to Elasticsearch B.V. under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch B.V. licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
* This project is based on a modification of https://github.com/tdunning/t-digest which is licensed under the Apache 2.0 License.
*/
package org.elasticsearch.tdigest;
/**
* Encodes the various scale functions for t-digests. These limits trade accuracy near the tails against accuracy near
* the median in different ways. For instance, K_0 has uniform cluster sizes and results in constant accuracy (in terms
* of q) while K_3 has cluster sizes proportional to min(q,1-q) which results in very much smaller error near the tails
* and modestly increased error near the median.
*
* The base forms (K_0, K_1, K_2 and K_3) all result in t-digests limited to a number of clusters equal to the
* compression factor. The K_2_NO_NORM and K_3_NO_NORM versions result in the cluster count increasing roughly with
* log(n).
*/
public enum ScaleFunction {
/**
* Generates uniform cluster sizes. Used for comparison only.
*/
K_0 {
@Override
public double k(double q, double compression, double n) {
return compression * q / 2;
}
@Override
public double k(double q, double normalizer) {
return normalizer * q;
}
@Override
public double q(double k, double compression, double n) {
return 2 * k / compression;
}
@Override
public double q(double k, double normalizer) {
return k / normalizer;
}
@Override
public double max(double q, double compression, double n) {
return 2 / compression;
}
@Override
public double max(double q, double normalizer) {
return 1 / normalizer;
}
@Override
public double normalizer(double compression, double n) {
return compression / 2;
}
},
/**
* Generates cluster sizes proportional to sqrt(q*(1-q)). This gives constant relative accuracy if accuracy is
* proportional to squared cluster size. It is expected that K_2 and K_3 will give better practical results.
*/
K_1 {
@Override
public double k(final double q, final double compression, double n) {
Function f = new Function() {
@Override
double apply(double q) {
return compression * Math.asin(2 * q - 1) / (2 * Math.PI);
}
};
return ScaleFunction.limitCall(f, q, 1e-15, 1 - 1e-15);
}
@Override
public double k(final double q, final double normalizer) {
Function f = new Function() {
@Override
double apply(double q) {
return normalizer * Math.asin(2 * q - 1);
}
};
return ScaleFunction.limitCall(f, q, 1e-15, 1 - 1e-15);
}
@Override
public double q(double k, final double compression, double n) {
Function f = new Function() {
@Override
double apply(double k) {
return (Math.sin(k * (2 * Math.PI / compression)) + 1) / 2;
}
};
return ScaleFunction.limitCall(f, k, -compression / 4, compression / 4);
}
@Override
public double q(double k, final double normalizer) {
Function f = new Function() {
@Override
double apply(double x) {
return (Math.sin(x) + 1) / 2;
}
};
double x = k / normalizer;
return ScaleFunction.limitCall(f, x, -Math.PI / 2, Math.PI / 2);
}
@Override
public double max(double q, double compression, double n) {
if (q <= 0) {
return 0;
} else if (q >= 1) {
return 0;
} else {
return 2 * Math.sin(Math.PI / compression) * Math.sqrt(q * (1 - q));
}
}
@Override
public double max(double q, double normalizer) {
if (q <= 0) {
return 0;
} else if (q >= 1) {
return 0;
} else {
return 2 * Math.sin(0.5 / normalizer) * Math.sqrt(q * (1 - q));
}
}
@Override
public double normalizer(double compression, double n) {
return compression / (2 * Math.PI);
}
},
/**
* Generates cluster sizes proportional to sqrt(q*(1-q)) but avoids computation of asin in the critical path by
* using an approximate version.
*/
K_1_FAST {
@Override
public double k(double q, final double compression, double n) {
Function f = new Function() {
@Override
double apply(double q) {
return compression * fastAsin(2 * q - 1) / (2 * Math.PI);
}
};
return ScaleFunction.limitCall(f, q, 0, 1);
}
@Override
public double k(double q, final double normalizer) {
Function f = new Function() {
@Override
double apply(double q) {
return normalizer * fastAsin(2 * q - 1);
}
};
return ScaleFunction.limitCall(f, q, 0, 1);
}
@Override
public double q(double k, double compression, double n) {
return (Math.sin(k * (2 * Math.PI / compression)) + 1) / 2;
}
@Override
public double q(double k, double normalizer) {
return (Math.sin(k / normalizer) + 1) / 2;
}
@Override
public double max(double q, double compression, double n) {
if (q <= 0) {
return 0;
} else if (q >= 1) {
return 0;
} else {
return 2 * Math.sin(Math.PI / compression) * Math.sqrt(q * (1 - q));
}
}
@Override
public double max(double q, double normalizer) {
if (q <= 0) {
return 0;
} else if (q >= 1) {
return 0;
} else {
return 2 * Math.sin(0.5 / normalizer) * Math.sqrt(q * (1 - q));
}
}
@Override
public double normalizer(double compression, double n) {
return compression / (2 * Math.PI);
}
},
/**
* Generates cluster sizes proportional to q*(1-q). This makes tail error bounds tighter than for K_1. The use of a
* normalizing function results in a strictly bounded number of clusters no matter how many samples.
*/
K_2 {
@Override
public double k(double q, final double compression, final double n) {
if (n <= 1) {
if (q <= 0) {
return -10;
} else if (q >= 1) {
return 10;
} else {
return 0;
}
}
Function f = new Function() {
@Override
double apply(double q) {
return compression * Math.log(q / (1 - q)) / Z(compression, n);
}
};
return ScaleFunction.limitCall(f, q, 1e-15, 1 - 1e-15);
}
@Override
public double k(double q, final double normalizer) {
Function f = new Function() {
@Override
double apply(double q) {
return Math.log(q / (1 - q)) * normalizer;
}
};
return ScaleFunction.limitCall(f, q, 1e-15, 1 - 1e-15);
}
@Override
public double q(double k, double compression, double n) {
double w = Math.exp(k * Z(compression, n) / compression);
return w / (1 + w);
}
@Override
public double q(double k, double normalizer) {
double w = Math.exp(k / normalizer);
return w / (1 + w);
}
@Override
public double max(double q, double compression, double n) {
return Z(compression, n) * q * (1 - q) / compression;
}
@Override
public double max(double q, double normalizer) {
return q * (1 - q) / normalizer;
}
@Override
public double normalizer(double compression, double n) {
return compression / Z(compression, n);
}
private double Z(double compression, double n) {
return 4 * Math.log(n / compression) + 24;
}
},
/**
* Generates cluster sizes proportional to min(q, 1-q). This makes tail error bounds tighter than for K_1 or K_2.
* The use of a normalizing function results in a strictly bounded number of clusters no matter how many samples.
*/
K_3 {
@Override
public double k(double q, final double compression, final double n) {
Function f = new Function() {
@Override
double apply(double q) {
if (q <= 0.5) {
return compression * Math.log(2 * q) / Z(compression, n);
} else {
return -k(1 - q, compression, n);
}
}
};
return ScaleFunction.limitCall(f, q, 1e-15, 1 - 1e-15);
}
@Override
public double k(double q, final double normalizer) {
Function f = new Function() {
@Override
double apply(double q) {
if (q <= 0.5) {
return Math.log(2 * q) * normalizer;
} else {
return -k(1 - q, normalizer);
}
}
};
return ScaleFunction.limitCall(f, q, 1e-15, 1 - 1e-15);
}
@Override
public double q(double k, double compression, double n) {
if (k <= 0) {
return Math.exp(k * Z(compression, n) / compression) / 2;
} else {
return 1 - q(-k, compression, n);
}
}
@Override
public double q(double k, double normalizer) {
if (k <= 0) {
return Math.exp(k / normalizer) / 2;
} else {
return 1 - q(-k, normalizer);
}
}
@Override
public double max(double q, double compression, double n) {
return Z(compression, n) * Math.min(q, 1 - q) / compression;
}
@Override
public double max(double q, double normalizer) {
return Math.min(q, 1 - q) / normalizer;
}
@Override
public double normalizer(double compression, double n) {
return compression / Z(compression, n);
}
private double Z(double compression, double n) {
return 4 * Math.log(n / compression) + 21;
}
},
/**
* Generates cluster sizes proportional to q*(1-q). This makes the tail error bounds tighter. This version does not
* use a normalizer function and thus the number of clusters increases roughly proportional to log(n). That is good
* for accuracy, but bad for size and bad for the statically allocated MergingDigest, but can be useful for
* tree-based implementations.
*/
K_2_NO_NORM {
@Override
public double k(double q, final double compression, double n) {
Function f = new Function() {
@Override
double apply(double q) {
return compression * Math.log(q / (1 - q));
}
};
return ScaleFunction.limitCall(f, q, 1e-15, 1 - 1e-15);
}
@Override
public double k(double q, final double normalizer) {
Function f = new Function() {
@Override
double apply(double q) {
return normalizer * Math.log(q / (1 - q));
}
};
return ScaleFunction.limitCall(f, q, 1e-15, 1 - 1e-15);
}
@Override
public double q(double k, double compression, double n) {
double w = Math.exp(k / compression);
return w / (1 + w);
}
@Override
public double q(double k, double normalizer) {
double w = Math.exp(k / normalizer);
return w / (1 + w);
}
@Override
public double max(double q, double compression, double n) {
return q * (1 - q) / compression;
}
@Override
public double max(double q, double normalizer) {
return q * (1 - q) / normalizer;
}
@Override
public double normalizer(double compression, double n) {
return compression;
}
},
/**
* Generates cluster sizes proportional to min(q, 1-q). This makes the tail error bounds tighter. This version does
* not use a normalizer function and thus the number of clusters increases roughly proportional to log(n). That is
* good for accuracy, but bad for size and bad for the statically allocated MergingDigest, but can be useful for
* tree-based implementations.
*/
K_3_NO_NORM {
@Override
public double k(double q, final double compression, final double n) {
Function f = new Function() {
@Override
double apply(double q) {
if (q <= 0.5) {
return compression * Math.log(2 * q);
} else {
return -k(1 - q, compression, n);
}
}
};
return limitCall(f, q, 1e-15, 1 - 1e-15);
}
@Override
public double k(double q, final double normalizer) {
// poor man's lambda, sigh
Function f = new Function() {
@Override
double apply(double q) {
if (q <= 0.5) {
return normalizer * Math.log(2 * q);
} else {
return -k(1 - q, normalizer);
}
}
};
return limitCall(f, q, 1e-15, 1 - 1e-15);
}
@Override
public double q(double k, double compression, double n) {
if (k <= 0) {
return Math.exp(k / compression) / 2;
} else {
return 1 - q(-k, compression, n);
}
}
@Override
public double q(double k, double normalizer) {
if (k <= 0) {
return Math.exp(k / normalizer) / 2;
} else {
return 1 - q(-k, normalizer);
}
}
@Override
public double max(double q, double compression, double n) {
return Math.min(q, 1 - q) / compression;
}
@Override
public double max(double q, double normalizer) {
return Math.min(q, 1 - q) / normalizer;
}
@Override
public double normalizer(double compression, double n) {
return compression;
}
}; // max weight is min(q,1-q), should improve tail accuracy even more
/**
* Converts a quantile to the k-scale. The total number of points is also provided so that a normalizing function
* can be computed if necessary.
*
* @param q The quantile
* @param compression Also known as delta in literature on the t-digest
* @param n The total number of samples
* @return The corresponding value of k
*/
public abstract double k(double q, double compression, double n);
/**
* Converts a quantile to the k-scale. The normalizer value depends on compression and (possibly) number of points
* in the digest. #normalizer(double, double)
*
* @param q The quantile
* @param normalizer The normalizer value which depends on compression and (possibly) number of points in the
* digest.
* @return The corresponding value of k
*/
public abstract double k(double q, double normalizer);
/**
* Computes q as a function of k. This is often faster than finding k as a function of q for some scales.
*
* @param k The index value to convert into q scale.
* @param compression The compression factor (often written as δ)
* @param n The number of samples already in the digest.
* @return The value of q that corresponds to k
*/
public abstract double q(double k, double compression, double n);
/**
* Computes q as a function of k. This is often faster than finding k as a function of q for some scales.
*
* @param k The index value to convert into q scale.
* @param normalizer The normalizer value which depends on compression and (possibly) number of points in the
* digest.
* @return The value of q that corresponds to k
*/
public abstract double q(double k, double normalizer);
/**
* Computes the maximum relative size a cluster can have at quantile q. Note that exactly where within the range
* spanned by a cluster that q should be isn't clear. That means that this function usually has to be taken at
* multiple points and the smallest value used.
*
* Note that this is the relative size of a cluster. To get the max number of samples in the cluster, multiply this
* value times the total number of samples in the digest.
*
* @param q The quantile
* @param compression The compression factor, typically delta in the literature
* @param n The number of samples seen so far in the digest
* @return The maximum number of samples that can be in the cluster
*/
public abstract double max(double q, double compression, double n);
/**
* Computes the maximum relative size a cluster can have at quantile q. Note that exactly where within the range
* spanned by a cluster that q should be isn't clear. That means that this function usually has to be taken at
* multiple points and the smallest value used.
*
* Note that this is the relative size of a cluster. To get the max number of samples in the cluster, multiply this
* value times the total number of samples in the digest.
*
* @param q The quantile
* @param normalizer The normalizer value which depends on compression and (possibly) number of points in the
* digest.
* @return The maximum number of samples that can be in the cluster
*/
public abstract double max(double q, double normalizer);
/**
* Computes the normalizer given compression and number of points.
* @param compression The compression parameter for the digest
* @param n The number of samples seen so far
* @return The normalizing factor for the scale function
*/
public abstract double normalizer(double compression, double n);
/**
* Approximates asin to within about 1e-6. This approximation works by breaking the range from 0 to 1 into 5 regions
* for all but the region nearest 1, rational polynomial models get us a very good approximation of asin and by
* interpolating as we move from region to region, we can guarantee continuity and we happen to get monotonicity as
* well. for the values near 1, we just use Math.asin as our region "approximation".
*
* @param x sin(theta)
* @return theta
*/
static double fastAsin(double x) {
if (x < 0) {
return -fastAsin(-x);
} else if (x > 1) {
return Double.NaN;
} else {
// Cutoffs for models. Note that the ranges overlap. In the
// overlap we do linear interpolation to guarantee the overall
// result is "nice"
double c0High = 0.1;
double c1High = 0.55;
double c2Low = 0.5;
double c2High = 0.8;
double c3Low = 0.75;
double c3High = 0.9;
double c4Low = 0.87;
if (x > c3High) {
return Math.asin(x);
} else {
// the models
double[] m0 = { 0.2955302411, 1.2221903614, 0.1488583743, 0.2422015816, -0.3688700895, 0.0733398445 };
double[] m1 = { -0.0430991920, 0.9594035750, -0.0362312299, 0.1204623351, 0.0457029620, -0.0026025285 };
double[] m2 = { -0.034873933724, 1.054796752703, -0.194127063385, 0.283963735636, 0.023800124916, -0.000872727381 };
double[] m3 = { -0.37588391875, 2.61991859025, -2.48835406886, 1.48605387425, 0.00857627492, -0.00015802871 };
// the parameters for all of the models
double[] vars = { 1, x, x * x, x * x * x, 1 / (1 - x), 1 / (1 - x) / (1 - x) };
// raw grist for interpolation coefficients
double x0 = bound((c0High - x) / c0High);
double x1 = bound((c1High - x) / (c1High - c2Low));
double x2 = bound((c2High - x) / (c2High - c3Low));
double x3 = bound((c3High - x) / (c3High - c4Low));
// interpolation coefficients
// noinspection UnnecessaryLocalVariable
double mix0 = x0;
double mix1 = (1 - x0) * x1;
double mix2 = (1 - x1) * x2;
double mix3 = (1 - x2) * x3;
double mix4 = 1 - x3;
// now mix all the results together, avoiding extra evaluations
double r = 0;
if (mix0 > 0) {
r += mix0 * eval(m0, vars);
}
if (mix1 > 0) {
r += mix1 * eval(m1, vars);
}
if (mix2 > 0) {
r += mix2 * eval(m2, vars);
}
if (mix3 > 0) {
r += mix3 * eval(m3, vars);
}
if (mix4 > 0) {
// model 4 is just the real deal
r += mix4 * Math.asin(x);
}
return r;
}
}
}
abstract static class Function {
abstract double apply(double x);
}
static double limitCall(Function f, double x, double low, double high) {
if (x < low) {
return f.apply(low);
} else if (x > high) {
return f.apply(high);
} else {
return f.apply(x);
}
}
private static double eval(double[] model, double[] vars) {
double r = 0;
for (int i = 0; i < model.length; i++) {
r += model[i] * vars[i];
}
return r;
}
private static double bound(double v) {
if (v <= 0) {
return 0;
} else if (v >= 1) {
return 1;
} else {
return v;
}
}
}