All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.tdunning.math.stats.AgentDigest Maven / Gradle / Ivy

There is a newer version: 9999.0
Show newest version
package com.tdunning.math.stats;

import com.google.common.base.Preconditions;

import com.yammer.metrics.Metrics;
import com.yammer.metrics.core.MetricName;

import net.jafama.FastMath;
import net.openhft.chronicle.bytes.Bytes;
import net.openhft.chronicle.core.io.IORuntimeException;
import net.openhft.chronicle.core.util.ReadResolvable;
import net.openhft.chronicle.hash.serialization.SizedReader;
import net.openhft.chronicle.hash.serialization.SizedWriter;
import net.openhft.chronicle.wire.WireIn;
import net.openhft.chronicle.wire.WireOut;

import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;

import wavefront.report.Histogram;
import wavefront.report.HistogramType;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;

/**
 * NOTE: This is a pruned and modified version of {@link MergingDigest}. It does not support queries (cdf/quantiles) or
 * the traditional encodings.
 * 

* Maintains a t-digest by collecting new points in a buffer that is then sorted occasionally and merged into a sorted * array that contains previously computed centroids. *

* This can be very fast because the cost of sorting and merging is amortized over several insertion. If we keep N * centroids total and have the input array is k long, then the amortized cost is something like *

* N/k + log k *

* These costs even out when N/k = log k. Balancing costs is often a good place to start in optimizing an algorithm. * For different values of compression factor, the following table shows estimated asymptotic values of N and suggested * values of k:

* *
CompressionNk
507825
10015742
20031473
*

* The virtues of this kind of t-digest implementation include:

  • No allocation is required after * initialization
  • The data structure automatically compresses existing centroids when possible
  • No Java * object overhead is incurred for centroids since data is kept in primitive arrays
*

* The current implementation takes the liberty of using ping-pong buffers for implementing the merge resulting in a * substantial memory penalty, but the complexity of an in place merge was not considered as worthwhile since even with * the overhead, the memory cost is less than 40 bytes per centroid which is much less than half what the AVLTreeDigest * uses. Speed tests are still not complete so it is uncertain whether the merge strategy is faster than the tree * strategy. */ public class AgentDigest extends AbstractTDigest { private final short compression; // points to the centroid that is currently being merged // if weight[lastUsedCell] == 0, then this is the number of centroids // else the number is lastUsedCell+1 private int lastUsedCell; // sum_i weight[i] See also unmergedWeight private double totalWeight = 0; // number of points that have been added to each merged centroid private double[] weight; // mean of points added to each merged centroid private double[] mean; // history of all data added to centroids (for testing purposes) private List> data = null; // buffers for merging private double[] mergeWeight; private double[] mergeMean; private List> mergeData = null; // sum_i tempWeight[i] private double unmergedWeight = 0; // this is the index of the next temporary centroid // this is a more Java-like convention than lastUsedCell uses private int tempUsed = 0; private final double[] tempWeight; private final double[] tempMean; private List> tempData = null; // array used for sorting the temp centroids. This is a field // to avoid allocations during operation private final int[] order; private long dispatchTimeMillis; // should only need ceiling(compression * PI / 2). Double the allocation for now for safety private static int defaultSizeForCompression(short compression) { return (int) (Math.PI * compression + 0.5); } // magic formula created by regressing against known sizes for sample compression values private static int bufferSizeForCompression(short compression) { return (int) (7.5 + 0.37 * compression - 2e-4 * compression * compression); } public AgentDigest(short compression, long dispatchTimeMillis) { Preconditions.checkArgument(compression >= 20D); Preconditions.checkArgument(compression <= 1000D); int numCentroids = defaultSizeForCompression(compression); int numBuffered = bufferSizeForCompression(compression); this.compression = compression; weight = new double[numCentroids]; mean = new double[numCentroids]; mergeWeight = new double[numCentroids]; mergeMean = new double[numCentroids]; tempWeight = new double[numBuffered]; tempMean = new double[numBuffered]; order = new int[numBuffered]; lastUsedCell = 0; this.dispatchTimeMillis = dispatchTimeMillis; } /** * Turns on internal data recording. */ @Override public TDigest recordAllData() { super.recordAllData(); data = new ArrayList<>(); mergeData = new ArrayList<>(); return this; } @Override void add(double x, int w, Centroid base) { add(x, w, base.data()); } @Override public void add(double x, int w) { add(x, w, (List) null); } @Override public void add(List others) { for (TDigest other : others) { setMinMax(Math.min(min, other.getMin()), Math.max(max, other.getMax())); for (Centroid centroid : other.centroids()) { add(centroid.mean(), centroid.count(), recordAllData ? centroid.data() : null); } } } public void add(double x, int w, List history) { if (Double.isNaN(x)) { throw new IllegalArgumentException("Cannot add NaN to t-digest"); } if (tempUsed >= tempWeight.length) { mergeNewValues(); } int where = tempUsed++; tempWeight[where] = w; tempMean[where] = x; unmergedWeight += w; if (data != null) { if (tempData == null) { tempData = new ArrayList<>(); } while (tempData.size() <= where) { tempData.add(new ArrayList<>()); } if (history == null) { history = Collections.singletonList(x); } tempData.get(where).addAll(history); } } private void mergeNewValues() { if (unmergedWeight > 0) { Sort.sort(order, tempMean, tempUsed); double wSoFar = 0; double k1 = 0; int i = 0; int j = 0; int n = 0; if (totalWeight > 0) { if (weight[lastUsedCell] > 0) { n = lastUsedCell + 1; } else { n = lastUsedCell; } } lastUsedCell = 0; totalWeight += unmergedWeight; unmergedWeight = 0; // merge tempWeight,tempMean and weight,mean into mergeWeight,mergeMean while (i < tempUsed && j < n) { int ix = order[i]; if (tempMean[ix] <= mean[j]) { wSoFar += tempWeight[ix]; k1 = mergeCentroid(wSoFar, k1, tempWeight[ix], tempMean[ix], tempData != null ? tempData.get(ix) : null); i++; } else { wSoFar += weight[j]; k1 = mergeCentroid(wSoFar, k1, weight[j], mean[j], data != null ? data.get(j) : null); j++; } } while (i < tempUsed) { int ix = order[i]; wSoFar += tempWeight[ix]; k1 = mergeCentroid(wSoFar, k1, tempWeight[ix], tempMean[ix], tempData != null ? tempData.get(ix) : null); i++; } while (j < n) { wSoFar += weight[j]; k1 = mergeCentroid(wSoFar, k1, weight[j], mean[j], data != null ? data.get(j) : null); j++; } tempUsed = 0; // swap pointers for working space and merge space double[] z = weight; weight = mergeWeight; mergeWeight = z; Arrays.fill(mergeWeight, 0); z = mean; mean = mergeMean; mergeMean = z; if (data != null) { data = mergeData; mergeData = new ArrayList<>(); tempData = new ArrayList<>(); } } } private double mergeCentroid(double wSoFar, double k1, double w, double m, List newData) { double k2 = integratedLocation(wSoFar / totalWeight); if (k2 - k1 <= 1 || mergeWeight[lastUsedCell] == 0) { // merge into existing centroid mergeWeight[lastUsedCell] += w; mergeMean[lastUsedCell] = mergeMean[lastUsedCell] + (m - mergeMean[lastUsedCell]) * w / mergeWeight[lastUsedCell]; } else { // create new centroid lastUsedCell++; mergeMean[lastUsedCell] = m; mergeWeight[lastUsedCell] = w; k1 = integratedLocation((wSoFar - w) / totalWeight); } if (mergeData != null) { while (mergeData.size() <= lastUsedCell) { mergeData.add(new ArrayList<>()); } mergeData.get(lastUsedCell).addAll(newData); } return k1; } /** * Exposed for testing. */ int checkWeights() { return checkWeights(weight, totalWeight, lastUsedCell); } private int checkWeights(double[] w, double total, int last) { int badCount = 0; int n = last; if (w[n] > 0) { n++; } double k1 = 0; double q = 0; for (int i = 0; i < n; i++) { double dq = w[i] / total; double k2 = integratedLocation(q + dq); if (k2 - k1 > 1 && w[i] != 1) { System.out.printf("Oversize centroid at %d, k0=%.2f, k1=%.2f, dk=%.2f, w=%.2f, q=%.4f\n", i, k1, k2, k2 - k1, w[i], q); badCount++; } if (k2 - k1 > 1.5 && w[i] != 1) { throw new IllegalStateException(String.format("Egregiously oversized centroid at %d, k0=%.2f, k1=%.2f, dk=%.2f, w=%.2f, q=%.4f\n", i, k1, k2, k2 - k1, w[i], q)); } q += dq; k1 = k2; } return badCount; } /** * Converts a quantile into a centroid scale value. The centroid scale is nominally * the number k of the centroid that a quantile point q should belong to. Due to * round-offs, however, we can't align things perfectly without splitting points * and centroids. We don't want to do that, so we have to allow for offsets. * In the end, the criterion is that any quantile range that spans a centroid * scale range more than one should be split across more than one centroid if * possible. This won't be possible if the quantile range refers to a single point * or an already existing centroid. *

* This mapping is steep near q=0 or q=1 so each centroid there will correspond to * less q range. Near q=0.5, the mapping is flatter so that centroids there will * represent a larger chunk of quantiles. * * @param q The quantile scale value to be mapped. * @return The centroid scale value corresponding to q. */ private double integratedLocation(double q) { return compression * (FastMath.asin(2 * q - 1) + Math.PI / 2) / Math.PI; } @Override public void compress() { mergeNewValues(); } @Override public long size() { return (long) (totalWeight + unmergedWeight); } @Override public double cdf(double x) { // Not supported return Double.NaN; } @Override public double quantile(double q) { return Double.NaN; } /** * Not clear to me that this is a good idea, maybe just add the temp points and existing centroids rather then merging * first? */ @Override public Collection centroids() { // we don't actually keep centroid structures around so we have to fake it List r = new ArrayList<>(); int count = centroidCount(); for (int i = 0; i < count; i++) { r.add(new Centroid(mean[i], (int) weight[i], data != null ? data.get(i) : null)); } return r; } @Override public double compression() { return compression; } @Override public int byteSize() { return 0; } @Override public int smallByteSize() { return 0; } /** * Number of centroids of this AgentDigest (does compress if necessary) */ public int centroidCount() { mergeNewValues(); return lastUsedCell + (weight[lastUsedCell] == 0 ? 0 : 1); } /** * Creates a reporting Histogram from this AgentDigest (marked with the supplied duration). */ public Histogram toHistogram(int duration) { int numCentroids = centroidCount(); // NOTE: now merged as a side-effect List means = new ArrayList<>(centroidCount()); List count = new ArrayList<>(centroidCount()); for (int i = 0; i < numCentroids; ++i) { means.add(mean[i]); count.add((int) Math.round(weight[i])); } return Histogram.newBuilder() .setDuration(duration) .setBins(means) .setCounts(count) .setType(HistogramType.TDIGEST) .build(); } /** * Comprises of the dispatch-time (8 bytes) + compression (2 bytes) */ private static final int FIXED_SIZE = 8 + 2; /** * Weight, mean float pair */ private static final int PER_CENTROID_SIZE = 8; private int encodedSize() { return FIXED_SIZE + centroidCount() * PER_CENTROID_SIZE; } /** * Stateless AgentDigest codec for chronicle maps */ public static class AgentDigestMarshaller implements SizedReader, SizedWriter, ReadResolvable { private static final AgentDigestMarshaller INSTANCE = new AgentDigestMarshaller(); private static final com.yammer.metrics.core.Histogram accumulatorValueSizes = Metrics.newHistogram(new MetricName("histogram", "", "accumulatorValueSize")); private AgentDigestMarshaller() { } public static AgentDigestMarshaller get() { return INSTANCE; } @Nonnull @Override public AgentDigest read(Bytes in, long size, @Nullable AgentDigest using) { Preconditions.checkArgument(size >= FIXED_SIZE); short compression = in.readShort(); if (using == null || using.compression != compression) { using = new AgentDigest(compression, in.readLong()); } else { using.dispatchTimeMillis = in.readLong(); } using.totalWeight = 0d; using.lastUsedCell = (int) ((size - FIXED_SIZE) / PER_CENTROID_SIZE); using.tempUsed = 0; using.unmergedWeight = 0D; // need explicit nulling of weight past lastUsedCell Arrays.fill(using.weight, using.lastUsedCell, using.weight.length, 0D); for (int i = 0; i < using.lastUsedCell; ++i) { float weight = in.readFloat(); using.weight[i] = weight; using.mean[i] = in.readFloat(); using.totalWeight += weight; } return using; } @Override public long size(@Nonnull AgentDigest toWrite) { long size = toWrite.encodedSize(); accumulatorValueSizes.update(size); return size; } @Override public void write(Bytes out, long size, @Nonnull AgentDigest toWrite) { // Merge in all buffered values int numCentroids = toWrite.centroidCount(); // Just for sanity, comment out for production use Preconditions.checkArgument(size == toWrite.encodedSize()); // Write compression out.writeShort(toWrite.compression); // Time out.writeLong(toWrite.dispatchTimeMillis); // Centroids for (int i = 0; i < numCentroids; ++i) { out.writeFloat((float) toWrite.weight[i]); out.writeFloat((float) toWrite.mean[i]); } } @Nonnull @Override public AgentDigestMarshaller readResolve() { return INSTANCE; } @Override public void readMarshallable(@Nonnull WireIn wire) throws IORuntimeException { // ignore } @Override public void writeMarshallable(@Nonnull WireOut wire) { // ignore } } @Override public void asBytes(ByteBuffer buf) { // Ignore } @Override public void asSmallBytes(ByteBuffer buf) { // Ignore } /** * Time at which this digest should be dispatched to wavefront. */ public long getDispatchTimeMillis() { return dispatchTimeMillis; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy