All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.yahoo.sketches.quantiles.DoublesSketch Maven / Gradle / Ivy

/*
 * Copyright 2016, Yahoo! Inc.
 * Licensed under the terms of the Apache License 2.0. See LICENSE file at the project root for terms.
 */

package com.yahoo.sketches.quantiles;

import static com.yahoo.sketches.Util.ceilingPowerOf2;
import static com.yahoo.sketches.quantiles.Util.checkIsCompactMemory;

import java.util.Random;

import com.yahoo.memory.Memory;
import com.yahoo.sketches.Family;
import com.yahoo.sketches.SketchesArgumentException;


/**
 * This is a stochastic streaming sketch that enables near-real time analysis of the
 * approximate distribution of real values from a very large stream in a single pass.
 * The analysis is obtained using a getQuantiles(*) function or its inverse functions the
 * Probability Mass Function from getPMF(*) and the Cumulative Distribution Function from getCDF(*).
 *
 * 

Consider a large stream of one million values such as packet sizes coming into a network node. * The absolute rank of any specific size value is simply its index in the hypothetical sorted * array of values. * The normalized rank (or fractional rank) is the absolute rank divided by the stream size, * in this case one million. * The value corresponding to the normalized rank of 0.5 represents the 50th percentile or median * value of the distribution, or getQuantile(0.5). Similarly, the 95th percentile is obtained from * getQuantile(0.95). Using the getQuantiles(0.0, 1.0) will return the min and max values seen by * the sketch.

* *

From the min and max values, for example, 1 and 1000 bytes, * you can obtain the PMF from getPMF(100, 500, 900) that will result in an array of * 4 fractional values such as {.4, .3, .2, .1}, which means that *

    *
  • 40% of the values were < 100,
  • *
  • 30% of the values were ≥ 100 and < 500,
  • *
  • 20% of the values were ≥ 500 and < 900, and
  • *
  • 10% of the values were ≥ 900.
  • *
* A frequency histogram can be obtained by simply multiplying these fractions by getN(), * which is the total count of values received. * The getCDF(*) works similarly, but produces the cumulative distribution instead. * *

The accuracy of this sketch is a function of the configured value k, which also affects * the overall size of the sketch. Accuracy of this quantile sketch is always with respect to * the normalized rank. A k of 128 produces a normalized, rank error of about 1.7%. * For example, the median value returned from getQuantile(0.5) will be between the actual values * from the hypothetically sorted array of input values at normalized ranks of 0.483 and 0.517, with * a confidence of about 99%.

* *
Table Guide for DoublesSketch Size in Bytes and Approximate Error:
          K => |      16      32      64     128     256     512   1,024
    ~ Error => | 12.145%  6.359%  3.317%  1.725%  0.894%  0.463%  0.239%
             N | Size in Bytes ->
------------------------------------------------------------------------
             0 |       8       8       8       8       8       8       8
             1 |      72      72      72      72      72      72      72
             3 |      72      72      72      72      72      72      72
             7 |     104     104     104     104     104     104     104
            15 |     168     168     168     168     168     168     168
            31 |     296     296     296     296     296     296     296
            63 |     424     552     552     552     552     552     552
           127 |     552     808   1,064   1,064   1,064   1,064   1,064
           255 |     680   1,064   1,576   2,088   2,088   2,088   2,088
           511 |     808   1,320   2,088   3,112   4,136   4,136   4,136
         1,023 |     936   1,576   2,600   4,136   6,184   8,232   8,232
         2,047 |   1,064   1,832   3,112   5,160   8,232  12,328  16,424
         4,095 |   1,192   2,088   3,624   6,184  10,280  16,424  24,616
         8,191 |   1,320   2,344   4,136   7,208  12,328  20,520  32,808
        16,383 |   1,448   2,600   4,648   8,232  14,376  24,616  41,000
        32,767 |   1,576   2,856   5,160   9,256  16,424  28,712  49,192
        65,535 |   1,704   3,112   5,672  10,280  18,472  32,808  57,384
       131,071 |   1,832   3,368   6,184  11,304  20,520  36,904  65,576
       262,143 |   1,960   3,624   6,696  12,328  22,568  41,000  73,768
       524,287 |   2,088   3,880   7,208  13,352  24,616  45,096  81,960
     1,048,575 |   2,216   4,136   7,720  14,376  26,664  49,192  90,152
     2,097,151 |   2,344   4,392   8,232  15,400  28,712  53,288  98,344
     4,194,303 |   2,472   4,648   8,744  16,424  30,760  57,384 106,536
     8,388,607 |   2,600   4,904   9,256  17,448  32,808  61,480 114,728
    16,777,215 |   2,728   5,160   9,768  18,472  34,856  65,576 122,920
    33,554,431 |   2,856   5,416  10,280  19,496  36,904  69,672 131,112
    67,108,863 |   2,984   5,672  10,792  20,520  38,952  73,768 139,304
   134,217,727 |   3,112   5,928  11,304  21,544  41,000  77,864 147,496
   268,435,455 |   3,240   6,184  11,816  22,568  43,048  81,960 155,688
   536,870,911 |   3,368   6,440  12,328  23,592  45,096  86,056 163,880
 1,073,741,823 |   3,496   6,696  12,840  24,616  47,144  90,152 172,072
 2,147,483,647 |   3,624   6,952  13,352  25,640  49,192  94,248 180,264
 4,294,967,295 |   3,752   7,208  13,864  26,664  51,240  98,344 188,456

 * 
*

There is more documentation available on * DataSketches.GitHub.io.

* *

This is an implementation of the Low Discrepancy Mergeable Quantiles Sketch, using double * values, described in section 3.2 of the journal version of the paper "Mergeable Summaries" * by Agarwal, Cormode, Huang, Phillips, Wei, and Yi. *

* *

This algorithm is independent of the distribution of values, which can be anywhere in the * range of the IEEE-754 64-bit doubles. * *

This algorithm intentionally inserts randomness into the sampling process for values that * ultimately get retained in the sketch. The results produced by this algorithm are not * deterministic. For example, if the same stream is inserted into two different instances of this * sketch, the answers obtained from the two sketches may not be be identical.

* *

Similarly, there may be directional inconsistencies. For example, the resulting array of * values obtained from getQuantiles(fractions[]) input into the reverse directional query * getPMF(splitPoints[]) may not result in the original fractional values.

* * @author Kevin Lang * @author Lee Rhodes * @author Jon Malkin */ public abstract class DoublesSketch { static final int DOUBLES_SER_VER = 3; static final int MAX_PRELONGS = Family.QUANTILES.getMaxPreLongs(); static final int MIN_K = 2; /** * Parameter that controls space usage of sketch and accuracy of estimates. */ final int k_; /** * Setting the seed makes the results of the sketch deterministic if the input values are * received in exactly the same order. This is only useful when performing test comparisons, * otherwise is not recommended. */ public static final Random rand = new Random(); DoublesSketch(final int k) { Util.checkK(k); k_ = k; } /** * Returns a new builder * @return a new builder */ public static final DoublesSketchBuilder builder() { return new DoublesSketchBuilder(); } /** * Heapify takes the sketch image in Memory and instantiates an on-heap Sketch. * The resulting sketch will not retain any link to the source Memory. * @param srcMem a Memory image of a Sketch. * See Memory * @return a heap-based Sketch based on the given Memory */ public static DoublesSketch heapify(final Memory srcMem) { if (checkIsCompactMemory(srcMem)) { return CompactDoublesSketch.heapify(srcMem); } else { return UpdateDoublesSketch.heapify(srcMem); } } /** * Wrap this sketch around the given compact Memory image of a DoublesSketch. * * @param srcMem the given Memory image of a DoublesSketch that may have data, * @return a sketch that wraps the given srcMem */ public static DoublesSketch wrap(final Memory srcMem) { if (checkIsCompactMemory(srcMem)) { return DirectCompactDoublesSketch.wrapInstance(srcMem); } else { return DirectUpdateDoublesSketch.wrapInstance(srcMem); } } /** * This returns an approximation to the value of the data item * that would be preceded by the given fraction of a hypothetical sorted * version of the input stream so far. * *

We note that this method has a fairly large overhead (microseconds instead of nanoseconds) * so it should not be called multiple times to get different quantiles from the same * sketch. Instead use getQuantiles(), which pays the overhead only once. * *

If the sketch is empty: *

  • getQuantile(0.0) returns Double.POSITIVE_INFINITY
  • *
  • getQuantile(1.0) returns Double.NEGATIVE_INFINITY
  • *
  • getQuantile(0.0 <rank> 1.0) returns Double.NaN
  • *
* * @param fraction the specified fractional position in the hypothetical sorted stream. * These are also called normalized ranks or fractional ranks. * If fraction = 0.0, the true minimum value of the stream is returned. * If fraction = 1.0, the true maximum value of the stream is returned. * * @return the approximation to the value at the above fraction */ public double getQuantile(final double fraction) { if ((fraction < 0.0) || (fraction > 1.0)) { throw new SketchesArgumentException("Fraction cannot be less than zero or greater than 1.0"); } if (fraction == 0.0) { return this.getMinValue(); } else if (fraction == 1.0) { return this.getMaxValue(); } else { final DoublesAuxiliary aux = this.constructAuxiliary(); return aux.getQuantile(fraction); } } /** * This is a more efficient multiple-query version of getQuantile(). * *

This returns an array that could have been generated by using getQuantile() with many different * fractional ranks, but would be very inefficient. * This method incurs the internal set-up overhead once and obtains multiple quantile values in * a single query. It is strongly recommend that this method be used instead of multiple calls * to getQuantile(). * *

If the sketch is empty: *

  • getQuantiles(0.0, ...) returns Double.POSITIVE_INFINITY
  • *
  • getQuantiles(..., 1.0) returns Double.NEGATIVE_INFINITY
  • *
  • getQuantiles(..., 0.0 <rank> 1.0, ...) returns Double.NaN
  • *
* * @param fractions given array of fractional positions in the hypothetical sorted stream. * These are also called normalized ranks or fractional ranks. * These fractions must be monotonic, in increasing order and in the interval * [0.0, 1.0] inclusive. * * @return array of approximations to the given fractions in the same order as given fractions * array. */ public double[] getQuantiles(final double[] fractions) { Util.validateFractions(fractions); DoublesAuxiliary aux = null; final double[] answers = new double[fractions.length]; for (int i = 0; i < fractions.length; i++) { final double fraction = fractions[i]; if (fraction == 0.0) { answers[i] = this.getMinValue(); } else if (fraction == 1.0) { answers[i] = this.getMaxValue(); } else { if (aux == null) { aux = this.constructAuxiliary(); } answers[i] = aux.getQuantile(fraction); } } return answers; } /** * This is also a more efficient multiple-query version of getQuantile() and allows the caller to * specify the number of evenly spaced fractional ranks. * *

If the sketch is empty: *

  • getQuantiles(0.0, ...) returns Double.POSITIVE_INFINITY
  • *
  • getQuantiles(..., 1.0) returns Double.NEGATIVE_INFINITY
  • *
  • getQuantiles(..., 0.0 <rank> 1.0, ...) returns Double.NaN
  • *
* * @param evenlySpaced an integer that specifies the number of evenly spaced fractional ranks. * This must be a positive integer greater than 0. A value of 1 will return the min value. * A value of 2 will return the min and the max value. A value of 3 will return the min, * the median and the max value, etc. * * @return array of approximations to the given fractions in the same order as given fractions * array. */ public double[] getQuantiles(final int evenlySpaced) { return getQuantiles(getEvenlySpaced(evenlySpaced)); } /** * Returns an approximation to the Probability Mass Function (PMF) of the input stream * given a set of splitPoints (values). * *

The resulting approximations have a probabilistic guarantee that be obtained from the * getNormalizedRankError() function. * *

If the sketch is empty this returns Double.NaN for all values.

* * @param splitPoints an array of m unique, monotonically increasing doubles * that divide the real number line into m+1 consecutive disjoint intervals. * * @return an array of m+1 doubles each of which is an approximation * to the fraction of the input stream values that fell into one of those intervals. * The definition of an "interval" is inclusive of the left splitPoint and exclusive of the right * splitPoint. */ public double[] getPMF(final double[] splitPoints) { return DoublesPmfCdfImpl.getPMFOrCDF(this, splitPoints, false); } /** * Returns an approximation to the Cumulative Distribution Function (CDF), which is the * cumulative analog of the PMF, of the input stream given a set of splitPoint (values). * *

More specifically, the value at array position j of the CDF is the * sum of the values in positions 0 through j of the PMF. * *

If the sketch is empty this returns Double.NaN for all values.

* * @param splitPoints an array of m unique, monotonically increasing doubles * that divide the real number line into m+1 consecutive disjoint intervals. * * @return an approximation to the CDF of the input stream given the splitPoints. */ public double[] getCDF(final double[] splitPoints) { return DoublesPmfCdfImpl.getPMFOrCDF(this, splitPoints, true); } /** * Returns the configured value of K * @return the configured value of K */ public int getK() { return k_; } /** * Returns the min value of the stream. * If the sketch is empty this returns Double.POSITIVE_INFINITY. * * @return the min value of the stream */ public abstract double getMinValue(); /** * Returns the max value of the stream. * If the sketch is empty this returns Double.NEGATIVE_INFINITY. * * @return the max value of the stream */ public abstract double getMaxValue(); /** * Returns the length of the input stream so far. * @return the length of the input stream so far */ public abstract long getN(); /** * Get the rank error normalized as a fraction between zero and one. * The error of this sketch is specified as a fraction of the normalized rank of the hypothetical * sorted stream of items presented to the sketch. * *

Suppose the sketch is presented with N values. The raw rank (0 to N-1) of an item * would be its index position in the sorted version of the input stream. If we divide the * raw rank by N, it becomes the normalized rank, which is between 0 and 1.0. * *

For example, choosing a K of 256 yields a normalized rank error of less than 1%. * The upper bound on the median value obtained by getQuantile(0.5) would be the value in the * hypothetical ordered stream of values at the normalized rank of 0.51. * The lower bound would be the value in the hypothetical ordered stream of values at the * normalized rank of 0.49. * *

The error of this sketch cannot be translated into an error (relative or absolute) of the * returned quantile values. * * @return the rank error normalized as a fraction between zero and one. */ public double getNormalizedRankError() { return getNormalizedRankError(getK()); } /** * Static method version of {@link #getNormalizedRankError()} * @param k the configuration parameter of a DoublesSketch * @return the rank error normalized as a fraction between zero and one. */ public static double getNormalizedRankError(final int k) { return Util.EpsilonFromK.getAdjustedEpsilon(k); } /** * Returns true if this sketch is empty * @return true if this sketch is empty */ public boolean isEmpty() { return getN() == 0; } /** * Returns true if this sketch is direct * @return true if this sketch is direct */ public abstract boolean isDirect(); /** * Returns true if this sketch is in estimation mode. * @return true if this sketch is in estimation mode. */ public boolean isEstimationMode() { return getN() >= 2L * k_; } /** * Serialize this sketch to a byte array. An UpdateDoublesSketch will be serialized in * an unordered, non-compact form; a CompactDoublesSketch will be serialized in ordered, * compact form. A DirectUpdateDoublesSketch can only wrap a non-compact array, and a * DirectCompactDoublesSketch can only wrap a compact array. * * @return byte array of this sketch */ public byte[] toByteArray() { if (isCompact()) { return toByteArray(true); } else { return toByteArray(false); } } /** * Serialize this sketch in a byte array form. * @param compact if true the sketch will be serialized in compact form. * DirectCompactDoublesSketch can wrap() only a compact byte array; * DirectUpdateDoublesSketch can wrap() only a non-compact byte array. * @return this sketch in a byte array form. */ public byte[] toByteArray(final boolean compact) { return DoublesByteArrayImpl.toByteArray(this, compact, compact); } /** * Returns summary information about this sketch. */ @Override public String toString() { return toString(true, false); } /** * Returns summary information about this sketch. Used for debugging. * @param sketchSummary if true includes sketch summary * @param dataDetail if true includes data detail * @return summary information about the sketch. */ public String toString(final boolean sketchSummary, final boolean dataDetail) { return DoublesUtil.toString(sketchSummary, dataDetail, this); } /** * From an source sketch, create a new sketch that must have a smaller value of K. * The original sketch is not modified. * * @param srcSketch the sourcing sketch * @param smallerK the new sketch's value of K that must be smaller than this value of K. * It is required that this.getK() = smallerK * 2^(nonnegative integer). * @param dstMem the destination Memory. It must not overlap the Memory of this sketch. * If null, a heap sketch will be returned, otherwise it will be off-heap. * * @return the new sketch. */ public DoublesSketch downSample(final DoublesSketch srcSketch, final int smallerK, final Memory dstMem) { return downSampleInternal(srcSketch, smallerK, dstMem); } /** * Computes the number of retained items (samples) in the sketch * @return the number of retained items (samples) in the sketch */ public int getRetainedItems() { return Util.computeRetainedItems(getK(), getN()); } /** * Returns the number of bytes this sketch would require to store in compact form, which is not * updatable. * @return the number of bytes this sketch would require to store in compact form. */ public int getCompactStorageBytes() { return getCompactStorageBytes(getK(), getN()); } /** * Returns the number of bytes a DoublesSketch would require to store in compact form * given the values of k and n. The compact form is not updatable. * @param k the size configuration parameter for the sketch * @param n the number of items input into the sketch * @return the number of bytes required to store this sketch in compact form. */ public static int getCompactStorageBytes(final int k, final long n) { if (n == 0) { return 8; } final int metaPreLongs = DoublesSketch.MAX_PRELONGS + 2; //plus min, max return ((metaPreLongs + Util.computeRetainedItems(k, n)) << 3); } /** * Returns the number of bytes this sketch would require to store in native form: compact for * a CompactDoublesSketch, non-compact for an UpdateDoublesSketch. * @return the number of bytes this sketch would require to store in compact form. */ public int getStorageBytes() { if (isCompact()) { return getCompactStorageBytes(); } return getUpdatableStorageBytes(); } /** * Returns the number of bytes this sketch would require to store in updatable form. * This uses roughly 2X the storage of the compact form. * @return the number of bytes this sketch would require to store in updatable form. */ public int getUpdatableStorageBytes() { return getUpdatableStorageBytes(getK(), getN()); } /** * Returns the number of bytes a sketch would require to store in updatable form. * This uses roughly 2X the storage of the compact form * given the values of k and n. * @param k the size configuration parameter for the sketch * @param n the number of items input into the sketch * @return the number of bytes this sketch would require to store in updatable form. */ public static int getUpdatableStorageBytes(final int k, final long n) { if (n == 0) { return 8; } final int metaPre = DoublesSketch.MAX_PRELONGS + 2; //plus min, max final int totLevels = Util.computeNumLevelsNeeded(k, n); if (n <= k) { final int ceil = Math.max(ceilingPowerOf2((int)n), DoublesSketch.MIN_K * 2); return (metaPre + ceil) << 3; } return (metaPre + (2 + totLevels) * k) << 3; } /** * Puts the current sketch into the given Memory in compact form if there is sufficient space, * otherwise, it throws an error. * * @param dstMem the given memory. */ public void putMemory(final Memory dstMem) { putMemory(dstMem, true); } /** * Close variant of toByteArray() to allow DoublesUnion to serialize a sorted, non-compact * sketch that gets wrapped as a gadget. * @param compact If true the sketch will be serialized in compact form. * @return This sketch in byte array form. */ private byte[] toSortedByteArray(final boolean compact) { return DoublesByteArrayImpl.toByteArray(this, true, compact); } /** * Puts the current sketch into the given Memory if there is sufficient space, otherwise, * throws an error. This loads the memory in compact form based on the given compact flag, but * always sorts the base buffer whether compact or not. * @param dstMem the given memory. * @param compact if true, this compacts and sorts the base buffer, which optimizes merge * performance at the cost of slightly increased serialization time. */ public void putMemory(final Memory dstMem, final boolean compact) { final byte[] byteArr = toSortedByteArray(compact); final int arrLen = byteArr.length; final long memCap = dstMem.getCapacity(); if (memCap < arrLen) { throw new SketchesArgumentException( "Destination Memory not large enough: " + memCap + " < " + arrLen); } dstMem.putByteArray(0, byteArr, 0, arrLen); } //Restricted /* * DoublesMergeImpl.downSamplingMergeInto requires the target sketch to implement update(), so * we ensure that the target is an UpdateSketch. The public API, on the other hand, just * specifies a DoublesSketch. This lets us be more specific about the type without changing the * public API. */ UpdateDoublesSketch downSampleInternal(final DoublesSketch srcSketch, final int smallerK, final Memory dstMem) { final UpdateDoublesSketch newSketch = (dstMem == null) ? HeapUpdateDoublesSketch.newInstance(smallerK) : DirectUpdateDoublesSketch.newInstance(smallerK, dstMem); DoublesMergeImpl.downSamplingMergeInto(srcSketch, newSketch); //TODO #2 return newSketch; } static double[] getEvenlySpaced(final int evenlySpaced) { final int n = evenlySpaced; if (n <= 0) { throw new SketchesArgumentException("EvenlySpaced must be > zero."); } final double[] fractions = new double[n]; double frac = 0.0; fractions[0] = frac; for (int i = 1; i < n; i++) { frac = (double)i / (n - 1); fractions[i] = frac; } if (n > 1) { fractions[n - 1] = 1.0; } return fractions; } /** * Returns the Auxiliary data structure, which is only used for getQuantile() and getQuantiles() * queries. * @return the Auxiliary data structure */ DoublesAuxiliary constructAuxiliary() { return new DoublesAuxiliary( this ); } //Restricted abstract /** * Returns true if this sketch is compact * @return true if this sketch is compact */ abstract boolean isCompact(); /** * Returns the base buffer count * @return the base buffer count */ abstract int getBaseBufferCount(); /** * Returns the bit pattern for valid log levels * @return the bit pattern for valid log levels */ abstract long getBitPattern(); /** * Returns the item capacity for the combined base buffer * @return the item capacity for the combined base buffer */ abstract int getCombinedBufferItemCapacity(); /** * Returns the combined buffer, in non-compact form. * @return the combined buffer, in non-compact form. */ abstract double[] getCombinedBuffer(); /** * Gets the Memory if it exists, otherwise returns null. * @return the Memory if it exists, otherwise returns null. */ abstract Memory getMemory(); }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy