All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.yahoo.sketches.quantiles.ItemsSketch Maven / Gradle / Ivy

There is a newer version: 0.13.4
Show newest version
/*
 * Copyright 2015-16, Yahoo! Inc.
 * Licensed under the terms of the Apache License 2.0. See LICENSE file at the project root for terms.
 */

package com.yahoo.sketches.quantiles;

import static com.yahoo.sketches.quantiles.PreambleUtil.COMPACT_FLAG_MASK;
import static com.yahoo.sketches.quantiles.PreambleUtil.EMPTY_FLAG_MASK;
import static com.yahoo.sketches.quantiles.PreambleUtil.ORDERED_FLAG_MASK;
import static com.yahoo.sketches.quantiles.PreambleUtil.extractFamilyID;
import static com.yahoo.sketches.quantiles.PreambleUtil.extractFlags;
import static com.yahoo.sketches.quantiles.PreambleUtil.extractK;
import static com.yahoo.sketches.quantiles.PreambleUtil.extractN;
import static com.yahoo.sketches.quantiles.PreambleUtil.extractPreLongs;
import static com.yahoo.sketches.quantiles.PreambleUtil.extractSerVer;
import static com.yahoo.sketches.quantiles.PreambleUtil.insertFamilyID;
import static com.yahoo.sketches.quantiles.PreambleUtil.insertFlags;
import static com.yahoo.sketches.quantiles.PreambleUtil.insertK;
import static com.yahoo.sketches.quantiles.PreambleUtil.insertN;
import static com.yahoo.sketches.quantiles.PreambleUtil.insertPreLongs;
import static com.yahoo.sketches.quantiles.PreambleUtil.insertSerVer;
import static com.yahoo.sketches.quantiles.Util.computeBaseBufferItems;
import static com.yahoo.sketches.quantiles.Util.computeBitPattern;

import java.lang.reflect.Array;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Random;

import com.yahoo.memory.Memory;
import com.yahoo.memory.MemoryRegion;
import com.yahoo.memory.NativeMemory;
import com.yahoo.sketches.ArrayOfItemsSerDe;
import com.yahoo.sketches.Family;
import com.yahoo.sketches.SketchesArgumentException;

/**
 * This is a stochastic streaming sketch that enables near-real time analysis of the
 * approximate distribution of comparable items from a very large stream in a single pass.
 * The analysis is obtained using a getQuantiles(*) function or its inverse functions the
 * Probability Mass Function from getPMF(*) and the Cumulative Distribution Function from getCDF(*).
 *
 * 

The documentation for {@link DoublesSketch} applies here except that the size of an ItemsSketch * is very dependent on the Items input into the sketch, so there is no comparable size table as * for the DoublesSketch. * *

There is more documentation available on * DataSketches.GitHub.io.

* * @param type of item * * @author Kevin Lang * @author Alexander Saydakov */ public final class ItemsSketch { /** * Parameter that controls space usage of sketch and accuracy of estimates. */ protected final int k_; private final Comparator comparator_; /** * Total number of data items in the stream so far. (Uniqueness plays no role in these sketches). */ protected long n_; /** * The smallest value ever seen in the stream. */ protected T minValue_; /** * The largest value ever seen in the stream. */ protected T maxValue_; /** * In the initial on-heap version, equals combinedBuffer_.length. * May differ in later versions that grow space more aggressively. * Also, in the off-heap version, combinedBuffer_ won't even be a java array, * so it won't know its own length. */ protected int combinedBufferItemCapacity_; /** * Number of samples currently in base buffer. * *

Count = N % (2*K) */ protected int baseBufferCount_; /** * Active levels expressed as a bit pattern. * *

Pattern = N / (2 * K) */ protected long bitPattern_; /** * This single array contains the base buffer plus all levels some of which may not be used. * A level is of size K and is either full and sorted, or not used. A "not used" buffer may have * garbage. Whether a level buffer used or not is indicated by the bitPattern_. * The base buffer has length 2*K but might not be full and isn't necessarily sorted. * The base buffer precedes the level buffers. * *

The levels arrays require quite a bit of explanation, which we defer until later. */ protected Object[] combinedBuffer_; /** * Setting the seed makes the results of the sketch deterministic if the input values are * received in exactly the same order. This is only useful when performing test comparisons, * otherwise is not recommended. */ public static final Random rand = new Random(); /** * Default value for about 1.7% normalized rank accuracy */ static final int DEFAULT_K = 128; private ItemsSketch(final int k, final Comparator comparator) { Util.checkK(k); k_ = k; comparator_ = comparator; } /** * Obtains a new instance of an ItemsSketch using the DEFAULT_K. * @param type of item * @param comparator to compare items * @return a GenericQuantileSketch */ public static ItemsSketch getInstance(final Comparator comparator) { return getInstance(DEFAULT_K, comparator); } /** * Obtains a new instance of an ItemsSketch. * @param type of item * @param k Parameter that controls space usage of sketch and accuracy of estimates. * Must be greater than 2 and less than 65536 and a power of 2. * @param comparator to compare items * @return a GenericQuantileSketch */ public static ItemsSketch getInstance(final int k, final Comparator comparator) { final ItemsSketch qs = new ItemsSketch(k, comparator); final int bufAlloc = Math.min(Util.MIN_BASE_BUF_SIZE, 2 * k); //the min is important qs.n_ = 0; qs.combinedBufferItemCapacity_ = bufAlloc; qs.combinedBuffer_ = new Object[bufAlloc]; qs.baseBufferCount_ = 0; qs.bitPattern_ = 0; qs.minValue_ = null; qs.maxValue_ = null; return qs; } /** * Heapifies the given srcMem, which must be a Memory image of a ItemsSketch * @param type of item * @param srcMem a Memory image of a sketch. * See Memory * @param comparator to compare items * @param serDe an instance of ArrayOfItemsSerDe * @return a ItemsSketch on the Java heap. */ public static ItemsSketch getInstance(final Memory srcMem, final Comparator comparator, final ArrayOfItemsSerDe serDe) { final long memCapBytes = srcMem.getCapacity(); if (memCapBytes < 8) { throw new SketchesArgumentException("Memory too small: " + memCapBytes); } long cumOffset = srcMem.getCumulativeOffset(0L); Object memArr = srcMem.array(); final int preambleLongs = extractPreLongs(memArr, cumOffset); final int serVer = extractSerVer(memArr, cumOffset); final int familyID = extractFamilyID(memArr, cumOffset); final int flags = extractFlags(memArr, cumOffset); final int k = extractK(memArr, cumOffset); ItemsUtil.checkItemsSerVer(serVer); if ((serVer == 3) && ((flags & COMPACT_FLAG_MASK) == 0)) { throw new SketchesArgumentException("Non-compact Memory images are not supported."); } final boolean empty = Util.checkPreLongsFlagsCap(preambleLongs, flags, memCapBytes); Util.checkFamilyID(familyID); final ItemsSketch qs = getInstance(k, comparator); //checks k if (empty) { return qs; } //Not empty, must have valid preamble + min, max final long n = extractN(memArr, cumOffset); //can't check memory capacity here, not enough information final int extra = 2; //for min, max int numMemItems = Util.computeRetainedItems(k, n) + extra; //set class members qs.n_ = n; qs.combinedBufferItemCapacity_ = Util.computeExpandedCombinedBufferItemCapacity(k, n); qs.baseBufferCount_ = computeBaseBufferItems(k, n); qs.bitPattern_ = computeBitPattern(k, n); qs.combinedBuffer_ = new Object[qs.combinedBufferItemCapacity_]; final int srcMemItemsOffsetBytes = preambleLongs * Long.BYTES; final MemoryRegion mReg = new MemoryRegion(srcMem, srcMemItemsOffsetBytes, srcMem.getCapacity() - srcMemItemsOffsetBytes); final T[] itemsArray = serDe.deserializeFromMemory(mReg, numMemItems); qs.itemsArrayToCombinedBuffer(itemsArray); return qs; } /** * Returns a copy of the given sketch * @param sketch the given sketch * @return a copy of the given sketch */ static ItemsSketch copy(final ItemsSketch sketch) { final ItemsSketch qsCopy = ItemsSketch.getInstance(sketch.k_, sketch.comparator_); qsCopy.n_ = sketch.n_; qsCopy.minValue_ = sketch.getMinValue(); qsCopy.maxValue_ = sketch.getMaxValue(); qsCopy.combinedBufferItemCapacity_ = sketch.getCombinedBufferAllocatedCount(); qsCopy.baseBufferCount_ = sketch.getBaseBufferCount(); qsCopy.bitPattern_ = sketch.getBitPattern(); Object[] combBuf = sketch.getCombinedBuffer(); qsCopy.combinedBuffer_ = Arrays.copyOf(combBuf, combBuf.length); return qsCopy; } /** * Updates this sketch with the given double data item * @param dataItem an item from a stream of items. NaNs are ignored. */ public void update(final T dataItem) { // this method only uses the base buffer part of the combined buffer if (dataItem == null) { return; } if (maxValue_ == null || comparator_.compare(dataItem, maxValue_) > 0) { maxValue_ = dataItem; } if (minValue_ == null || comparator_.compare(dataItem, minValue_) < 0) { minValue_ = dataItem; } if (baseBufferCount_ + 1 > combinedBufferItemCapacity_) { ItemsUtil.growBaseBuffer(this); } combinedBuffer_[baseBufferCount_++] = dataItem; n_++; if (baseBufferCount_ == 2 * k_) { ItemsUtil.processFullBaseBuffer(this); } } /** * This returns an approximation to the value of the data item * that would be preceded by the given fraction of a hypothetical sorted * version of the input stream so far. * *

We note that this method has a fairly large overhead (microseconds instead of nanoseconds) * so it should not be called multiple times to get different quantiles from the same * sketch. Instead use getQuantiles(). which pays the overhead only once. * * @param fraction the specified fractional position in the hypothetical sorted stream. * These are also called normalized ranks or fractional ranks. * If fraction = 0.0, the true minimum value of the stream is returned. * If fraction = 1.0, the true maximum value of the stream is returned. * * @return the approximation to the value at the above fraction */ public T getQuantile(final double fraction) { if ((fraction < 0.0) || (fraction > 1.0)) { throw new SketchesArgumentException("Fraction cannot be less than zero or greater than 1.0"); } if (fraction == 0.0) { return minValue_; } else if (fraction == 1.0) { return maxValue_; } else { final ItemsAuxiliary aux = constructAuxiliary(); return aux.getQuantile(fraction); } } /** * This is a more efficient multiple-query version of getQuantile(). * *

This returns an array that could have been generated by using getQuantile() with many different * fractional ranks, but would be very inefficient. * This method incurs the internal set-up overhead once and obtains multiple quantile values in * a single query. It is strongly recommend that this method be used instead of multiple calls * to getQuantile(). * * @param fractions given array of fractional positions in the hypothetical sorted stream. * These are also called normalized ranks or fractional ranks. * These fractions must be monotonic, in increasing order and in the interval * [0.0, 1.0] inclusive. * * @return array of approximations to the given fractions in the same order as given fractions * array. Returns null if sketch is empty */ public T[] getQuantiles(final double[] fractions) { if (isEmpty()) { return null; } Util.validateFractions(fractions); ItemsAuxiliary aux = null; @SuppressWarnings("unchecked") final T[] answers = (T[]) Array.newInstance(minValue_.getClass(), fractions.length); for (int i = 0; i < fractions.length; i++) { final double fraction = fractions[i]; if (fraction == 0.0) { answers[i] = minValue_; } else if (fraction == 1.0) { answers[i] = maxValue_; } else { if (aux == null) { aux = this.constructAuxiliary(); } answers[i] = aux.getQuantile(fraction); } } return answers; } /** * This is also a more efficient multiple-query version of getQuantile() and allows the caller to * specify the number of evenly spaced fractional ranks. * * * @param evenlySpaced an integer that specifies the number of evenly spaced fractional ranks. * This must be a positive integer greater than 0. A value of 1 will return the min value. * A value of 2 will return the min and the max value. A value of 3 will return the min, * the median and the max value, etc. * * @return array of approximations to the given fractions in the same order as given fractions * array. */ public T[] getQuantiles(int evenlySpaced) { return getQuantiles(getEvenlySpaced(evenlySpaced)); } /** * Returns an approximation to the Probability Mass Function (PMF) of the input stream * given a set of splitPoints (values). * *

The resulting approximations have a probabilistic guarantee that be obtained from the * getNormalizedRankError() function. * * @param splitPoints an array of m unique, monotonically increasing values * that divide the domain into m+1 consecutive disjoint intervals. * * @return an array of m+1 doubles each of which is an approximation * to the fraction of the input stream values that fell into one of those intervals. * The definition of an "interval" is inclusive of the left splitPoint and exclusive of the right * splitPoint. */ public double[] getPMF(final T[] splitPoints) { return getPMFOrCDF(splitPoints, false); } /** * Returns an approximation to the Cumulative Distribution Function (CDF), which is the * cumulative analog of the PMF, of the input stream given a set of splitPoints (values). * *

More specifically, the value at array position j of the CDF is the * sum of the values in positions 0 through j of the PMF. * * @param splitPoints an array of m unique, monotonically increasing values * that divide the domain into m+1 consecutive disjoint intervals. * * @return an approximation to the CDF of the input stream given the splitPoints. */ public double[] getCDF(final T[] splitPoints) { return getPMFOrCDF(splitPoints, true); } private double[] getPMFOrCDF(final T[] splitPoints, boolean isCDF) { long[] counters = ItemsUtil.internalBuildHistogram(splitPoints, this); int numCounters = counters.length; double[] result = new double[numCounters]; double n = n_; long subtotal = 0; if (isCDF) { for (int j = 0; j < numCounters; j++) { long count = counters[j]; subtotal += count; result[j] = subtotal / n; //normalize by n } } else { // PMF for (int j = 0; j < numCounters; j++) { long count = counters[j]; subtotal += count; result[j] = count / n; //normalize by n } } assert subtotal == n; //internal consistency check return result; } /** * Returns the configured value of K * @return the configured value of K */ public int getK() { return k_; } /** * Returns the min value of the stream * @return the min value of the stream */ public T getMinValue() { return minValue_; } /** * Returns the max value of the stream * @return the max value of the stream */ public T getMaxValue() { return maxValue_; } /** * Returns the length of the input stream so far. * @return the length of the input stream so far */ public long getN() { return n_; } /** * Get the rank error normalized as a fraction between zero and one. * The error of this sketch is specified as a fraction of the normalized rank of the hypothetical * sorted stream of items presented to the sketch. * *

Suppose the sketch is presented with N values. The raw rank (0 to N-1) of an item * would be its index position in the sorted version of the input stream. If we divide the * raw rank by N, it becomes the normalized rank, which is between 0 and 1.0. * *

For example, choosing a K of 227 yields a normalized rank error of about 1%. * The upper bound on the median value obtained by getQuantile(0.5) would be the value in the * hypothetical ordered stream of values at the normalized rank of 0.51. * The lower bound would be the value in the hypothetical ordered stream of values at the * normalized rank of 0.49. * *

The error of this sketch cannot be translated into an error (relative or absolute) of the * returned quantile values. * * @return the rank error normalized as a fraction between zero and one. */ public double getNormalizedRankError() { return getNormalizedRankError(getK()); } /** * Static method version of {@link #getNormalizedRankError()} * @param k the configuration parameter of a ItemsSketch * @return the rank error normalized as a fraction between zero and one. */ public static double getNormalizedRankError(int k) { return Util.EpsilonFromK.getAdjustedEpsilon(k); } /** * Returns true if this sketch is empty * @return true if this sketch is empty */ public boolean isEmpty() { return getN() == 0; } /** * Resets this sketch to a virgin state, but retains the original value of k. */ public void reset() { n_ = 0; combinedBufferItemCapacity_ = Math.min(Util.MIN_BASE_BUF_SIZE, 2 * k_); //the min is important combinedBuffer_ = new Object[combinedBufferItemCapacity_]; baseBufferCount_ = 0; bitPattern_ = 0; minValue_ = null; maxValue_ = null; } /** * Serialize this sketch to a byte array form. * @param serDe an instance of ArrayOfItemsSerDe * @return byte array of this sketch */ public byte[] toByteArray(final ArrayOfItemsSerDe serDe) { return toByteArray(false, serDe); } /** * Serialize this sketch to a byte array form. * @param ordered if true the base buffer will be ordered (default == false). * @param serDe an instance of ArrayOfItemsSerDe * @return this sketch in a byte array form. */ public byte[] toByteArray(final boolean ordered, final ArrayOfItemsSerDe serDe) { final boolean empty = isEmpty(); int flags = (empty ? EMPTY_FLAG_MASK : 0) | (ordered ? ORDERED_FLAG_MASK : 0) | COMPACT_FLAG_MASK; if (empty) { byte[] outByteArr = new byte[Long.BYTES]; Memory memOut = new NativeMemory(outByteArr); long cumOffset = memOut.getCumulativeOffset(0L); int preLongs = 1; insertPre0(outByteArr, cumOffset, preLongs, flags, k_); return outByteArr; } //not empty T[] dataArr = combinedBufferToItemsArray(ordered); //includes min and max int preLongs = 2; byte[] itemsByteArr = serDe.serializeToByteArray(dataArr); int numOutBytes = (preLongs << 3) + itemsByteArr.length; byte[] outByteArr = new byte[numOutBytes]; Memory memOut = new NativeMemory(outByteArr); long cumOffset = memOut.getCumulativeOffset(0L); //insert preamble insertPre0(outByteArr, cumOffset, preLongs, flags, k_); insertN(outByteArr, cumOffset, n_); //insert data memOut.putByteArray(preLongs << 3, itemsByteArr, 0, itemsByteArr.length); return outByteArr; } /** * Returns summary information about this sketch. */ @Override public String toString() { return toString(true, false); } /** * Returns summary information about this sketch. Used for debugging. * @param sketchSummary if true includes sketch summary * @param dataDetail if true includes data detail * @return summary information about the sketch. */ public String toString(final boolean sketchSummary, final boolean dataDetail) { return ItemsUtil.toString(sketchSummary, dataDetail, this); } /** * From an existing sketch, this creates a new sketch that can have a smaller value of K. * The original sketch is not modified. * * @param newK the new value of K that must be smaller than current value of K. * It is required that this.getK() = newK * 2^(nonnegative integer). * @return the new sketch. */ public ItemsSketch downSample(final int newK) { final ItemsSketch newSketch = ItemsSketch.getInstance(newK, comparator_); ItemsUtil.downSamplingMergeInto(this, newSketch); return newSketch; } /** * Computes the number of retained entries (samples) in the sketch * @return the number of retained entries (samples) in the sketch */ public int getRetainedItems() { return Util.computeRetainedItems(getK(), getN()); } /** * Puts the current sketch into the given Memory if there is sufficient space. * Otherwise, throws an error. * * @param dstMem the given memory. * @param serDe an instance of ArrayOfItemsSerDe */ public void putMemory(final Memory dstMem, final ArrayOfItemsSerDe serDe) { final byte[] byteArr = toByteArray(serDe); final long memCap = dstMem.getCapacity(); if (memCap < byteArr.length) { throw new SketchesArgumentException( "Destination Memory not large enough: " + memCap + " < " + byteArr.length); } dstMem.putByteArray(0, byteArr, 0, byteArr.length); } // Restricted /** * Returns the base buffer count * @return the base buffer count */ protected int getBaseBufferCount() { return baseBufferCount_; } /** * Returns the allocated count for the combined base buffer * @return the allocated count for the combined base buffer */ protected int getCombinedBufferAllocatedCount() { return combinedBufferItemCapacity_; } /** * Returns the bit pattern for valid log levels * @return the bit pattern for valid log levels */ protected long getBitPattern() { return bitPattern_; } /** * Returns the combined buffer reference * @return the combined buffer reference */ protected Object[] getCombinedBuffer() { return combinedBuffer_; } /** * Loads the Combined Buffer, min and max from the given items array. * The Combined Buffer is always in non-compact form and must be pre-allocated. * @param itemsArray the given items array */ private void itemsArrayToCombinedBuffer(T[] itemsArray) { final int extra = 2; // space for min and max values //Load min, max minValue_ = itemsArray[0]; maxValue_ = itemsArray[1]; //Load base buffer System.arraycopy(itemsArray, extra, combinedBuffer_, 0, baseBufferCount_); //Load levels long bits = bitPattern_; if (bits > 0) { int index = extra + baseBufferCount_; for (int level = 0; bits != 0L; level++, bits >>>= 1) { if ((bits & 1L) > 0L) { System.arraycopy(itemsArray, index, combinedBuffer_, (2 + level) * k_, k_); index += k_; } } } } /** * Returns an array of items in compact form, including min and max extracted from the * Combined Buffer. * @param ordered true if the desired form of the resulting array has the base buffer sorted. * @return an array of items, including min and max extracted from the Combined Buffer. */ @SuppressWarnings("unchecked") private T[] combinedBufferToItemsArray(boolean ordered) { T[] outArr = null; final int extra = 2; // extra space for min and max values final int outArrCap = getRetainedItems(); outArr = (T[]) Array.newInstance(minValue_.getClass(), outArrCap + extra); //Load min, max outArr[0] = minValue_; outArr[1] = maxValue_; //Load base buffer System.arraycopy(combinedBuffer_, 0, outArr, extra, baseBufferCount_); //Load levels long bits = bitPattern_; if (bits > 0) { int index = extra + baseBufferCount_; for (int level = 0; bits != 0L; level++, bits >>>= 1) { if ((bits & 1L) > 0L) { System.arraycopy(combinedBuffer_, (2 + level) * k_, outArr, index, k_); index += k_; } } } if (ordered) { Arrays.sort(outArr, extra, baseBufferCount_ + extra, comparator_); } return outArr; } private static final void insertPre0(byte[] outArr, long cumOffset, int preLongs, int flags, int k) { insertPreLongs(outArr, cumOffset, preLongs); insertSerVer(outArr, cumOffset, ItemsUtil.ITEMS_SER_VER); insertFamilyID(outArr, cumOffset, Family.QUANTILES.getID()); insertFlags(outArr, cumOffset, flags); insertK(outArr, cumOffset, k); } /** * Returns the Auxiliary data structure which is only used for getQuantile() and getQuantiles() * queries. * @return the Auxiliary data structure */ private ItemsAuxiliary constructAuxiliary() { return new ItemsAuxiliary(this); } private static double[] getEvenlySpaced(final int n) { if (n <= 0) { throw new SketchesArgumentException("n must be > zero."); } final double[] fractions = new double[n]; fractions[0] = 0.0; for (int i = 1; i < n; i++) { fractions[i] = (double) i / (n - 1); } if (n > 1) { fractions[n - 1] = 1.0; } return fractions; } Comparator getComparator() { return comparator_; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy