All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.yahoo.sketches.quantiles.ItemsUtil Maven / Gradle / Ivy

There is a newer version: 0.13.4
Show newest version
/*
 * Copyright 2016, Yahoo! Inc.
 * Licensed under the terms of the Apache License 2.0. See LICENSE file at the project root for terms.
 */

package com.yahoo.sketches.quantiles;

import static com.yahoo.sketches.Util.checkIfPowerOf2;
import static java.lang.System.arraycopy;

import java.util.Arrays;
import java.util.Comparator;

import com.yahoo.sketches.SketchesArgumentException;

/**
 * Utility class for generic quantiles sketch.
 *
 * 

This class contains a highly specialized sort called blockyTandemMergeSort(). * It also contains methods that are used while building histograms and other common * functions.

* * @author Kevin Lang * @author Alex Saydadov */ final class ItemsUtil { private ItemsUtil() {} static final int ITEMS_SER_VER = 3; static final int PRIOR_ITEMS_SER_VER = 2; /** * Check the validity of the given serialization version * @param serVer the given serialization version */ static void checkItemsSerVer(int serVer) { if ((serVer == ITEMS_SER_VER) || (serVer == PRIOR_ITEMS_SER_VER)) { return; } throw new SketchesArgumentException( "Possible corruption: Invalid Serialization Version: " + serVer); } /** * Checks the sequential validity of the given array of values. * They must be unique, monotonically increasing and not null. * @param values given array of values */ static final void validateValues(final T[] values, final Comparator comparator) { final int lenM1 = values.length - 1; for (int j = 0; j < lenM1; j++) { if (values[j] != null && values[j + 1] != null && comparator.compare(values[j], values[j + 1]) < 0) { continue; } throw new SketchesArgumentException( "Values must be unique, monotonically increasing and not null."); } } /** * Shared algorithm for both PMF and CDF functions. The splitPoints must be unique, monotonically * increasing values. * @param splitPoints an array of m unique, monotonically increasing values * that divide the ordered domain into m+1 consecutive disjoint intervals. * @param sketch the given quantiles sketch * @return the unnormalized, accumulated counts of m + 1 intervals. */ @SuppressWarnings("unchecked") static long[] internalBuildHistogram(final T[] splitPoints, ItemsSketch sketch) { final Object[] levelsArr = sketch.getCombinedBuffer(); final Object[] baseBuffer = levelsArr; final int bbCount = sketch.getBaseBufferCount(); validateValues(splitPoints, sketch.getComparator()); final int numSplitPoints = splitPoints.length; final int numCounters = numSplitPoints + 1; final long[] counters = new long[numCounters]; long weight = 1; if (numSplitPoints < 50) { // empirically determined crossover // sort not worth it when few split points bilinearTimeIncrementHistogramCounters( (T[]) baseBuffer, 0, bbCount, weight, splitPoints, counters, sketch.getComparator()); } else { Arrays.sort(baseBuffer, 0, bbCount); // sort is worth it when many split points linearTimeIncrementHistogramCounters( (T[]) baseBuffer, 0, bbCount, weight, splitPoints, counters, sketch.getComparator() ); } long myBitPattern = sketch.getBitPattern(); final int k = sketch.getK(); assert myBitPattern == sketch.getN() / (2L * k); // internal consistency check for (int lvl = 0; myBitPattern != 0L; lvl++, myBitPattern >>>= 1) { weight += weight; // *= 2 if ((myBitPattern & 1L) > 0L) { //valid level exists // the levels are already sorted so we can use the fast version linearTimeIncrementHistogramCounters( (T[]) levelsArr, (2 + lvl) * k, k, weight, splitPoints, counters, sketch.getComparator()); } } return counters; } /** * Called when the base buffer has just acquired 2*k elements. * @param sketch the given quantiles sketch */ @SuppressWarnings("unchecked") static void processFullBaseBuffer(final ItemsSketch sketch) { final int bbCount = sketch.getBaseBufferCount(); final long n = sketch.getN(); assert bbCount == 2 * sketch.getK(); // internal consistency check // make sure there will be enough levels for the propagation maybeGrowLevels(n, sketch); // important: n_ was incremented by update before we got here // this aliasing is a bit dangerous; notice that we did it after the possible resizing final Object[] baseBuffer = sketch.getCombinedBuffer(); Arrays.sort(baseBuffer, 0, bbCount); inPlacePropagateCarry( 0, null, 0, // this null is okay (T[]) baseBuffer, 0, true, sketch); sketch.baseBufferCount_ = 0; Arrays.fill(baseBuffer, 0, 2 * sketch.getK(), null); // to release the discarded objects assert n / (2 * sketch.getK()) == sketch.getBitPattern(); // internal consistency check } @SuppressWarnings("unchecked") static void inPlacePropagateCarry( final int startingLevel, final T[] sizeKBuf, final int sizeKStart, final T[] size2KBuf, final int size2KStart, final boolean doUpdateVersion, final ItemsSketch sketch) { // else doMergeIntoVersion final Object[] levelsArr = sketch.getCombinedBuffer(); final long bitPattern = sketch.getBitPattern(); final int k = sketch.getK(); final int endingLevel = Util.positionOfLowestZeroBitStartingAt(bitPattern, startingLevel); if (doUpdateVersion) { // update version of computation // its is okay for sizeKbuf to be null in this case zipSize2KBuffer( size2KBuf, size2KStart, levelsArr, (2 + endingLevel) * k, k); } else { // mergeInto version of computation System.arraycopy( sizeKBuf, sizeKStart, levelsArr, (2 + endingLevel) * k, k); } for (int lvl = startingLevel; lvl < endingLevel; lvl++) { assert (bitPattern & (1L << lvl)) > 0; // internal consistency check mergeTwoSizeKBuffers( (T[]) levelsArr, (2 + lvl) * k, (T[]) levelsArr, (2 + endingLevel) * k, size2KBuf, size2KStart, k, sketch.getComparator()); zipSize2KBuffer( size2KBuf, size2KStart, levelsArr, (2 + endingLevel) * k, k); // to release the discarded objects Arrays.fill(levelsArr, (2 + lvl) * k, (2 + lvl + 1) * k, null); } // end of loop over lower levels // update bit pattern with binary-arithmetic ripple carry sketch.bitPattern_ = bitPattern + (1L << startingLevel); } static void maybeGrowLevels(final long newN, final ItemsSketch sketch) { // important: newN might not equal n_ final int k = sketch.getK(); final int numLevelsNeeded = Util.computeNumLevelsNeeded(k, newN); if (numLevelsNeeded == 0) { // don't need any levels yet, and might have small base buffer; this can happen during a merge return; } // from here on we need a full-size base buffer and at least one level assert newN >= 2L * k; assert numLevelsNeeded > 0; final int spaceNeeded = (2 + numLevelsNeeded) * k; if (spaceNeeded <= sketch.getCombinedBufferAllocatedCount()) { return; } // copies base buffer plus old levels sketch.combinedBuffer_ = Arrays.copyOf(sketch.getCombinedBuffer(), spaceNeeded); sketch.combinedBufferItemCapacity_ = spaceNeeded; } static void growBaseBuffer(final ItemsSketch sketch) { final Object[] baseBuffer = sketch.getCombinedBuffer(); final int oldSize = sketch.getCombinedBufferAllocatedCount(); final int k = sketch.getK(); assert oldSize < 2 * k; final int newSize = Math.max(Math.min(2 * k, 2 * oldSize), 1); sketch.combinedBufferItemCapacity_ = newSize; sketch.combinedBuffer_ = Arrays.copyOf(baseBuffer, newSize); } /** * Merges the source sketch into the target sketch that can have a smaller value of K. * However, it is required that the ratio of the two K values be a power of 2. * I.e., source.getK() = target.getK() * 2^(nonnegative integer). * The source is not modified. * * @param src The source sketch * @param tgt The target sketch */ @SuppressWarnings("unchecked") static void downSamplingMergeInto(final ItemsSketch src, final ItemsSketch tgt) { final int targetK = tgt.getK(); final int sourceK = src.getK(); if ((sourceK % targetK) != 0) { throw new SketchesArgumentException( "source.getK() must equal target.getK() * 2^(nonnegative integer)."); } final int downFactor = sourceK / targetK; checkIfPowerOf2(downFactor, "source.getK()/target.getK() ratio"); int lgDownFactor = Integer.numberOfTrailingZeros(downFactor); final Object[] sourceLevels = src.getCombinedBuffer(); // aliasing is a bit dangerous final Object[] sourceBaseBuffer = src.getCombinedBuffer(); // aliasing is a bit dangerous final long nFinal = tgt.getN() + src.getN(); for (int i = 0; i < src.getBaseBufferCount(); i++) { tgt.update((T) sourceBaseBuffer[i]); } maybeGrowLevels(nFinal, tgt); final Object[] scratchBuf = new Object[2 * targetK]; final Object[] downBuf = new Object[targetK]; long srcBitPattern = src.getBitPattern(); for (int srcLvl = 0; srcBitPattern != 0L; srcLvl++, srcBitPattern >>>= 1) { if ((srcBitPattern & 1L) > 0L) { justZipWithStride( sourceLevels, (2 + srcLvl) * sourceK, downBuf, 0, targetK, downFactor); inPlacePropagateCarry( srcLvl + lgDownFactor, (T[]) downBuf, 0, (T[]) scratchBuf, 0, false, tgt); // won't update target.n_ until the very end } } tgt.n_ = nFinal; assert tgt.getN() / (2 * targetK) == tgt.getBitPattern(); // internal consistency check final T srcMax = src.getMaxValue(); final T srcMin = src.getMinValue(); final T tgtMax = tgt.getMaxValue(); final T tgtMin = tgt.getMinValue(); if (src.getComparator().compare(srcMax, tgtMax) > 0) { tgt.maxValue_ = srcMax; } if (src.getComparator().compare(srcMin, tgtMin) < 0) { tgt.minValue_ = srcMin; } } private static void zipSize2KBuffer( final Object[] bufA, int startA, // input final Object[] bufC, int startC, // output final int k) { int randomOffset = ItemsSketch.rand.nextBoolean() ? 1 : 0; int limC = startC + k; for (int a = startA + randomOffset, c = startC; c < limC; a += 2, c++) { bufC[c] = bufA[a]; } } private static void justZipWithStride( final T[] bufSrc, final int startSrc, // input final T[] bufC, final int startC, // output final int kC, // number of items that should be in the output final int stride) { final int randomOffset = ItemsSketch.rand.nextInt(stride); final int limC = startC + kC; for (int a = startSrc + randomOffset, c = startC; c < limC; a += stride, c++ ) { bufC[c] = bufSrc[a]; } } private static void mergeTwoSizeKBuffers( final T[] keySrc1, final int startSrc1, final T[] keySrc2, final int arrStart2, final T[] keyDst, final int arrStart3, final int k, final Comparator comparator) { final int arrStop1 = startSrc1 + k; final int arrStop2 = arrStart2 + k; int i1 = startSrc1; int i2 = arrStart2; int i3 = arrStart3; while (i1 < arrStop1 && i2 < arrStop2) { if (comparator.compare(keySrc2[i2], keySrc1[i1]) < 0) { keyDst[i3++] = keySrc2[i2++]; } else { keyDst[i3++] = keySrc1[i1++]; } } if (i1 < arrStop1) { System.arraycopy(keySrc1, i1, keyDst, i3, arrStop1 - i1); } else { assert i2 < arrStop2; System.arraycopy(keySrc1, i2, keyDst, i3, arrStop2 - i2); } } /** * Because of the nested loop, cost is O(numSamples * numSplitPoints), which is bilinear. * This method does NOT require the samples to be sorted. * @param samples array of samples * @param offset into samples array * @param numSamples number of samples in samples array * @param weight of the samples * @param splitPoints must be unique and sorted. Number of splitPoints + 1 == counters.length. * @param counters array of counters */ static void bilinearTimeIncrementHistogramCounters(final T[] samples, final int offset, final int numSamples, final long weight, final T[] splitPoints, final long[] counters, final Comparator comparator) { assert (splitPoints.length + 1 == counters.length); for (int i = 0; i < numSamples; i++) { final T sample = samples[i + offset]; int j = 0; for (j = 0; j < splitPoints.length; j++) { final T splitpoint = splitPoints[j]; if (comparator.compare(sample, splitpoint) < 0) { break; } } assert j < counters.length; counters[j] += weight; } } /** * This one does a linear time simultaneous walk of the samples and splitPoints. Because this * internal procedure is called multiple times, we require the caller to ensure these 3 properties: *
    *
  1. samples array must be sorted.
  2. *
  3. splitPoints must be unique and sorted
  4. *
  5. number of SplitPoints + 1 == counters.length
  6. *
* @param samples sorted array of samples * @param offset into samples array * @param numSamples number of samples in samples array * @param weight of the samples * @param splitPoints must be unique and sorted. Number of splitPoints + 1 = counters.length. * @param counters array of counters */ static void linearTimeIncrementHistogramCounters(final T[] samples, final int offset, final int numSamples, final long weight, final T[] splitPoints, final long[] counters, final Comparator comparator) { int i = 0; int j = 0; while (i < numSamples && j < splitPoints.length) { if (comparator.compare(samples[i + offset], splitPoints[j]) < 0) { counters[j] += weight; // this sample goes into this bucket i++; // move on to next sample and see whether it also goes into this bucket } else { j++; // no more samples for this bucket. move on the next bucket. } } // now either i == numSamples(we are out of samples), or // j == numSplitPoints(out of buckets, but there are more samples remaining) // we only need to do something in the latter case. if (j == splitPoints.length) { counters[j] += (weight * (numSamples - i)); } } /** * blockyTandemMergeSort() is an implementation of top-down merge sort specialized * for the case where the input contains successive equal-length blocks * that have already been sorted, so that only the top part of the * merge tree remains to be executed. Also, two arrays are sorted in tandem, * as discussed above. * @param keyArr array of keys * @param valArr array of values * @param arrLen length of keyArr and valArr * @param blkSize size of internal sorted blocks */ static void blockyTandemMergeSort(final T[] keyArr, final long[] valArr, final int arrLen, final int blkSize, final Comparator comparator) { assert blkSize >= 1; if (arrLen <= blkSize) { return; } int numblks = arrLen / blkSize; if (numblks * blkSize < arrLen) { numblks += 1; } assert (numblks * blkSize >= arrLen); // duplicate the input is preparation for the "ping-pong" copy reduction strategy. final T[] keyTmp = Arrays.copyOf(keyArr, arrLen); final long[] valTmp = Arrays.copyOf(valArr, arrLen); blockyTandemMergeSortRecursion(keyTmp, valTmp, keyArr, valArr, 0, numblks, blkSize, arrLen, comparator); } /** * blockyTandemMergeSortRecursion() is called by blockyTandemMergeSort(). * In addition to performing the algorithm's top down recursion, * it manages the buffer swapping that eliminates most copying. * It also maps the input's pre-sorted blocks into the subarrays * that are processed by tandemMerge(). * @param keySrc key source * @param valSrc value source * @param keyDst key destination * @param valDst value destination * @param grpStart group start, refers to pre-sorted blocks such as block 0, block 1, etc. * @param grpLen group length, refers to pre-sorted blocks such as block 0, block 1, etc. * @param blkSize block size * @param arrLim array limit * @param comparator to compare keys */ private static void blockyTandemMergeSortRecursion(final T[] keySrc, final long[] valSrc, final T[] keyDst, final long[] valDst, final int grpStart, final int grpLen, /* indices of blocks */ final int blkSize, final int arrLim, final Comparator comparator) { // Important note: grpStart and grpLen do NOT refer to positions in the underlying array. // Instead, they refer to the pre-sorted blocks, such as block 0, block 1, etc. assert (grpLen > 0); if (grpLen == 1) { return; } int grpLen1 = grpLen / 2; int grpLen2 = grpLen - grpLen1; assert (grpLen1 >= 1); assert (grpLen2 >= grpLen1); final int grpStart1 = grpStart; final int grpStart2 = grpStart + grpLen1; //swap roles of src and dst blockyTandemMergeSortRecursion(keyDst, valDst, keySrc, valSrc, grpStart1, grpLen1, blkSize, arrLim, comparator); //swap roles of src and dst blockyTandemMergeSortRecursion(keyDst, valDst, keySrc, valSrc, grpStart2, grpLen2, blkSize, arrLim, comparator); // here we convert indices of blocks into positions in the underlying array. final int arrStart1 = grpStart1 * blkSize; final int arrStart2 = grpStart2 * blkSize; final int arrLen1 = grpLen1 * blkSize; int arrLen2 = grpLen2 * blkSize; // special case for the final block which might be shorter than blkSize. if (arrStart2 + arrLen2 > arrLim) { arrLen2 = arrLim - arrStart2; } tandemMerge(keySrc, valSrc, arrStart1, arrLen1, arrStart2, arrLen2, keyDst, valDst, arrStart1, comparator); // which will be arrStart3 } /** * Performs two merges in tandem. One of them provides the sort keys * while the other one passively undergoes the same data motion. * @param keySrc key source * @param valSrc value source * @param arrStart1 Array 1 start offset * @param arrLen1 Array 1 length * @param arrStart2 Array 2 start offset * @param arrLen2 Array 2 length * @param keyDst key destination * @param valDst value destination * @param arrStart3 Array 3 start offset * @param comparator to compare keys */ private static void tandemMerge(final T[] keySrc, final long[] valSrc, final int arrStart1, final int arrLen1, final int arrStart2, final int arrLen2, final T[] keyDst, final long[] valDst, final int arrStart3, final Comparator comparator) { final int arrStop1 = arrStart1 + arrLen1; final int arrStop2 = arrStart2 + arrLen2; int i1 = arrStart1; int i2 = arrStart2; int i3 = arrStart3; while (i1 < arrStop1 && i2 < arrStop2) { if (comparator.compare(keySrc[i2], keySrc[i1]) < 0) { keyDst[i3] = keySrc[i2]; valDst[i3] = valSrc[i2]; i3++; i2++; } else { keyDst[i3] = keySrc[i1]; valDst[i3] = valSrc[i1]; i3++; i1++; } } if (i1 < arrStop1) { arraycopy(keySrc, i1, keyDst, i3, arrStop1 - i1); arraycopy(valSrc, i1, valDst, i3, arrStop1 - i1); } else { assert i2 < arrStop2; arraycopy(keySrc, i2, keyDst, i3, arrStop2 - i2); arraycopy(valSrc, i2, valDst, i3, arrStop2 - i2); } } static String toString(final boolean sketchSummary, final boolean dataDetail, final ItemsSketch sketch) { final StringBuilder sb = new StringBuilder(); final String thisSimpleName = sketch.getClass().getSimpleName(); final int bbCount = sketch.getBaseBufferCount(); final int combAllocCount = sketch.getCombinedBufferAllocatedCount(); final int k = sketch.getK(); final long bitPattern = sketch.getBitPattern(); if (dataDetail) { sb.append(Util.LS).append("### ").append(thisSimpleName).append(" DATA DETAIL: ").append(Util.LS); final Object[] items = sketch.getCombinedBuffer(); //output the base buffer sb.append(" BaseBuffer :"); if (bbCount > 0) { for (int i = 0; i < bbCount; i++) { sb.append(' ').append(items[i]); } } sb.append(Util.LS); //output all the levels final int numItems = combAllocCount; if (numItems > 2 * k) { sb.append(" Valid | Level"); for (int j = 2 * k; j < numItems; j++) { //output level data starting at 2K if (j % k == 0) { //start output of new level final int levelNum = j > 2 * k ? (j - 2 * k) / k : 0; final String validLvl = ((1L << levelNum) & bitPattern) > 0 ? " T " : " F "; final String lvl = String.format("%5d", levelNum); sb.append(Util.LS).append(" ").append(validLvl).append(" ").append(lvl).append(":"); } sb.append(' ').append(items[j]); } sb.append(Util.LS); } sb.append("### END DATA DETAIL").append(Util.LS); } if (sketchSummary) { final long n = sketch.getN(); final String nStr = String.format("%,d", n); final int numLevels = Util.computeNumLevelsNeeded(k, n); final String bufCntStr = String.format("%,d", combAllocCount); final int preBytes = sketch.isEmpty() ? Long.BYTES : 2 * Long.BYTES; final double eps = Util.EpsilonFromK.getAdjustedEpsilon(k); final String epsPct = String.format("%.3f%%", eps * 100.0); final int numSamples = sketch.getRetainedItems(); final String numSampStr = String.format("%,d", numSamples); sb.append(Util.LS).append("### ").append(thisSimpleName).append(" SUMMARY: ").append(Util.LS); sb.append(" K : ").append(k).append(Util.LS); sb.append(" N : ").append(nStr).append(Util.LS); sb.append(" BaseBufferCount : ").append(bbCount).append(Util.LS); sb.append(" CombinedBufferAllocatedCount : ").append(bufCntStr).append(Util.LS); sb.append(" Total Levels : ").append(numLevels).append(Util.LS); sb.append(" Valid Levels : ").append(Util.computeValidLevels(bitPattern)) .append(Util.LS); sb.append(" Level Bit Pattern : ").append(Long.toBinaryString(bitPattern)) .append(Util.LS); sb.append(" Valid Samples : ").append(numSampStr).append(Util.LS); sb.append(" Preamble Bytes : ").append(preBytes).append(Util.LS); sb.append(" Normalized Rank Error : ").append(epsPct).append(Util.LS); sb.append(" Min Value : ").append(sketch.getMinValue()).append(Util.LS); sb.append(" Max Value : ").append(sketch.getMaxValue()).append(Util.LS); sb.append("### END SKETCH SUMMARY").append(Util.LS); } return sb.toString(); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy