All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.yahoo.sketches.quantiles.DoublesAuxiliary Maven / Gradle / Ivy

There is a newer version: 0.13.4
Show newest version
/*
 * Copyright 2015-16, Yahoo! Inc.
 * Licensed under the terms of the Apache License 2.0. See LICENSE file at the project root for terms.
 */

package com.yahoo.sketches.quantiles;

import static com.yahoo.sketches.quantiles.DoublesSketchAccessor.BB_LVL_IDX;
import static java.lang.System.arraycopy;

import java.util.Arrays;

/**
 * Auxiliary data structure for answering quantile queries
 *
 * @author Kevin Lang
 * @author Lee Rhodes
 */
final class DoublesAuxiliary {
  long auxN_;
  double[] auxSamplesArr_; //array of size samples
  long[] auxCumWtsArr_;

  /**
   * Constructs the Auxiliary structure from the DoublesSketch
   * @param qs a DoublesSketch
   */
  DoublesAuxiliary(final DoublesSketch qs ) {
    final int k = qs.getK();
    final long n = qs.getN();
    final long bitPattern = qs.getBitPattern();
    final int numSamples = qs.getRetainedItems();
    final DoublesSketchAccessor sketchAccessor = DoublesSketchAccessor.wrap(qs);

    final double[] itemsArr = new double[numSamples];
    final long[] cumWtsArr = new long[numSamples + 1]; // the extra slot is very important

    // Populate from DoublesSketch:
    //  copy over the "levels" and then the base buffer, all with appropriate weights
    populateFromDoublesSketch(k, n, bitPattern, sketchAccessor, itemsArr, cumWtsArr);

    // Sort the first "numSamples" slots of the two arrays in tandem,
    //  taking advantage of the already sorted blocks of length k
    blockyTandemMergeSort(itemsArr, cumWtsArr, numSamples, k);

    // convert the item weights into totals of the weights preceding each item
    long subtot = 0;
    for (int i = 0; i < numSamples + 1; i++ ) {
      final long newSubtot = subtot + cumWtsArr[i];
      cumWtsArr[i] = subtot;
      subtot = newSubtot;
    }

    assert subtot == n;

    auxN_ = n;
    auxSamplesArr_ = itemsArr;
    auxCumWtsArr_ = cumWtsArr;
  }

  /**
   * Get the estimated value given phi
   * @param phi the fractional position where: 0 ≤ φ ≤ 1.0.
   * @return the estimated value given phi
   */
  double getQuantile(final double phi) {
    assert 0.0 <= phi;
    assert phi <= 1.0;
    final long n = this.auxN_;
    if (n <= 0) { return Double.NaN; }
    final long pos = posOfPhi(phi, n);
    return approximatelyAnswerPositionalQuery(pos);
  }

  /**
   * Returns the zero-based index (position) of a value in the hypothetical sorted stream of
   * values of size n. Also used by ItemsAuxiliary.
   * @param phi the fractional position where: 0 ≤ φ ≤ 1.0.
   * @param n the size of the stream
   * @return the index, a value between 0 and n-1.
   */
  static long posOfPhi(final double phi, final long n) { //don't tinker with this definition
    final long pos = (long) Math.floor(phi * n);
    return (pos == n) ? n - 1 : pos;
  }

  /**
   * Assuming that there are n items in the true stream, this asks what
   * item would appear in position 0 <= pos < n of a hypothetical sorted
   * version of that stream.
   *
   * 

Note that since that since the true stream is unavailable, * we don't actually answer the question for that stream, but rather for * a different stream of the same length, that could hypothetically * be reconstructed from the weighted samples in our sketch. * @param pos position * @return approximate answer */ private double approximatelyAnswerPositionalQuery(final long pos) { assert 0 <= pos; assert pos < this.auxN_; final int index = chunkContainingPos(this.auxCumWtsArr_, pos); return this.auxSamplesArr_[index]; } /** * Populate the arrays and registers from a DoublesSketch * @param k K value of sketch * @param n The current size of the stream * @param bitPattern the bit pattern for valid log levels * @param sketchAccessor A DoublesSketchAccessor around the sketch * @param itemsArr the consolidated array of all items from the sketch populated here * @param cumWtsArr the cumulative weights for each item from the sketch populated here */ private final static void populateFromDoublesSketch( final int k, final long n, final long bitPattern, final DoublesSketchAccessor sketchAccessor, final double[] itemsArr, final long[] cumWtsArr) { long weight = 1; int nxt = 0; long bits = bitPattern; assert bits == n / (2L * k); // internal consistency check for (int lvl = 0; bits != 0L; lvl++, bits >>>= 1) { weight *= 2; if ((bits & 1L) > 0L) { sketchAccessor.setLevel(lvl); for (int i = 0; i < sketchAccessor.numItems(); i++) { itemsArr[nxt] = sketchAccessor.get(i); cumWtsArr[nxt] = weight; nxt++; } } } weight = 1; //NOT a mistake! We just copied the highest level; now we need to copy the base buffer final int startOfBaseBufferBlock = nxt; // Copy BaseBuffer over, along with weight = 1 sketchAccessor.setLevel(BB_LVL_IDX); for (int i = 0; i < sketchAccessor.numItems(); i++) { itemsArr[nxt] = sketchAccessor.get(i); cumWtsArr[nxt] = weight; nxt++; } assert nxt == itemsArr.length; // Must sort the items that came from the base buffer. // Don't need to sort the corresponding weights because they are all the same. final int numSamples = nxt; Arrays.sort(itemsArr, startOfBaseBufferBlock, numSamples); cumWtsArr[numSamples] = 0; } /** * This is written in terms of a plain array to facilitate testing. * Also used by ItemsAuxiliary. * @param arr the chunk containing the position * @param pos the position * @return the index of the chunk containing the position */ static int chunkContainingPos(final long[] arr, final long pos) { final int nominalLength = arr.length - 1; /* remember, arr contains an "extra" position */ assert nominalLength > 0; final long n = arr[nominalLength]; assert 0 <= pos; assert pos < n; final int l = 0; final int r = nominalLength; // the following three asserts should probably be retained since they ensure // that the necessary invariants hold at the beginning of the search assert l < r; assert arr[l] <= pos; assert pos < arr[r]; return searchForChunkContainingPos(arr, pos, l, r); } // Let m_i denote the minimum position of the length=n "full" sorted sequence // that is represented in slot i of the length = n "chunked" sorted sequence. // // Note that m_i is the same thing as auxCumWtsArr_[i] // // Then the answer to a positional query 0 <= q < n is l, where 0 <= l < len, // A) m_l <= q // B) q < m_r // C) l+1 = r // // A) and B) provide the invariants for our binary search. // Observe that they are satisfied by the initial conditions: l = 0 and r = len. private static int searchForChunkContainingPos( final long[] arr, final long pos, final int l, final int r) { // the following three asserts can probably go away eventually, since it is fairly clear // that if these invariants hold at the beginning of the search, they will be maintained assert l < r; assert arr[l] <= pos; assert pos < arr[r]; if (l + 1 == r) { return l; } else { final int m = l + (r - l) / 2; if (arr[m] <= pos) { return (searchForChunkContainingPos(arr, pos, m, r)); } else { return (searchForChunkContainingPos(arr, pos, l, m)); } } } /** * blockyTandemMergeSort() is an implementation of top-down merge sort specialized * for the case where the input contains successive equal-length blocks * that have already been sorted, so that only the top part of the * merge tree remains to be executed. Also, two arrays are sorted in tandem, * as discussed below. * @param keyArr array of keys * @param valArr array of values * @param arrLen length of keyArr and valArr * @param blkSize size of internal sorted blocks */ //used by DoublesAuxiliary and UtilTest static void blockyTandemMergeSort(final double[] keyArr, final long[] valArr, final int arrLen, final int blkSize) { assert blkSize >= 1; if (arrLen <= blkSize) { return; } int numblks = arrLen / blkSize; if (numblks * blkSize < arrLen) { numblks += 1; } assert (numblks * blkSize >= arrLen); // duplicate the input is preparation for the "ping-pong" copy reduction strategy. final double[] keyTmp = Arrays.copyOf(keyArr, arrLen); final long[] valTmp = Arrays.copyOf(valArr, arrLen); blockyTandemMergeSortRecursion(keyTmp, valTmp, keyArr, valArr, 0, numblks, blkSize, arrLen); } /** * blockyTandemMergeSortRecursion() is called by blockyTandemMergeSort(). * In addition to performing the algorithm's top down recursion, * it manages the buffer swapping that eliminates most copying. * It also maps the input's pre-sorted blocks into the subarrays * that are processed by tandemMerge(). * @param keySrc key source * @param valSrc value source * @param keyDst key destination * @param valDst value destination * @param grpStart group start, refers to pre-sorted blocks such as block 0, block 1, etc. * @param grpLen group length, refers to pre-sorted blocks such as block 0, block 1, etc. * @param blkSize block size * @param arrLim array limit */ private static void blockyTandemMergeSortRecursion(final double[] keySrc, final long[] valSrc, final double[] keyDst, final long[] valDst, final int grpStart, final int grpLen, /* indices of blocks */ final int blkSize, final int arrLim) { // Important note: grpStart and grpLen do NOT refer to positions in the underlying array. // Instead, they refer to the pre-sorted blocks, such as block 0, block 1, etc. assert (grpLen > 0); if (grpLen == 1) { return; } final int grpLen1 = grpLen / 2; final int grpLen2 = grpLen - grpLen1; assert (grpLen1 >= 1); assert (grpLen2 >= grpLen1); final int grpStart1 = grpStart; final int grpStart2 = grpStart + grpLen1; //swap roles of src and dst blockyTandemMergeSortRecursion(keyDst, valDst, keySrc, valSrc, grpStart1, grpLen1, blkSize, arrLim); //swap roles of src and dst blockyTandemMergeSortRecursion(keyDst, valDst, keySrc, valSrc, grpStart2, grpLen2, blkSize, arrLim); // here we convert indices of blocks into positions in the underlying array. final int arrStart1 = grpStart1 * blkSize; final int arrStart2 = grpStart2 * blkSize; final int arrLen1 = grpLen1 * blkSize; int arrLen2 = grpLen2 * blkSize; // special case for the final block which might be shorter than blkSize. if (arrStart2 + arrLen2 > arrLim) { arrLen2 = arrLim - arrStart2; } tandemMerge(keySrc, valSrc, arrStart1, arrLen1, arrStart2, arrLen2, keyDst, valDst, arrStart1); // which will be arrStart3 } /** * Performs two merges in tandem. One of them provides the sort keys * while the other one passively undergoes the same data motion. * @param keySrc key source * @param valSrc value source * @param arrStart1 Array 1 start offset * @param arrLen1 Array 1 length * @param arrStart2 Array 2 start offset * @param arrLen2 Array 2 length * @param keyDst key destination * @param valDst value destination * @param arrStart3 Array 3 start offset */ private static void tandemMerge(final double[] keySrc, final long[] valSrc, final int arrStart1, final int arrLen1, final int arrStart2, final int arrLen2, final double[] keyDst, final long[] valDst, final int arrStart3) { final int arrStop1 = arrStart1 + arrLen1; final int arrStop2 = arrStart2 + arrLen2; int i1 = arrStart1; int i2 = arrStart2; int i3 = arrStart3; while (i1 < arrStop1 && i2 < arrStop2) { if (keySrc[i2] < keySrc[i1]) { keyDst[i3] = keySrc[i2]; valDst[i3] = valSrc[i2]; i3++; i2++; } else { keyDst[i3] = keySrc[i1]; valDst[i3] = valSrc[i1]; i3++; i1++; } } if (i1 < arrStop1) { arraycopy(keySrc, i1, keyDst, i3, arrStop1 - i1); arraycopy(valSrc, i1, valDst, i3, arrStop1 - i1); } else { assert i2 < arrStop2; arraycopy(keySrc, i2, keyDst, i3, arrStop2 - i2); arraycopy(valSrc, i2, valDst, i3, arrStop2 - i2); } } } // end of class Auxiliary





© 2015 - 2025 Weber Informatics LLC | Privacy Policy