All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.datasketches.quantiles.DoublesAuxiliary Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.datasketches.quantiles;

import static java.lang.System.arraycopy;
import static org.apache.datasketches.quantiles.DoublesSketchAccessor.BB_LVL_IDX;
import static org.apache.datasketches.quantiles.Util.checkFractionalRankBounds;

import java.util.Arrays;

import org.apache.datasketches.QuantilesHelper;

/**
 * Auxiliary data structure for answering quantile queries
 *
 * @author Kevin Lang
 * @author Lee Rhodes
 */
final class DoublesAuxiliary {
  long auxN_;
  double[] auxSamplesArr_; //array of size samples
  long[] auxCumWtsArr_;

  /**
   * Constructs the Auxiliary structure from the DoublesSketch
   * @param qs a DoublesSketch
   */
  DoublesAuxiliary(final DoublesSketch qs ) {
    final int k = qs.getK();
    final long n = qs.getN();
    final long bitPattern = qs.getBitPattern();
    final int numSamples = qs.getRetainedItems();
    final DoublesSketchAccessor sketchAccessor = DoublesSketchAccessor.wrap(qs);

    final double[] itemsArr = new double[numSamples];
    final long[] cumWtsArr = new long[numSamples + 1]; // the extra slot is very important

    // Populate from DoublesSketch:
    //  copy over the "levels" and then the base buffer, all with appropriate weights
    populateFromDoublesSketch(k, n, bitPattern, sketchAccessor, itemsArr, cumWtsArr);

    // Sort the first "numSamples" slots of the two arrays in tandem,
    //  taking advantage of the already sorted blocks of length k
    blockyTandemMergeSort(itemsArr, cumWtsArr, numSamples, k);

    final long total = QuantilesHelper.convertToPrecedingCummulative(cumWtsArr);
    assert total == n;

    auxN_ = n;
    auxSamplesArr_ = itemsArr;
    auxCumWtsArr_ = cumWtsArr;
  }

  /**
   * Get the estimated quantile given a fractional rank.
   * @param fRank the fractional rank where: 0 ≤ fRank ≤ 1.0.
   * @return the estimated quantile
   */
  double getQuantile(final double fRank) {
    checkFractionalRankBounds(fRank);
    final long pos = QuantilesHelper.posOfPhi(fRank, auxN_);
    return approximatelyAnswerPositionalQuery(pos);
  }

  /**
   * Assuming that there are n items in the true stream, this asks what
   * item would appear in position 0 ≤ pos < n of a hypothetical sorted
   * version of that stream.
   *
   * 

Note that since that since the true stream is unavailable, * we don't actually answer the question for that stream, but rather for * a different stream of the same length, that could hypothetically * be reconstructed from the weighted samples in our sketch. * @param pos position * @return approximate answer */ private double approximatelyAnswerPositionalQuery(final long pos) { assert 0 <= pos; assert pos < auxN_; final int index = QuantilesHelper.chunkContainingPos(auxCumWtsArr_, pos); return auxSamplesArr_[index]; } /** * Populate the arrays and registers from a DoublesSketch * @param k K value of sketch * @param n The current size of the stream * @param bitPattern the bit pattern for valid log levels * @param sketchAccessor A DoublesSketchAccessor around the sketch * @param itemsArr the consolidated array of all items from the sketch populated here * @param cumWtsArr the cumulative weights for each item from the sketch populated here */ private final static void populateFromDoublesSketch( final int k, final long n, final long bitPattern, final DoublesSketchAccessor sketchAccessor, final double[] itemsArr, final long[] cumWtsArr) { long weight = 1; int nxt = 0; long bits = bitPattern; assert bits == (n / (2L * k)); // internal consistency check for (int lvl = 0; bits != 0L; lvl++, bits >>>= 1) { weight *= 2; if ((bits & 1L) > 0L) { sketchAccessor.setLevel(lvl); for (int i = 0; i < sketchAccessor.numItems(); i++) { itemsArr[nxt] = sketchAccessor.get(i); cumWtsArr[nxt] = weight; nxt++; } } } weight = 1; //NOT a mistake! We just copied the highest level; now we need to copy the base buffer final int startOfBaseBufferBlock = nxt; // Copy BaseBuffer over, along with weight = 1 sketchAccessor.setLevel(BB_LVL_IDX); for (int i = 0; i < sketchAccessor.numItems(); i++) { itemsArr[nxt] = sketchAccessor.get(i); cumWtsArr[nxt] = weight; nxt++; } assert nxt == itemsArr.length; // Must sort the items that came from the base buffer. // Don't need to sort the corresponding weights because they are all the same. final int numSamples = nxt; Arrays.sort(itemsArr, startOfBaseBufferBlock, numSamples); cumWtsArr[numSamples] = 0; } /** * blockyTandemMergeSort() is an implementation of top-down merge sort specialized * for the case where the input contains successive equal-length blocks * that have already been sorted, so that only the top part of the * merge tree remains to be executed. Also, two arrays are sorted in tandem, * as discussed below. * @param keyArr array of keys * @param valArr array of values * @param arrLen length of keyArr and valArr * @param blkSize size of internal sorted blocks */ //used by DoublesAuxiliary and UtilTest static void blockyTandemMergeSort(final double[] keyArr, final long[] valArr, final int arrLen, final int blkSize) { assert blkSize >= 1; if (arrLen <= blkSize) { return; } int numblks = arrLen / blkSize; if ((numblks * blkSize) < arrLen) { numblks += 1; } assert ((numblks * blkSize) >= arrLen); // duplicate the input is preparation for the "ping-pong" copy reduction strategy. final double[] keyTmp = Arrays.copyOf(keyArr, arrLen); final long[] valTmp = Arrays.copyOf(valArr, arrLen); blockyTandemMergeSortRecursion(keyTmp, valTmp, keyArr, valArr, 0, numblks, blkSize, arrLen); } /** * blockyTandemMergeSortRecursion() is called by blockyTandemMergeSort(). * In addition to performing the algorithm's top down recursion, * it manages the buffer swapping that eliminates most copying. * It also maps the input's pre-sorted blocks into the subarrays * that are processed by tandemMerge(). * @param keySrc key source * @param valSrc value source * @param keyDst key destination * @param valDst value destination * @param grpStart group start, refers to pre-sorted blocks such as block 0, block 1, etc. * @param grpLen group length, refers to pre-sorted blocks such as block 0, block 1, etc. * @param blkSize block size * @param arrLim array limit */ private static void blockyTandemMergeSortRecursion(final double[] keySrc, final long[] valSrc, final double[] keyDst, final long[] valDst, final int grpStart, final int grpLen, /* indices of blocks */ final int blkSize, final int arrLim) { // Important note: grpStart and grpLen do NOT refer to positions in the underlying array. // Instead, they refer to the pre-sorted blocks, such as block 0, block 1, etc. assert (grpLen > 0); if (grpLen == 1) { return; } final int grpLen1 = grpLen / 2; final int grpLen2 = grpLen - grpLen1; assert (grpLen1 >= 1); assert (grpLen2 >= grpLen1); final int grpStart1 = grpStart; final int grpStart2 = grpStart + grpLen1; //swap roles of src and dst blockyTandemMergeSortRecursion(keyDst, valDst, keySrc, valSrc, grpStart1, grpLen1, blkSize, arrLim); //swap roles of src and dst blockyTandemMergeSortRecursion(keyDst, valDst, keySrc, valSrc, grpStart2, grpLen2, blkSize, arrLim); // here we convert indices of blocks into positions in the underlying array. final int arrStart1 = grpStart1 * blkSize; final int arrStart2 = grpStart2 * blkSize; final int arrLen1 = grpLen1 * blkSize; int arrLen2 = grpLen2 * blkSize; // special case for the final block which might be shorter than blkSize. if ((arrStart2 + arrLen2) > arrLim) { arrLen2 = arrLim - arrStart2; } tandemMerge(keySrc, valSrc, arrStart1, arrLen1, arrStart2, arrLen2, keyDst, valDst, arrStart1); // which will be arrStart3 } /** * Performs two merges in tandem. One of them provides the sort keys * while the other one passively undergoes the same data motion. * @param keySrc key source * @param valSrc value source * @param arrStart1 Array 1 start offset * @param arrLen1 Array 1 length * @param arrStart2 Array 2 start offset * @param arrLen2 Array 2 length * @param keyDst key destination * @param valDst value destination * @param arrStart3 Array 3 start offset */ private static void tandemMerge(final double[] keySrc, final long[] valSrc, final int arrStart1, final int arrLen1, final int arrStart2, final int arrLen2, final double[] keyDst, final long[] valDst, final int arrStart3) { final int arrStop1 = arrStart1 + arrLen1; final int arrStop2 = arrStart2 + arrLen2; int i1 = arrStart1; int i2 = arrStart2; int i3 = arrStart3; while ((i1 < arrStop1) && (i2 < arrStop2)) { if (keySrc[i2] < keySrc[i1]) { keyDst[i3] = keySrc[i2]; valDst[i3] = valSrc[i2]; i2++; } else { keyDst[i3] = keySrc[i1]; valDst[i3] = valSrc[i1]; i1++; } i3++; } if (i1 < arrStop1) { arraycopy(keySrc, i1, keyDst, i3, arrStop1 - i1); arraycopy(valSrc, i1, valDst, i3, arrStop1 - i1); } else { assert i2 < arrStop2; arraycopy(keySrc, i2, keyDst, i3, arrStop2 - i2); arraycopy(valSrc, i2, valDst, i3, arrStop2 - i2); } } } // end of class Auxiliary





© 2015 - 2025 Weber Informatics LLC | Privacy Policy