All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.yahoo.sketches.quantiles.DoublesPmfCdfImpl Maven / Gradle / Ivy

/*
 * Copyright 2016, Yahoo! Inc. Licensed under the terms of the
 * Apache License 2.0. See LICENSE file at the project root for terms.
 */

package com.yahoo.sketches.quantiles;

/**
 * The PMF and CDF algorithms for quantiles.
 *
 * @author Lee Rhodes
 * @author Kevin Lang
 */
class DoublesPmfCdfImpl {

  static double[] getPMFOrCDF(final DoublesSketch sketch, final double[] splitPoints,
      final boolean isCDF) {
    final long[] counters = internalBuildHistogram(sketch, splitPoints);
    final int numCounters = counters.length;
    final double[] result = new double[numCounters];
    final double n = sketch.getN();
    long subtotal = 0;
    if (isCDF) {
      for (int j = 0; j < numCounters; j++) {
        final long count = counters[j];
        subtotal += count;
        result[j] = subtotal / n; //normalize by n
      }
    } else { // PMF
      for (int j = 0; j < numCounters; j++) {
        final long count = counters[j];
        subtotal += count;
        result[j] = count / n; //normalize by n
      }
    }
    assert subtotal == n; //internal consistency check
    return result;
  }

  /**
   * Shared algorithm for both PMF and CDF functions. The splitPoints must be unique, monotonically
   * increasing values.
   * @param sketch the given quantiles DoublesSketch
   * @param splitPoints an array of m unique, monotonically increasing doubles
   * that divide the real number line into m+1 consecutive disjoint intervals.
   * @return the unnormalized, accumulated counts of m + 1 intervals.
   */
  private static long[] internalBuildHistogram(final DoublesSketch sketch, final double[] splitPoints) {
    final DoublesSketchAccessor sketchAccessor = DoublesSketchAccessor.wrap(sketch);
    Util.validateValues(splitPoints);

    final int numSplitPoints = splitPoints.length;
    final int numCounters = numSplitPoints + 1;
    final long[] counters = new long[numCounters];

    long weight = 1;
    sketchAccessor.setLevel(DoublesSketchAccessor.BB_LVL_IDX);
    if (numSplitPoints < 50) { // empirically determined crossover
      // sort not worth it when few split points
      DoublesPmfCdfImpl.bilinearTimeIncrementHistogramCounters(
              sketchAccessor, weight, splitPoints, counters);
    } else {
      sketchAccessor.sort();
      // sort is worth it when many split points
      DoublesPmfCdfImpl.linearTimeIncrementHistogramCounters(
              sketchAccessor, weight, splitPoints, counters);
    }

    long myBitPattern = sketch.getBitPattern();
    final int k = sketch.getK();
    assert myBitPattern == sketch.getN() / (2L * k); // internal consistency check
    for (int lvl = 0; myBitPattern != 0L; lvl++, myBitPattern >>>= 1) {
      weight += weight; // *= 2
      if ((myBitPattern & 1L) > 0L) { //valid level exists
        // the levels are already sorted so we can use the fast version
        sketchAccessor.setLevel(lvl);
        DoublesPmfCdfImpl.linearTimeIncrementHistogramCounters(
                sketchAccessor, weight, splitPoints, counters);
      }
    }
    return counters;

  }

  /**
   * Because of the nested loop, cost is O(numSamples * numSplitPoints), which is bilinear.
   * This method does NOT require the samples to be sorted.
   * @param samples DoublesBufferAccessor holding an array of samples
   * @param weight of the samples
   * @param splitPoints must be unique and sorted. Number of splitPoints + 1 == counters.length.
   * @param counters array of counters
   */
  static void bilinearTimeIncrementHistogramCounters(final DoublesBufferAccessor samples,
                                                     final long weight,
                                                     final double[] splitPoints,
                                                     final long[] counters) {
    assert (splitPoints.length + 1 == counters.length);
    for (int i = 0; i < samples.numItems(); i++) {
      final double sample = samples.get(i);
      int j;
      for (j = 0; j < splitPoints.length; j++) {
        final double splitpoint = splitPoints[j];
        if (sample < splitpoint) {
          break;
        }
      }
      assert j < counters.length;
      counters[j] += weight;
    }
  }


  /**
   * This one does a linear time simultaneous walk of the samples and splitPoints. Because this
   * internal procedure is called multiple times, we require the caller to ensure these 3 properties:
   * 
    *
  1. samples array must be sorted.
  2. *
  3. splitPoints must be unique and sorted
  4. *
  5. number of SplitPoints + 1 == counters.length
  6. *
* @param samples DoublesBufferAccessor holding an array of samples * @param weight of the samples * @param splitPoints must be unique and sorted. Number of splitPoints + 1 = counters.length. * @param counters array of counters */ static void linearTimeIncrementHistogramCounters(final DoublesBufferAccessor samples, final long weight, final double[] splitPoints, final long[] counters) { int i = 0; int j = 0; while (i < samples.numItems() && j < splitPoints.length) { if (samples.get(i) < splitPoints[j]) { counters[j] += weight; // this sample goes into this bucket i++; // move on to next sample and see whether it also goes into this bucket } else { j++; // no more samples for this bucket. move on the next bucket. } } // now either i == numSamples(we are out of samples), or // j == numSplitPoints(out of buckets, but there are more samples remaining) // we only need to do something in the latter case. if (j == splitPoints.length) { counters[j] += (weight * (samples.numItems() - i)); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy