All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.yahoo.sketches.sampling.ReservoirEntropy Maven / Gradle / Ivy

The newest version!
package com.yahoo.sketches.sampling;

import java.util.ArrayList;
import java.util.List;

import com.yahoo.sketches.Util;

//CHECKSTYLE.OFF: JavadocMethod
//CHECKSTYLE.OFF: WhitespaceAround
public class ReservoirEntropy {

  public static void main(String[] args) {
    //largeSketchEntropy();
    sketchEntropy();
    unionEntropy();
    mismatchedKEntropy();
  }

  /**
   * Computes entropy of distribution over a single sketch
   */
  static void sketchEntropy() {
    final int numIter = 100000;
    final int k = 64;
    final int valueRange = k * k;
    SamplingConfig sc = new SamplingConfig(numIter, 1, k, valueRange);
    int[] histogram = runExperiment(sc);

    System.out.println("sketch entropy result:");
    System.out.println(printStats(histogram, sc));
  }

  /**
   * Computes entropy of distribution over a single large sketch
   */
  static void largeSketchEntropy() {
    final int numIter = 20000;
    final int k = 1 << 20;
    final int valueRange = k << 3;
    SamplingConfig sc = new SamplingConfig(numIter, 1, k, valueRange);
    int[] histogram = runExperiment(sc);

    System.out.println("large sketch entropy result:");
    System.out.println(printStats(histogram, sc));
  }

  /**
   * Computes entropy of distribution over the union of multiple sketches.
   * If we have a valid random sample per-sketch from sketchEntropy(), this will run faster by
   * using sketches with exactly k input values as the union input.
   */
  static void unionEntropy() {
    final int numIter = 100000;
    final int numSketches = 13;
    final int k = 100;
    SamplingConfig sc = new SamplingConfig(numIter, numSketches, k, k);
    int[] histogram = runExperiment(sc);

    System.out.println("union entropy result:");
    System.out.println(printStats(histogram, sc));
  }

  static void mismatchedKEntropy() {
    final int numIter = 100000;
    final int numSketches = 2;
    final int[] k = {128, 1024};
    final int[] valueRange = {8192, 1024};
    SamplingConfig sc = new SamplingConfig(numIter, numSketches, k, valueRange);
    int[] histogram = runExperiment(sc);

    System.out.println("mismatched k entropy result:");
    System.out.println(printStats(histogram, sc));
  }

  static int[] runExperiment(SamplingConfig sc) {
    int[] hist = new int[sc.getCumulativeRange()];

    for (int i = 0; i < sc.getNumIters(); ++i) {
      //if (i > 0 && i % 100 == 0) { System.err.println("Iter " + i); }
      List> sketchList = generateSketches(sc);

      Integer[] out = unionSketchList(sketchList, i, sc);

      for (int key : out) {
        ++hist[key];
      }
    }

    return hist;
  }

  // Creates a list of sketches with non-overlapping value ranges.
  static List> generateSketches(final SamplingConfig sc) {
    List> sketchList = new ArrayList<>(sc.getNumSketches());

    int idx = 0;
    for (int i = 0; i < sc.getNumSketches(); ++i) {
      int k = sc.hasMultipleK() ? sc.getKArray()[i] : sc.getK();
      int rangeMax = sc.getRangeSize(sc.hasMultipleK() ? i : 0);

      ReservoirItemsSketch ris = ReservoirItemsSketch.getInstance(k);
      for (int j = 0; j < rangeMax; ++j) {
        ris.update(idx++);
      }

      sketchList.add(ris);
    }

    return sketchList;
  }

  static String printStats(final int[] histogram, final SamplingConfig sc) {
    int min = Integer.MAX_VALUE;
    int max = Integer.MIN_VALUE;
    long outputCount = 0;

    for (int i = 0; i < histogram.length; ++i) {
      int val = histogram[i];
      //System.out.printf("[%d]: %d\n", i, histogram[i]);
      if (val < min) {
        min = val;
      }
      if (val > max) {
        max = val;
      }
      outputCount += val;
    }

    return "H      = " + computeEntropy(outputCount, histogram) + Util.LS
            + "Theo H = " + Math.log(countPossibleValues(sc)) / Math.log(2.0) + Util.LS
            + "min    = " + min + Util.LS
            + "max    = " + max + Util.LS;
  }

  static double computeEntropy(final long denom, final int[] data) {
    double H = 0.0;
    final double scaleFactor = 1.0 / denom;
    final double INV_LN_2 = 1.0 / Math.log(2.0);

    for (int count : data) {
      double p = count * scaleFactor;
      H -= p * Math.log(p) * INV_LN_2;
    }

    return H;
  }

  /**
   * If multiple values of k, uses the max to instantiate the union
   *
   * @param sketches List of sketches to use
   * @param stIdx    Starting index in the list, to allow round-robin ordering (still
   *                 deterministic, but not 100% fixed)
   * @param sc       SamplingConfig object to use
   * @param       Type of item in the sketches
   * @return Array of samples selected by the union
   */
  static  T[] unionSketchList(final List> sketches,
                                 final int stIdx,
                                 final SamplingConfig sc) {
    ReservoirItemsUnion riu = ReservoirItemsUnion.getInstance(sc.getMaxK());

    for (int i = 0; i < sketches.size(); ++i) {
      int sketchIdx = (stIdx + i) % sc.getNumSketches();
      riu.update(sketches.get(sketchIdx));
    }

    return riu.getResult().getSamples();
  }

  // Sum of all ranges
  static int countPossibleValues(final SamplingConfig sc) {
    if (sc.getRangeSizeArray().length > 1) {
      int total = 0;
      for (int val : sc.getRangeSizeArray()) {
        total += val;
      }
      return total;
    }

    return sc.getRangeSize() * sc.getNumSketches();
  }

  static Integer[] simpleUnion(final int k) {
    ReservoirItemsSketch rls1 = ReservoirItemsSketch.getInstance(k);
    ReservoirItemsSketch rls2 = ReservoirItemsSketch.getInstance(k);

    for (int i = 0; i < 10 * k; ++i) {
      rls1.update(i);
      rls2.update(k * k + i);
    }

    ReservoirItemsUnion rlu = ReservoirItemsUnion.getInstance(k);
    rlu.update(rls1);
    rlu.update(rls2);

    return rlu.getResult().getSamples();
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy