All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.yahoo.sketches.hll.HllSketch Maven / Gradle / Ivy

There is a newer version: 0.13.4
Show newest version
/*
 * Copyright 2015-16, Yahoo! Inc.
 * Licensed under the terms of the Apache License 2.0. See LICENSE file at the project root for terms.
 */

package com.yahoo.sketches.hll;

import static com.yahoo.sketches.Util.DEFAULT_UPDATE_SEED;
import static com.yahoo.sketches.hash.MurmurHash3.hash;
import static java.nio.charset.StandardCharsets.UTF_8;

/**
 * Top-level class for the HLL family of sketches.
 * Use the HllSketchBuilder to construct this class.
 *
 * @author Kevin Lang
 */
public class HllSketch {
  private static final double HLL_REL_ERROR_NUMER = 1.04;

  /**
   * Returns an HllSketchBuilder
   * @return an HllSketchBuilder
   */
  public static HllSketchBuilder builder() {
    return new HllSketchBuilder();
  }

  private Fields.UpdateCallback updateCallback;
  private final Preamble preamble;

  private Fields fields;

  /**
   * Construct this class with the given Fields
   * @param fields the given Fields
   */
  public HllSketch(Fields fields) {
    this.fields = fields;
    this.updateCallback = new Fields.UpdateCallback() {
      @Override
      public void bucketUpdated(int bucket, byte oldVal, byte newVal) {
        //intentionally empty
      }
    };
    this.preamble = fields.getPreamble();
  }

  /**
   * Present this sketch with a long.
   *
   * @param datum The given long datum.
   */
  public void update(long datum) {
    long[] data = { datum };
    updateWithHash(hash(data, DEFAULT_UPDATE_SEED));
  }

  /**
   * Present this sketch with the given double (or float) datum.
   * The double will be converted to a long using Double.doubleToLongBits(datum),
   * which normalizes all NaN values to a single NaN representation.
   * Plus and minus zero will be normalized to plus zero.
   * The special floating-point values NaN and +/- Infinity are treated as distinct.
   *
   * @param datum The given double datum.
   */
  public void update(double datum) {
    double d = (datum == 0.0) ? 0.0 : datum; // canonicalize -0.0, 0.0
    long[] data = { Double.doubleToLongBits(d) };// canonicalize all NaN forms
    updateWithHash(hash(data, DEFAULT_UPDATE_SEED));
  }

  /**
   * Present this sketch with the given String.
   * The string is converted to a byte array using UTF8 encoding.
   * If the string is null or empty no update attempt is made and the method returns.
   *
   * @param datum The given String.
   */
  public void update(String datum) {
    if (datum == null || datum.isEmpty()) {
      return;
    }
    byte[] data = datum.getBytes(UTF_8);
    updateWithHash(hash(data, DEFAULT_UPDATE_SEED));
  }

  /**
   * Present this sketch with the given byte array.
   * If the byte array is null or empty no update attempt is made and the method returns.
   *
   * @param data The given byte array.
   */
  public void update(byte[] data) {
    if ((data == null) || (data.length == 0)) {
      return;
    }
    updateWithHash(hash(data, DEFAULT_UPDATE_SEED));
  }

  /**
   * Present this sketch with the given integer array.
   * If the integer array is null or empty no update attempt is made and the method returns.
   *
   * @param data The given int array.
   */
  public void update(int[] data) {
    if ((data == null) || (data.length == 0)) {
      return;
    }
    updateWithHash(hash(data, DEFAULT_UPDATE_SEED));
  }

  /**
   * Present this sketch with the given long array.
   * If the long array is null or empty no update attempt is made and the method returns.
   *
   * @param data The given long array.
   */
  public void update(long[] data) {
    if ((data == null) || (data.length == 0)) {
      return;
    }
    updateWithHash(hash(data, DEFAULT_UPDATE_SEED));
  }

  /**
   * Gets the unique count estimate.
   * @return the sketch's best estimate of the cardinality of the input stream.
   */
  public double getEstimate() {
    double rawEst = getRawEstimate();
    int logK = preamble.getLogConfigK();

    double[] x_arr = Interpolation.interpolation_x_arrs[logK - Interpolation.INTERPOLATION_MIN_LOG_K];
    double[] y_arr = Interpolation.interpolation_y_arrs[logK - Interpolation.INTERPOLATION_MIN_LOG_K];

    if (rawEst < x_arr[0]) {
      return 0;
    }
    if (rawEst > x_arr[x_arr.length - 1]) {
      return rawEst;
    }

    double adjEst = Interpolation.cubicInterpolateUsingTable(x_arr, y_arr, rawEst);
    int configK = preamble.getConfigK();

    if (adjEst > 3.0 * configK) {
      return adjEst;
    }

    double linEst = getLinearEstimate();
    double avgEst = (adjEst + linEst) / 2.0;

    // The following constant 0.64 comes from empirical measurements (see below) of the crossover
    //   point between the average error of the linear estimator and the adjusted hll estimator
    if (avgEst > 0.64 * configK) {
      return adjEst;
    }
    return linEst;
  }

  /**
   * Gets the upper bound with respect to the Estimate
   * @param numStdDevs the number of standard deviations from the Estimate
   * @return the upper bound
   */
  public double getUpperBound(double numStdDevs) {
    return getEstimate() / (1.0 - eps(numStdDevs));
  }

  /**
   * Gets the lower bound with respect to the Estimate
   * @param numStdDevs the number of standard deviations from the Estimate
   * @return the lower bound
   */
  public double getLowerBound(double numStdDevs) {
    double lowerBound = getEstimate() / (1.0 + eps(numStdDevs));
    double numNonZeros = preamble.getConfigK();
    numNonZeros -= numBucketsAtZero();
    if (lowerBound < numNonZeros) {
      return numNonZeros;
    }
    return lowerBound;
  }

  private double getRawEstimate() {
    int numBuckets = preamble.getConfigK();
    double correctionFactor = 0.7213 / (1.0 + 1.079 / numBuckets);
    correctionFactor *= numBuckets * numBuckets;
    correctionFactor /= inversePowerOf2Sum();
    return correctionFactor;
  }

  private double getLinearEstimate() {
    int configK = preamble.getConfigK();
    long longV = numBucketsAtZero();
    if (longV == 0) {
      return configK * Math.log(configK / 0.5);
    }
    return (configK * (HarmonicNumbers.harmonicNumber(configK) - HarmonicNumbers.harmonicNumber(longV)));
  }

  /**
   * Union this sketch with that one
   * @param that the other sketch
   * @return this sketch
   */
  public HllSketch union(HllSketch that) {
    fields = that.fields.unionInto(fields, updateCallback);
    return this;
  }

  private void updateWithHash(long[] hash) {
    byte newValue = (byte) (Long.numberOfLeadingZeros(hash[1]) + 1);
    int slotno = (int) hash[0] & (preamble.getConfigK() - 1);
    fields = fields.updateBucket(slotno, newValue, updateCallback);
  }

  private double eps(double numStdDevs) {
    return numStdDevs * HLL_REL_ERROR_NUMER / Math.sqrt(preamble.getConfigK());
  }

  /**
   * Serializes this sketch to a byte array
   * @return this sketch as a byte array
   */
  public byte[] toByteArray() {
    int numBytes = (preamble.getPreambleLongs() << 3) + fields.numBytesToSerialize();
    byte[] retVal = new byte[numBytes];

    fields.intoByteArray(retVal, preamble.intoByteArray(retVal, 0));

    return retVal;
  }

  /**
   * Serializes this sketch without the Preamble as a byte array
   * @return this sketch without the Preamble as a byte array
   */
  public byte[] toByteArrayNoPreamble() {
    byte[] retVal = new byte[fields.numBytesToSerialize()];
    fields.intoByteArray(retVal, 0);
    return retVal;
  }

  /**
   * Returns this sketch in compact form
   * @return this sketch in compact form
   */
  public HllSketch asCompact() {
    return new HllSketch(fields.toCompact());
  }

  /**
   * Returns the number of configured buckets (k)
   * @return the number of configured buckets (k)
   */
  public int numBuckets() {
    return preamble.getConfigK();
  }

  /**
   * Gets the Preamble of this sketch
   * @return the Preamble of this sketch
   */
  public Preamble getPreamble() {
    return preamble;
  }

  /**
   * Set the update callback. It is final so that it can not be overridden.
   *
   * @param updateCallback the update callback for the HllSketch to use when talking with its Fields
   */
  protected final void setUpdateCallback(Fields.UpdateCallback updateCallback) {
    this.updateCallback = updateCallback;
  }

  //Helper methods that are potential extension points for children

  /**
   * The sum of the inverse powers of 2
   *
   * @return the sum of the inverse powers of 2
   */
  protected double inversePowerOf2Sum() {
    return HllUtils.computeInvPow2Sum(numBuckets(), fields.getBucketIterator());
  }

  protected int numBucketsAtZero() {
    int retVal = 0;
    int count = 0;

    BucketIterator bucketIter = fields.getBucketIterator();
    while (bucketIter.next()) {
      if (bucketIter.getValue() == 0) {
        ++retVal;
      }
      ++count;
    }

    // All skipped buckets are 0.
    retVal += fields.getPreamble().getConfigK() - count;

    return retVal;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy