org.apache.solr.util.hll.HLLUtil Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of solr-core Show documentation
Apache Solr (module: core)
There is a newer version: 9.7.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.util.hll;

/** Static functions for computing constants and parameters used in the HLL algorithm. */
final class HLLUtil {
  /**
   * Precomputed pwMaxMask values indexed by registerSizeInBits.
   * Calculated with this formula:
   *
   * {@code
   * int maxRegisterValue = (1 << registerSizeInBits) - 1;
   * // Mask with all bits set except for (maxRegisterValue - 1) least significant bits (see #addRaw())
   * return ~((1L << (maxRegisterValue - 1)) - 1);
   * }
   *
   * @see #pwMaxMask(int)
   */
  @SuppressWarnings("BadShiftAmount")
  private static final long[] PW_MASK = {
    ~((1L << (((1 << 0) - 1) - 1)) - 1),
    ~((1L << (((1 << 1) - 1) - 1)) - 1),
    ~((1L << (((1 << 2) - 1) - 1)) - 1),
    ~((1L << (((1 << 3) - 1) - 1)) - 1),
    ~((1L << (((1 << 4) - 1) - 1)) - 1),
    ~((1L << (((1 << 5) - 1) - 1)) - 1),
    ~((1L << (((1 << 6) - 1) - 1)) - 1),
    ~((1L << (((1 << 7) - 1) - 1)) - 1),
    ~((1L << (((1 << 8) - 1) - 1)) - 1)
  };

  /**
   * Precomputed twoToL values indexed by a linear combination of regWidth
   * and log2m.
   *
   * The array is one-dimensional and can be accessed by using index 
   * (REG_WIDTH_INDEX_MULTIPLIER * regWidth) + log2m for regWidth and 
   * log2m between the specified HLL.{MINIMUM,MAXIMUM}_{REGWIDTH,LOG2M}_PARAM
   * constants.
   *
   * 
See: Blog
   * post with section on 2^L
   *
   * @see #largeEstimator(int, int, double)
   * @see #largeEstimatorCutoff(int, int)
   */
  private static final double[] TWO_TO_L =
      new double[(HLL.MAXIMUM_REGWIDTH_PARAM + 1) * (HLL.MAXIMUM_LOG2M_PARAM + 1)];

  /** Spacing constant used to compute offsets into {@link #TWO_TO_L}. */
  private static final int REG_WIDTH_INDEX_MULTIPLIER = HLL.MAXIMUM_LOG2M_PARAM + 1;

  static {
    for (int regWidth = HLL.MINIMUM_REGWIDTH_PARAM;
        regWidth <= HLL.MAXIMUM_REGWIDTH_PARAM;
        regWidth++) {
      for (int log2m = HLL.MINIMUM_LOG2M_PARAM; log2m <= HLL.MAXIMUM_LOG2M_PARAM; log2m++) {
        int maxRegisterValue = (1 << regWidth) - 1;

        // Since 1 is added to p(w) in the insertion algorithm, only
        // (maxRegisterValue - 1) bits are inspected hence the hash
        // space is one power of two smaller.
        final int pwBits = (maxRegisterValue - 1);
        final int totalBits = (pwBits + log2m);
        final double twoToL = Math.pow(2, totalBits);
        TWO_TO_L[(REG_WIDTH_INDEX_MULTIPLIER * regWidth) + log2m] = twoToL;
      }
    }
  }

  // ************************************************************************
  /**
   * Computes the bit-width of HLL registers necessary to estimate a set of the specified
   * cardinality.
   *
   * @param expectedUniqueElements an upper bound on the number of unique elements that are
   *     expected. This must be greater than zero.
   * @return a register size in bits (i.e. log2(log2(n)))
   */
  public static int registerBitSize(final long expectedUniqueElements) {
    return Math.max(
        HLL.MINIMUM_REGWIDTH_PARAM,
        (int) Math.ceil(NumberUtil.log2(NumberUtil.log2((double) expectedUniqueElements))));
  }

  // ========================================================================
  /**
   * Computes the 'alpha-m-squared' constant used by the HyperLogLog algorithm.
   *
   * @param m this must be a power of two, cannot be less than 16 (2⁴), and cannot be
   *     greater than 65536 (2¹⁶).
   * @return gamma times registerCount squared where gamma is based on the value of
   *     registerCount.
   * @throws IllegalArgumentException if registerCount is less than 16.
   */
  public static double alphaMSquared(final int m) {
    switch (m) {
      case 1 /*2^0*/:
      case 2 /*2^1*/:
      case 4 /*2^2*/:
      case 8 /*2^3*/:
        throw new IllegalArgumentException("'m' cannot be less than 16 (" + m + " < 16).");

      case 16 /*2^4*/:
        return 0.673 * m * m;

      case 32 /*2^5*/:
        return 0.697 * m * m;

      case 64 /*2^6*/:
        return 0.709 * m * m;

      default /*>2^6*/:
        return (0.7213 / (1.0 + 1.079 / m)) * m * m;
    }
  }

  // ========================================================================
  /**
   * Computes a mask that prevents overflow of HyperLogLog registers.
   *
   * @param registerSizeInBits the size of the HLL registers, in bits.
   * @return mask a long mask to prevent overflow of the registers
   * @see #registerBitSize(long)
   */
  public static long pwMaxMask(final int registerSizeInBits) {
    return PW_MASK[registerSizeInBits];
  }

  // ========================================================================
  /**
   * The cutoff for using the "small range correction" formula, in the HyperLogLog algorithm.
   *
   * @param m the number of registers in the HLL. m in the paper.
   * @return the cutoff for the small range correction.
   * @see #smallEstimator(int, int)
   */
  public static double smallEstimatorCutoff(final int m) {
    return ((double) m * 5) / 2;
  }

  /**
   * The "small range correction" formula from the HyperLogLog algorithm. Only appropriate if both
   * the estimator is smaller than
   *
   * 
(5/2) * m

   *
   * and there are still registers that have the zero value.
   *
   * @param m the number of registers in the HLL. m in the paper.
   * @param numberOfZeroes the number of registers with value zero. V in the paper.
   * @return a corrected cardinality estimate.
   */
  public static double smallEstimator(final int m, final int numberOfZeroes) {
    return m * Math.log((double) m / numberOfZeroes);
  }

  /**
   * The cutoff for using the "large range correction" formula, from the HyperLogLog algorithm,
   * adapted for 64 bit hashes.
   *
   * See: Blog
   * post with section on 64 bit hashes and 'large range correction' cutoff
   *
   * @param log2m log-base-2 of the number of registers in the HLL. b in the paper.
   * @param registerSizeInBits the size of the HLL registers, in bits.
   * @return the cutoff for the large range correction.
   * @see #largeEstimator(int, int, double)
   */
  public static double largeEstimatorCutoff(final int log2m, final int registerSizeInBits) {
    return (TWO_TO_L[(REG_WIDTH_INDEX_MULTIPLIER * registerSizeInBits) + log2m]) / 30.0;
  }

  /**
   * The "large range correction" formula from the HyperLogLog algorithm, adapted for 64 bit hashes.
   * Only appropriate for estimators whose value exceeds the return of {@link
   * #largeEstimatorCutoff(int, int)}.
   *
   * See: Blog
   * post with section on 64 bit hashes and 'large range correction'
   *
   * @param log2m log-base-2 of the number of registers in the HLL. b in the paper.
   * @param registerSizeInBits the size of the HLL registers, in bits.
   * @param estimator the original estimator ("E" in the paper).
   * @return a corrected cardinality estimate.
   */
  public static double largeEstimator(
      final int log2m, final int registerSizeInBits, final double estimator) {
    final double twoToL = TWO_TO_L[(REG_WIDTH_INDEX_MULTIPLIER * registerSizeInBits) + log2m];
    return -1 * twoToL * Math.log(1.0 - (estimator / twoToL));
  }
}