All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hive.common.ndv.hll.HyperLogLog Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.common.ndv.hll;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.Map;
import java.util.TreeMap;

import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.ql.util.JavaDataModel;
import org.apache.hive.common.util.Murmur3;

/**
 * 
 * This is an implementation of the following variants of hyperloglog (HLL)
 * algorithm 
 * Original  - Original HLL algorithm from Flajolet et. al from
 *             http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf
 * HLLNoBias - Google's implementation of bias correction based on lookup table
 *             http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/40671.pdf
 * HLL++     - Google's implementation of HLL++ algorithm that uses SPARSE registers
 *             http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/40671.pdf
 * 
 * Following are the constructor parameters that determines which algorithm is
 * used
 * numRegisterIndexBits - number of LSB hashcode bits to be used as register index.
 *                        Default is 14. min = 4 and max = 16
 * numHashBits - number of bits for hashcode. Default is 64. min = 32 and max = 128
 * encoding - Type of encoding to use (SPARSE or DENSE). The algorithm automatically
 *            switches to DENSE beyond a threshold. Default: SPARSE
 * enableBitPacking - To enable bit packing or not. Bit packing improves compression
 *                    at the cost of more CPU cycles. Default: true
 * noBias - Use Google's bias table lookup for short range bias correction.
 *          Enabling this will highly improve the estimation accuracy for short
 *          range values. Default: true
 *
 * 
*/ public class HyperLogLog implements NumDistinctValueEstimator { private final static int DEFAULT_HASH_BITS = 64; private final static long HASH64_ZERO = Murmur3.hash64(new byte[] {0}); private final static long HASH64_ONE = Murmur3.hash64(new byte[] {1}); private final static ByteBuffer SHORT_BUFFER = ByteBuffer.allocate(Short.BYTES); private final static ByteBuffer INT_BUFFER = ByteBuffer.allocate(Integer.BYTES); private final static ByteBuffer LONG_BUFFER = ByteBuffer.allocate(Long.BYTES); public enum EncodingType { SPARSE, DENSE } // number of bits to address registers private final int p; // number of registers - 2^p private final int m; // refer paper private float alphaMM; // enable/disable bias correction using table lookup private final boolean noBias; // enable/disable bitpacking private final boolean bitPacking; // Not making it configurable for perf reasons (avoid checks) private final int chosenHashBits = DEFAULT_HASH_BITS; private HLLDenseRegister denseRegister; private HLLSparseRegister sparseRegister; // counts are cached to avoid repeated complex computation. If register value // is updated the count will be computed again. private long cachedCount; private boolean invalidateCount; private EncodingType encoding; // threshold to switch from SPARSE to DENSE encoding private int encodingSwitchThreshold; private HyperLogLog(HyperLogLogBuilder hllBuilder) { if (hllBuilder.numRegisterIndexBits < HLLConstants.MIN_P_VALUE || hllBuilder.numRegisterIndexBits > HLLConstants.MAX_P_VALUE) { throw new IllegalArgumentException("p value should be between " + HLLConstants.MIN_P_VALUE + " to " + HLLConstants.MAX_P_VALUE); } this.p = hllBuilder.numRegisterIndexBits; this.m = 1 << p; this.noBias = hllBuilder.noBias; this.bitPacking = hllBuilder.bitPacking; // the threshold should be less than 12K bytes for p = 14. // The reason to divide by 5 is, in sparse mode after serialization the // entriesin sparse map are compressed, and delta encoded as varints. The // worst case size of varints are 5 bytes. Hence, 12K/5 ~= 2400 entries in // sparse map. if (bitPacking) { this.encodingSwitchThreshold = ((m * 6) / 8) / 5; } else { // if bitpacking is disabled, all register values takes 8 bits and hence // we can be more flexible with the threshold. For p=14, 16K/5 = 3200 // entries in sparse map can be allowed. this.encodingSwitchThreshold = m / 3; } // initializeAlpha(DEFAULT_HASH_BITS); // alphaMM value for 128 bits hash seems to perform better for default 64 hash bits this.alphaMM = 0.7213f / (1 + 1.079f / m); // For efficiency alpha is multiplied by m^2 this.alphaMM = this.alphaMM * m * m; this.cachedCount = -1; this.invalidateCount = false; this.encoding = hllBuilder.encoding; if (encoding.equals(EncodingType.SPARSE)) { this.sparseRegister = new HLLSparseRegister(p, HLLConstants.P_PRIME_VALUE, HLLConstants.Q_PRIME_VALUE); this.denseRegister = null; } else { this.sparseRegister = null; this.denseRegister = new HLLDenseRegister(p, bitPacking); } } public static HyperLogLogBuilder builder() { return new HyperLogLogBuilder(); } public static class HyperLogLogBuilder { private int numRegisterIndexBits = 14; private EncodingType encoding = EncodingType.SPARSE; private boolean bitPacking = true; private boolean noBias = true; public HyperLogLogBuilder() { } public HyperLogLogBuilder setNumRegisterIndexBits(int b) { this.numRegisterIndexBits = b; return this; } public HyperLogLogBuilder setEncoding(EncodingType enc) { this.encoding = enc; return this; } public HyperLogLogBuilder enableBitPacking(boolean b) { this.bitPacking = b; return this; } public HyperLogLogBuilder enableNoBias(boolean nb) { this.noBias = nb; return this; } public HyperLogLog build() { return new HyperLogLog(this); } } // see paper for alpha initialization. private void initializeAlpha(final int hashBits) { if (hashBits <= 16) { alphaMM = 0.673f; } else if (hashBits <= 32) { alphaMM = 0.697f; } else if (hashBits <= 64) { alphaMM = 0.709f; } else { alphaMM = 0.7213f / (float) (1 + 1.079f / m); } // For efficiency alpha is multiplied by m^2 alphaMM = alphaMM * m * m; } public void addBoolean(boolean val) { add(val ? HASH64_ONE : HASH64_ZERO); } public void addByte(byte val) { add(Murmur3.hash64(new byte[] {val})); } public void addBytes(byte[] val) { add(Murmur3.hash64(val)); } public void addShort(short val) { SHORT_BUFFER.putShort(0, val); add(Murmur3.hash64(SHORT_BUFFER.array())); } public void addInt(int val) { INT_BUFFER.putInt(0, val); add(Murmur3.hash64(INT_BUFFER.array())); } public void addLong(long val) { LONG_BUFFER.putLong(0, val); add(Murmur3.hash64(LONG_BUFFER.array())); } public void addFloat(float val) { INT_BUFFER.putFloat(0, val); add(Murmur3.hash64(INT_BUFFER.array())); } public void addDouble(double val) { LONG_BUFFER.putDouble(0, val); add(Murmur3.hash64(LONG_BUFFER.array())); } public void addChar(char val) { SHORT_BUFFER.putChar(0, val); add(Murmur3.hash64(SHORT_BUFFER.array())); } /** * Java's default charset will be used for strings. * @param val * - input string */ public void addString(String val) { add(Murmur3.hash64(val.getBytes())); } public void addString(String val, Charset charset) { add(Murmur3.hash64(val.getBytes(charset))); } public void add(long hashcode) { if (encoding.equals(EncodingType.SPARSE)) { if (sparseRegister.add(hashcode)) { invalidateCount = true; } // if size of sparse map excess the threshold convert the sparse map to // dense register and switch to DENSE encoding if (sparseRegister.getSize() > encodingSwitchThreshold) { encoding = EncodingType.DENSE; denseRegister = sparseToDenseRegister(sparseRegister); sparseRegister = null; invalidateCount = true; } } else { if (denseRegister.add(hashcode)) { invalidateCount = true; } } } public long estimateNumDistinctValues() { // FMSketch treats the ndv of all nulls as 1 but hll treates the ndv as 0. // In order to get rid of divide by 0 problem, we follow FMSketch return count() > 0 ? count() : 1; } public long count() { // compute count only if the register values are updated else return the // cached count if (invalidateCount || cachedCount < 0) { if (encoding.equals(EncodingType.SPARSE)) { // if encoding is still SPARSE use linear counting with increase // accuracy (as we use pPrime bits for register index) int mPrime = 1 << sparseRegister.getPPrime(); cachedCount = linearCount(mPrime, mPrime - sparseRegister.getSize()); } else { // for DENSE encoding, use bias table lookup for HLLNoBias algorithm // else fallback to HLLOriginal algorithm double sum = denseRegister.getSumInversePow2(); long numZeros = denseRegister.getNumZeroes(); // cardinality estimate from normalized bias corrected harmonic mean on // the registers cachedCount = (long) (alphaMM * (1.0 / sum)); long pow = (long) Math.pow(2, chosenHashBits); // when bias correction is enabled if (noBias) { cachedCount = cachedCount <= 5 * m ? (cachedCount - estimateBias(cachedCount)) : cachedCount; long h = cachedCount; if (numZeros != 0) { h = linearCount(m, numZeros); } if (h < getThreshold()) { cachedCount = h; } } else { // HLL algorithm shows stronger bias for values in (2.5 * m) range. // To compensate for this short range bias, linear counting is used // for values before this short range. The original paper also says // similar bias is seen for long range values due to hash collisions // in range >1/30*(2^32). For the default case, we do not have to // worry about this long range bias as the paper used 32-bit hashing // and we use 64-bit hashing as default. 2^64 values are too high to // observe long range bias (hash collisions). if (cachedCount <= 2.5 * m) { // for short range use linear counting if (numZeros != 0) { cachedCount = linearCount(m, numZeros); } } else if (chosenHashBits < 64 && cachedCount > (0.033333 * pow)) { // long range bias for 32-bit hashcodes if (cachedCount > (1 / 30) * pow) { cachedCount = (long) (-pow * Math.log(1.0 - (double) cachedCount / (double) pow)); } } } } invalidateCount = false; } return cachedCount; } private long getThreshold() { return (long) (HLLConstants.thresholdData[p - 4] + 0.5); } /** * Estimate bias from lookup table * @param count * - cardinality before bias correction * @return cardinality after bias correction */ private long estimateBias(long count) { double[] rawEstForP = HLLConstants.rawEstimateData[p - 4]; // compute distance and store it in sorted map TreeMap estIndexMap = new TreeMap<>(); double distance = 0; for (int i = 0; i < rawEstForP.length; i++) { distance = Math.pow(count - rawEstForP[i], 2); estIndexMap.put(distance, i); } // take top-k closest neighbors and compute the bias corrected cardinality long result = 0; double[] biasForP = HLLConstants.biasData[p - 4]; double biasSum = 0; int kNeighbors = HLLConstants.K_NEAREST_NEIGHBOR; for (Map.Entry entry : estIndexMap.entrySet()) { biasSum += biasForP[entry.getValue()]; kNeighbors--; if (kNeighbors <= 0) { break; } } // 0.5 added for rounding off result = (long) ((biasSum / HLLConstants.K_NEAREST_NEIGHBOR) + 0.5); return result; } public void setCount(long count) { this.cachedCount = count; this.invalidateCount = true; } private long linearCount(int mVal, long numZeros) { return (long) (Math.round(mVal * Math.log(mVal / ((double) numZeros)))); } // refer paper public double getStandardError() { return 1.04 / Math.sqrt(m); } public HLLDenseRegister getHLLDenseRegister() { return denseRegister; } public HLLSparseRegister getHLLSparseRegister() { return sparseRegister; } /** * Reconstruct sparse map from serialized integer list * @param reg * - uncompressed and delta decoded integer list */ public void setHLLSparseRegister(int[] reg) { for (int i : reg) { int key = i >>> HLLConstants.Q_PRIME_VALUE; byte value = (byte) (i & 0x3f); sparseRegister.set(key, value); } } /** * Reconstruct dense registers from byte array * @param reg * - unpacked byte array */ public void setHLLDenseRegister(byte[] reg) { int i = 0; for (byte b : reg) { denseRegister.set(i, b); i++; } } /** * Merge the specified hyperloglog to the current one. Encoding switches * automatically after merge if the encoding switch threshold is exceeded. * @param hll * - hyperloglog to be merged * @throws IllegalArgumentException */ public void merge(HyperLogLog hll) { if (p != hll.p || chosenHashBits != hll.chosenHashBits) { throw new IllegalArgumentException( "HyperLogLog cannot be merged as either p or hashbits are different. Current: " + toString() + " Provided: " + hll.toString()); } EncodingType otherEncoding = hll.getEncoding(); if (encoding.equals(EncodingType.SPARSE) && otherEncoding.equals(EncodingType.SPARSE)) { sparseRegister.merge(hll.getHLLSparseRegister()); // if after merge the sparse switching threshold is exceeded then change // to dense encoding if (sparseRegister.getSize() > encodingSwitchThreshold) { encoding = EncodingType.DENSE; denseRegister = sparseToDenseRegister(sparseRegister); sparseRegister = null; } } else if (encoding.equals(EncodingType.DENSE) && otherEncoding.equals(EncodingType.DENSE)) { denseRegister.merge(hll.getHLLDenseRegister()); } else if (encoding.equals(EncodingType.SPARSE) && otherEncoding.equals(EncodingType.DENSE)) { denseRegister = sparseToDenseRegister(sparseRegister); denseRegister.merge(hll.getHLLDenseRegister()); sparseRegister = null; encoding = EncodingType.DENSE; } else if (encoding.equals(EncodingType.DENSE) && otherEncoding.equals(EncodingType.SPARSE)) { HLLDenseRegister otherDenseRegister = sparseToDenseRegister(hll.getHLLSparseRegister()); denseRegister.merge(otherDenseRegister); } invalidateCount = true; } /** * Converts sparse to dense hll register * @param sparseRegister * - sparse register to be converted * @return converted dense register */ private HLLDenseRegister sparseToDenseRegister(HLLSparseRegister sparseRegister) { if (sparseRegister == null) { return null; } int p = sparseRegister.getP(); int pMask = (1 << p) - 1; HLLDenseRegister result = new HLLDenseRegister(p, bitPacking); for (Map.Entry entry : sparseRegister.getSparseMap().entrySet()) { int key = entry.getKey(); int idx = key & pMask; result.set(idx, entry.getValue()); } return result; } @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append("Encoding: "); sb.append(encoding); sb.append(", p: "); sb.append(p); sb.append(", estimatedCardinality: "); sb.append(estimateNumDistinctValues()); return sb.toString(); } public String toStringExtended() { if (encoding.equals(EncodingType.DENSE)) { return toString() + ", " + denseRegister.toExtendedString(); } else if (encoding.equals(EncodingType.SPARSE)) { return toString() + ", " + sparseRegister.toExtendedString(); } return toString(); } public int getNumRegisterIndexBits() { return p; } public EncodingType getEncoding() { return encoding; } public void setEncoding(EncodingType encoding) { this.encoding = encoding; } @Override public boolean equals(Object obj) { if (!(obj instanceof HyperLogLog)) { return false; } HyperLogLog other = (HyperLogLog) obj; long count = estimateNumDistinctValues(); long otherCount = other.estimateNumDistinctValues(); boolean result = p == other.p && chosenHashBits == other.chosenHashBits && encoding.equals(other.encoding) && count == otherCount; if (encoding.equals(EncodingType.DENSE)) { result = result && denseRegister.equals(other.getHLLDenseRegister()); } if (encoding.equals(EncodingType.SPARSE)) { result = result && sparseRegister.equals(other.getHLLSparseRegister()); } return result; } @Override public int hashCode() { int hashcode = 0; hashcode += 31 * p; hashcode += 31 * chosenHashBits; hashcode += encoding.hashCode(); hashcode += 31 * estimateNumDistinctValues(); if (encoding.equals(EncodingType.DENSE)) { hashcode += 31 * denseRegister.hashCode(); } if (encoding.equals(EncodingType.SPARSE)) { hashcode += 31 * sparseRegister.hashCode(); } return hashcode; } @Override public void reset() { } @Override public byte[] serialize() { ByteArrayOutputStream bos = new ByteArrayOutputStream(); // write bytes to bos ... try { HyperLogLogUtils.serializeHLL(bos, this); byte[] result = bos.toByteArray(); bos.close(); return result; } catch (IOException e) { throw new RuntimeException(e); } } @Override public NumDistinctValueEstimator deserialize(byte[] buf) { InputStream is = new ByteArrayInputStream(buf); try { HyperLogLog result = HyperLogLogUtils.deserializeHLL(is); is.close(); return result; } catch (IOException e) { throw new RuntimeException(e); } } @Override public void addToEstimator(long v) { addLong(v); } @Override public void addToEstimator(String s) { addString(s); } @Override public void addToEstimator(double d) { addDouble(d); } @Override public void addToEstimator(HiveDecimal decimal) { addDouble(decimal.doubleValue()); } @Override public void mergeEstimators(NumDistinctValueEstimator o) { merge((HyperLogLog) o); } @Override public int lengthFor(JavaDataModel model) { // 5 is the head, 1<





© 2015 - 2024 Weber Informatics LLC | Privacy Policy