All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.infinispan.server.resp.hll.internal.CompactSet Maven / Gradle / Ivy

There is a newer version: 15.1.4.Final
Show newest version
package org.infinispan.server.resp.hll.internal;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.LongStream;

import org.infinispan.commons.marshall.ProtoStreamTypeIds;
import org.infinispan.protostream.annotations.ProtoFactory;
import org.infinispan.protostream.annotations.ProtoField;
import org.infinispan.protostream.annotations.ProtoTypeId;
import org.infinispan.protostream.descriptors.Type;

import net.jcip.annotations.GuardedBy;
import net.jcip.annotations.ThreadSafe;

/**
 * The {@link org.infinispan.server.resp.hll.HyperLogLog} magic.
 * 

* The compact representation can track up to 264 elements with an accurate cardinality estimation and occupy * only 12Kb of memory. At instantiation, is reserved a long array with length {@link #STORE_SIZE}. We use * P={@link #HLL_BUCKET_COUNT} bits to identify the bucket and Q={@link #HLL_MAX_CONSECUTIVE_ZEROES} bits to count the * consecutive zeroes. The Q number is representable with 6 bits. Therefore, a register has a width of 6 bits * ({@link #REGISTER_WIDTH}). *

* *

* In C, we could allocate the memory space and manipulate it as we want, but here in Java, we have to do some bit * twiddling magic to make the registers fit. In other words, each position in the array uses long with 64 bits, whereas * one register uses only 6 bits. Multiple registers are within a single array position or spread across two positions. * Some manipulation is necessary to retrieve the correct register value from the array. *

* *

* The basic idea is that with a 64-bit hash, the first P bits identify the bucket, and the remaining Q bits we use to * count the trailing zeroes, and in case all bits are 0, the count is Q + 1. The next step is to update the bucket value * if it is smaller than the current count. From the bucket number and some bit-shifting, we identify the register. *

* *

Credits

* This implementation takes inspiration from the Redis implementation [1], which is based [2]. The bit-shifting * manipulation has some basis in [3], which implements the original algorithm without optimizations. * *

* Our cardinality estimation uses the optimizations proposed in [2]. And this is the reason we keep an additional int * array. We use this approach as it is the same as Redis uses. This algorithm does not require a bias correction as the * Google implementation or the [3]. That is, it does not need empirical data to correct estimations. *

* * @see [1] Redis HyperLogLog implementation. * @see [2] New cardinality estimation algorithms for HyperLogLog sketches * @see [3] Aggregate Knowledge HyperLogLog implementation. * @since 15.0 * @author José Bolina */ @ThreadSafe @ProtoTypeId(ProtoStreamTypeIds.RESP_HYPER_LOG_LOG_COMPACT) public class CompactSet implements HLLRepresentation { // The number of bits to identify the bucket. // This is equivalent of Math.log2(HLL_BUCKET_TOTAL). static final int HLL_BUCKET_COUNT = 14; // The number of bits to count the number of consecutive zeroes. // Utilizing a long, we have 64 bits total. The first HLL_BUCKET_COUNT bits are for the bucket identification. // The remaining bits count the number of consecutive zeroes. static final int HLL_MAX_CONSECUTIVE_ZEROES = Long.SIZE - HLL_BUCKET_COUNT; static final int HLL_BUCKET_TOTAL = 1 << HLL_BUCKET_COUNT; static final int HLL_BUCKET_MASK = HLL_BUCKET_TOTAL - 1; // Comes from Eq. 13 of [2]. This is used during the cardinality estimation and is equivalent to: 0.5 / Math.log(2). static final double ALPHA_INF = 0.721347520444481703680; // The size of the actual register we use to store the count of zeroes. // The highest number we can have is `HLL_MAX_CONSECUTIVE_ZEROES + 1`, which is representable in 6 bits. static final byte REGISTER_WIDTH = (byte) (Long.SIZE - Long.numberOfLeadingZeros(HLL_MAX_CONSECUTIVE_ZEROES)); // The maximum value a register can hold with the given size. static final byte MAX_REGISTER_ENTRY = (byte) (1L << REGISTER_WIDTH); // Mask to identify the correct register in the store position. static final byte SINGLE_REGISTER_MASK = (byte) (MAX_REGISTER_ENTRY - 1); // The total store size. This takes into account the splitted registers and the multiple registers per array position. // In total, this will occupy roughly 12Kb. static final int STORE_SIZE = ((REGISTER_WIDTH * HLL_BUCKET_TOTAL) + SINGLE_REGISTER_MASK) >>> REGISTER_WIDTH; @GuardedBy("this") private final long[] store; @GuardedBy("this") private final int[] multiplicity; private volatile byte minimum; public CompactSet() { this.store = new long[STORE_SIZE]; // The multiplicity is utilized during the cardinality calculation. We can maintain it pre-calculated during // insertions instead of calculating it in the cardinality. // We call it `multiplicity` to adhere to what the work [2] calls it. In truth, this is a histogram. this.multiplicity = new int[HLL_MAX_CONSECUTIVE_ZEROES + 2]; // Since it is just created, all registers has a size o 0. this.multiplicity[0] = HLL_BUCKET_TOTAL; this.minimum = 0b0; } @ProtoFactory CompactSet(Collection store, Collection multiplicity, byte minimum) { this.store = store.stream().mapToLong(Long::longValue).toArray(); this.multiplicity = multiplicity.stream().mapToInt(Integer::intValue).toArray(); this.minimum = minimum; } void readSource(Set hashes) { for (long hash : hashes) { setRegister(hash); } } @Override public boolean set(byte[] data) { return setRegister(Util.hash(data)); } /** * The cardinality estimation is based on [2] (see class doc). *

* This is an implementation of Algorithm 6 [2]. It uses the {@link #multiplicity} vector and the methods of * {@link #tau(double)} and {@link #sigma(double)}. Note that this does not need to iterate over the real * registers to make an estimation, *

* * @return The estimate cardinality of the set. */ @Override public long cardinality() { double z; synchronized (this) { z = HLL_BUCKET_TOTAL * tau(1 - (double) multiplicity[HLL_MAX_CONSECUTIVE_ZEROES + 1] / HLL_BUCKET_TOTAL); for (int k = HLL_MAX_CONSECUTIVE_ZEROES + 1; k >= 1; k--) { z = 0.5 * (z + multiplicity[k]); } z = z + HLL_BUCKET_TOTAL * sigma((double) multiplicity[0] / HLL_BUCKET_TOTAL); } double v = ALPHA_INF * HLL_BUCKET_TOTAL * HLL_BUCKET_TOTAL / z; return Math.round(v); } /** * Implemented from Algorithm 6 [2]. * * @param x: A double floating point precision value in [0:1] interval. */ private static double sigma(double x) { if (x == 1) return Double.POSITIVE_INFINITY; double zPrime; double y = 1; double z = x; do { x *= x; zPrime = z; z += x * y; y += y; } while(z != zPrime); return z; } /** * Implemented from Algorithm 6 [2]. * * @param x: A double floating point precision value in [0:1] interval. */ private static double tau(double x) { if (x == 0 || x == 1) return 0; double y = 1; double z = 1 - x; double zPrime = z; do { x = Math.sqrt(x); zPrime = z; y = 0.5 * y; z = z - Math.pow(1- x, 2) * y; } while (z != zPrime); return z / 3; } private boolean setRegister(long hash) { // The sequence is the `HLL_MAX_CONSECUTIVE_ZEROES` most-significant bits. We remove the `HLL_BUCKET_COUNT` // least-significant bits that identify the bucket. This needs to use an unsigned operation. long seq = hash >>> HLL_BUCKET_COUNT; // We count the number 0 from right until reaching the first bit set. If we don't have any bit set on `seq`, // we say the next active bit is HLL_MAX_CONSECUTIVE_ZEROES + 1. That happens when seq == 0, we moved the `HLL_BUCKET_COUNT` LSB. // Safe cast as the number has at most `HLL_MAX_CONSECUTIVE_ZEROES + 1` bits, which is representable in a byte. byte k = (byte) (seq == 0 ? HLL_MAX_CONSECUTIVE_ZEROES + 1 : (1 + Long.numberOfTrailingZeros(seq))); // If the calculated k is smaller than the known minimum, means that it will not update anything. // We can abort early safely. if (k > minimum) { // The bucket is the `HLL_BUCKET_COUNT` least-significant bits. int bucket = (int) (hash & HLL_BUCKET_MASK); return setRegister(bucket, k); } return false; } /** * Beware, there be dragons. *

* This converts the {@param bucket} mapping into the correct register in the {@link #store}. If the new {@param value} * is greater than the stored value, we can proceed updating the value. *

*

* Additionally, this implements the update to the {@link #multiplicity} array as defined on Algorithm 7 [2]. We update * the histogram to reflect the count values and the {@link #minimum}, if necessary. *

* * @param bucket: The bucket identified by the hash P least-significant bits. * @param value: The count of consecutive zeroes. * @return if the register was updated or not. */ private boolean setRegister(int bucket, byte value) { assert bucket < (1L << 29) : "Bucket is out of bound."; int index = bucket * REGISTER_WIDTH; // We have an array withs longs to store each register. Since REGISTER_WIDTH < 8, each can have a register // split in two bytes. We identify the start position in the array. int first = index >>> REGISTER_WIDTH; // We advance to the next position and divide by the register width to get a possible split. int second = (index + REGISTER_WIDTH - 1) >>> REGISTER_WIDTH; // How many bits to skip before reaching the actual register. // For example, we use 6 bits out of 64 bits, in which position is our value? From bit 0, 1? int offset = index & SINGLE_REGISTER_MASK; // The register is using two positions if the first and second indexes are different. boolean spilled = first != second; byte stored; synchronized (this) { if (spilled) { // The register value spans across two positions. // First halve, we use the remainder to skip unnecessary bits. // Second halve, we now skip (MAX_SIZE - remainder) bits to retrieve the remaining bits. // Then the | to append the two halves and a mask. stored = (byte) (((store[first] >>> offset) | (store[second] << (MAX_REGISTER_ENTRY - offset))) & SINGLE_REGISTER_MASK); } else { // The value is within a single register. We simply use the remainder and a mask. stored = (byte) ((store[first] >>> offset) & SINGLE_REGISTER_MASK); } // We only update in case the sequence of zeroes is greater than seen previously. if (value > stored) { // We update the multiplicity to skip the calculation during the cardinality. // This comes from Algorithm 7 in [2]. multiplicity[stored] -= 1; multiplicity[value] += 1; if (stored == minimum) { while (multiplicity[minimum] == 0) { minimum += 1; } } // Now we need to write the new value back. And, we need to store on both positions, if necessary. if (spilled) { // The value occupies two positions. // We need to clear the previous occupied bits and set with the new value. store[first] &= (1L << offset) - 1; store[first] |= (long) value << offset; // Now, reset the remaining bits and set the value. store[second] &= ~(SINGLE_REGISTER_MASK >>> (MAX_REGISTER_ENTRY - offset)); store[second] |= (value >>> (MAX_REGISTER_ENTRY - offset)); } else { // The register occupy only a single position. We just reset and write the value back. store[first] &= ~((long) SINGLE_REGISTER_MASK << offset); store[first] |= (long) value << offset; } return true; } } return false; } @ProtoField(number = 1, collectionImplementation = ArrayList.class) List store() { return LongStream.of(store) .boxed() .collect(Collectors.toList()); } @ProtoField(number = 2, collectionImplementation = ArrayList.class, type = Type.UINT32) Collection multiplicity() { return IntStream.of(multiplicity) .boxed() .collect(Collectors.toList()); } @ProtoField(number = 3, javaType = byte.class, defaultValue = "0", type = Type.UINT32) byte minimum() { return minimum; } @Override public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; CompactSet that = (CompactSet) o; synchronized (this) { synchronized (that) { return Arrays.equals(this.store, that.store); } } } @Override public int hashCode() { synchronized (this) { return Arrays.hashCode(store); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy