package com.twitter.algebird
import java.nio.ByteBuffer
/** A super lightweight (hopefully) version of BitSet */
case class BitSetLite(in: Array[Byte]) {
def contains(x: Int): Boolean = {
* Pretend 'in' is little endian so that the bitstring b0b1b2b3 is such that if b0 == 1, then
* 0 is in the bitset, if b1 == 1, then 1 is in the bitset.
val arrayIdx = x / 8
val remainder = x % 8
((in(arrayIdx) >> (7 - remainder)) & 1) == 1
* Implementation of the HyperLogLog approximate counting as a Monoid
* @link
* HyperLogLog: the analysis of a near-optimal cardinality estimation algorithm
* Philippe Flajolet and Éric Fusy and Olivier Gandouet and Frédéric Meunier
object HyperLogLog {
/* Size of the hash in bits */
val hashSize = 128
def hash(input: Array[Byte]): Array[Byte] = {
val seed = 12345678
val (l0, l1) = MurmurHash128(seed)(input)
val buf = new Array[Byte](16)
implicit def int2Bytes(i: Int) = {
val buf = new Array[Byte](4)
implicit def long2Bytes(i: Long) = {
val buf = new Array[Byte](8)
def twopow(i: Int): Double = scala.math.pow(2.0, i)
* the value 'j' is equal to
* TODO: We could read in a byte at a time.
def j(bsl: BitSetLite, bits: Int): Int = {
def loop(pos: Int, accum: Int): Int = {
if (pos >= bits) {
} else if (bsl.contains(pos)) {
loop(pos + 1, accum + (1 << pos))
} else {
loop(pos + 1, accum)
loop(0, 0)
* The value 'w' is equal to . The function rho counts the number of leading
* zeroes in 'w'. We can calculate rho(w) at once with the method rhoW.
def rhoW(bsl: BitSetLite, bits: Int): Byte = {
def loop(pos: Int, zeros: Int): Int =
if (bsl.contains(pos)) zeros
else loop(pos + 1, zeros + 1)
loop(bits, 1).toByte
* We are computing j and \rho(w) from the paper,
* sorry for the name, but it allows someone to compare to the paper extremely low probability
* rhow (position of the leftmost one bit) is > 127, so we use a Byte to store it
* Given a hash the value 'j' is equal to and
* the value 'w' is equal to . The function rho counts the number of leading
* zeroes in 'w'. We can calculate rho(w) at once with the method rhoW.
def jRhoW(in: Array[Byte], bits: Int): (Int, Byte) = {
val onBits = BitSetLite(in)
(j(onBits, bits), rhoW(onBits, bits))
def toBytes(h: HLL): Array[Byte] = {
h match {
case SparseHLL(bits, maxRhow) =>
val jLen = (bits + 7) / 8
assert(jLen >= 1)
assert(jLen <= 3)
val buf = new Array[Byte](1 + 1 + (jLen + 1) * maxRhow.size)
val byteBuf = ByteBuffer
.put(3: Byte)
maxRhow.foldLeft(byteBuf) { (bb, jrhow) =>
val (j, rhow) = jrhow
bb.put((j & 0xff).toByte)
if (jLen >= 2) bb.put(((j >> 8) & 0xff).toByte)
if (jLen >= 3) bb.put(((j >> 16) & 0xff).toByte)
case DenseHLL(bits, v) => ((2: Byte) +: bits.toByte +: v).toArray
// Make sure to be reversible so fromBytes(toBytes(x)) == x
def fromBytes(bytes: Array[Byte]): HLL = {
val bb = ByteBuffer.wrap(bytes)
bb.get.toInt match {
case 2 => DenseHLL(bb.get, bytes.toIndexedSeq.tail.tail)
case 3 => sparseFromByteBuffer(bb)
case n => throw new Exception("Unrecognized HLL type: " + n)
def fromByteBuffer(bb: ByteBuffer): HLL = {
bb.get.toInt match {
case 2 =>
val bits = bb.get
val buf = new Array[Byte](bb.remaining)
DenseHLL(bits, buf)
case 3 => sparseFromByteBuffer(bb)
case n => throw new Exception("Unrecognized HLL type: " + n)
private def sparseFromByteBuffer(bb: ByteBuffer): SparseHLL = {
val bits = bb.get
val jLen = (bits + 7) / 8
assert(bb.remaining % (jLen + 1) == 0, "Invalid byte array")
val maxRhow = (1 to bb.remaining / (jLen + 1)).map { _ =>
val j = jLen match {
case 1 => (bb.get.toInt & 0xff)
case 2 => (bb.get.toInt & 0xff) + ((bb.get.toInt & 0xff) << 8)
case 3 => (bb.get.toInt & 0xff) + ((bb.get.toInt & 0xff) << 8) + ((bb.get.toInt & 0xff) << 16)
val rhow = bb.get
j -> Max(rhow)
SparseHLL(bits, maxRhow)
def alpha(bits: Int) = bits match {
case 4 => 0.673
case 5 => 0.697
case 6 => 0.709
case _ => 0.7213 / (1.0 + 1.079 / (1 << bits).toDouble)
def error(bits: Int): Double = 1.04 / scala.math.sqrt(twopow(bits))
sealed abstract class HLL extends {
import HyperLogLog._
def bits: Int
def size: Int
def zeroCnt: Int
def z: Double
def +(other: HLL): HLL
def toDenseHLL: DenseHLL
def approximateSize = asApprox(initialEstimate)
def estimatedSize = approximateSize.estimate.toDouble
private def initialEstimate = {
val e = factor * z
// There are large and small value corrections from the paper
// We stopped using the small value corrections since when using Long's
// there was pathalogically bad results. See
if (e <= smallE) {
} else {
private def asApprox(v: Double): Approximate[Long] = {
val stdev = error(bits)
val lowerBound = math.floor(math.max(v * (1.0 - 3 * stdev), 0.0)).toLong
val upperBound = math.ceil(v * (1.0 + 3 * stdev)).toLong
// Lower bound. see:
val prob3StdDev = 0.9972
Approximate(lowerBound, v.toLong, upperBound, prob3StdDev)
private def factor = alpha(bits) * size.toDouble * size.toDouble
private def smallE = 5 * size / 2.0
private def smallEstimate(e: Double): Double = {
if (zeroCnt == 0) {
} else {
size * scala.math.log(size.toDouble / zeroCnt)
* Set each item in the given buffer to the max of this and the buffer
def updateInto(buffer: Array[Byte]): Unit
case class SparseHLL(bits: Int, maxRhow: Map[Int, Max[Byte]]) extends HLL {
assert(bits > 3, "Use at least 4 bits (2^(bits) = bytes consumed)")
val size = 1 << bits
lazy val zeroCnt = size - maxRhow.size
lazy val z = 1.0 / (zeroCnt.toDouble + { mj => HyperLogLog.twopow(-mj.get) }.sum)
def +(other: HLL) = {
other match {
case sparse @ SparseHLL(_, oMaxRhow) =>
assert(sparse.size == size, "Incompatible HLL size: " + sparse.size + " != " + size)
val allMaxRhow =, oMaxRhow)
if (allMaxRhow.size * 16 <= size) {
SparseHLL(bits, allMaxRhow)
} else {
DenseHLL(bits, sparseMapToSequence(allMaxRhow))
case DenseHLL(bits, oldV) =>
assert(oldV.size == size, "Incompatible HLL size: " + oldV.size + " != " + size)
val newV = maxRhow.foldLeft(oldV) {
case (v, (j, rhow)) =>
if (rhow.get > v(j)) {
v.updated(j, rhow.get)
} else {
DenseHLL(bits, newV)
def sparseMapToSequence(values: Map[Int, Max[Byte]]): IndexedSeq[Byte] = {
val array = Array.fill[Byte](size)(0: Byte)
values.foreach { case (j, rhow) => array.update(j, rhow.get) }
lazy val toDenseHLL = DenseHLL(bits, sparseMapToSequence(maxRhow))
def updateInto(buffer: Array[Byte]): Unit = {
assert(buffer.length == size, "Length mismatch")
maxRhow.foreach {
case (idx, maxb) =>
buffer.update(idx, buffer(idx) max (maxb.get))
* These are the individual instances which the Monoid knows how to add
case class DenseHLL(bits: Int, v: IndexedSeq[Byte]) extends HLL {
assert(v.size == (1 << bits), "Invalid size for dense vector: " + size + " != (1 << " + bits + ")")
def size = v.size
lazy val zeroCnt = v.count { _ == 0 }
// Named from the parameter in the paper, probably never useful to anyone
// except HyperLogLogMonoid
lazy val z = 1.0 / ( { mj => HyperLogLog.twopow(-mj) }.sum)
def +(other: HLL): HLL = {
other match {
case SparseHLL(_, _) => (other + this)
case DenseHLL(_, ov) =>
assert(ov.size == v.size, "Incompatible HLL size: " + ov.size + " != " + v.size)
.map { case (rhow, oRhow) => rhow max oRhow }
val toDenseHLL = this
def updateInto(buffer: Array[Byte]): Unit = {
assert(buffer.length == size, "Length mismatch")
var idx = 0
v.foreach { maxb =>
buffer.update(idx, (buffer(idx)) max maxb)
idx += 1
class IndexedSeqArrayByte(buf: Array[Byte]) extends scala.collection.IndexedSeq[Byte] {
def length = buf.length
def apply(idx: Int): Byte = buf.apply(idx)
override def stringPrefix = "Array"
* Error is about 1.04/sqrt(2^{bits}), so you want something like 12 bits for 1% error
* which means each HLLInstance is about 2^{12} = 4kb per instance.
class HyperLogLogMonoid(val bits: Int) extends Monoid[HLL] {
import HyperLogLog._
assert(bits > 3, "Use at least 4 bits (2^(bits) = bytes consumed)")
val size = 1 << bits
def apply[T <% Array[Byte]](t: T) = create(t)
val zero: HLL = SparseHLL(bits,[Map[Int, Max[Byte]]])
def plus(left: HLL, right: HLL) = left + right
private[this] final def denseUpdate(existing: HLL, iter: Iterator[HLL]): HLL = {
val buffer = new Array[Byte](size)
iter.foreach { _.updateInto(buffer) }
DenseHLL(bits, new IndexedSeqArrayByte(buffer))
override def sumOption(items: TraversableOnce[HLL]): Option[HLL] =
if (items.isEmpty) {
} else {
val iter = items.toIterator.buffered
var curValue =
while (iter.hasNext) {
curValue = (curValue, iter.head) match {
case (DenseHLL(_, _), _) => denseUpdate(curValue, iter)
case (_, DenseHLL(_, _)) => denseUpdate(curValue, iter)
case _ => curValue +
def create(example: Array[Byte]): HLL = {
val hashed = hash(example)
val (j, rhow) = jRhoW(hashed, bits)
SparseHLL(bits, Map(j -> Max(rhow)))
def batchCreate[T <% Array[Byte]](instances: Iterable[T]): HLL = {
val allMaxRhow = instances
.map { x => jRhoW(hash(x), bits) }
.groupBy { case (j, rhow) => j }
.mapValues { _.maxBy { case (j, rhow) => rhow } }
.mapValues { case (j, rhow) => Max(rhow) }
if (allMaxRhow.size * 16 <= size) {
SparseHLL(bits, allMaxRhow)
} else {
SparseHLL(bits, allMaxRhow).toDenseHLL
final def estimateSize(hll: HLL): Double = {
final def sizeOf(hll: HLL): Approximate[Long] = {
final def estimateIntersectionSize(his: Seq[HLL]): Double = {
final def intersectionSize(his: Seq[HLL]): Approximate[Long] = { { head =>
val tail = his.tail
* |A n B| = |A| + |B| - |A u B|
* in the below, we set A = head, and B = tail.
* then note that A u (B0 n B1 n ...) = (B0 u A) n (B1 u A) n ...
* the latter we can compute with { _ + A } using the HLLInstance +
* since + on HLLInstance creates the instance for the union.
sizeOf(head) + intersectionSize(tail) -
intersectionSize( { _ + head })
.map { _.withMin(0L) } // We always know the intersection is >= 0
.getOrElse(Approximate.exact(0L)) // Empty lists have no intersection
object HyperLogLogAggregator {
def apply(bits: Int): HyperLogLogAggregator = {
val monoid = new HyperLogLogMonoid(bits)
new HyperLogLogAggregator(monoid)
def sizeAggregator(bits: Int): Aggregator[Array[Byte], HLL, Double] = {
case class HyperLogLogAggregator(val hllMonoid: HyperLogLogMonoid) extends MonoidAggregator[Array[Byte], HLL, HLL] {
val monoid = hllMonoid
def prepare(value: Array[Byte]) = monoid.create(value)
def present(hll: HLL) = hll
