com.twitter.algebird.HyperLogLog.scala Maven / Gradle / Ivy
/*
Copyright 2012 Twitter, Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package com.twitter.algebird
import java.nio.ByteBuffer
/** A super lightweight (hopefully) version of BitSet */
case class BitSetLite(in: Array[Byte]) {
def contains(x: Int): Boolean = {
/**
* Pretend 'in' is little endian so that the bitstring b0b1b2b3 is such that if b0 == 1, then
* 0 is in the bitset, if b1 == 1, then 1 is in the bitset.
*/
val arrayIdx = x / 8
val remainder = x % 8
((in(arrayIdx) >> (7 - remainder)) & 1) == 1
}
}
/**
* Implementation of the HyperLogLog approximate counting as a Monoid
* @link http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf
*
* HyperLogLog: the analysis of a near-optimal cardinality estimation algorithm
* Philippe Flajolet and Éric Fusy and Olivier Gandouet and Frédéric Meunier
*/
object HyperLogLog {
/* Size of the hash in bits */
val hashSize = 128
def hash(input: Array[Byte]): Array[Byte] = {
val seed = 12345678
val (l0, l1) = MurmurHash128(seed)(input)
val buf = new Array[Byte](16)
ByteBuffer
.wrap(buf)
.putLong(l0)
.putLong(l1)
buf
}
implicit def int2Bytes(i: Int) = {
val buf = new Array[Byte](4)
ByteBuffer
.wrap(buf)
.putInt(i)
buf
}
implicit def long2Bytes(i: Long) = {
val buf = new Array[Byte](8)
ByteBuffer
.wrap(buf)
.putLong(i)
buf
}
def twopow(i: Int): Double = scala.math.pow(2.0, i)
/**
* the value 'j' is equal to
* TODO: We could read in a byte at a time.
*/
def j(bsl: BitSetLite, bits: Int): Int = {
@annotation.tailrec
def loop(pos: Int, accum: Int): Int = {
if (pos >= bits) {
accum
} else if (bsl.contains(pos)) {
loop(pos + 1, accum + (1 << pos))
} else {
loop(pos + 1, accum)
}
}
loop(0, 0)
}
/**
* The value 'w' is equal to . The function rho counts the number of leading
* zeroes in 'w'. We can calculate rho(w) at once with the method rhoW.
*/
def rhoW(bsl: BitSetLite, bits: Int): Byte = {
@annotation.tailrec
def loop(pos: Int, zeros: Int): Int =
if (bsl.contains(pos)) zeros
else loop(pos + 1, zeros + 1)
loop(bits, 1).toByte
}
/**
* We are computing j and \rho(w) from the paper,
* sorry for the name, but it allows someone to compare to the paper extremely low probability
* rhow (position of the leftmost one bit) is > 127, so we use a Byte to store it
* Given a hash the value 'j' is equal to and
* the value 'w' is equal to . The function rho counts the number of leading
* zeroes in 'w'. We can calculate rho(w) at once with the method rhoW.
*/
def jRhoW(in: Array[Byte], bits: Int): (Int, Byte) = {
val onBits = BitSetLite(in)
(j(onBits, bits), rhoW(onBits, bits))
}
def toBytes(h: HLL): Array[Byte] = {
h match {
case SparseHLL(bits, maxRhow) =>
val jLen = (bits + 7) / 8
assert(jLen >= 1)
assert(jLen <= 3)
val buf = new Array[Byte](1 + 1 + (jLen + 1) * maxRhow.size)
val byteBuf = ByteBuffer
.wrap(buf)
.put(3: Byte)
.put(bits.toByte)
maxRhow.foldLeft(byteBuf) { (bb, jrhow) =>
val (j, rhow) = jrhow
bb.put((j & 0xff).toByte)
if (jLen >= 2) bb.put(((j >> 8) & 0xff).toByte)
if (jLen >= 3) bb.put(((j >> 16) & 0xff).toByte)
bb.put(rhow.get)
}
buf
case DenseHLL(bits, v) => ((2: Byte) +: bits.toByte +: v).toArray
}
}
// Make sure to be reversible so fromBytes(toBytes(x)) == x
def fromBytes(bytes: Array[Byte]): HLL = {
val bb = ByteBuffer.wrap(bytes)
bb.get.toInt match {
case 2 => DenseHLL(bb.get, bytes.toIndexedSeq.tail.tail)
case 3 => sparseFromByteBuffer(bb)
case n => throw new Exception("Unrecognized HLL type: " + n)
}
}
def fromByteBuffer(bb: ByteBuffer): HLL = {
bb.get.toInt match {
case 2 =>
val bits = bb.get
val buf = new Array[Byte](bb.remaining)
bb.get(buf)
DenseHLL(bits, buf)
case 3 => sparseFromByteBuffer(bb)
case n => throw new Exception("Unrecognized HLL type: " + n)
}
}
private def sparseFromByteBuffer(bb: ByteBuffer): SparseHLL = {
val bits = bb.get
val jLen = (bits + 7) / 8
assert(bb.remaining % (jLen + 1) == 0, "Invalid byte array")
val maxRhow = (1 to bb.remaining / (jLen + 1)).map { _ =>
val j = jLen match {
case 1 => (bb.get.toInt & 0xff)
case 2 => (bb.get.toInt & 0xff) + ((bb.get.toInt & 0xff) << 8)
case 3 => (bb.get.toInt & 0xff) + ((bb.get.toInt & 0xff) << 8) + ((bb.get.toInt & 0xff) << 16)
}
val rhow = bb.get
j -> Max(rhow)
}.toMap
SparseHLL(bits, maxRhow)
}
def alpha(bits: Int) = bits match {
case 4 => 0.673
case 5 => 0.697
case 6 => 0.709
case _ => 0.7213 / (1.0 + 1.079 / (1 << bits).toDouble)
}
def error(bits: Int): Double = 1.04 / scala.math.sqrt(twopow(bits))
}
sealed abstract class HLL extends java.io.Serializable {
import HyperLogLog._
def bits: Int
def size: Int
def zeroCnt: Int
def z: Double
def +(other: HLL): HLL
def toDenseHLL: DenseHLL
def approximateSize = asApprox(initialEstimate)
def estimatedSize = approximateSize.estimate.toDouble
private def initialEstimate = {
val e = factor * z
// There are large and small value corrections from the paper
// We stopped using the small value corrections since when using Long's
// there was pathalogically bad results. See https://github.com/twitter/algebird/issues/284
if (e <= smallE) {
smallEstimate(e)
} else {
e
}
}
private def asApprox(v: Double): Approximate[Long] = {
val stdev = error(bits)
val lowerBound = math.floor(math.max(v * (1.0 - 3 * stdev), 0.0)).toLong
val upperBound = math.ceil(v * (1.0 + 3 * stdev)).toLong
// Lower bound. see: http://en.wikipedia.org/wiki/68-95-99.7_rule
val prob3StdDev = 0.9972
Approximate(lowerBound, v.toLong, upperBound, prob3StdDev)
}
private def factor = alpha(bits) * size.toDouble * size.toDouble
private def smallE = 5 * size / 2.0
private def smallEstimate(e: Double): Double = {
if (zeroCnt == 0) {
e
} else {
size * scala.math.log(size.toDouble / zeroCnt)
}
}
/**
* Set each item in the given buffer to the max of this and the buffer
*/
def updateInto(buffer: Array[Byte]): Unit
}
case class SparseHLL(bits: Int, maxRhow: Map[Int, Max[Byte]]) extends HLL {
assert(bits > 3, "Use at least 4 bits (2^(bits) = bytes consumed)")
val size = 1 << bits
lazy val zeroCnt = size - maxRhow.size
lazy val z = 1.0 / (zeroCnt.toDouble + maxRhow.values.map { mj => HyperLogLog.twopow(-mj.get) }.sum)
def +(other: HLL) = {
other match {
case sparse @ SparseHLL(_, oMaxRhow) =>
assert(sparse.size == size, "Incompatible HLL size: " + sparse.size + " != " + size)
val allMaxRhow = Monoid.plus(maxRhow, oMaxRhow)
if (allMaxRhow.size * 16 <= size) {
SparseHLL(bits, allMaxRhow)
} else {
DenseHLL(bits, sparseMapToSequence(allMaxRhow))
}
case DenseHLL(bits, oldV) =>
assert(oldV.size == size, "Incompatible HLL size: " + oldV.size + " != " + size)
val newV = maxRhow.foldLeft(oldV) {
case (v, (j, rhow)) =>
if (rhow.get > v(j)) {
v.updated(j, rhow.get)
} else {
v
}
}
DenseHLL(bits, newV)
}
}
def sparseMapToSequence(values: Map[Int, Max[Byte]]): IndexedSeq[Byte] = {
val array = Array.fill[Byte](size)(0: Byte)
values.foreach { case (j, rhow) => array.update(j, rhow.get) }
array.toIndexedSeq
}
lazy val toDenseHLL = DenseHLL(bits, sparseMapToSequence(maxRhow))
def updateInto(buffer: Array[Byte]): Unit = {
assert(buffer.length == size, "Length mismatch")
maxRhow.foreach {
case (idx, maxb) =>
buffer.update(idx, buffer(idx) max (maxb.get))
}
}
}
/**
* These are the individual instances which the Monoid knows how to add
*/
case class DenseHLL(bits: Int, v: IndexedSeq[Byte]) extends HLL {
assert(v.size == (1 << bits), "Invalid size for dense vector: " + size + " != (1 << " + bits + ")")
def size = v.size
lazy val zeroCnt = v.count { _ == 0 }
// Named from the parameter in the paper, probably never useful to anyone
// except HyperLogLogMonoid
lazy val z = 1.0 / (v.map { mj => HyperLogLog.twopow(-mj) }.sum)
def +(other: HLL): HLL = {
other match {
case SparseHLL(_, _) => (other + this)
case DenseHLL(_, ov) =>
assert(ov.size == v.size, "Incompatible HLL size: " + ov.size + " != " + v.size)
DenseHLL(bits,
v
.view
.zip(ov)
.map { case (rhow, oRhow) => rhow max oRhow }
.toIndexedSeq)
}
}
val toDenseHLL = this
def updateInto(buffer: Array[Byte]): Unit = {
assert(buffer.length == size, "Length mismatch")
var idx = 0
v.foreach { maxb =>
buffer.update(idx, (buffer(idx)) max maxb)
idx += 1
}
}
}
class IndexedSeqArrayByte(buf: Array[Byte]) extends scala.collection.IndexedSeq[Byte] {
def length = buf.length
def apply(idx: Int): Byte = buf.apply(idx)
override def stringPrefix = "Array"
}
/*
* Error is about 1.04/sqrt(2^{bits}), so you want something like 12 bits for 1% error
* which means each HLLInstance is about 2^{12} = 4kb per instance.
*/
class HyperLogLogMonoid(val bits: Int) extends Monoid[HLL] {
import HyperLogLog._
assert(bits > 3, "Use at least 4 bits (2^(bits) = bytes consumed)")
val size = 1 << bits
def apply[T <% Array[Byte]](t: T) = create(t)
val zero: HLL = SparseHLL(bits, Monoid.zero[Map[Int, Max[Byte]]])
def plus(left: HLL, right: HLL) = left + right
private[this] final def denseUpdate(existing: HLL, iter: Iterator[HLL]): HLL = {
val buffer = new Array[Byte](size)
existing.updateInto(buffer)
iter.foreach { _.updateInto(buffer) }
DenseHLL(bits, new IndexedSeqArrayByte(buffer))
}
override def sumOption(items: TraversableOnce[HLL]): Option[HLL] =
if (items.isEmpty) {
None
} else {
val iter = items.toIterator.buffered
var curValue = iter.next
while (iter.hasNext) {
curValue = (curValue, iter.head) match {
case (DenseHLL(_, _), _) => denseUpdate(curValue, iter)
case (_, DenseHLL(_, _)) => denseUpdate(curValue, iter)
case _ => curValue + iter.next
}
}
Some(curValue)
}
def create(example: Array[Byte]): HLL = {
val hashed = hash(example)
val (j, rhow) = jRhoW(hashed, bits)
SparseHLL(bits, Map(j -> Max(rhow)))
}
def batchCreate[T <% Array[Byte]](instances: Iterable[T]): HLL = {
val allMaxRhow = instances
.map { x => jRhoW(hash(x), bits) }
.groupBy { case (j, rhow) => j }
.mapValues { _.maxBy { case (j, rhow) => rhow } }
.mapValues { case (j, rhow) => Max(rhow) }
if (allMaxRhow.size * 16 <= size) {
SparseHLL(bits, allMaxRhow)
} else {
SparseHLL(bits, allMaxRhow).toDenseHLL
}
}
final def estimateSize(hll: HLL): Double = {
hll.estimatedSize
}
final def sizeOf(hll: HLL): Approximate[Long] = {
hll.approximateSize
}
final def estimateIntersectionSize(his: Seq[HLL]): Double = {
intersectionSize(his).estimate.toDouble
}
final def intersectionSize(his: Seq[HLL]): Approximate[Long] = {
his.headOption.map { head =>
val tail = his.tail
/*
* |A n B| = |A| + |B| - |A u B|
* in the below, we set A = head, and B = tail.
* then note that A u (B0 n B1 n ...) = (B0 u A) n (B1 u A) n ...
* the latter we can compute with tail.map { _ + A } using the HLLInstance +
* since + on HLLInstance creates the instance for the union.
*/
sizeOf(head) + intersectionSize(tail) -
intersectionSize(tail.map { _ + head })
}
.map { _.withMin(0L) } // We always know the intersection is >= 0
.getOrElse(Approximate.exact(0L)) // Empty lists have no intersection
}
}
object HyperLogLogAggregator {
def apply(bits: Int): HyperLogLogAggregator = {
val monoid = new HyperLogLogMonoid(bits)
new HyperLogLogAggregator(monoid)
}
def sizeAggregator(bits: Int): Aggregator[Array[Byte], HLL, Double] = {
apply(bits).andThenPresent(_.estimatedSize)
}
}
case class HyperLogLogAggregator(val hllMonoid: HyperLogLogMonoid) extends MonoidAggregator[Array[Byte], HLL, HLL] {
val monoid = hllMonoid
def prepare(value: Array[Byte]) = monoid.create(value)
def present(hll: HLL) = hll
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy