All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.twitter.algebird.HyperLogLog.scala Maven / Gradle / Ivy

/*
Copyright 2012 Twitter, Inc.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package com.twitter.algebird

import java.nio.ByteBuffer

/** A super lightweight (hopefully) version of BitSet */
case class BitSetLite(in: Array[Byte]) {
  def contains(x: Int): Boolean = {
    /**
     * Pretend 'in' is little endian so that the bitstring b0b1b2b3 is such that if b0 == 1, then
     *  0 is in the bitset, if b1 == 1, then 1 is in the bitset.
     */
    val arrayIdx = x / 8
    val remainder = x % 8
    ((in(arrayIdx) >> (7 - remainder)) & 1) == 1
  }
}

/**
 * Implementation of the HyperLogLog approximate counting as a Monoid
 * @link http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf
 *
 * HyperLogLog: the analysis of a near-optimal cardinality estimation algorithm
 * Philippe Flajolet and Éric Fusy and Olivier Gandouet and Frédéric Meunier
 */
object HyperLogLog {

  /* Size of the hash in bits */
  val hashSize = 128

  def hash(input: Array[Byte]): Array[Byte] = {
    val seed = 12345678
    val (l0, l1) = MurmurHash128(seed)(input)
    val buf = new Array[Byte](16)
    ByteBuffer
      .wrap(buf)
      .putLong(l0)
      .putLong(l1)
    buf
  }

  implicit def int2Bytes(i: Int) = {
    val buf = new Array[Byte](4)
    ByteBuffer
      .wrap(buf)
      .putInt(i)
    buf
  }

  implicit def long2Bytes(i: Long) = {
    val buf = new Array[Byte](8)
    ByteBuffer
      .wrap(buf)
      .putLong(i)
    buf
  }

  def twopow(i: Int): Double = scala.math.pow(2.0, i)

  /**
   * the value 'j' is equal to 
   *  TODO: We could read in a byte at a time.
   */
  def j(bsl: BitSetLite, bits: Int): Int = {
    @annotation.tailrec
    def loop(pos: Int, accum: Int): Int = {
      if (pos >= bits) {
        accum
      } else if (bsl.contains(pos)) {
        loop(pos + 1, accum + (1 << pos))
      } else {
        loop(pos + 1, accum)
      }
    }
    loop(0, 0)
  }

  /**
   * The value 'w' is equal to . The function rho counts the number of leading
   *  zeroes in 'w'. We can calculate rho(w) at once with the method rhoW.
   */
  def rhoW(bsl: BitSetLite, bits: Int): Byte = {
    @annotation.tailrec
    def loop(pos: Int, zeros: Int): Int =
      if (bsl.contains(pos)) zeros
      else loop(pos + 1, zeros + 1)
    loop(bits, 1).toByte
  }

  /**
   * We are computing j and \rho(w) from the paper,
   *  sorry for the name, but it allows someone to compare to the paper extremely low probability
   *  rhow (position of the leftmost one bit) is > 127, so we use a Byte to store it
   *  Given a hash  the value 'j' is equal to  and
   *  the value 'w' is equal to . The function rho counts the number of leading
   *  zeroes in 'w'. We can calculate rho(w) at once with the method rhoW.
   */
  def jRhoW(in: Array[Byte], bits: Int): (Int, Byte) = {
    val onBits = BitSetLite(in)
    (j(onBits, bits), rhoW(onBits, bits))
  }

  def toBytes(h: HLL): Array[Byte] = {
    h match {
      case SparseHLL(bits, maxRhow) =>
        val jLen = (bits + 7) / 8
        assert(jLen >= 1)
        assert(jLen <= 3)
        val buf = new Array[Byte](1 + 1 + (jLen + 1) * maxRhow.size)
        val byteBuf = ByteBuffer
          .wrap(buf)
          .put(3: Byte)
          .put(bits.toByte)
        maxRhow.foldLeft(byteBuf) { (bb, jrhow) =>
          val (j, rhow) = jrhow
          bb.put((j & 0xff).toByte)
          if (jLen >= 2) bb.put(((j >> 8) & 0xff).toByte)
          if (jLen >= 3) bb.put(((j >> 16) & 0xff).toByte)
          bb.put(rhow.get)
        }
        buf

      case DenseHLL(bits, v) => ((2: Byte) +: bits.toByte +: v).toArray
    }
  }

  // Make sure to be reversible so fromBytes(toBytes(x)) == x
  def fromBytes(bytes: Array[Byte]): HLL = {
    val bb = ByteBuffer.wrap(bytes)
    bb.get.toInt match {
      case 2 => DenseHLL(bb.get, bytes.toIndexedSeq.tail.tail)
      case 3 => sparseFromByteBuffer(bb)
      case n => throw new Exception("Unrecognized HLL type: " + n)
    }
  }

  def fromByteBuffer(bb: ByteBuffer): HLL = {
    bb.get.toInt match {
      case 2 =>
        val bits = bb.get
        val buf = new Array[Byte](bb.remaining)
        bb.get(buf)
        DenseHLL(bits, buf)
      case 3 => sparseFromByteBuffer(bb)
      case n => throw new Exception("Unrecognized HLL type: " + n)
    }
  }

  private def sparseFromByteBuffer(bb: ByteBuffer): SparseHLL = {
    val bits = bb.get
    val jLen = (bits + 7) / 8
    assert(bb.remaining % (jLen + 1) == 0, "Invalid byte array")
    val maxRhow = (1 to bb.remaining / (jLen + 1)).map { _ =>
      val j = jLen match {
        case 1 => (bb.get.toInt & 0xff)
        case 2 => (bb.get.toInt & 0xff) + ((bb.get.toInt & 0xff) << 8)
        case 3 => (bb.get.toInt & 0xff) + ((bb.get.toInt & 0xff) << 8) + ((bb.get.toInt & 0xff) << 16)
      }
      val rhow = bb.get
      j -> Max(rhow)
    }.toMap
    SparseHLL(bits, maxRhow)
  }

  def alpha(bits: Int) = bits match {

    case 4 => 0.673
    case 5 => 0.697
    case 6 => 0.709
    case _ => 0.7213 / (1.0 + 1.079 / (1 << bits).toDouble)

  }

  def error(bits: Int): Double = 1.04 / scala.math.sqrt(twopow(bits))
}

sealed abstract class HLL extends java.io.Serializable {
  import HyperLogLog._

  def bits: Int
  def size: Int
  def zeroCnt: Int
  def z: Double
  def +(other: HLL): HLL
  def toDenseHLL: DenseHLL

  def approximateSize = asApprox(initialEstimate)

  def estimatedSize = approximateSize.estimate.toDouble

  private def initialEstimate = {

    val e = factor * z
    // There are large and small value corrections from the paper
    // We stopped using the small value corrections since when using Long's
    // there was pathalogically bad results. See https://github.com/twitter/algebird/issues/284
    if (e <= smallE) {
      smallEstimate(e)
    } else {
      e
    }
  }

  private def asApprox(v: Double): Approximate[Long] = {
    val stdev = error(bits)
    val lowerBound = math.floor(math.max(v * (1.0 - 3 * stdev), 0.0)).toLong
    val upperBound = math.ceil(v * (1.0 + 3 * stdev)).toLong
    // Lower bound. see: http://en.wikipedia.org/wiki/68-95-99.7_rule
    val prob3StdDev = 0.9972
    Approximate(lowerBound, v.toLong, upperBound, prob3StdDev)
  }

  private def factor = alpha(bits) * size.toDouble * size.toDouble

  private def smallE = 5 * size / 2.0

  private def smallEstimate(e: Double): Double = {
    if (zeroCnt == 0) {
      e
    } else {
      size * scala.math.log(size.toDouble / zeroCnt)
    }
  }
  /**
   * Set each item in the given buffer to the max of this and the buffer
   */
  def updateInto(buffer: Array[Byte]): Unit
}

case class SparseHLL(bits: Int, maxRhow: Map[Int, Max[Byte]]) extends HLL {

  assert(bits > 3, "Use at least 4 bits (2^(bits) = bytes consumed)")

  val size = 1 << bits

  lazy val zeroCnt = size - maxRhow.size

  lazy val z = 1.0 / (zeroCnt.toDouble + maxRhow.values.map { mj => HyperLogLog.twopow(-mj.get) }.sum)

  def +(other: HLL) = {

    other match {

      case sparse @ SparseHLL(_, oMaxRhow) =>
        assert(sparse.size == size, "Incompatible HLL size: " + sparse.size + " != " + size)
        val allMaxRhow = Monoid.plus(maxRhow, oMaxRhow)
        if (allMaxRhow.size * 16 <= size) {
          SparseHLL(bits, allMaxRhow)
        } else {
          DenseHLL(bits, sparseMapToSequence(allMaxRhow))
        }

      case DenseHLL(bits, oldV) =>
        assert(oldV.size == size, "Incompatible HLL size: " + oldV.size + " != " + size)
        val newV = maxRhow.foldLeft(oldV) {
          case (v, (j, rhow)) =>
            if (rhow.get > v(j)) {
              v.updated(j, rhow.get)
            } else {
              v
            }
        }
        DenseHLL(bits, newV)

    }
  }

  def sparseMapToSequence(values: Map[Int, Max[Byte]]): IndexedSeq[Byte] = {
    val array = Array.fill[Byte](size)(0: Byte)
    values.foreach { case (j, rhow) => array.update(j, rhow.get) }
    array.toIndexedSeq
  }

  lazy val toDenseHLL = DenseHLL(bits, sparseMapToSequence(maxRhow))
  def updateInto(buffer: Array[Byte]): Unit = {
    assert(buffer.length == size, "Length mismatch")
    maxRhow.foreach {
      case (idx, maxb) =>
        buffer.update(idx, buffer(idx) max (maxb.get))
    }
  }
}

/**
 * These are the individual instances which the Monoid knows how to add
 */
case class DenseHLL(bits: Int, v: IndexedSeq[Byte]) extends HLL {

  assert(v.size == (1 << bits), "Invalid size for dense vector: " + size + " != (1 << " + bits + ")")

  def size = v.size

  lazy val zeroCnt = v.count { _ == 0 }

  // Named from the parameter in the paper, probably never useful to anyone
  // except HyperLogLogMonoid
  lazy val z = 1.0 / (v.map { mj => HyperLogLog.twopow(-mj) }.sum)

  def +(other: HLL): HLL = {

    other match {

      case SparseHLL(_, _) => (other + this)

      case DenseHLL(_, ov) =>
        assert(ov.size == v.size, "Incompatible HLL size: " + ov.size + " != " + v.size)
        DenseHLL(bits,
          v
            .view
            .zip(ov)
            .map { case (rhow, oRhow) => rhow max oRhow }
            .toIndexedSeq)

    }
  }

  val toDenseHLL = this
  def updateInto(buffer: Array[Byte]): Unit = {
    assert(buffer.length == size, "Length mismatch")
    var idx = 0
    v.foreach { maxb =>
      buffer.update(idx, (buffer(idx)) max maxb)
      idx += 1
    }
  }
}

class IndexedSeqArrayByte(buf: Array[Byte]) extends scala.collection.IndexedSeq[Byte] {
  def length = buf.length
  def apply(idx: Int): Byte = buf.apply(idx)
  override def stringPrefix = "Array"
}

/*
 * Error is about 1.04/sqrt(2^{bits}), so you want something like 12 bits for 1% error
 * which means each HLLInstance is about 2^{12} = 4kb per instance.
 */
class HyperLogLogMonoid(val bits: Int) extends Monoid[HLL] {
  import HyperLogLog._

  assert(bits > 3, "Use at least 4 bits (2^(bits) = bytes consumed)")

  val size = 1 << bits

  def apply[T <% Array[Byte]](t: T) = create(t)

  val zero: HLL = SparseHLL(bits, Monoid.zero[Map[Int, Max[Byte]]])

  def plus(left: HLL, right: HLL) = left + right

  private[this] final def denseUpdate(existing: HLL, iter: Iterator[HLL]): HLL = {
    val buffer = new Array[Byte](size)
    existing.updateInto(buffer)
    iter.foreach { _.updateInto(buffer) }

    DenseHLL(bits, new IndexedSeqArrayByte(buffer))
  }

  override def sumOption(items: TraversableOnce[HLL]): Option[HLL] =
    if (items.isEmpty) {
      None
    } else {
      val iter = items.toIterator.buffered
      var curValue = iter.next
      while (iter.hasNext) {
        curValue = (curValue, iter.head) match {
          case (DenseHLL(_, _), _) => denseUpdate(curValue, iter)
          case (_, DenseHLL(_, _)) => denseUpdate(curValue, iter)
          case _ => curValue + iter.next
        }
      }
      Some(curValue)
    }

  def create(example: Array[Byte]): HLL = {
    val hashed = hash(example)
    val (j, rhow) = jRhoW(hashed, bits)
    SparseHLL(bits, Map(j -> Max(rhow)))
  }

  def batchCreate[T <% Array[Byte]](instances: Iterable[T]): HLL = {
    val allMaxRhow = instances
      .map { x => jRhoW(hash(x), bits) }
      .groupBy { case (j, rhow) => j }
      .mapValues { _.maxBy { case (j, rhow) => rhow } }
      .mapValues { case (j, rhow) => Max(rhow) }
    if (allMaxRhow.size * 16 <= size) {
      SparseHLL(bits, allMaxRhow)
    } else {
      SparseHLL(bits, allMaxRhow).toDenseHLL
    }
  }

  final def estimateSize(hll: HLL): Double = {
    hll.estimatedSize
  }

  final def sizeOf(hll: HLL): Approximate[Long] = {
    hll.approximateSize
  }

  final def estimateIntersectionSize(his: Seq[HLL]): Double = {
    intersectionSize(his).estimate.toDouble
  }

  final def intersectionSize(his: Seq[HLL]): Approximate[Long] = {
    his.headOption.map { head =>
      val tail = his.tail
      /*
       * |A n B| = |A| + |B| - |A u B|
       * in the below, we set A = head, and B = tail.
       * then note that A u (B0 n B1 n ...) = (B0 u A) n (B1 u A) n ...
       * the latter we can compute with tail.map { _ + A } using the HLLInstance +
       * since + on HLLInstance creates the instance for the union.
       */
      sizeOf(head) + intersectionSize(tail) -
        intersectionSize(tail.map { _ + head })
    }
      .map { _.withMin(0L) } // We always know the intersection is >= 0
      .getOrElse(Approximate.exact(0L)) // Empty lists have no intersection
  }
}

object HyperLogLogAggregator {
  def apply(bits: Int): HyperLogLogAggregator = {
    val monoid = new HyperLogLogMonoid(bits)
    new HyperLogLogAggregator(monoid)
  }

  def sizeAggregator(bits: Int): Aggregator[Array[Byte], HLL, Double] = {
    apply(bits).andThenPresent(_.estimatedSize)
  }
}

case class HyperLogLogAggregator(val hllMonoid: HyperLogLogMonoid) extends MonoidAggregator[Array[Byte], HLL, HLL] {
  val monoid = hllMonoid

  def prepare(value: Array[Byte]) = monoid.create(value)
  def present(hll: HLL) = hll
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy