com.twitter.algebird.MinHasher.scala Maven / Gradle / Ivy

Go to download
package com.twitter.algebird

import java.nio._

/**
  * Instances of MinHasher can create, combine, and compare fixed-sized signatures of
  * arbitrarily sized sets.
  *
  * A signature is represented by a byte array of approx maxBytes size.
  * You can initialize a signature with a single element, usually a Long or String.
  * You can combine any two set's signatures to produce the signature of their union.
  * You can compare any two set's signatures to estimate their jaccard similarity.
  * You can use a set's signature to estimate the number of distinct values in the set.
  * You can also use a combination of the above to estimate the size of the intersection of
  * two sets from their signatures.
  * The more bytes in the signature, the more accurate all of the above will be.
  * 
  * You can also use these signatures to quickly find similar sets without doing
  * n^2 comparisons. Each signature is assigned to several buckets; sets whose signatures
  * end up in the same bucket are likely to be similar. The targetThreshold controls
  * the desired level of similarity - the higher the threshold, the more efficiently
  * you can find all the similar sets.
  *
  * This abstract superclass is generic with regards to the size of the hash used.
  * Depending on the number of unique values in the domain of the sets, you may want
  * a MinHasher16, a MinHasher32, or a new custom subclass.
  *
  * This implementation is modeled after Chapter 3 of Ullman and Rajaraman's Mining of Massive Datasets:
  * http://infolab.stanford.edu/~ullman/mmds/ch3a.pdf
**/
abstract class MinHasher[H](targetThreshold : Double, maxBytes : Int)(implicit n : Numeric[H]) extends Monoid[Array[Byte]] {
  /** the number of bytes used for each hash in the signature */
  def hashSize : Int

  /** For explanation of the "bands" and "rows" see Ullman and Rajaraman */
  val numBands = pickBands(targetThreshold, maxBytes / hashSize)
  val numRows = maxBytes / numBands / hashSize
  val numHashes = numRows * numBands
  val numBytes = numHashes * hashSize

  /** This seed could be anything */
  val seed = 123456789

  /** We always use a 128 bit hash function, so the number of hash functions is different
    * (and usually smaller) than the number of hashes in the signature.
  **/
  val hashFunctions = {
    val r = new scala.util.Random(seed)
    val numHashFunctions = math.ceil(numBytes / 16.0).toInt
    (1 to numHashFunctions).map{i => MurmurHash128(r.nextLong)}    
  }

  /** Signature for empty set, needed to be a proper Monoid */
  val zero = buildArray{maxHash}

  /** Set union */
  def plus(left : Array[Byte], right : Array[Byte]) = {
    buildArray(left, right){(l,r) => n.min(l, r)}
  }
  
  /** Esimate jaccard similarity (size of union / size of intersection) */
  def similarity(left : Array[Byte], right : Array[Byte]) = {
    val matching = buildArray(left,right){(l,r) => if(l == r) n.one else n.zero}
    matching.map{_.toDouble}.sum / numHashes
  } 

  /** Bucket keys to use for quickly finding other similar items via locality sensitive hashing */
  def buckets(sig : Array[Byte]) = {
    sig.grouped(numRows*hashSize).toList.map{band =>
      val (long1, long2) = hashFunctions.head(band)
      long1
    }
  }

  /** Create a signature for a single Long value */
  def init(value : Long) : Array[Byte] = init{_(value)}

  /** Create a signature for a single String value */
  def init(value : String) : Array[Byte]= init{_(value)}

  /** Create a signature for an arbitrary value */
  def init(fn : MurmurHash128 => (Long,Long)) : Array[Byte] = {
    val bytes = new Array[Byte](numBytes)
    val buffer = ByteBuffer.allocate(hashFunctions.size * 16)
    val longBuffer = buffer.asLongBuffer
    hashFunctions.foreach{h =>
      val (long1, long2) = fn(h)
      longBuffer.put(long1)
      longBuffer.put(long2)
    }
    buffer.rewind
    buffer.get(bytes)
    bytes
  }

  /** useful for understanding the effects of numBands and numRows */
  val estimatedThreshold = math.pow(1.0/numBands, 1.0/numRows)

  /** useful for understanding the effects of numBands and numRows */
  def probabilityOfInclusion(sim : Double) = 1.0 - math.pow(1.0 - math.pow(sim, numRows), numBands)

  /** numerically solve the inverse of estimatedThreshold, given numBands*numRows */
  def pickBands(threshold : Double, hashes : Int) = {
    val target = hashes * -1 * math.log(threshold)
    var bands = 1
    while(bands * math.log(bands) < target)
      bands += 1
    bands
  }

  /** Maximum value the hash can take on (not 2*hashSize because of signed types) */
  def maxHash : H

  /** Initialize a byte array by generating hash values */
  def buildArray(fn: => H) : Array[Byte]

  /** Decode two signatures into hash values, combine them somehow, and produce a new array */
  def buildArray(left : Array[Byte], right : Array[Byte])(fn: (H,H) => H) : Array[Byte]
}

class MinHasher32(t : Double, n : Int) extends MinHasher[Int](t,n) {
  def hashSize = 4  
  def maxHash = Int.MaxValue
  def buildArray(fn: => Int) : Array[Byte] = {
    val byteBuffer = ByteBuffer.allocate(numBytes)    
    val writeBuffer = byteBuffer.asIntBuffer
    1.to(numHashes).foreach{i => writeBuffer.put(fn)}
    byteBuffer.array
  }

  def buildArray(left : Array[Byte], right : Array[Byte])(fn: (Int,Int) => Int) : Array[Byte] = {
    val leftBuffer = ByteBuffer.wrap(left).asIntBuffer
    val rightBuffer = ByteBuffer.wrap(right).asIntBuffer
    buildArray{fn(leftBuffer.get, rightBuffer.get)}
  }

  /** seems to work, but experimental and not generic yet */
  def approxCount(sig : Array[Byte]) = {
    val buffer = ByteBuffer.wrap(sig).asIntBuffer
    val mean = 1.to(numHashes).map{i => buffer.get.toLong}.sum / numHashes
    (2L << 31) / (mean.toLong + (2L << 30))
  }
}

class MinHasher16(t : Double, n : Int) extends MinHasher[Char](t,n) {
  def hashSize = 2  
  def maxHash = Char.MaxValue
  def buildArray(fn: => Char) : Array[Byte] = {
    val byteBuffer = ByteBuffer.allocate(numBytes)    
    val writeBuffer = byteBuffer.asCharBuffer
    1.to(numHashes).foreach{i => writeBuffer.put(fn)}
    byteBuffer.array
  }

  def buildArray(left : Array[Byte], right : Array[Byte])(fn: (Char,Char) => Char) : Array[Byte] = {
    val leftBuffer = ByteBuffer.wrap(left).asCharBuffer
    val rightBuffer = ByteBuffer.wrap(right).asCharBuffer
    buildArray{fn(leftBuffer.get, rightBuffer.get)}
  }
}