com.soundcloud.lsh.QueryLsh.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of cosine-lsh-join-spark_2.11 Show documentation
cosine-lsh-join-spark
The newest version!
package com.soundcloud.lsh

import com.soundcloud.lsh.SparkImplicits._
import org.apache.spark.ReExports.BoundedPriorityQueue
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, IndexedRowMatrix, MatrixEntry}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession

import scala.collection.mutable
import scala.reflect.ClassTag
import scala.util.Random

case class SubBucket(bucketHash: Int, subBucketId: Int = -1)

/**
 * Standard Lsh implementation. The queryMatrix is hashed multiple times and exact hash matches are
 * searched for in the dbMatrix. These candidates are used to compute the cosine distance.
 *
 * @param minCosineSimilarity minimum similarity two items need to have
 *                            otherwise they are discarded from the result set
 * @param dimensions          number of random vectors (hyperplanes) to generate bit
 *                            vectors of length d
 * @param numNeighbours       maximum number of catalogItems to be matched for a query when there is
 *                            a matching LSH hash
 * @param maxMatches          maximum number of candidates to return for each query
 * @param rounds              number of hash rounds. The more the better the approximation but
 *                            longer the computation.
 * @param randomReplications  The number of times that catalog set should be replicated in order
 *                            to increase LSH recall.
 * @param bucketSplittingPercentile The percentile of the query bucket size that is used as reference for the maximal
 *                                  bucket size. Any bucket bigger than that will be split in sub-buckets that are smaller.
 *
 */
class QueryLsh(minCosineSimilarity: Double,
               dimensions: Int,
               numNeighbours: Int,
               maxMatches: Int,
               rounds: Int,
               randomReplications: Int = 0,
               bucketSplittingPercentile: Double = 0.95)(sparkSession: SparkSession) extends QueryJoiner with Serializable {

  override def join(queryMatrix: IndexedRowMatrix, catalogMatrix: IndexedRowMatrix): CoordinateMatrix = {
    val numFeatures = queryMatrix.numCols().toInt

    val neighbours = 0 until rounds map {
      i =>
        val randomMatrix = localRandomMatrix(dimensions, numFeatures)
        val querySignatures: RDD[(Int, Signature)] = matrixToBitSet(queryMatrix, randomMatrix).
          keyBy(_.bitSet.hashCode())

        querySignatures.setName(s"querySignatures run $i").cache()

        val bucketSplits: Map[Int, Int] = computeBucketSplits(querySignatures)
        val bucketSplitsBc = sparkSession.sparkContext.broadcast(bucketSplits)


        val splitQuerySignatures: RDD[(SubBucket, Signature)] = splitIntoSubBuckets(querySignatures, bucketSplitsBc)

        val duplicatedCatalogSignatures: RDD[(SubBucket, Signature)] =
          matrixToBitSet(catalogMatrix, randomMatrix).
            andThen(replicateAndRehash(randomReplications, _)).
            andThen(duplicateInSubBuckets(_, bucketSplitsBc))

        joinWithRightBounded(numNeighbours, splitQuerySignatures, duplicatedCatalogSignatures).
          values.
          flatMap {
            case (query, catalog) =>
              val cosine = Cosine(query.vector, catalog.vector)
              if(cosine < minCosineSimilarity)
                None
              else
                Some(MatrixEntry(query.index, catalog.index, cosine))
          }
    }

    val mergedNeighbours = distinctTopK(neighbours.reduce(_ ++ _), maxMatches)

    new CoordinateMatrix(mergedNeighbours)
  }

  private def distinctTopK(rdd: RDD[MatrixEntry], k: Int): RDD[MatrixEntry] = {
    rdd.mapPartitions { partition =>
      // It is assumed that the partitions do not contain duplicates (this is mostly true, but might be false if a query has been split in multiple subbuckets,
      // and a catalog item is replicated in two subbuckets, and the two subbuckets end up in the same partition).
      // As a result, we don't deduplicate before shuffling, and might have less than k unique items per partition in some cases.
      val m = mutable.Map.empty[Long, BoundedPriorityQueue[MatrixEntry]]

      for (entry <- partition) {
        val key = entry.i
        val queue = m.getOrElseUpdate(key, new BoundedPriorityQueue[MatrixEntry](k)(Ordering.by(_.value)))
        queue += entry
      }

      m.toIterator
    }.reduceByKey { case (matches1, matches2) =>
      // This is a pretty naive implementation of distinct+topk. It's likely possible to get better performance by writing a custom
      // priority queue that has no duplicates.
      val distinctMatches: Set[MatrixEntry] = matches1.toSet ++ matches2
      new BoundedPriorityQueue[MatrixEntry](k)(Ordering.by(_.value)) ++= distinctMatches
    }.flatMap(_._2)
  }

  private def duplicateInSubBuckets(signatures: RDD[Signature], bucketSplitsBc: Broadcast[Map[Int, Int]]): RDD[(SubBucket, Signature)] = {
    val newCatalogSignatures = signatures.
      flatMap { signature =>
        val hash = signature.bitSet.hashCode()
        bucketSplitsBc.value.get(hash) match {
          case None => Seq((SubBucket(hash), signature))
          case Some(numSplits) => 0 until numSplits map { i => (SubBucket(hash, i), signature) }
        }
      }
    newCatalogSignatures
  }

  private def splitIntoSubBuckets(querySignatures: RDD[(Int, Signature)], bucketSplitsBc: Broadcast[Map[Int, Int]]): RDD[(SubBucket, Signature)] = {
    querySignatures.mapPartitions { partition =>
      val random = new Random()
      partition.map { case (hash, signature) =>
        bucketSplitsBc.value.get(hash) match {
          case None => (SubBucket(hash), signature)
          case Some(numSplits) => (SubBucket(hash, random.nextInt(numSplits)), signature)
        }
      }
    }
  }

  /*
     * Compute how many times each bucket needs to be split
     *
     * First, compute `bucketSplittingPercentile` percentile of bucket sizes, then determine the bucket splits so that
     * no bucket is bigger than that percentile.
     */
  private def computeBucketSplits(querySignatures: RDD[(Int, Signature)]): Map[Int, Int] = {
    import sparkSession.implicits._

    val bucketSizes: RDD[(Int, Int)] = querySignatures.mapValues(_ => 1).reduceByKey(_ + _).cache

    val Array(maxBucketSize) = bucketSizes.toDF("hash", "queryCount")
      .stat.approxQuantile("queryCount", Array(bucketSplittingPercentile), 0.001)

    val bucketSplits = bucketSizes.filter(_._2 > maxBucketSize).mapValues(_ / maxBucketSize.toInt + 1).collect().toMap
    bucketSizes.unpersist()
    bucketSplits
  }

  private def joinWithRightBounded[K: ClassTag, V: ClassTag, W: ClassTag]
                                  (bound: Int, rdd1: RDD[(K, V)], rdd2: RDD[(K, W)]): RDD[(K, (V, W))] = {
    rdd1.cogroup(rdd2).flatMapValues( pair =>
      for (v <- pair._1.iterator; w <- pair._2.iterator.take(bound)) yield (v, w)
    )
  }


  /*
   * Replicate data into neighboring hashes to increase recall.
   *
   * Each row of the RDD is replicated `replications` times by generating a new row and randomly
   * flipping a bit of the hash to put it into a different neighboring hash bucket.
   */
  private def replicateAndRehash(replications: Int, sigs: RDD[Signature]): RDD[Signature] = {
    if(replications==0) sigs
    else
      sigs.mapPartitions{ partition =>
        val random = new Random()
        partition.flatMap{ sig =>
          val indices = (0 until sig.bitSet.numBits).toSeq
          val selectedIndices = random.shuffle[Int, IndexedSeq](indices).take(replications)
          val perturbedBitSets = selectedIndices.map{ix => val bitSet = BitSet(sig.bitSet); bitSet.flip(ix); bitSet}
          sig +: perturbedBitSets.map(bitSet => sig.copy(bitSet=bitSet))
        }
      }
  }

}