All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.soundcloud.lsh.package.scala Maven / Gradle / Ivy

The newest version!
package com.soundcloud

import java.util.Random

import org.apache.spark.mllib.linalg.distributed.{MatrixEntry, IndexedRow, IndexedRowMatrix}
import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector}
import org.apache.spark.rdd.RDD

package object lsh {

  /**
   * An id with it's hash encoding.
   *
   */
  final case class SparseSignature(index: Long, bitSet: BitSet) extends Ordered[SparseSignature] {
    override def compare(that: SparseSignature): Int = bitSetComparator(this.bitSet, that.bitSet)
  }

  /**
   * An id with it's hash encoding and original vector.
   *
   */
  final case class Signature(index: Long, vector: Vector, bitSet: BitSet) extends Ordered[Signature] {
    override def compare(that: Signature): Int = bitSetComparator(this.bitSet, that.bitSet)
  }

  /**
   * Compares two bit sets according to the first different bit
   *
   */
  def bitSetComparator(a: BitSet, b: BitSet): Int = {
    val xor = a ^ b
    val firstDifference = xor.nextSetBit(0)
    if (firstDifference >= 0) {
      if (a.get(firstDifference)) // if the difference is set to 1 on a
        1
      else
        -1
    } else {
      0
    }
  }

  /**
   * Returns a string representation of a BitSet
   */
  def bitSetToString(bs: BitSet): String = bs.iterator.mkString(",")


  /**
   * Returns a local k by d matrix with random gaussian entries mean=0.0 and
   * std=1.0
   *
   * This is a k by d matrix as it is multiplied by the input matrix
   *
   */
  def localRandomMatrix(d: Int, numFeatures: Int): Matrix = {
    val randomGenerator = new Random()
    val values = Array.fill(numFeatures * d)(randomGenerator.nextGaussian())
    Matrices.dense(numFeatures, d, values)
  }

  /**
   * Converts a given input matrix to a bit set representation using random hyperplanes
   *
   */
  def matrixToBitSet(inputMatrix: IndexedRowMatrix, localRandomMatrix: Matrix): RDD[Signature] = {
    val bitSets = inputMatrix.multiply(localRandomMatrix).rows.map {
      indexedRow =>
        (indexedRow.index, vectorToBitSet(indexedRow.vector))
    }
    val originalVectors = inputMatrix.rows.map { row => (row.index, row.vector) }
    bitSets.join(originalVectors).map {
      case (id, (bitSet, vector)) =>
        Signature(id, vector, bitSet)
    }
  }

  /**
   * Converts a given input matrix to a bit set representation using random hyperplanes
   *
   */
  def matrixToBitSetSparse(inputMatrix: IndexedRowMatrix, localRandomMatrix: Matrix): RDD[SparseSignature] = {
    inputMatrix.multiply(localRandomMatrix).rows.map {
      indexedRow: IndexedRow =>
        val bitSet = vectorToBitSet(indexedRow.vector)
        SparseSignature(indexedRow.index, bitSet)
    }
  }

  /**
   * Converts a vector to a bit set by replacing all values of x with sign(x)
   *
   */
  def vectorToBitSet(vector: Vector): BitSet = {
    val bitSet = new BitSet(vector.size)
    vector.toArray.zipWithIndex.map {
      case ((value: Double, index: Int)) =>
        if (math.signum(value) > 0)
          bitSet.set(index)
    }
    bitSet
  }

  /**
   * Approximates the cosine distance of two bit sets using their hamming
   * distance
   *
   */
  def hammingToCosine(hammingDistance: Int, d: Double): Double = {
    val pr = 1.0 - (hammingDistance / d)
    math.cos((1.0 - pr) * math.Pi)
  }

  /**
   * Returns the hamming distance between two bit vectors
   *
   */
  def hamming(vec1: BitSet, vec2: BitSet): Int = {
    (vec1 ^ vec2).cardinality()
  }

  /**
   * Compares two bit sets for their equality
   *
   */
  def bitSetIsEqual(vec1: BitSet, vec2: BitSet): Boolean = {
    hamming(vec1, vec2) == 0
  }

  /**
   * Take distinct matrix entry values based on the indices only.
   * The actual values are discarded.
   *
   */
  def distinct(matrix: RDD[MatrixEntry]): RDD[MatrixEntry] = {
    matrix.keyBy(m => (m.i, m.j)).reduceByKey((x, y) => x).values
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy