All Downloads are FREE. Search and download functionalities are using the official Maven repository.

fregata.hash.SimHash.scala Maven / Gradle / Ivy

The newest version!
package fregata.hash

import fregata._
import scala.collection.mutable.ArrayBuffer

/**
  * Created by hjliu on 16/11/29.
  */
class SimHash extends Serializable{

  var featureSize: Int = 0
  var hashbits: Int = 0
  var hashfunc: (Int) => Array[Long] = x => null

  def this(featureSize: Int, hashBits: Int)= {

    this
    this.featureSize = featureSize
    this.hashbits = hashBits

    hashBits match {
      case 64 => hashfunc = hash64
      case 128 => hashfunc = hash128
      case 192 => hashfunc = hash192
      case 256 => hashfunc = hash256
      case 320 => hashfunc = hash320
      case 384 => hashfunc = hash384
      case 448 => hashfunc = hash448
      case 512 => hashfunc = hash512
      case _ => hashfunc = hash256
    }

  }

  def hash64(in: Int): Array[Long] = {
    var h = in.toLong
    h ^= (h >> 23)
    h *= 0x2127599bf4325c37L
    h ^= h >> 47

    val t = new Array[Long](1)
    t(0) = h
    t
  }

  def hash128(in: Int): Array[Long] = {
    var h = in.toLong
    h ^= (h >> 23)
    h *= 0x2127599bf4325c37L
    h ^= h >> 47

    val t = new Array[Long](2)
    t(0) = h

    h = -in.toLong
    h ^= (h >> 23)
    h *= 0x2127599bf4325c37L
    h ^= h >> 47
    t(1) = h
    t
  }

  def hash192(in: Int): Array[Long] = {
    var h = in.toLong
    h ^= (h >> 23)
    h *= 0x2127599bf4325c37L
    h ^= h >> 47

    val t = new Array[Long](3)
    t(0) = h

    h = -in.toLong
    h ^= (h >> 23)
    h *= 0x2127599bf4325c37L
    h ^= h >> 47
    t(1) = h

    h = in.toLong + featureSize
    h ^= (h >> 23)
    h *= 0x2127599bf4325c37L
    h ^= h >> 47
    t(2) = h
    t
  }

  def hash256(in: Int): Array[Long] = {
    var h = in.toLong
    h ^= (h >> 23)
    h *= 0x2127599bf4325c37L
    h ^= h >> 47

    val t = new Array[Long](4)
    t(0) = h

    h = -in.toLong
    h ^= (h >> 23)
    h *= 0x2127599bf4325c37L
    h ^= h >> 47
    t(1) = h

    h = in.toLong + featureSize
    h ^= (h >> 23)
    h *= 0x2127599bf4325c37L
    h ^= h >> 47
    t(2) = h

    h = -in.toLong - featureSize
    h ^= (h >> 23)
    h *= 0x2127599bf4325c37L
    h ^= h >> 47
    t(3) = h
    t
  }

  def hash320(in: Int): Array[Long] = {
    var h = in.toLong
    h ^= (h >> 23)
    h *= 0x2127599bf4325c37L
    h ^= h >> 47

    val t = new Array[Long](5)
    t(0) = h

    h = -in.toLong
    h ^= (h >> 23)
    h *= 0x2127599bf4325c37L
    h ^= h >> 47
    t(1) = h

    h = in.toLong + featureSize
    h ^= (h >> 23)
    h *= 0x2127599bf4325c37L
    h ^= h >> 47
    t(2) = h

    h = -in.toLong - featureSize
    h ^= (h >> 23)
    h *= 0x2127599bf4325c37L
    h ^= h >> 47
    t(3) = h

    h = in.toLong + 2 * featureSize
    h ^= (h >> 23)
    h *= 0x2127599bf4325c37L
    h ^= h >> 47
    t(4) = h

    t
  }

  def hash384(in: Int): Array[Long] = {
    var h = in.toLong
    h ^= (h >> 23)
    h *= 0x2127599bf4325c37L
    h ^= h >> 47

    val t = new Array[Long](6)
    t(0) = h

    h = -in.toLong
    h ^= (h >> 23)
    h *= 0x2127599bf4325c37L
    h ^= h >> 47
    t(1) = h

    h = in.toLong + featureSize
    h ^= (h >> 23)
    h *= 0x2127599bf4325c37L
    h ^= h >> 47
    t(2) = h

    h = -in.toLong - featureSize
    h ^= (h >> 23)
    h *= 0x2127599bf4325c37L
    h ^= h >> 47
    t(3) = h

    h = in.toLong + 2 * featureSize
    h ^= (h >> 23)
    h *= 0x2127599bf4325c37L
    h ^= h >> 47
    t(4) = h

    h = -in.toLong - 2 * featureSize
    h ^= (h >> 23)
    h *= 0x2127599bf4325c37L
    h ^= h >> 47
    t(5) = h

    t
  }

  def hash448(in: Int): Array[Long] = {
    var h = in.toLong
    h ^= (h >> 23)
    h *= 0x2127599bf4325c37L
    h ^= h >> 47

    val t = new Array[Long](7)
    t(0) = h

    h = -in.toLong
    h ^= (h >> 23)
    h *= 0x2127599bf4325c37L
    h ^= h >> 47
    t(1) = h

    h = in.toLong + featureSize
    h ^= (h >> 23)
    h *= 0x2127599bf4325c37L
    h ^= h >> 47
    t(2) = h

    h = -in.toLong - featureSize
    h ^= (h >> 23)
    h *= 0x2127599bf4325c37L
    h ^= h >> 47
    t(3) = h

    h = in.toLong + 2 * featureSize
    h ^= (h >> 23)
    h *= 0x2127599bf4325c37L
    h ^= h >> 47
    t(4) = h

    h = -in.toLong - 2 * featureSize
    h ^= (h >> 23)
    h *= 0x2127599bf4325c37L
    h ^= h >> 47
    t(5) = h

    h = in.toLong + 3 * featureSize
    h ^= (h >> 23)
    h *= 0x2127599bf4325c37L
    h ^= h >> 47
    t(6) = h

    t
  }

  def hash512(in: Int): Array[Long] = {
    var h = in.toLong
    h ^= (h >> 23)
    h *= 0x2127599bf4325c37L
    h ^= h >> 47

    val t = new Array[Long](8)
    t(0) = h

    h = -in.toLong
    h ^= (h >> 23)
    h *= 0x2127599bf4325c37L
    h ^= h >> 47
    t(1) = h

    h = in.toLong + featureSize
    h ^= (h >> 23)
    h *= 0x2127599bf4325c37L
    h ^= h >> 47
    t(2) = h

    h = -in.toLong - featureSize
    h ^= (h >> 23)
    h *= 0x2127599bf4325c37L
    h ^= h >> 47
    t(3) = h

    h = in.toLong + 2 * featureSize
    h ^= (h >> 23)
    h *= 0x2127599bf4325c37L
    h ^= h >> 47
    t(4) = h

    h = -in.toLong - 2 * featureSize
    h ^= (h >> 23)
    h *= 0x2127599bf4325c37L
    h ^= h >> 47
    t(5) = h

    h = in.toLong + 3 * featureSize
    h ^= (h >> 23)
    h *= 0x2127599bf4325c37L
    h ^= h >> 47
    t(6) = h

    h = -in.toLong - 3 * featureSize
    h ^= (h >> 23)
    h *= 0x2127599bf4325c37L
    h ^= h >> 47
    t(7) = h
    t
  }

  def hash(vs: Vector) = {
    val s = new Array[Num](hashbits)
    val one = 1L

    vs.foreachPair {
      (i, v) =>
        if (v != 0d) {
          val h = hashfunc(i)
          var k = 0
          val hlen = h.length
          while (k < hlen) {
            var l = 0
            while (l < 64) {
              if ((h(k) & (one << l)) != 0) {
                s(k * 64 + l) += v
              } else {
                s(k * 64 + l) -= v
              }
              l += 1
            }
            k += 1
          }
        }
    }

    var j = 0
    val indices = ArrayBuffer[Int]()
    val values  = ArrayBuffer[Num]()
    while (j < s.length) {
      if (s(j) >= 0) {
        indices.append(j)
        values.append(1d)
      }
      j += 1
    }
    new SparseVector(indices.toArray, values.toArray, hashbits).asInstanceOf[Vector]
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy