shark.util.MurmurHash3_x86_128.scala Maven / Gradle / Ivy
The newest version!
/*
* Copyright (C) 2012 The Regents of The University California.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package shark.util
import java.lang.Integer.{ rotateLeft => rotl }
/**
* The MurmurHash3_x86_128(...) is a fast, non-cryptographic, 128-bit hash
* function that has excellent avalanche and 2-way bit independence properties.
*
*
* The C++ version, revision 147, of the MurmurHash3, written Austin Appleby,
* and which is in the Public Domain, was the inspiration for this
* implementation in Scala. The C++ version can be found at
* SMHasher & MurmurHash.
*
* The Scala implementation follows the C++ version closely with two additional features
* tailored for scenarios where object allocation is expensive., e.g where the hash function
* is called several million times.
* Use the method hash(data, seed, length) if you would like to reuse the same input buffer.
* Likewise, use the method hash(data, seed, length, results) if you would like to reuse
* the output buffer which is always of a fixed length 4.
*
*
* @author Ram Sriharsha (harshars at yahoo-inc dot com)
*/
sealed class HashState(var h1: Int, var h2: Int, var h3: Int, var h4: Int) {
val C1 = 0x239b961b
val C2 = 0xab0e9789
val C3 = 0x38b34ae5
val C4 = 0xa1e38b93
@inline final def blockMix(k1: Int, k2: Int, k3: Int, k4: Int) {
h1 ^= selfMixK1(k1)
h1 = rotl(h1, 19); h1 += h2; h1 = h1 * 5 + 0x561ccd1b
h2 ^= selfMixK2(k2)
h2 = rotl(h2, 17); h2 += h3; h2 = h2 * 5 + 0x0bcaa747
h3 ^= selfMixK3(k3)
h3 = rotl(h3, 15); h3 += h4; h3 = h3 * 5 + 0x96cd1c35
h4 ^= selfMixK4(k4)
h4 = rotl(h4, 13); h4 += h1; h4 = h4 * 5 + 0x32ac3b17
}
@inline final def finalMix(k1: Int, k2: Int, k3: Int, k4: Int, len: Int) {
h1 ^= (if (k1 ==0) 0 else selfMixK1(k1))
h2 ^= (if (k2 ==0) 0 else selfMixK2(k2))
h3 ^= (if (k3 ==0) 0 else selfMixK3(k3))
h4 ^= (if (k4 ==0) 0 else selfMixK4(k4))
h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len
h1 += h2; h1 += h3; h1 += h4
h2 += h1; h3 += h1; h4 += h1
h1 = fmix(h1)
h2 = fmix(h2)
h3 = fmix(h3)
h4 = fmix(h4)
h1 += h2; h1 += h3; h1 += h4
h2 += h1; h3 += h1; h4 += h1
}
@inline final def fmix(hash: Int): Int = {
var h = hash
h ^= h >> 16
h *= 0x85ebca6b
h ^= h >> 13
h *= 0xc2b2ae35
h ^= h >> 16
h
}
@inline final def selfMixK1(k: Int): Int = {
var k1 = k; k1 *= C1; k1 = rotl(k1, 15); k1 *= C2
k1
}
@inline final def selfMixK2(k: Int): Int = {
var k2 = k; k2 *= C2; k2 = rotl(k2, 16); k2 *= C3
k2
}
@inline final def selfMixK3(k: Int): Int = {
var k3 = k; k3 *= C3; k3 = rotl(k3, 17); k3 *= C4
k3
}
@inline final def selfMixK4(k: Int): Int = {
var k4 = k; k4 *= C4; k4 = rotl(k4, 18); k4 *= C1
k4
}
}
object MurmurHash3_x86_128 {
/**
* @param data is the bytes to be hashed.
* @param seed is the seed for the murmurhash algorithm.
*/
@inline final def hash(data: Array[Byte], seed: Int)
: Array[Int] = {
hash(data, seed, data.length)
}
/**
* An optimization for reusing memory under large number of hash calls.
* @param data is the bytes to be hashed.
* @param seed is the seed for the murmurhash algorithm.
* @param length is the length of the buffer to use for hashing.
* @param results is the output buffer to store the four ints that are returned,
* should have size at least 4.
*/
@inline final def hash(data: Array[Byte], seed: Int, length: Int,
results: Array[Int]): Unit = {
var i = 0
val blocks = length >> 4
val state = new HashState(seed, seed, seed, seed)
while (i < blocks) {
val k1 = getInt(data, 4*i, 4)
val k2 = getInt(data, 4*i + 4, 4)
val k3 = getInt(data, 4*i + 8, 4)
val k4 = getInt(data, 4*i + 12, 4)
state.blockMix(k1, k2, k3, k4)
i += 1
}
var k1, k2, k3, k4 = 0
val tail = blocks * 16
val rem = length - tail
// atmost 15 bytes remain
rem match {
case 12 | 13 | 14 | 15 => {
k1 = getInt(data, tail, 4)
k2 = getInt(data, tail + 4, 4)
k3 = getInt(data, tail + 8, 4)
k4 = getInt(data, tail + 12, rem - 12)
}
case 8 | 9 | 10 | 11 => {
k1 = getInt(data, tail, 4)
k2 = getInt(data, tail + 4, 4)
k3 = getInt(data, tail + 8, rem - 8)
}
case 4 | 5 | 6 | 7 => {
k1 = getInt(data, tail, 4)
k2 = getInt(data, tail + 4, rem - 4)
}
case 0 | 1 | 2 | 3 => {
k1 = getInt(data, tail, rem)
}
}
state.finalMix(k1, k2, k3, k4, length)
results(0) = state.h1
results(1) = state.h2
results(2) = state.h3
results(3) = state.h4
}
/**
* An optimization for reusing memory under large number of hash calls.
* @param data is the bytes to be hashed.
* @param seed is the seed for the murmurhash algorithm.
* @param length is the length of the buffer to use for hashing.
* @return is an array of size 4 that holds the four ints that comprise the 128 bit hash.
*/
@inline final def hash(data: Array[Byte], seed: Int, length: Int)
: Array[Int] = {
val results = new Array[Int](4)
hash(data, seed, length, results)
results
}
/**
* Utility function to convert a byte array into an int, filling in zeros
* if the byte array is not big enough.
* @param data is the byte array to be converted to an int.
* @param index is the starting index in the byte array.
* @param rem is the remainder of the byte array to examine.
*/
@inline final def getInt(data: Array[Byte], index: Int, rem: Int): Int = {
rem match {
case 3 => data(index) << 24 |
(data(index + 1) & 0xFF) << 16 |
(data(index + 2) & 0xFF) << 8
case 2 => data(index) << 24 |
(data(index + 1) & 0xFF) << 16
case 1 => data(index) << 24
case 0 => 0
case _ => data(index) << 24 |
(data(index + 1) & 0xFF) << 16 |
(data(index + 2) & 0xFF) << 8 |
(data(index + 3) & 0xFF)
}
}
}