org.apache.datasketches.hash.MurmurHash3 Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.datasketches.hash;
import java.io.Serializable;
/**
*
* The MurmurHash3 is a fast, non-cryptographic, 128-bit hash function that has
* excellent avalanche and 2-way bit independence properties.
*
*
*
* Austin Appleby's C++
*
* MurmurHash3_x64_128(...), final revision 150,
* which is in the Public Domain, was the inspiration for this implementation in Java.
*
*
*
* This java implementation pays close attention to the C++ algorithms in order to
* maintain bit-wise compatibility, but the design is quite different. This implementation has also
* been extended to include processing of arrays of longs, char or ints, which was not part of the
* original C++ implementation. This implementation produces the same exact output hash bits as
* the above C++ method given the same input.
*
* In addition, with this implementation, the hash of byte[], char[], int[], or long[] will
* produce the same hash result if, and only if, all the arrays have the same exact length in
* bytes, and if the contents of the values in the arrays have the same byte endianness and
* overall order. There is a unit test for this class that demonstrates this.
*
*
* The structure of this implementation also reflects a separation of code that is dependent on the
* input structure (in this case byte[], int[] or long[]) from code that is independent of the input
* structure. This also makes the code more readable and suitable for future extensions.
*
*
* @author Lee Rhodes
*/
public final class MurmurHash3 implements Serializable {
private static final long serialVersionUID = 0L;
private MurmurHash3() {}
//--Hash of long[]----------------------------------------------------
/**
* Returns a long array of size 2, which is a 128-bit hash of the input.
*
* @param key The input long[] array. Must be non-null and non-empty.
* @param seed A long valued seed.
* @return the hash.
*/
public static long[] hash(final long[] key, final long seed) {
final HashState hashState = new HashState(seed, seed);
final int longs = key.length; //in longs
// Number of full 128-bit blocks of 2 longs (the body).
// Possible exclusion of a remainder of 1 long.
final int nblocks = longs >>> 1; //longs / 2
// Process the 128-bit blocks (the body) into the hash
for (int i = 0; i < nblocks; i++ ) {
final long k1 = key[i << 1]; //0, 2, 4, ...
final long k2 = key[(i << 1) + 1]; //1, 3, 5, ...
hashState.blockMix128(k1, k2);
}
// Get the tail index, remainder length
final int tail = nblocks << 1; // 2 longs / block
final int rem = longs - tail; // remainder longs: 0,1
// Get the tail
final long k1 = (rem == 0) ? 0 : key[tail]; //k2 -> 0
// Mix the tail into the hash and return
return hashState.finalMix128(k1, 0, longs << 3); //convert to bytes
}
//--Hash of int[]----------------------------------------------------
/**
* Returns a long array of size 2, which is a 128-bit hash of the input.
*
* @param key The input int[] array. Must be non-null and non-empty.
* @param seed A long valued seed.
* @return the hash.
*/
public static long[] hash(final int[] key, final long seed) {
final HashState hashState = new HashState(seed, seed);
final int ints = key.length; //in ints
// Number of full 128-bit blocks of 4 ints.
// Possible exclusion of a remainder of up to 3 ints.
final int nblocks = ints >>> 2; //ints / 4
// Process the 128-bit blocks (the body) into the hash
for (int i = 0; i < nblocks; i++ ) { //4 ints per block
final long k1 = getLong(key, i << 2, 2); //0, 4, 8, ...
final long k2 = getLong(key, (i << 2) + 2, 2); //2, 6, 10, ...
hashState.blockMix128(k1, k2);
}
// Get the tail index, remainder length
final int tail = nblocks << 2; // 4 ints per block
final int rem = ints - tail; // remainder ints: 0,1,2,3
// Get the tail
final long k1;
final long k2;
if (rem > 2) { //k1 -> whole; k2 -> partial
k1 = getLong(key, tail, 2);
k2 = getLong(key, tail + 2, rem - 2);
}
else { //k1 -> whole, partial or 0; k2 == 0
k1 = (rem == 0) ? 0 : getLong(key, tail, rem);
k2 = 0;
}
// Mix the tail into the hash and return
return hashState.finalMix128(k1, k2, ints << 2); //convert to bytes
}
//--Hash of char[]----------------------------------------------------
/**
* Returns a long array of size 2, which is a 128-bit hash of the input.
*
* @param key The input char[] array. Must be non-null and non-empty.
* @param seed A long valued seed.
* @return the hash.
*/
public static long[] hash(final char[] key, final long seed) {
final HashState hashState = new HashState(seed, seed);
final int chars = key.length; //in chars
// Number of full 128-bit blocks of 8 chars.
// Possible exclusion of a remainder of up to 7 chars.
final int nblocks = chars >>> 3; //chars / 8
// Process the 128-bit blocks (the body) into the hash
for (int i = 0; i < nblocks; i++ ) { //8 chars per block
final long k1 = getLong(key, i << 3, 4); //0, 8, 16, ...
final long k2 = getLong(key, (i << 3) + 4, 4); //4, 12, 20, ...
hashState.blockMix128(k1, k2);
}
// Get the tail index, remainder length
final int tail = nblocks << 3; // 8 chars per block
final int rem = chars - tail; // remainder chars: 0,1,2,3,4,5,6,7
// Get the tail
final long k1;
final long k2;
if (rem > 4) { //k1 -> whole; k2 -> partial
k1 = getLong(key, tail, 4);
k2 = getLong(key, tail + 4, rem - 4);
}
else { //k1 -> whole, partial or 0; k2 == 0
k1 = (rem == 0) ? 0 : getLong(key, tail, rem);
k2 = 0;
}
// Mix the tail into the hash and return
return hashState.finalMix128(k1, k2, chars << 1); //convert to bytes
}
//--Hash of byte[]----------------------------------------------------
/**
* Returns a long array of size 2, which is a 128-bit hash of the input.
*
* @param key The input byte[] array. Must be non-null and non-empty.
* @param seed A long valued seed.
* @return the hash.
*/
public static long[] hash(final byte[] key, final long seed) {
final HashState hashState = new HashState(seed, seed);
final int bytes = key.length; //in bytes
// Number of full 128-bit blocks of 16 bytes.
// Possible exclusion of a remainder of up to 15 bytes.
final int nblocks = bytes >>> 4; //bytes / 16
// Process the 128-bit blocks (the body) into the hash
for (int i = 0; i < nblocks; i++ ) { //16 bytes per block
final long k1 = getLong(key, i << 4, 8); //0, 16, 32, ...
final long k2 = getLong(key, (i << 4) + 8, 8); //8, 24, 40, ...
hashState.blockMix128(k1, k2);
}
// Get the tail index, remainder length
final int tail = nblocks << 4; //16 bytes per block
final int rem = bytes - tail; // remainder bytes: 0,1,...,15
// Get the tail
final long k1;
final long k2;
if (rem > 8) { //k1 -> whole; k2 -> partial
k1 = getLong(key, tail, 8);
k2 = getLong(key, tail + 8, rem - 8);
}
else { //k1 -> whole, partial or 0; k2 == 0
k1 = (rem == 0) ? 0 : getLong(key, tail, rem);
k2 = 0;
}
// Mix the tail into the hash and return
return hashState.finalMix128(k1, k2, bytes);
}
//--HashState class---------------------------------------------------
/**
* Common processing of the 128-bit hash state independent of input type.
*/
private static final class HashState {
private static final long C1 = 0x87c37b91114253d5L;
private static final long C2 = 0x4cf5ad432745937fL;
private long h1;
private long h2;
HashState(final long h1, final long h2) {
this.h1 = h1;
this.h2 = h2;
}
/**
* Block mix (128-bit block) of input key to internal hash state.
*
* @param k1 intermediate mix value
* @param k2 intermediate mix value
*/
void blockMix128(final long k1, final long k2) {
h1 ^= mixK1(k1);
h1 = Long.rotateLeft(h1, 27);
h1 += h2;
h1 = (h1 * 5) + 0x52dce729;
h2 ^= mixK2(k2);
h2 = Long.rotateLeft(h2, 31);
h2 += h1;
h2 = (h2 * 5) + 0x38495ab5;
}
long[] finalMix128(final long k1, final long k2, final long inputLengthBytes) {
h1 ^= mixK1(k1);
h2 ^= mixK2(k2);
h1 ^= inputLengthBytes;
h2 ^= inputLengthBytes;
h1 += h2;
h2 += h1;
h1 = finalMix64(h1);
h2 = finalMix64(h2);
h1 += h2;
h2 += h1;
return new long[] { h1, h2 };
}
/**
* Final self mix of h*.
*
* @param h input to final mix
* @return mix
*/
private static long finalMix64(long h) {
h ^= h >>> 33;
h *= 0xff51afd7ed558ccdL;
h ^= h >>> 33;
h *= 0xc4ceb9fe1a85ec53L;
h ^= h >>> 33;
return h;
}
/**
* Self mix of k1
*
* @param k1 input argument
* @return mix
*/
private static long mixK1(long k1) {
k1 *= C1;
k1 = Long.rotateLeft(k1, 31);
k1 *= C2;
return k1;
}
/**
* Self mix of k2
*
* @param k2 input argument
* @return mix
*/
private static long mixK2(long k2) {
k2 *= C2;
k2 = Long.rotateLeft(k2, 33);
k2 *= C1;
return k2;
}
}
//--Helper methods----------------------------------------------------
/**
* Gets a long from the given byte array starting at the given byte array index and continuing for
* remainder (rem) bytes. The bytes are extracted in little-endian order. There is no limit
* checking.
*
* @param bArr The given input byte array.
* @param index Zero-based index from the start of the byte array.
* @param rem Remainder bytes. An integer in the range [1,8].
* @return long
*/
private static long getLong(final byte[] bArr, final int index, final int rem) {
long out = 0L;
for (int i = rem; i-- > 0;) { //i= 7,6,5,4,3,2,1,0
final byte b = bArr[index + i];
out ^= (b & 0xFFL) << (i * 8); //equivalent to |=
}
return out;
}
/**
* Gets a long from the given char array starting at the given char array index and continuing for
* remainder (rem) chars. The chars are extracted in little-endian order. There is no limit
* checking.
*
* @param charArr The given input char array.
* @param index Zero-based index from the start of the char array.
* @param rem Remainder chars. An integer in the range [1,4].
* @return long
*/
private static long getLong(final char[] charArr, final int index, final int rem) {
long out = 0L;
for (int i = rem; i-- > 0;) { //i= 3,2,1,0
final char c = charArr[index + i];
out ^= (c & 0xFFFFL) << (i * 16); //equivalent to |=
}
return out;
}
/**
* Gets a long from the given int array starting at the given int array index and continuing for
* remainder (rem) integers. The integers are extracted in little-endian order. There is no limit
* checking.
*
* @param intArr The given input int array.
* @param index Zero-based index from the start of the int array.
* @param rem Remainder integers. An integer in the range [1,2].
* @return long
*/
private static long getLong(final int[] intArr, final int index, final int rem) {
long out = 0L;
for (int i = rem; i-- > 0;) { //i= 1,0
final int v = intArr[index + i];
out ^= (v & 0xFFFFFFFFL) << (i * 32); //equivalent to |=
}
return out;
}
}