hivemall.utils.hashing.MurmurHash3 Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package hivemall.utils.hashing;
import hivemall.utils.math.MathUtils;
public final class MurmurHash3 {
/** 2^24 */
public static final int DEFAULT_NUM_FEATURES = 16777216;
/**
* @return hash value of range from 0 to 2^24 (16777216).
*/
public static int murmurhash3(final String data) {
final int h = murmurhash3_x86_32(data, 0, data.length(), 0x9747b28c);
int r = MathUtils.moduloPowerOfTwo(h, DEFAULT_NUM_FEATURES);
if (r < 0) {
r += DEFAULT_NUM_FEATURES;
}
return r;
}
public static int murmurhash3(final String data, final int numFeatures) {
int r = murmurhash3_x86_32(data, 0, data.length(), 0x9747b28c) % numFeatures;
if (r < 0) {
r += numFeatures;
}
return r;
}
public static int murmurhash3_x86_32(final String data) {
return murmurhash3_x86_32(data, 0x9747b28c);
}
public static int murmurhash3_x86_32(final String data, final int seed) {
return murmurhash3_x86_32(data, 0, data.length(), seed);
}
/** Returns the MurmurHash3_x86_32 hash. */
public static int murmurhash3_x86_32(final CharSequence data, final int offset, final int len,
final int seed) {
final int c1 = 0xcc9e2d51;
final int c2 = 0x1b873593;
int h1 = seed;
final int end = offset + len;
int pos = offset;
int k1 = 0;
int k2 = 0;
int shift = 0;
int bits = 0;
int nBytes = 0; // length in UTF8 bytes
while (pos < end) {
final int code = data.charAt(pos++);
if (code < 0x80) {
k2 = code;
bits = 8;
} else if (code < 0x800) {
k2 = (0xC0 | (code >> 6)) | ((0x80 | (code & 0x3F)) << 8);
bits = 16;
} else if (code < 0xD800 || code > 0xDFFF || pos >= end) {
// we check for pos>=end to encode an unpaired surrogate as 3 bytes.
k2 = (0xE0 | (code >> 12)) | ((0x80 | ((code >> 6) & 0x3F)) << 8)
| ((0x80 | (code & 0x3F)) << 16);
bits = 24;
} else {
// surrogate pair
// int utf32 = pos < end ? (int) data.charAt(pos++) : 0;
int utf32 = (int) data.charAt(pos++);
utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
k2 = (0xff & (0xF0 | (utf32 >> 18))) | ((0x80 | ((utf32 >> 12) & 0x3F))) << 8
| ((0x80 | ((utf32 >> 6) & 0x3F))) << 16 | (0x80 | (utf32 & 0x3F)) << 24;
bits = 32;
}
k1 |= k2 << shift;
shift += bits;
if (shift >= 32) {
// mix after we have a complete word
k1 *= c1;
k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15);
k1 *= c2;
h1 ^= k1;
h1 = (h1 << 13) | (h1 >>> 19); // ROTL32(h1,13);
h1 = h1 * 5 + 0xe6546b64;
shift -= 32;
// unfortunately, java won't let you shift 32 bits off, so we need to check for 0
if (shift != 0) {
k1 = k2 >>> (bits - shift); // bits used == bits - newshift
} else {
k1 = 0;
}
nBytes += 4;
}
} // inner
// handle tail
if (shift > 0) {
nBytes += shift >> 3;
k1 *= c1;
k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15);
k1 *= c2;
h1 ^= k1;
}
// finalization
h1 ^= nBytes;
// fmix(h1);
h1 ^= h1 >>> 16;
h1 *= 0x85ebca6b;
h1 ^= h1 >>> 13;
h1 *= 0xc2b2ae35;
h1 ^= h1 >>> 16;
return h1;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy