All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.commons.codec.digest.MurmurHash2 Maven / Gradle / Ivy

There is a newer version: 4.1.4
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.commons.codec.digest;

import org.apache.commons.codec.binary.StringUtils;

/**
 * Implementation of the MurmurHash2 32-bit and 64-bit hash functions.
 *
 * 

MurmurHash is a non-cryptographic hash function suitable for general * hash-based lookup. The name comes from two basic operations, multiply (MU) * and rotate (R), used in its inner loop. Unlike cryptographic hash functions, * it is not specifically designed to be difficult to reverse by an adversary, * making it unsuitable for cryptographic purposes.

* *

This contains a Java port of the 32-bit hash function {@code MurmurHash2} * and the 64-bit hash function {@code MurmurHash64A} from Austin Applyby's * original {@code c++} code in SMHasher.

* *

This is a re-implementation of the original C code plus some additional * features.

* *

This is public domain code with no copyrights. From home page of * SMHasher:

* *
* "All MurmurHash versions are public domain software, and the author * disclaims all copyright to their code." *
* * @see MurmurHash * @see * Original MurmurHash2 c++ code * @since 1.13 */ public final class MurmurHash2 { // Constants for 32-bit variant private static final int M32 = 0x5bd1e995; private static final int R32 = 24; // Constants for 64-bit variant private static final long M64 = 0xc6a4a7935bd1e995L; private static final int R64 = 47; /** No instance methods. */ private MurmurHash2() { } /** * Generates a 32-bit hash from byte array with the given length and seed. * * @param data The input byte array * @param length The length of the array * @param seed The initial seed value * @return The 32-bit hash */ public static int hash32(final byte[] data, final int length, final int seed) { // Initialize the hash to a random value int h = seed ^ length; // Mix 4 bytes at a time into the hash final int nblocks = length >> 2; // body for (int i = 0; i < nblocks; i++) { final int index = (i << 2); int k = getLittleEndianInt(data, index); k *= M32; k ^= k >>> R32; k *= M32; h *= M32; h ^= k; } // Handle the last few bytes of the input array final int index = (nblocks << 2); switch (length - index) { case 3: h ^= (data[index + 2] & 0xff) << 16; case 2: h ^= (data[index + 1] & 0xff) << 8; case 1: h ^= (data[index] & 0xff); h *= M32; } // Do a few final mixes of the hash to ensure the last few // bytes are well-incorporated. h ^= h >>> 13; h *= M32; h ^= h >>> 15; return h; } /** * Generates a 32-bit hash from byte array with the given length and a default seed value. * This is a helper method that will produce the same result as: * *
     * int seed = 0x9747b28c;
     * int hash = MurmurHash2.hash32(data, length, seed);
     * 
* * @param data The input byte array * @param length The length of the array * @return The 32-bit hash * @see #hash32(byte[], int, int) */ public static int hash32(final byte[] data, final int length) { return hash32(data, length, 0x9747b28c); } /** * Generates a 32-bit hash from a string with a default seed. *

* Before 1.14 the string was converted using default encoding. * Since 1.14 the string is converted to bytes using UTF-8 encoding. *

* This is a helper method that will produce the same result as: * *
     * int seed = 0x9747b28c;
     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
     * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
     * 
* * @param text The input string * @return The 32-bit hash * @see #hash32(byte[], int, int) */ public static int hash32(final String text) { final byte[] bytes = StringUtils.getBytesUtf8(text); return hash32(bytes, bytes.length); } /** * Generates a 32-bit hash from a substring with a default seed value. * The string is converted to bytes using the default encoding. * This is a helper method that will produce the same result as: * *
     * int seed = 0x9747b28c;
     * byte[] bytes = text.substring(from, from + length).getBytes(StandardCharsets.UTF_8);
     * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
     * 
* * @param text The input string * @param from The starting index * @param length The length of the substring * @return The 32-bit hash * @see #hash32(byte[], int, int) */ public static int hash32(final String text, final int from, final int length) { return hash32(text.substring(from, from + length)); } /** * Generates a 64-bit hash from byte array of the given length and seed. * * @param data The input byte array * @param length The length of the array * @param seed The initial seed value * @return The 64-bit hash of the given array */ public static long hash64(final byte[] data, final int length, final int seed) { long h = (seed & 0xffffffffL) ^ (length * M64); final int nblocks = length >> 3; // body for (int i = 0; i < nblocks; i++) { final int index = (i << 3); long k = getLittleEndianLong(data, index); k *= M64; k ^= k >>> R64; k *= M64; h ^= k; h *= M64; } final int index = (nblocks << 3); switch (length - index) { case 7: h ^= ((long) data[index + 6] & 0xff) << 48; case 6: h ^= ((long) data[index + 5] & 0xff) << 40; case 5: h ^= ((long) data[index + 4] & 0xff) << 32; case 4: h ^= ((long) data[index + 3] & 0xff) << 24; case 3: h ^= ((long) data[index + 2] & 0xff) << 16; case 2: h ^= ((long) data[index + 1] & 0xff) << 8; case 1: h ^= ((long) data[index] & 0xff); h *= M64; } h ^= h >>> R64; h *= M64; h ^= h >>> R64; return h; } /** * Generates a 64-bit hash from byte array with given length and a default seed value. * This is a helper method that will produce the same result as: * *
     * int seed = 0xe17a1465;
     * int hash = MurmurHash2.hash64(data, length, seed);
     * 
* * @param data The input byte array * @param length The length of the array * @return The 64-bit hash * @see #hash64(byte[], int, int) */ public static long hash64(final byte[] data, final int length) { return hash64(data, length, 0xe17a1465); } /** * Generates a 64-bit hash from a string with a default seed. *

* Before 1.14 the string was converted using default encoding. * Since 1.14 the string is converted to bytes using UTF-8 encoding. *

* This is a helper method that will produce the same result as: * *
     * int seed = 0xe17a1465;
     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
     * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
     * 
* * @param text The input string * @return The 64-bit hash * @see #hash64(byte[], int, int) */ public static long hash64(final String text) { final byte[] bytes = StringUtils.getBytesUtf8(text); return hash64(bytes, bytes.length); } /** * Generates a 64-bit hash from a substring with a default seed value. * The string is converted to bytes using the default encoding. * This is a helper method that will produce the same result as: * *
     * int seed = 0xe17a1465;
     * byte[] bytes = text.substring(from, from + length).getBytes(StandardCharsets.UTF_8);
     * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
     * 
* * @param text The The input string * @param from The starting index * @param length The length of the substring * @return The 64-bit hash * @see #hash64(byte[], int, int) */ public static long hash64(final String text, final int from, final int length) { return hash64(text.substring(from, from + length)); } /** * Gets the little-endian int from 4 bytes starting at the specified index. * * @param data The data * @param index The index * @return The little-endian int */ private static int getLittleEndianInt(final byte[] data, final int index) { return ((data[index ] & 0xff) ) | ((data[index + 1] & 0xff) << 8) | ((data[index + 2] & 0xff) << 16) | ((data[index + 3] & 0xff) << 24); } /** * Gets the little-endian long from 8 bytes starting at the specified index. * * @param data The data * @param index The index * @return The little-endian long */ private static long getLittleEndianLong(final byte[] data, final int index) { return (((long) data[index ] & 0xff) ) | (((long) data[index + 1] & 0xff) << 8) | (((long) data[index + 2] & 0xff) << 16) | (((long) data[index + 3] & 0xff) << 24) | (((long) data[index + 4] & 0xff) << 32) | (((long) data[index + 5] & 0xff) << 40) | (((long) data[index + 6] & 0xff) << 48) | (((long) data[index + 7] & 0xff) << 56); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy