All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.commons.codec.digest.MurmurHash2 Maven / Gradle / Ivy

Go to download

The Apache Commons Codec package contains simple encoder and decoders for various formats such as Base64 and Hexadecimal. In addition to these widely used encoders and decoders, the codec package also maintains a collection of phonetic encoding utilities.

There is a newer version: 62
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.commons.codec.digest;

import org.apache.commons.codec.binary.StringUtils;

/**
 * Implementation of the MurmurHash2 32-bit and 64-bit hash functions.
 *
 * 

MurmurHash is a non-cryptographic hash function suitable for general * hash-based lookup. The name comes from two basic operations, multiply (MU) * and rotate (R), used in its inner loop. Unlike cryptographic hash functions, * it is not specifically designed to be difficult to reverse by an adversary, * making it unsuitable for cryptographic purposes.

* *

This contains a Java port of the 32-bit hash function {@code MurmurHash2} * and the 64-bit hash function {@code MurmurHash64A} from Austin Applyby's * original {@code c++} code in SMHasher.

* *

This is a re-implementation of the original C code plus some additional * features.

* *

This is public domain code with no copyrights. From home page of * SMHasher:

* *
* "All MurmurHash versions are public domain software, and the author * disclaims all copyright to their code." *
* * @see MurmurHash * @see * Original MurmurHash2 c++ code * @since 1.13 */ public final class MurmurHash2 { // Constants for 32-bit variant private static final int M32 = 0x5bd1e995; private static final int R32 = 24; // Constants for 64-bit variant private static final long M64 = 0xc6a4a7935bd1e995L; private static final int R64 = 47; /** No instance methods. */ private MurmurHash2() { } /** * Generates a 32-bit hash from byte array with the given length and seed. * * @param data The input byte array * @param length The length of the array * @param seed The initial seed value * @return The 32-bit hash */ public static int hash32(final byte[] data, final int length, final int seed) { // Initialize the hash to a random value int h = seed ^ length; // Mix 4 bytes at a time into the hash final int nblocks = length >> 2; // body for (int i = 0; i < nblocks; i++) { final int index = (i << 2); int k = getLittleEndianInt(data, index); k *= M32; k ^= k >>> R32; k *= M32; h *= M32; h ^= k; } // Handle the last few bytes of the input array final int index = (nblocks << 2); switch (length - index) { case 3: h ^= (data[index + 2] & 0xff) << 16; case 2: h ^= (data[index + 1] & 0xff) << 8; case 1: h ^= (data[index] & 0xff); h *= M32; } // Do a few final mixes of the hash to ensure the last few // bytes are well-incorporated. h ^= h >>> 13; h *= M32; h ^= h >>> 15; return h; } /** * Generates a 32-bit hash from byte array with the given length and a default seed value. * This is a helper method that will produce the same result as: * *
     * int seed = 0x9747b28c;
     * int hash = MurmurHash2.hash32(data, length, seed);
     * 
* * @param data The input byte array * @param length The length of the array * @return The 32-bit hash * @see #hash32(byte[], int, int) */ public static int hash32(final byte[] data, final int length) { return hash32(data, length, 0x9747b28c); } /** * Generates a 32-bit hash from a string with a default seed. *

* Before 1.14 the string was converted using default encoding. * Since 1.14 the string is converted to bytes using UTF-8 encoding. *

* This is a helper method that will produce the same result as: * *
     * int seed = 0x9747b28c;
     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
     * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
     * 
* * @param text The input string * @return The 32-bit hash * @see #hash32(byte[], int, int) */ public static int hash32(final String text) { final byte[] bytes = StringUtils.getBytesUtf8(text); return hash32(bytes, bytes.length); } /** * Generates a 32-bit hash from a substring with a default seed value. * The string is converted to bytes using the default encoding. * This is a helper method that will produce the same result as: * *
     * int seed = 0x9747b28c;
     * byte[] bytes = text.substring(from, from + length).getBytes(StandardCharsets.UTF_8);
     * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
     * 
* * @param text The input string * @param from The starting index * @param length The length of the substring * @return The 32-bit hash * @see #hash32(byte[], int, int) */ public static int hash32(final String text, final int from, final int length) { return hash32(text.substring(from, from + length)); } /** * Generates a 64-bit hash from byte array of the given length and seed. * * @param data The input byte array * @param length The length of the array * @param seed The initial seed value * @return The 64-bit hash of the given array */ public static long hash64(final byte[] data, final int length, final int seed) { long h = (seed & 0xffffffffL) ^ (length * M64); final int nblocks = length >> 3; // body for (int i = 0; i < nblocks; i++) { final int index = (i << 3); long k = getLittleEndianLong(data, index); k *= M64; k ^= k >>> R64; k *= M64; h ^= k; h *= M64; } final int index = (nblocks << 3); switch (length - index) { case 7: h ^= ((long) data[index + 6] & 0xff) << 48; case 6: h ^= ((long) data[index + 5] & 0xff) << 40; case 5: h ^= ((long) data[index + 4] & 0xff) << 32; case 4: h ^= ((long) data[index + 3] & 0xff) << 24; case 3: h ^= ((long) data[index + 2] & 0xff) << 16; case 2: h ^= ((long) data[index + 1] & 0xff) << 8; case 1: h ^= ((long) data[index] & 0xff); h *= M64; } h ^= h >>> R64; h *= M64; h ^= h >>> R64; return h; } /** * Generates a 64-bit hash from byte array with given length and a default seed value. * This is a helper method that will produce the same result as: * *
     * int seed = 0xe17a1465;
     * int hash = MurmurHash2.hash64(data, length, seed);
     * 
* * @param data The input byte array * @param length The length of the array * @return The 64-bit hash * @see #hash64(byte[], int, int) */ public static long hash64(final byte[] data, final int length) { return hash64(data, length, 0xe17a1465); } /** * Generates a 64-bit hash from a string with a default seed. *

* Before 1.14 the string was converted using default encoding. * Since 1.14 the string is converted to bytes using UTF-8 encoding. *

* This is a helper method that will produce the same result as: * *
     * int seed = 0xe17a1465;
     * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
     * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
     * 
* * @param text The input string * @return The 64-bit hash * @see #hash64(byte[], int, int) */ public static long hash64(final String text) { final byte[] bytes = StringUtils.getBytesUtf8(text); return hash64(bytes, bytes.length); } /** * Generates a 64-bit hash from a substring with a default seed value. * The string is converted to bytes using the default encoding. * This is a helper method that will produce the same result as: * *
     * int seed = 0xe17a1465;
     * byte[] bytes = text.substring(from, from + length).getBytes(StandardCharsets.UTF_8);
     * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
     * 
* * @param text The The input string * @param from The starting index * @param length The length of the substring * @return The 64-bit hash * @see #hash64(byte[], int, int) */ public static long hash64(final String text, final int from, final int length) { return hash64(text.substring(from, from + length)); } /** * Gets the little-endian int from 4 bytes starting at the specified index. * * @param data The data * @param index The index * @return The little-endian int */ private static int getLittleEndianInt(final byte[] data, final int index) { return ((data[index ] & 0xff) ) | ((data[index + 1] & 0xff) << 8) | ((data[index + 2] & 0xff) << 16) | ((data[index + 3] & 0xff) << 24); } /** * Gets the little-endian long from 8 bytes starting at the specified index. * * @param data The data * @param index The index * @return The little-endian long */ private static long getLittleEndianLong(final byte[] data, final int index) { return (((long) data[index ] & 0xff) ) | (((long) data[index + 1] & 0xff) << 8) | (((long) data[index + 2] & 0xff) << 16) | (((long) data[index + 3] & 0xff) << 24) | (((long) data[index + 4] & 0xff) << 32) | (((long) data[index + 5] & 0xff) << 40) | (((long) data[index + 6] & 0xff) << 48) | (((long) data[index + 7] & 0xff) << 56); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy