All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.gitee.cloudangel.impala.hash.Murmur2Hash Maven / Gradle / Ivy

The newest version!
/**
 * huize Service Inc
 * All Rights Reserved @2018
 */
package com.gitee.cloudangel.impala.hash;

/**
 * 

(1)murmur hash 2.0版本java实现,来源于{@code http://murmurhash.googlepages.com/}

*

(2)我们使用的CDH集群Impala内置UDF函数,只有MurmurHash2版本,没有MurmurHash3

*

(3)因此当前阶段我们也需要使用MurmurHash2,保持跟impala内置UDF函数相同的实现逻辑

*

(4)impala使用hash算法的语句是{@code select murmur_hash("hello-world") as xxx }

* * @author tianyuliang * @version $Id: Murmur2Hash.java, v0.1 2022/4/2 */ public class Murmur2Hash { /** * 在Murmurhash2中用来生成64位散列值的方法中,seed值被指定为 0x1234ABCD(相当于十进制3782874213)(通过seed值来降低冲突碰撞发生的概率) * 方法调用者只需要关注需要计算散列值的数据内容即可 * https://marcuseddie.github.io/2019/MurMurhash-Learning.html *

* impala官方定义版本是0x0(非常意外吧?!),而MurmurHash算法本身推荐使用质数来做随机数(质数越大则算法碰撞度越低) *

* 1) marcuseddie.github.io这个作者推荐的质数种子是 0xe17a1465 (相当于十进制305441741) *

* 2)基于质数表,找到一个在40亿内的质数 0xEE6B27EB(相当于十进制3999999979) *

*/ private static final int DEFAULT_SEED = 0x0; /** * 'm' and 'r' are mixing constants generated offline. * They're not really 'magic', they just happen to work well. */ private static final int MURMUR_32_PRIME = 0x5bd1e995; private static final int MURMUR_32_M = 24; private static final int MURMUR_32_SEED = 0x9747b28c; /** * 'm' and 'r' are mixing constants generated offline. * They're not really 'magic', they just happen to work well. */ private static final long MURMUR_64_PRIME = 0xc6a4a7935bd1e995L; private static final int MURMUR_64_M = 47; private static final int MURMUR_64_SEED = DEFAULT_SEED; // all methods static; private constructor. private Murmur2Hash() { } /** * Generates 32 bit hash from byte array of the given length and * seed. * * @param data byte array to hash * @param length length of the array to hash * @param seed initial seed value * @return 32 bit hash of the given array */ public static int hash32(final byte[] data, int length, int seed) { // 'm' and 'r' are mixing constants generated offline. // They're not really 'magic', they just happen to work well. final int m = MURMUR_32_PRIME; final int r = MURMUR_32_M; // Initialize the hash to a random value int h = seed ^ length; int length4 = length / 4; for (int i = 0; i < length4; i++) { final int i4 = i * 4; int k = (data[i4 + 0] & 0xff) + ((data[i4 + 1] & 0xff) << 8) + ((data[i4 + 2] & 0xff) << 16) + ((data[i4 + 3] & 0xff) << 24); k *= m; k ^= k >>> r; k *= m; h *= m; h ^= k; } // Handle the last few bytes of the input array switch (length % 4) { case 3: h ^= (data[(length & ~3) + 2] & 0xff) << 16; case 2: h ^= (data[(length & ~3) + 1] & 0xff) << 8; case 1: h ^= (data[length & ~3] & 0xff); h *= m; } h ^= h >>> 13; h *= m; h ^= h >>> 15; return h; } /** * Generates 32 bit hash from byte array with default seed value. * * @param data byte array to hash * @param length length of the array to hash * @return 32 bit hash of the given array */ public static int hash32(final byte[] data, int length) { return hash32(data, length, MURMUR_32_SEED); } /** * Generates 32 bit hash from a string. * * @param text string to hash * @return 32 bit hash of the given string */ public static int hash32(final String text) { if (text == null || text.length() == 0) { return 0; } final byte[] bytes = text.getBytes(); return hash32(bytes, bytes.length); } /** * Generates 32 bit hash from a substring. * * @param text string to hash * @param from starting index * @param length length of the substring to hash * @return 32 bit hash of the given string */ public static int hash32(final String text, int from, int length) { String tmp = text.substring(from, from + length); // support debug if hash result not match return hash32(tmp); } /** * Generates 64 bit hash from byte array of the given length and seed. * * @param data byte array to hash * @param length length of the array to hash * @param seed initial seed value * @return 64 bit hash of the given array */ public static long hash64(final byte[] data, int length, int seed) { final long m = MURMUR_64_PRIME; final int r = MURMUR_64_M; long h = (seed & 0xffffffffl) ^ (length * m); int length8 = length / 8; for (int i = 0; i < length8; i++) { final int i8 = i * 8; long k = ((long) data[i8 + 0] & 0xff) + (((long) data[i8 + 1] & 0xff) << 8) + (((long) data[i8 + 2] & 0xff) << 16) + (((long) data[i8 + 3] & 0xff) << 24) + (((long) data[i8 + 4] & 0xff) << 32) + (((long) data[i8 + 5] & 0xff) << 40) + (((long) data[i8 + 6] & 0xff) << 48) + (((long) data[i8 + 7] & 0xff) << 56); k *= m; k ^= k >>> r; k *= m; h ^= k; h *= m; } switch (length % 8) { case 7: h ^= (long) (data[(length & ~7) + 6] & 0xff) << 48; case 6: h ^= (long) (data[(length & ~7) + 5] & 0xff) << 40; case 5: h ^= (long) (data[(length & ~7) + 4] & 0xff) << 32; case 4: h ^= (long) (data[(length & ~7) + 3] & 0xff) << 24; case 3: h ^= (long) (data[(length & ~7) + 2] & 0xff) << 16; case 2: h ^= (long) (data[(length & ~7) + 1] & 0xff) << 8; case 1: h ^= (long) (data[length & ~7] & 0xff); h *= m; } h ^= h >>> r; h *= m; h ^= h >>> r; return h; } /** * Generates 64 bit hash from byte array with default seed value. * * @param data byte array to hash * @param length length of the array to hash * @return 64 bit hash of the given string */ public static long hash64(final byte[] data, int length) { return hash64(data, length, MURMUR_64_SEED); } /** * Generates 64 bit hash from a string. * * @param text string to hash * @return 64 bit hash of the given string */ public static long hash64(final String text) { if (text == null || text.length() == 0) { return 0L; } final byte[] bytes = text.getBytes(); return hash64(bytes, bytes.length); } /** * Generates 64 bit hash from a substring. * * @param text string to hash * @param from starting index * @param length length of the substring to hash * @return 64 bit hash of the given array */ public static long hash64(final String text, int from, int length) { String tmp = text.substring(from, from + length); // support debug if hash result not match return hash64(tmp); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy