com.yahoo.collections.BobHash Maven / Gradle / Ivy
Show all versions of vespajlib Show documentation
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.collections;
import com.yahoo.text.Utf8;
/**
* A Java port of Michael Susag's BobHash in FastLib. This version is
* specifically done to be bit compatible with the one in FastLib, as it
* is used in decoding packets from FastServer.
*
* Hash function based on
* http://burtleburtle.net/bob/hash/index.html
* by Bob Jenkins, 1996. [email protected]. You may use this
* code any way you wish, private, educational, or commercial. It's free.
*
* @author Michael Susag
* @author Steinar Knutsen
*/
public class BobHash {
/**
* mix -- mix 3 32-bit values reversibly.
* For every delta with one or two bits set, and the deltas of all three
* high bits or all three low bits, whether the original value of a,b,c
* is almost all zero or is uniformly distributed,
* If mix() is run forward or backward, at least 32 bits in a,b,c
* have at least 1/4 probability of changing.
* If mix() is run forward, every bit of c will change between 1/3 and
* 2/3 of the time. (Well, 22/100 and 78/100 for some 2-bit deltas.)
* mix() was built out of 36 single-cycle latency instructions in a
* structure that could supported 2x parallelism, like so:
*
*
* a -= b;
* a -= c; x = (c>>13);
* b -= c; a ^= x;
* b -= a; x = (a<<8);
* c -= a; b ^= x;
* c -= b; x = (b>>13);
* ...
*
*
*
* Unfortunately, superscalar Pentiums and Sparcs can't take advantage
* of that parallelism. They've also turned some of those single-cycle
* latency instructions into multi-cycle latency instructions. Still,
* this is the fastest good hash I could find. There were about 2^^68
* to choose from. I only looked at a billion or so.
*/
private static int[] mix(int a, int b, int c) {
a -= b; a -= c; a ^= (c >>> 13);
b -= c; b -= a; b ^= (a << 8);
c -= a; c -= b; c ^= (b >>> 13);
a -= b; a -= c; a ^= (c >>> 12);
b -= c; b -= a; b ^= (a << 16);
c -= a; c -= b; c ^= (b >>> 5);
a -= b; a -= c; a ^= (c >>> 3);
b -= c; b -= a; b ^= (a << 10);
c -= a; c -= b; c ^= (b >>> 15);
return new int[]{ a, b, c };
}
/**
* Transform a byte to an int viewed as an unsigned byte.
*/
private static int unsign(byte x) {
int y;
y = 0xFF & x;
return y;
}
/**
* Hashes a string, by calling hash(byte[] key,int initval) with
* the utf-8 bytes of the string as key and 0 as initval.
* Note: This is copying the string content, change implementation to
* use efficiently on large strings.
*
* bratseth
*/
public static int hash(String key) {
return hash(Utf8.toBytes(key), 0);
}
/**
* The hash function
*
*
* hash() -- hash a variable-length key into a 32-bit value
* k : the key (the unaligned variable-length array of bytes)
* len : the length of the key, counting by bytes
* initval : can be any 4-byte value
*
*
* Returns a 32-bit value. Every bit of the key affects every bit of
* the return value. Every 1-bit and 2-bit delta achieves avalanche.
* About 6*len+35 instructions.
*
*
* The best hash table sizes are powers of 2. There is no need to do
* mod a prime (mod is sooo slow!). If you need less than 32 bits,
* use a bitmask. For example, if you need only 10 bits, do
* h = (h & hashmask(10));
* In which case, the hash table should have hashsize(10) elements.
*
* If you are hashing n strings (ub1 **)k, do it like this:
* for (i=0, h=0; i<n; ++i) h = hash( k[i], len[i], h);
*
*
* By Bob Jenkins, 1996. [email protected]. You may use this
* code any way you wish, private, educational, or commercial. It's free.
*
*
* See http://burtleburtle.net/bob/hash/evahash.html
* Use for hash table lookup, or anything where one collision in 2^^32 is
* acceptable. Do NOT use for cryptographic purposes.
*
* @param k the key
* @param initval the previous hash, or an arbitrary value
* @return A 32 bit hash value
*/
@SuppressWarnings("fallthrough")
public static int hash(byte[] k, int initval) {
int a, b, c, len;
int offset = 0;
int[] abcBuffer;
/* Set up the internal state */
len = k.length;
a = b = 0x9e3779b9; /* the golden ratio; an arbitrary value */
c = initval; /* the previous hash value */
// handle most of the key
while (len >= 12) {
a += (unsign(k[offset + 0]) + (unsign(k[offset + 1]) << 8)
+ (unsign(k[offset + 2]) << 16)
+ (unsign(k[offset + 3]) << 24));
b += (unsign(k[offset + 4]) + (unsign(k[offset + 5]) << 8)
+ (unsign(k[offset + 6]) << 16)
+ (unsign(k[offset + 7]) << 24));
c += (unsign(k[offset + 8]) + (unsign(k[offset + 9]) << 8)
+ (unsign(k[offset + 10]) << 16)
+ (unsign(k[offset + 11]) << 24));
abcBuffer = mix(a, b, c);
a = abcBuffer[0];
b = abcBuffer[1];
c = abcBuffer[2];
offset += 12;
len -= 12;
}
// handle the last 11 bytes
c += k.length;
switch (len) {
// all the case statements fall through
case 11:
c += (unsign(k[offset + 10]) << 24);
case 10:
c += (unsign(k[offset + 9]) << 16);
case 9:
c += (unsign(k[offset + 8]) << 8);
/* the first byte of c is reserved for the length */
case 8:
b += (unsign(k[offset + 7]) << 24);
case 7:
b += (unsign(k[offset + 6]) << 16);
case 6:
b += (unsign(k[offset + 5]) << 8);
case 5:
b += unsign(k[offset + 4]);
case 4:
a += (unsign(k[offset + 3]) << 24);
case 3:
a += (unsign(k[offset + 2]) << 16);
case 2:
a += (unsign(k[offset + 1]) << 8);
case 1:
a += unsign(k[offset + 0]);
/* case 0: nothing left to add */
}
abcBuffer = mix(a, b, c);
return abcBuffer[2];
}
}