org.apache.datasketches.hash.MurmurHash3Adaptor Maven / Gradle / Ivy
Show all versions of datasketches-java Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.datasketches.hash;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.apache.datasketches.Util.ceilingPowerOf2;
import static org.apache.datasketches.hash.MurmurHash3.hash;
import java.nio.ByteBuffer;
import org.apache.datasketches.SketchesArgumentException;
import org.apache.datasketches.SketchesStateException;
/**
* A general purpose wrapper for the MurmurHash3.
*
* - Inputs can be long, long[], int[], char[], byte[], double or String.
* - Returns null if arrays or String is null or empty.
* - Provides methods for returning the 128-bit result as either an array of 2 longs or as a byte
* array of 16 bytes.
* - Provides modulo, asDouble and asInt functions.
*
*
* @author Lee Rhodes
*/
public final class MurmurHash3Adaptor {
private static final long BIT62 = 1L << 62;
private static final long MAX_LONG = Long.MAX_VALUE;
private static final long INT_MASK = 0x7FFFFFFFL;
private static final long PRIME = 9219741426499971445L; //from P. L'Ecuyer and R. Simard
private MurmurHash3Adaptor() {}
/**
* Hash a long and long seed.
*
* @param datum the input long value
* @param seed A long valued seed.
* @return The 128-bit hash as a byte[16] in Big Endian order from 2 64-bit longs.
*/
public static byte[] hashToBytes(final long datum, final long seed) {
final long[] data = { datum };
return toByteArray(hash(data, seed));
}
/**
* Hash a long[] and long seed.
*
* @param data the input long array
* @param seed A long valued seed.
* @return The 128-bit hash as a byte[16] in Big Endian order from 2 64-bit longs.
*/
public static byte[] hashToBytes(final long[] data, final long seed) {
if ((data == null) || (data.length == 0)) {
return null;
}
return toByteArray(hash(data, seed));
}
/**
* Hash an int[] and long seed.
*
* @param data the input int array
* @param seed A long valued seed.
* @return The 128-bit hash as a byte[16] in Big Endian order from 2 64-bit longs.
*/
public static byte[] hashToBytes(final int[] data, final long seed) {
if ((data == null) || (data.length == 0)) {
return null;
}
return toByteArray(hash(data, seed));
}
/**
* Hash a char[] and long seed.
*
* @param data the input char array
* @param seed A long valued seed.
* @return The 128-bit hash as a byte[16] in Big Endian order from 2 64-bit longs.
*/
public static byte[] hashToBytes(final char[] data, final long seed) {
if ((data == null) || (data.length == 0)) {
return null;
}
return toByteArray(hash(data, seed));
}
/**
* Hash a byte[] and long seed.
*
* @param data the input byte array
* @param seed A long valued seed.
* @return The 128-bit hash as a byte[16] in Big Endian order from 2 64-bit longs.
*/
public static byte[] hashToBytes(final byte[] data, final long seed) {
if ((data == null) || (data.length == 0)) {
return null;
}
return toByteArray(hash(data, seed));
}
/**
* Hash a double and long seed.
*
* @param datum the input double
* @param seed A long valued seed.
* @return The 128-bit hash as a byte[16] in Big Endian order from 2 64-bit longs.
*/
public static byte[] hashToBytes(final double datum, final long seed) {
final double d = (datum == 0.0) ? 0.0 : datum; //canonicalize -0.0, 0.0
final long[] data = { Double.doubleToLongBits(d) }; //canonicalize all NaN forms
return toByteArray(hash(data, seed));
}
/**
* Hash a String and long seed.
*
* @param datum the input String
* @param seed A long valued seed.
* @return The 128-bit hash as a byte[16] in Big Endian order from 2 64-bit longs.
*/
public static byte[] hashToBytes(final String datum, final long seed) {
if ((datum == null) || datum.isEmpty()) {
return null;
}
final byte[] data = datum.getBytes(UTF_8);
return toByteArray(hash(data, seed));
}
/**
* Hash a long and long seed.
*
* @param datum the input long
* @param seed A long valued seed.
* @return The 128-bit hash as a long[2].
*/
public static long[] hashToLongs(final long datum, final long seed) {
final long[] data = { datum };
return hash(data, seed);
}
/**
* Hash a long[] and long seed.
*
* @param data the input long array.
* @param seed A long valued seed.
* @return The 128-bit hash as a long[2].
*/
public static long[] hashToLongs(final long[] data, final long seed) {
if ((data == null) || (data.length == 0)) {
return null;
}
return hash(data, seed);
}
/**
* Hash a int[] and long seed.
*
* @param data the input int array.
* @param seed A long valued seed.
* @return The 128-bit hash as a long[2].
*/
public static long[] hashToLongs(final int[] data, final long seed) {
if ((data == null) || (data.length == 0)) {
return null;
}
return hash(data, seed);
}
/**
* Hash a char[] and long seed.
*
* @param data the input char array.
* @param seed A long valued seed.
* @return The 128-bit hash as a long[2].
*/
public static long[] hashToLongs(final char[] data, final long seed) {
if ((data == null) || (data.length == 0)) {
return null;
}
return hash(data, seed);
}
/**
* Hash a byte[] and long seed.
*
* @param data the input byte array.
* @param seed A long valued seed.
* @return The 128-bit hash as a long[2].
*/
public static long[] hashToLongs(final byte[] data, final long seed) {
if ((data == null) || (data.length == 0)) {
return null;
}
return hash(data, seed);
}
/**
* Hash a double and long seed.
*
* @param datum the input double.
* @param seed A long valued seed.
* @return The 128-bit hash as a long[2].
*/
public static long[] hashToLongs(final double datum, final long seed) {
final double d = (datum == 0.0) ? 0.0 : datum; //canonicalize -0.0, 0.0
final long[] data = { Double.doubleToLongBits(d) };//canonicalize all NaN forms
return hash(data, seed);
}
/**
* Hash a String and long seed.
*
* @param datum the input String.
* @param seed A long valued seed.
* @return The 128-bit hash as a long[2].
*/
public static long[] hashToLongs(final String datum, final long seed) {
if ((datum == null) || datum.isEmpty()) {
return null;
}
final byte[] data = datum.getBytes(UTF_8);
return hash(data, seed);
}
//As Integer functions
/**
* Returns a deterministic uniform random integer between zero (inclusive) and
* n (exclusive) given the input data.
* @param data the input long array.
* @param n The upper exclusive bound of the integers produced. Must be > 1.
* @return deterministic uniform random integer
*/
public static int asInt(final long[] data, final int n) {
if ((data == null) || (data.length == 0)) {
throw new SketchesArgumentException("Input is null or empty.");
}
return asInteger(data, n); //data is long[]
}
/**
* Returns a deterministic uniform random integer between zero (inclusive) and
* n (exclusive) given the input data.
* @param data the input int array.
* @param n The upper exclusive bound of the integers produced. Must be > 1.
* @return deterministic uniform random integer
*/
public static int asInt(final int[] data, final int n) {
if ((data == null) || (data.length == 0)) {
throw new SketchesArgumentException("Input is null or empty.");
}
return asInteger(toLongArray(data), n); //data is int[]
}
/**
* Returns a deterministic uniform random integer between zero (inclusive) and
* n (exclusive) given the input data.
* @param data the input byte array.
* @param n The upper exclusive bound of the integers produced. Must be > 1.
* @return deterministic uniform random integer.
*/
public static int asInt(final byte[] data, final int n) {
if ((data == null) || (data.length == 0)) {
throw new SketchesArgumentException("Input is null or empty.");
}
return asInteger(toLongArray(data), n); //data is byte[]
}
/**
* Returns a deterministic uniform random integer between zero (inclusive) and
* n (exclusive) given the input datum.
* @param datum the input long
* @param n The upper exclusive bound of the integers produced. Must be > 1.
* @return deterministic uniform random integer
*/
public static int asInt(final long datum, final int n) {
final long[] data = { datum };
return asInteger(data, n); //data is long[]
}
/**
* Returns a deterministic uniform random integer between zero (inclusive) and
* n (exclusive) given the input double.
* @param datum the given double.
* @param n The upper exclusive bound of the integers produced. Must be > 1.
* @return deterministic uniform random integer
*/
public static int asInt(final double datum, final int n) {
final double d = (datum == 0.0) ? 0.0 : datum; //canonicalize -0.0, 0.0
final long[] data = { Double.doubleToLongBits(d) };//canonicalize all NaN forms
return asInteger(data, n); //data is long[]
}
/**
* Returns a deterministic uniform random integer between zero (inclusive) and
* n (exclusive) given the input datum.
* @param datum the given String.
* @param n The upper exclusive bound of the integers produced. Must be > 1.
* @return deterministic uniform random integer
*/
public static int asInt(final String datum, final int n) {
if ((datum == null) || datum.isEmpty()) {
throw new SketchesArgumentException("Input is null or empty.");
}
final byte[] data = datum.getBytes(UTF_8);
return asInteger(toLongArray(data), n); //data is byte[]
}
/**
* Returns a deterministic uniform random integer with a minimum inclusive value of zero and a
* maximum exclusive value of n given the input data.
*
* The integer values produced are only as random as the MurmurHash3 algorithm, which may be
* adequate for many applications. However, if you are looking for high guarantees of randomness
* you should turn to more sophisticated random generators such as Mersenne Twister or Well19937c
* algorithms.
*
* @param data The input data (key)
* @param n The upper exclusive bound of the integers produced. Must be > 1.
* @return deterministic uniform random integer
*/
private static int asInteger(final long[] data, final int n) {
int t;
int cnt = 0;
long seed = 0;
if (n < 2) {
throw new SketchesArgumentException("Given value of n must be > 1.");
}
if (n > (1 << 30)) {
while (++cnt < 10000) {
final long[] h = MurmurHash3.hash(data, seed);
t = (int) (h[0] & INT_MASK);
if (t < n) {
return t;
}
t = (int) ((h[0] >>> 33));
if (t < n) {
return t;
}
t = (int) (h[1] & INT_MASK);
if (t < n) {
return t;
}
t = (int) ((h[1] >>> 33));
if (t < n) {
return t;
}
seed += PRIME;
} // end while
throw new SketchesStateException(
"Internal Error: Failed to find integer < n within 10000 iterations.");
}
final long mask = ceilingPowerOf2(n) - 1;
while (++cnt < 10000) {
final long[] h = MurmurHash3.hash(data, seed);
t = (int) (h[0] & mask);
if (t < n) {
return t;
}
t = (int) ((h[0] >>> 33) & mask);
if (t < n) {
return t;
}
t = (int) (h[1] & mask);
if (t < n) {
return t;
}
t = (int) ((h[1] >>> 33) & mask);
if (t < n) {
return t;
}
seed += PRIME;
} // end while
throw new SketchesStateException(
"Internal Error: Failed to find integer < n within 10000 iterations.");
}
/**
* Returns a uniform random double with a minimum inclusive value of zero and a maximum exclusive
* value of 1.0.
*
*
The double values produced are only as random as the MurmurHash3 algorithm, which may be
* adequate for many applications. However, if you are looking for high guarantees of randomness
* you should turn to more sophisticated random generators such as Mersenne Twister or Well
* algorithms.
*
* @param hash The output of the MurmurHash3.
* @return the uniform random double.
*/
public static double asDouble(final long[] hash) {
return (hash[0] >>> 12) * 0x1.0p-52d;
}
/**
* Returns the remainder from the modulo division of the 128-bit output of the murmurHash3 by the
* divisor.
*
* @param h0 The lower 64-bits of the 128-bit MurmurHash3 hash.
* @param h1 The upper 64-bits of the 128-bit MurmurHash3 hash.
* @param divisor Must be positive and greater than zero.
* @return the modulo result.
*/
public static int modulo(final long h0, final long h1, final int divisor) {
final long d = divisor;
final long modH0 = (h0 < 0L) ? addRule(mulRule(BIT62, 2L, d), (h0 & MAX_LONG), d) : h0 % d;
final long modH1 = (h1 < 0L) ? addRule(mulRule(BIT62, 2L, d), (h1 & MAX_LONG), d) : h1 % d;
final long modTop = mulRule(mulRule(BIT62, 4L, d), modH1, d);
return (int) addRule(modTop, modH0, d);
}
/**
* Returns the remainder from the modulo division of the 128-bit output of the murmurHash3 by the
* divisor.
*
* @param hash The size 2 long array from the MurmurHash3.
* @param divisor Must be positive and greater than zero.
* @return the modulo result
*/
public static int modulo(final long[] hash, final int divisor) {
return modulo(hash[0], hash[1], divisor);
}
private static long addRule(final long a, final long b, final long d) {
return ((a % d) + (b % d)) % d;
}
private static long mulRule(final long a, final long b, final long d) {
return ((a % d) * (b % d)) % d;
}
private static byte[] toByteArray(final long[] hash) { //Assumes Big Endian
final byte[] bArr = new byte[16];
final ByteBuffer bb = ByteBuffer.wrap(bArr);
bb.putLong(hash[0]);
bb.putLong(hash[1]);
return bArr;
}
private static long[] toLongArray(final byte[] data) {
final int dataLen = data.length;
final int longLen = (dataLen + 7) / 8;
final long[] longArr = new long[longLen];
for (int bi = 0; bi < dataLen; bi++) {
final int li = bi / 8;
longArr[li] |= (((long)data[bi]) << ((bi * 8) % 64));
}
return longArr;
}
private static long[] toLongArray(final int[] data) {
final int dataLen = data.length;
final int longLen = (dataLen + 1) / 2;
final long[] longArr = new long[longLen];
for (int ii = 0; ii < dataLen; ii++) {
final int li = ii / 2;
longArr[li] |= (((long)data[ii]) << ((ii * 32) % 64));
}
return longArr;
}
}