All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.yahoo.sketches.hash.MurmurHash3Adaptor Maven / Gradle / Ivy

There is a newer version: 0.13.4
Show newest version
/*
 * Copyright 2015-16, Yahoo! Inc.
 * Licensed under the terms of the Apache License 2.0. See LICENSE file at the project root for terms.
 */

package com.yahoo.sketches.hash;

import static com.yahoo.sketches.hash.MurmurHash3.hash;
import static java.nio.charset.StandardCharsets.UTF_8;

import java.nio.ByteBuffer;

import com.yahoo.sketches.SketchesArgumentException;
import com.yahoo.sketches.SketchesStateException;

/**
 * A general purpose wrapper for the MurmurHash3.
 * 
    *
  • Inputs can be long, long[], int[], byte[], double or String.
  • *
  • Returns null if arrays or String is null or empty.
  • *
  • Provides methods for returning the 128-bit result as either an array of 2 longs or as a byte * array of 16 bytes.
  • *
  • Provides modulo, asDouble and asInt functions.
  • *
* * @author Lee Rhodes */ public final class MurmurHash3Adaptor { private static final long BIT62 = 1L << 62; private static final long MAX_LONG = Long.MAX_VALUE; private static final long INT_MASK = 0x7FFFFFFFL; private static final long PRIME = 9219741426499971445L; //from P. L'Ecuyer and R. Simard private MurmurHash3Adaptor() {} /** * Hash a long and long seed. * * @param datum the input long value * @param seed A long valued seed. * @return The 128-bit hash as a byte[16] in Big Endian order from 2 64-bit longs. */ public static byte[] hashToBytes(long datum, long seed) { long[] data = { datum }; return toByteArray(hash(data, seed)); } /** * Hash a long[] and long seed. * * @param data the input long array * @param seed A long valued seed. * @return The 128-bit hash as a byte[16] in Big Endian order from 2 64-bit longs. */ public static byte[] hashToBytes(long[] data, long seed) { if ((data == null) || (data.length == 0)) { return null; } return toByteArray(hash(data, seed)); } /** * Hash an int[] and long seed. * * @param data the input int array * @param seed A long valued seed. * @return The 128-bit hash as a byte[16] in Big Endian order from 2 64-bit longs. */ public static byte[] hashToBytes(int[] data, long seed) { if ((data == null) || (data.length == 0)) { return null; } return toByteArray(hash(data, seed)); } /** * Hash a char[] and long seed. * * @param data the input char array * @param seed A long valued seed. * @return The 128-bit hash as a byte[16] in Big Endian order from 2 64-bit longs. */ public static byte[] hashToBytes(char[] data, long seed) { if ((data == null) || (data.length == 0)) { return null; } return toByteArray(hash(data, seed)); } /** * Hash a byte[] and long seed. * * @param data the input byte array * @param seed A long valued seed. * @return The 128-bit hash as a byte[16] in Big Endian order from 2 64-bit longs. */ public static byte[] hashToBytes(byte[] data, long seed) { if ((data == null) || (data.length == 0)) { return null; } return toByteArray(hash(data, seed)); } /** * Hash a double and long seed. * * @param datum the input double * @param seed A long valued seed. * @return The 128-bit hash as a byte[16] in Big Endian order from 2 64-bit longs. */ public static byte[] hashToBytes(double datum, long seed) { double d = (datum == 0.0) ? 0.0 : datum; //canonicalize -0.0, 0.0 long[] data = { Double.doubleToLongBits(d) }; //canonicalize all NaN forms return toByteArray(hash(data, seed)); } /** * Hash a String and long seed. * * @param datum the input String * @param seed A long valued seed. * @return The 128-bit hash as a byte[16] in Big Endian order from 2 64-bit longs. */ public static byte[] hashToBytes(String datum, long seed) { if ((datum == null) || datum.isEmpty()) { return null; } byte[] data = datum.getBytes(UTF_8); return toByteArray(hash(data, seed)); } /** * Hash a long and long seed. * * @param datum the input long * @param seed A long valued seed. * @return The 128-bit hash as a long[2]. */ public static long[] hashToLongs(long datum, long seed) { long[] data = { datum }; return hash(data, seed); } /** * Hash a long[] and long seed. * * @param data the input long array. * @param seed A long valued seed. * @return The 128-bit hash as a long[2]. */ public static long[] hashToLongs(long[] data, long seed) { if ((data == null) || (data.length == 0)) { return null; } return hash(data, seed); } /** * Hash a int[] and long seed. * * @param data the input int array. * @param seed A long valued seed. * @return The 128-bit hash as a long[2]. */ public static long[] hashToLongs(int[] data, long seed) { if ((data == null) || (data.length == 0)) { return null; } return hash(data, seed); } /** * Hash a char[] and long seed. * * @param data the input char array. * @param seed A long valued seed. * @return The 128-bit hash as a long[2]. */ public static long[] hashToLongs(char[] data, long seed) { if ((data == null) || (data.length == 0)) { return null; } return hash(data, seed); } /** * Hash a byte[] and long seed. * * @param data the input byte array. * @param seed A long valued seed. * @return The 128-bit hash as a long[2]. */ public static long[] hashToLongs(byte[] data, long seed) { if ((data == null) || (data.length == 0)) { return null; } return hash(data, seed); } /** * Hash a double and long seed. * * @param datum the input double. * @param seed A long valued seed. * @return The 128-bit hash as a long[2]. */ public static long[] hashToLongs(double datum, long seed) { double d = (datum == 0.0) ? 0.0 : datum; //canonicalize -0.0, 0.0 long[] data = { Double.doubleToLongBits(d) };//canonicalize all NaN forms return hash(data, seed); } /** * Hash a String and long seed. * * @param datum the input String. * @param seed A long valued seed. * @return The 128-bit hash as a long[2]. */ public static long[] hashToLongs(String datum, long seed) { if ((datum == null) || datum.isEmpty()) { return null; } byte[] data = datum.getBytes(UTF_8); return hash(data, seed); } //As Integer functions /** * Returns a deterministic uniform random integer between zero (inclusive) and * n (exclusive) given the input data. * @param data the input long array. * @param n The upper exclusive bound of the integers produced. Must be > 1. * @return deterministic uniform random integer */ public static int asInt(long[] data, int n) { if ((data == null) || (data.length == 0)) { throw new SketchesArgumentException("Input is null or empty."); } return asInteger(data, n); //data is long[] } /** * Returns a deterministic uniform random integer between zero (inclusive) and * n (exclusive) given the input data. * @param data the input int array. * @param n The upper exclusive bound of the integers produced. Must be > 1. * @return deterministic uniform random integer */ public static int asInt(int[] data, int n) { if ((data == null) || (data.length == 0)) { throw new SketchesArgumentException("Input is null or empty."); } return asInteger(toLongArray(data), n); //data is int[] } /** * Returns a deterministic uniform random integer between zero (inclusive) and * n (exclusive) given the input data. * @param data the input byte array. * @param n The upper exclusive bound of the integers produced. Must be > 1. * @return deterministic uniform random integer. */ public static int asInt(byte[] data, int n) { if ((data == null) || (data.length == 0)) { throw new SketchesArgumentException("Input is null or empty."); } return asInteger(toLongArray(data), n); //data is byte[] } /** * Returns a deterministic uniform random integer between zero (inclusive) and * n (exclusive) given the input datum. * @param datum the input long * @param n The upper exclusive bound of the integers produced. Must be > 1. * @return deterministic uniform random integer */ public static int asInt(long datum, int n) { long[] data = { datum }; return asInteger(data, n); //data is long[] } /** * Returns a deterministic uniform random integer between zero (inclusive) and * n (exclusive) given the input double. * @param datum the given double. * @param n The upper exclusive bound of the integers produced. Must be > 1. * @return deterministic uniform random integer */ public static int asInt(double datum, int n) { double d = (datum == 0.0) ? 0.0 : datum; //canonicalize -0.0, 0.0 long[] data = { Double.doubleToLongBits(d) };//canonicalize all NaN forms return asInteger(data, n); //data is long[] } /** * Returns a deterministic uniform random integer between zero (inclusive) and * n (exclusive) given the input datum. * @param datum the given String. * @param n The upper exclusive bound of the integers produced. Must be > 1. * @return deterministic uniform random integer */ public static int asInt(String datum, int n) { if ((datum == null) || datum.isEmpty()) { throw new SketchesArgumentException("Input is null or empty."); } byte[] data = datum.getBytes(UTF_8); return asInteger(toLongArray(data), n); //data is byte[] } /** * Returns a deterministic uniform random integer with a minimum inclusive value of zero and a * maximum exclusive value of n given the input data. * *

The integer values produced are only as random as the MurmurHash3 algorithm, which may be * adequate for many applications. However, if you are looking for high guarantees of randomness * you should turn to more sophisticated random generators such as Mersenne Twister or Well19937c * algorithms. * * @param data The input data (key) * @param n The upper exclusive bound of the integers produced. Must be > 1. * @return deterministic uniform random integer */ private static int asInteger(long[] data, int n) { int t; int cnt = 0; long seed = 0; if (n < 2) { throw new SketchesArgumentException("Given value of n must be > 1."); } if (n > (1 << 30)) { while (++cnt < 10000) { long[] h = MurmurHash3.hash(data, seed); t = (int) (h[0] & INT_MASK); if (t < n) { return t; } t = (int) ((h[0] >>> 33)); if (t < n) { return t; } t = (int) (h[1] & INT_MASK); if (t < n) { return t; } t = (int) ((h[1] >>> 33)); if (t < n) { return t; } seed += PRIME; } // end while throw new SketchesStateException( "Internal Error: Failed to find integer < n within 10000 iterations."); } long mask = ceilingPowerOf2(n) - 1; while (++cnt < 10000) { long[] h = MurmurHash3.hash(data, seed); t = (int) (h[0] & mask); if (t < n) { return t; } t = (int) ((h[0] >>> 33) & mask); if (t < n) { return t; } t = (int) (h[1] & mask); if (t < n) { return t; } t = (int) ((h[1] >>> 33) & mask); if (t < n) { return t; } seed += PRIME; } // end while throw new SketchesStateException( "Internal Error: Failed to find integer < n within 10000 iterations."); } /** * Returns a uniform random double with a minimum inclusive value of zero and a maximum exclusive * value of 1.0. * *

The double values produced are only as random as the MurmurHash3 algorithm, which may be * adequate for many applications. However, if you are looking for high guarantees of randomness * you should turn to more sophisticated random generators such as Mersenne Twister or Well * algorithms. * * @param hash The output of the MurmurHash3. * @return the uniform random double. */ public static double asDouble(long[] hash) { return (hash[0] >>> 12) * 0x1.0p-52d; } /** * Returns the remainder from the modulo division of the 128-bit output of the murmurHash3 by the * divisor. * * @param h0 The lower 64-bits of the 128-bit MurmurHash3 hash. * @param h1 The upper 64-bits of the 128-bit MurmurHash3 hash. * @param divisor Must be positive and greater than zero. * @return the modulo result. */ public static int modulo(long h0, long h1, int divisor) { long d = divisor; long modH0 = (h0 < 0L) ? addRule(mulRule(BIT62, 2L, d), (h0 & MAX_LONG), d) : h0 % d; long modH1 = (h1 < 0L) ? addRule(mulRule(BIT62, 2L, d), (h1 & MAX_LONG), d) : h1 % d; long modTop = mulRule(mulRule(BIT62, 4L, d), modH1, d); return (int) addRule(modTop, modH0, d); } /** * Returns the remainder from the modulo division of the 128-bit output of the murmurHash3 by the * divisor. * * @param hash The size 2 long array from the MurmurHash3. * @param divisor Must be positive and greater than zero. * @return the modulo result */ public static int modulo(long[] hash, int divisor) { return modulo(hash[0], hash[1], divisor); } private static long addRule(long a, long b, long d) { return ((a % d) + (b % d)) % d; } private static long mulRule(long a, long b, long d) { return ((a % d) * (b % d)) % d; } private static byte[] toByteArray(long[] hash) { //Assumes Big Endian byte[] bArr = new byte[16]; ByteBuffer bb = ByteBuffer.wrap(bArr); bb.putLong(hash[0]); bb.putLong(hash[1]); return bArr; } private static long[] toLongArray(byte[] data) { int dataLen = data.length; int longLen = (dataLen + 7) / 8; long[] longArr = new long[longLen]; for (int bi = 0; bi < dataLen; bi++) { int li = bi / 8; longArr[li] |= (((long)data[bi]) << (bi * 8) % 64); } return longArr; } private static long[] toLongArray(int[] data) { int dataLen = data.length; int longLen = (dataLen + 1) / 2; long[] longArr = new long[longLen]; for (int ii = 0; ii < dataLen; ii++) { int li = ii / 2; longArr[li] |= (((long)data[ii]) << (ii * 32) % 64); } return longArr; } /** * Computes the ceiling power of 2 within the range [1, 2^30]. This is the smallest positive power * of 2 that equal to or greater than the given n.
* For: *

    *
  • n ≤ 1: returns 1
  • *
  • 2^30 ≤ n ≤ 2^31 -1 : returns 2^30
  • *
  • n == a power of 2 : returns n
  • *
  • otherwise returns the smallest power of 2 greater than n
  • *
* * @param n The input argument. * @return the ceiling power of 2. */ private static int ceilingPowerOf2(int n) { if (n <= 1) { return 1; } int topPwrOf2 = 1 << 30; return (n >= topPwrOf2) ? topPwrOf2 : Integer.highestOneBit((n - 1) << 1); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy