All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.datasketches.hash.MurmurHash3Adaptor Maven / Gradle / Ivy

Go to download

Core sketch algorithms used alone and by other Java repositories in the DataSketches library.

There is a newer version: 7.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.datasketches.hash;

import static java.nio.charset.StandardCharsets.UTF_8;
import static org.apache.datasketches.Util.ceilingPowerOf2;
import static org.apache.datasketches.hash.MurmurHash3.hash;

import java.nio.ByteBuffer;

import org.apache.datasketches.SketchesArgumentException;
import org.apache.datasketches.SketchesStateException;

/**
 * A general purpose wrapper for the MurmurHash3.
 * 
    *
  • Inputs can be long, long[], int[], char[], byte[], double or String.
  • *
  • Returns null if arrays or String is null or empty.
  • *
  • Provides methods for returning the 128-bit result as either an array of 2 longs or as a byte * array of 16 bytes.
  • *
  • Provides modulo, asDouble and asInt functions.
  • *
* * @author Lee Rhodes */ public final class MurmurHash3Adaptor { private static final long BIT62 = 1L << 62; private static final long MAX_LONG = Long.MAX_VALUE; private static final long INT_MASK = 0x7FFFFFFFL; private static final long PRIME = 9219741426499971445L; //from P. L'Ecuyer and R. Simard private MurmurHash3Adaptor() {} /** * Hash a long and long seed. * * @param datum the input long value * @param seed A long valued seed. * @return The 128-bit hash as a byte[16] in Big Endian order from 2 64-bit longs. */ public static byte[] hashToBytes(final long datum, final long seed) { final long[] data = { datum }; return toByteArray(hash(data, seed)); } /** * Hash a long[] and long seed. * * @param data the input long array * @param seed A long valued seed. * @return The 128-bit hash as a byte[16] in Big Endian order from 2 64-bit longs. */ public static byte[] hashToBytes(final long[] data, final long seed) { if ((data == null) || (data.length == 0)) { return null; } return toByteArray(hash(data, seed)); } /** * Hash an int[] and long seed. * * @param data the input int array * @param seed A long valued seed. * @return The 128-bit hash as a byte[16] in Big Endian order from 2 64-bit longs. */ public static byte[] hashToBytes(final int[] data, final long seed) { if ((data == null) || (data.length == 0)) { return null; } return toByteArray(hash(data, seed)); } /** * Hash a char[] and long seed. * * @param data the input char array * @param seed A long valued seed. * @return The 128-bit hash as a byte[16] in Big Endian order from 2 64-bit longs. */ public static byte[] hashToBytes(final char[] data, final long seed) { if ((data == null) || (data.length == 0)) { return null; } return toByteArray(hash(data, seed)); } /** * Hash a byte[] and long seed. * * @param data the input byte array * @param seed A long valued seed. * @return The 128-bit hash as a byte[16] in Big Endian order from 2 64-bit longs. */ public static byte[] hashToBytes(final byte[] data, final long seed) { if ((data == null) || (data.length == 0)) { return null; } return toByteArray(hash(data, seed)); } /** * Hash a double and long seed. * * @param datum the input double * @param seed A long valued seed. * @return The 128-bit hash as a byte[16] in Big Endian order from 2 64-bit longs. */ public static byte[] hashToBytes(final double datum, final long seed) { final double d = (datum == 0.0) ? 0.0 : datum; //canonicalize -0.0, 0.0 final long[] data = { Double.doubleToLongBits(d) }; //canonicalize all NaN forms return toByteArray(hash(data, seed)); } /** * Hash a String and long seed. * * @param datum the input String * @param seed A long valued seed. * @return The 128-bit hash as a byte[16] in Big Endian order from 2 64-bit longs. */ public static byte[] hashToBytes(final String datum, final long seed) { if ((datum == null) || datum.isEmpty()) { return null; } final byte[] data = datum.getBytes(UTF_8); return toByteArray(hash(data, seed)); } /** * Hash a long and long seed. * * @param datum the input long * @param seed A long valued seed. * @return The 128-bit hash as a long[2]. */ public static long[] hashToLongs(final long datum, final long seed) { final long[] data = { datum }; return hash(data, seed); } /** * Hash a long[] and long seed. * * @param data the input long array. * @param seed A long valued seed. * @return The 128-bit hash as a long[2]. */ public static long[] hashToLongs(final long[] data, final long seed) { if ((data == null) || (data.length == 0)) { return null; } return hash(data, seed); } /** * Hash a int[] and long seed. * * @param data the input int array. * @param seed A long valued seed. * @return The 128-bit hash as a long[2]. */ public static long[] hashToLongs(final int[] data, final long seed) { if ((data == null) || (data.length == 0)) { return null; } return hash(data, seed); } /** * Hash a char[] and long seed. * * @param data the input char array. * @param seed A long valued seed. * @return The 128-bit hash as a long[2]. */ public static long[] hashToLongs(final char[] data, final long seed) { if ((data == null) || (data.length == 0)) { return null; } return hash(data, seed); } /** * Hash a byte[] and long seed. * * @param data the input byte array. * @param seed A long valued seed. * @return The 128-bit hash as a long[2]. */ public static long[] hashToLongs(final byte[] data, final long seed) { if ((data == null) || (data.length == 0)) { return null; } return hash(data, seed); } /** * Hash a double and long seed. * * @param datum the input double. * @param seed A long valued seed. * @return The 128-bit hash as a long[2]. */ public static long[] hashToLongs(final double datum, final long seed) { final double d = (datum == 0.0) ? 0.0 : datum; //canonicalize -0.0, 0.0 final long[] data = { Double.doubleToLongBits(d) };//canonicalize all NaN forms return hash(data, seed); } /** * Hash a String and long seed. * * @param datum the input String. * @param seed A long valued seed. * @return The 128-bit hash as a long[2]. */ public static long[] hashToLongs(final String datum, final long seed) { if ((datum == null) || datum.isEmpty()) { return null; } final byte[] data = datum.getBytes(UTF_8); return hash(data, seed); } //As Integer functions /** * Returns a deterministic uniform random integer between zero (inclusive) and * n (exclusive) given the input data. * @param data the input long array. * @param n The upper exclusive bound of the integers produced. Must be > 1. * @return deterministic uniform random integer */ public static int asInt(final long[] data, final int n) { if ((data == null) || (data.length == 0)) { throw new SketchesArgumentException("Input is null or empty."); } return asInteger(data, n); //data is long[] } /** * Returns a deterministic uniform random integer between zero (inclusive) and * n (exclusive) given the input data. * @param data the input int array. * @param n The upper exclusive bound of the integers produced. Must be > 1. * @return deterministic uniform random integer */ public static int asInt(final int[] data, final int n) { if ((data == null) || (data.length == 0)) { throw new SketchesArgumentException("Input is null or empty."); } return asInteger(toLongArray(data), n); //data is int[] } /** * Returns a deterministic uniform random integer between zero (inclusive) and * n (exclusive) given the input data. * @param data the input byte array. * @param n The upper exclusive bound of the integers produced. Must be > 1. * @return deterministic uniform random integer. */ public static int asInt(final byte[] data, final int n) { if ((data == null) || (data.length == 0)) { throw new SketchesArgumentException("Input is null or empty."); } return asInteger(toLongArray(data), n); //data is byte[] } /** * Returns a deterministic uniform random integer between zero (inclusive) and * n (exclusive) given the input datum. * @param datum the input long * @param n The upper exclusive bound of the integers produced. Must be > 1. * @return deterministic uniform random integer */ public static int asInt(final long datum, final int n) { final long[] data = { datum }; return asInteger(data, n); //data is long[] } /** * Returns a deterministic uniform random integer between zero (inclusive) and * n (exclusive) given the input double. * @param datum the given double. * @param n The upper exclusive bound of the integers produced. Must be > 1. * @return deterministic uniform random integer */ public static int asInt(final double datum, final int n) { final double d = (datum == 0.0) ? 0.0 : datum; //canonicalize -0.0, 0.0 final long[] data = { Double.doubleToLongBits(d) };//canonicalize all NaN forms return asInteger(data, n); //data is long[] } /** * Returns a deterministic uniform random integer between zero (inclusive) and * n (exclusive) given the input datum. * @param datum the given String. * @param n The upper exclusive bound of the integers produced. Must be > 1. * @return deterministic uniform random integer */ public static int asInt(final String datum, final int n) { if ((datum == null) || datum.isEmpty()) { throw new SketchesArgumentException("Input is null or empty."); } final byte[] data = datum.getBytes(UTF_8); return asInteger(toLongArray(data), n); //data is byte[] } /** * Returns a deterministic uniform random integer with a minimum inclusive value of zero and a * maximum exclusive value of n given the input data. * *

The integer values produced are only as random as the MurmurHash3 algorithm, which may be * adequate for many applications. However, if you are looking for high guarantees of randomness * you should turn to more sophisticated random generators such as Mersenne Twister or Well19937c * algorithms. * * @param data The input data (key) * @param n The upper exclusive bound of the integers produced. Must be > 1. * @return deterministic uniform random integer */ private static int asInteger(final long[] data, final int n) { int t; int cnt = 0; long seed = 0; if (n < 2) { throw new SketchesArgumentException("Given value of n must be > 1."); } if (n > (1 << 30)) { while (++cnt < 10000) { final long[] h = MurmurHash3.hash(data, seed); t = (int) (h[0] & INT_MASK); if (t < n) { return t; } t = (int) ((h[0] >>> 33)); if (t < n) { return t; } t = (int) (h[1] & INT_MASK); if (t < n) { return t; } t = (int) ((h[1] >>> 33)); if (t < n) { return t; } seed += PRIME; } // end while throw new SketchesStateException( "Internal Error: Failed to find integer < n within 10000 iterations."); } final long mask = ceilingPowerOf2(n) - 1; while (++cnt < 10000) { final long[] h = MurmurHash3.hash(data, seed); t = (int) (h[0] & mask); if (t < n) { return t; } t = (int) ((h[0] >>> 33) & mask); if (t < n) { return t; } t = (int) (h[1] & mask); if (t < n) { return t; } t = (int) ((h[1] >>> 33) & mask); if (t < n) { return t; } seed += PRIME; } // end while throw new SketchesStateException( "Internal Error: Failed to find integer < n within 10000 iterations."); } /** * Returns a uniform random double with a minimum inclusive value of zero and a maximum exclusive * value of 1.0. * *

The double values produced are only as random as the MurmurHash3 algorithm, which may be * adequate for many applications. However, if you are looking for high guarantees of randomness * you should turn to more sophisticated random generators such as Mersenne Twister or Well * algorithms. * * @param hash The output of the MurmurHash3. * @return the uniform random double. */ public static double asDouble(final long[] hash) { return (hash[0] >>> 12) * 0x1.0p-52d; } /** * Returns the remainder from the modulo division of the 128-bit output of the murmurHash3 by the * divisor. * * @param h0 The lower 64-bits of the 128-bit MurmurHash3 hash. * @param h1 The upper 64-bits of the 128-bit MurmurHash3 hash. * @param divisor Must be positive and greater than zero. * @return the modulo result. */ public static int modulo(final long h0, final long h1, final int divisor) { final long d = divisor; final long modH0 = (h0 < 0L) ? addRule(mulRule(BIT62, 2L, d), (h0 & MAX_LONG), d) : h0 % d; final long modH1 = (h1 < 0L) ? addRule(mulRule(BIT62, 2L, d), (h1 & MAX_LONG), d) : h1 % d; final long modTop = mulRule(mulRule(BIT62, 4L, d), modH1, d); return (int) addRule(modTop, modH0, d); } /** * Returns the remainder from the modulo division of the 128-bit output of the murmurHash3 by the * divisor. * * @param hash The size 2 long array from the MurmurHash3. * @param divisor Must be positive and greater than zero. * @return the modulo result */ public static int modulo(final long[] hash, final int divisor) { return modulo(hash[0], hash[1], divisor); } private static long addRule(final long a, final long b, final long d) { return ((a % d) + (b % d)) % d; } private static long mulRule(final long a, final long b, final long d) { return ((a % d) * (b % d)) % d; } private static byte[] toByteArray(final long[] hash) { //Assumes Big Endian final byte[] bArr = new byte[16]; final ByteBuffer bb = ByteBuffer.wrap(bArr); bb.putLong(hash[0]); bb.putLong(hash[1]); return bArr; } private static long[] toLongArray(final byte[] data) { final int dataLen = data.length; final int longLen = (dataLen + 7) / 8; final long[] longArr = new long[longLen]; for (int bi = 0; bi < dataLen; bi++) { final int li = bi / 8; longArr[li] |= (((long)data[bi]) << ((bi * 8) % 64)); } return longArr; } private static long[] toLongArray(final int[] data) { final int dataLen = data.length; final int longLen = (dataLen + 1) / 2; final long[] longArr = new long[longLen]; for (int ii = 0; ii < dataLen; ii++) { final int li = ii / 2; longArr[li] |= (((long)data[ii]) << ((ii * 32) % 64)); } return longArr; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy