org.apache.ignite.ml.math.MurmurHash Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.ignite.ml.math;

import java.nio.ByteBuffer;
import java.nio.ByteOrder;

/**
 * This is a very fast, non-cryptographic hash suitable for general hash-based lookup.
 * 
 * See http://murmurhash.googlepages.com/ for mre details.
 */
public class MurmurHash {
    /** Hide it. */
    private MurmurHash() {
    }

    /**
     * This produces exactly the same hash values as the final C+ version of MurmurHash3 and is
     * thus suitable for producing the same hash values across platforms.
     * 
     * The 32 bit x86 version of this hash should be the fastest variant for relatively short keys like IDs.
     * 
     * Note - The x86 and x64 versions do _not_ produce the same results, as the algorithms are
     * optimized for their respective platforms.
     * 
     * See also http://github.com/yonik/java_util for future updates to this method.
     *
     * @param data Data to hash.
     * @param off Where to start munging.
     * @param len How many bytes to process.
     * @param seed The seed to start with.
     * @return 32 bit hash platform compatible with C++ MurmurHash3 implementation on x86.
     */
    public static int hash3X86(byte[] data, int off, int len, int seed) {
        int c1 = 0xcc9e2d51;
        int c2 = 0x1b873593;

        int h1 = seed;
        int roundedEnd = off + (len & 0xfffffffc);  // Round down to 4 byte block.

        for (int i = off; i < roundedEnd; i += 4) {
            int k1 = (data[i] & 0xff) | ((data[i + 1] & 0xff) << 8) | ((data[i + 2] & 0xff) << 16) | (data[i + 3] << 24);

            k1 *= c1;
            k1 = (k1 << 15) | (k1 >>> 17);
            k1 *= c2;

            h1 ^= k1;
            h1 = (h1 << 13) | (h1 >>> 19);
            h1 = h1 * 5 + 0xe6546b64;
        }

        // Tail.
        int k1 = 0;

        switch (len & 0x03) {
            case 3:
                k1 = (data[roundedEnd + 2] & 0xff) << 16;
                // Fallthrough - WTF?
            case 2:
                k1 |= (data[roundedEnd + 1] & 0xff) << 8;
                // Fallthrough - WTF?
            case 1:
                k1 |= data[roundedEnd] & 0xff;
                k1 *= c1;
                k1 = (k1 << 15) | (k1 >>> 17);
                k1 *= c2;
                h1 ^= k1;
            default:
        }

        // Finalization.
        h1 ^= len;

        h1 ^= h1 >>> 16;
        h1 *= 0x85ebca6b;
        h1 ^= h1 >>> 13;
        h1 *= 0xc2b2ae35;
        h1 ^= h1 >>> 16;

        return h1;
    }

    /**
     * Hashes an int.
     *
     * @param data The int to hash.
     * @param seed The seed to start with.
     * @return The 32 bit hash of the bytes in question.
     */
    public static int hash(int data, int seed) {
        byte[] arr = new byte[] {
            (byte)(data >>> 24),
            (byte)(data >>> 16),
            (byte)(data >>> 8),
            (byte)data
        };

        return hash(ByteBuffer.wrap(arr), seed);
    }

    /**
     * Hashes bytes in an array.
     *
     * @param data The bytes to hash.
     * @param seed The seed to start with.
     * @return The 32 bit hash of the bytes in question.
     */
    public static int hash(byte[] data, int seed) {
        return hash(ByteBuffer.wrap(data), seed);
    }

    /**
     * Hashes bytes in part of an array.
     *
     * @param data The data to hash.
     * @param off Where to start munging.
     * @param len How many bytes to process.
     * @param seed The seed to start with.
     * @return The 32-bit hash of the data in question.
     */
    public static int hash(byte[] data, int off, int len, int seed) {
        return hash(ByteBuffer.wrap(data, off, len), seed);
    }

    /**
     * Hashes the bytes in a buffer from the current position to the limit.
     *
     * @param buf The bytes to hash.
     * @param seed The seed to start with.
     * @return The 32 bit murmur hash of the bytes in the buffer.
     */
    public static int hash(ByteBuffer buf, int seed) {
        ByteOrder byteOrder = buf.order();
        buf.order(ByteOrder.LITTLE_ENDIAN);

        int m = 0x5bd1e995;
        int r = 24;

        int h = seed ^ buf.remaining();

        while (buf.remaining() >= 4) {
            int k = buf.getInt();

            k *= m;
            k ^= k >>> r;
            k *= m;

            h *= m;
            h ^= k;
        }

        if (buf.remaining() > 0) {
            ByteBuffer finish = ByteBuffer.allocate(4).order(ByteOrder.LITTLE_ENDIAN);

            finish.put(buf).rewind();

            h ^= finish.getInt();
            h *= m;
        }

        h ^= h >>> 13;
        h *= m;
        h ^= h >>> 15;

        buf.order(byteOrder);

        return h;
    }

    /**
     * @param data The data to hash.
     * @param seed The seed to start with.
     * @return Hash value for given data and seed.
     */
    public static long hash64A(byte[] data, int seed) {
        return hash64A(ByteBuffer.wrap(data), seed);
    }

    /**
     * @param data The data to hash.
     * @param off Where to start munging.
     * @param len How many bytes to process.
     * @param seed The seed to start with.
     */
    public static long hash64A(byte[] data, int off, int len, int seed) {
        return hash64A(ByteBuffer.wrap(data, off, len), seed);
    }

    /**
     * @param buf The data to hash.
     * @param seed The seed to start with.
     */
    public static long hash64A(ByteBuffer buf, int seed) {
        ByteOrder byteOrder = buf.order();
        buf.order(ByteOrder.LITTLE_ENDIAN);

        long m = 0xc6a4a7935bd1e995L;
        int r = 47;

        long h = seed ^ (buf.remaining() * m);

        while (buf.remaining() >= 8) {
            long k = buf.getLong();

            k *= m;
            k ^= k >>> r;
            k *= m;

            h ^= k;
            h *= m;
        }

        if (buf.remaining() > 0) {
            ByteBuffer finish = ByteBuffer.allocate(8).order(ByteOrder.LITTLE_ENDIAN);

            finish.put(buf).rewind();

            h ^= finish.getLong();
            h *= m;
        }

        h ^= h >>> r;
        h *= m;
        h ^= h >>> r;

        buf.order(byteOrder);

        return h;
    }
}