All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.elasticsearch.xpack.core.ml.inference.preprocessing.customwordembedding.Hash32 Maven / Gradle / Ivy

/*
 * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 * or more contributor license agreements. Licensed under the Elastic License
 * 2.0; you may not use this file except in compliance with the Elastic License
 * 2.0.
 *
 * This Java port of CLD3 was derived from Google's CLD3 project at https://github.com/google/cld3
 */
package org.elasticsearch.xpack.core.ml.inference.preprocessing.customwordembedding;

import java.nio.charset.StandardCharsets;

/**
 * Custom Hash class necessary for hashing nGrams
 */
final class Hash32 {

    private static final int DEFAULT_SEED = 0xBEEF;

    private final int seed;

    Hash32(int seed) {
        this.seed = seed;
    }

    Hash32() {
        this(DEFAULT_SEED);
    }

    public long hash(String input) {
        byte[] bytes = input.getBytes(StandardCharsets.UTF_8);
        return Integer.toUnsignedLong(hash32(bytes));
    }

    /**
     * Derived from https://github.com/google/cld3/blob/06f695f1c8ee530104416aab5dcf2d6a1414a56a/src/utils.cc#L137
     *
     * It is critical that we utilize this hash as it determines which weight and quantile column/row we choose
     * when building the feature array.
     */
    private int hash32(byte[] data) {
        int n = data.length;
        // 'm' and 'r' are mixing constants generated offline.
        // They're not really 'magic', they just happen to work well.
        int m = 0x5bd1e995;
        int r = 24;

        // Initialize the hash to a 'random' value
        int h = (seed ^ n);

        // Mix 4 bytes at a time into the hash
        int i = 0;
        while (n >= 4) {
            int k = decodeFixed32(data, i);
            k *= m;
            k ^= k >>> r; // use unsigned shift
            k *= m;
            h *= m;
            h ^= k;
            i += 4;
            n -= 4;
        }

        // Handle the last few bytes of the input array
        if (n == 3) {
            h ^= Byte.toUnsignedInt(data[i + 2]) << 16;
        }
        if (n >= 2) {
            h ^= Byte.toUnsignedInt(data[i + 1]) << 8;
        }
        if (n >= 1) {
            h ^= Byte.toUnsignedInt(data[i]);
            h *= m;
        }

        // Do a few final mixes of the hash to ensure the last few
        // bytes are well-incorporated.
        h ^= h >>> 13; // use unsigned shift
        h *= m;
        h ^= h >>> 15; // use unsigned shift
        return h;
    }

    private static int decodeFixed32(byte[] ptr, int offset) {
        return Byte.toUnsignedInt(ptr[offset]) |
            Byte.toUnsignedInt(ptr[offset + 1]) << 8 |
            Byte.toUnsignedInt(ptr[offset + 2]) << 16 |
            Byte.toUnsignedInt(ptr[offset + 3]) << 24;
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy