All Downloads are FREE. Search and download functionalities are using the official Maven repository.

info.debatty.java.lsh.MinHash Maven / Gradle / Ivy

package info.debatty.java.lsh;

import java.io.Serializable;
import java.security.InvalidParameterException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Random;
import java.util.Set;
import java.util.TreeSet;

/**
 * MinHash is a hashing scheme that tents to produce similar signatures for sets
 * that have a high Jaccard similarity.
 *
 * The Jaccard similarity between two sets is the relative number of elements
 * these sets have in common: J(A, B) = |A ∩ B| / |A ∪ B| A MinHash signature is
 * a sequence of numbers produced by multiple hash functions hi. It can be shown
 * that the Jaccard similarity between two sets is also the probability that
 * this hash result is the same for the two sets: J(A, B) = Pr[hi(A) = hi(B)].
 * Therefore, MinHash signatures can be used to estimate Jaccard similarity
 * between two sets. Moreover, it can be shown that the expected estimation
 * error is O(1 / sqrt(n)), where n is the size of the signature (the number of
 * hash functions that are used to produce the signature).
 *
 * @author Thibault Debatty http://www.debatty.info
 */
public class MinHash implements Serializable {

    private static final int LARGE_PRIME = 2147483647; // = 2^31 - 1 !

    /**
     * Compute the jaccard index between two sets.
     * @param s1
     * @param s2
     * @return
     */
    public static double jaccardIndex(
            final Set s1, final Set s2) {

        Set intersection = new HashSet(s1);
        intersection.retainAll(s2);

        Set union = new HashSet(s1);
        union.addAll(s2);

        if (union.isEmpty()) {
            return 0;
        }

        return (double) intersection.size() / union.size();
    }

    /**
     * Compute the exact jaccard index between two sets, represented as
     * arrays of booleans.
     * @param s1
     * @param s2
     * @return
     */
    public static double jaccardIndex(final boolean[] s1, final boolean[] s2) {
        if (s1.length != s2.length) {
            throw new InvalidParameterException("sets must be same size!");
        }
        return jaccardIndex(convert2Set(s1), convert2Set(s2));
    }

    /**
     * Convert a set represented as an array of booleans to a set of integer.
     *
     * @param array
     * @return
     */
    public static Set convert2Set(final boolean[] array) {
        Set set = new TreeSet();
        for (int i = 0; i < array.length; i++) {
            if (array[i]) {
                set.add(i);
            }
        }
        return set;
    }

    /**
     * Computes the size of the signature required to achieve a given error in
     * similarity estimation. (1 / error^2)
     *
     * @param error
     * @return size of the signature
     */
    public static int size(final double error) {
        if (error < 0 && error > 1) {
            throw new IllegalArgumentException("error should be in [0 .. 1]");
        }
        return (int) (1 / (error * error));
    }

    /**
     * Signature size.
     */
    private int n;

    /**
     * Random a and b coefficients for the random hash functions.
     */
    private long[][] hash_coefs;

    /**
     * Dictionary size (is also the size of vectors if the sets are provided
     * as vectors).
     */
    private int dict_size;

    /**
     * Initializes hash functions to compute MinHash signatures for sets built
     * from a dictionary of dict_size elements.
     *
     * @param size the number of hash functions (and the size of resulting
     * signatures)
     * @param dict_size
     */
    public MinHash(final int size, final int dict_size) {
        init(size, dict_size, new Random());
    }

    /**
     * Initializes hash function to compute MinHash signatures for sets built
     * from a dictionary of dict_size elements, with a given similarity
     * estimation error.
     *
     * @param error
     * @param dict_size
     */
    public MinHash(final double error, final int dict_size) {
        init(size(error), dict_size, new Random());
    }

    /**
     * Initializes hash functions to compute MinHash signatures for sets built
     * from a dictionary of dict_size elements.
     *
     * @param size the number of hash functions (and the size of resulting
     * signatures)
     * @param dict_size
     * @param seed random number generator seed. using the same value will
     * guarantee identical hashes across object instantiations
     */
    public MinHash(final int size, final int dict_size, final long seed) {
        init(size, dict_size, new Random(seed));
    }

    /**
     * Initializes hash function to compute MinHash signatures for sets built
     * from a dictionary of dict_size elements, with a given similarity
     * estimation error.
     *
     * @param error
     * @param dict_size
     * @param seed random number generator seed. using the same value will
     * guarantee identical hashes across object instantiations
     */
    public MinHash(final double error, final int dict_size, final long seed) {
        init(size(error), dict_size, new Random(seed));
    }

    /**
     * Computes the signature for this set The input set is represented as an
     * vector of booleans.
     * For example the array [true, false, true, true, false]
     * corresponds to the set {0, 2, 3}
     *
     * @param vector
     * @return the signature
     */
    public final int[] signature(final boolean[] vector) {
        if (vector.length != dict_size) {
            throw new IllegalArgumentException(
                    "Size of array should be dict_size");
        }

        return signature(convert2Set(vector));
    }

    /**
     * Computes the signature for this set. For example set = {0, 2, 3}
     *
     * @param set
     * @return the signature
     */
    public final int[] signature(final Set set) {
        int[] sig = new int[n];

        for (int i = 0; i < n; i++) {
            sig[i] = Integer.MAX_VALUE;
        }

        // For each row r:
        //for (int r = 0; r < dict_size; r++) {
        // if set has 0 in row r, do nothing
        //    if (!set.contains(r)) {
        //        continue;
        //    }
        // Loop over true values, instead of loop over all values of dictionary
        // to speedup computation
        final List list = new ArrayList(set);
        Collections.sort(list);

        for (final int r : list) {

            // However, if c has 1 in row r, then for each i = 1, 2, . . . ,n
            // set SIG(i, c) to the smaller of the current value of
            // SIG(i, c) and hi(r)
            for (int i = 0; i < n; i++) {
                sig[i] = Math.min(
                        sig[i],
                        h(i, r));
            }
        }

        return sig;
    }

    /**
     * Computes an estimation of Jaccard similarity (the number of elements in
     * common) between two sets, using the MinHash signatures of these two sets.
     *
     * @param sig1 MinHash signature of set1
     * @param sig2 MinHash signature of set2 (produced using the same
     * coefficients)
     * @return the estimated similarity
     */
    public final double similarity(final int[] sig1, final int[] sig2) {
        if (sig1.length != sig2.length) {
            throw new IllegalArgumentException(
                    "Size of signatures should be the same");
        }

        double sim = 0;
        for (int i = 0; i < sig1.length; i++) {
            if (sig1[i] == sig2[i]) {
                sim += 1;
            }
        }

        return sim / sig1.length;
    }

    /**
     * Computes the expected error of similarity computed using signatures.
     *
     * @return the expected error
     */
    public final double error() {
        return 1.0 / Math.sqrt(n);
    }

    /**
     * Compute hash function coefficients using provided Random.
     * @param size
     * @param dict_size
     * @param r
     */
    private void init(final int size, final int dict_size, final Random r) {
        if (size <= 0) {
            throw new InvalidParameterException(
                    "Signature size should be positive");
        }

        if (dict_size <= 0) {
            throw new InvalidParameterException(
                    "Dictionary size (or vector size) should be positive");
        }

        // In function h(i, x) the largest value could be
        // dict_size * dict_size + dict_size
        // throw an error if dict_size * dict_size + dict_size > Long.MAX_VALUE
        if (dict_size > (Long.MAX_VALUE - dict_size) / dict_size) {
            throw new InvalidParameterException(
                    "Dictionary size (or vector size) is too big and will "
                            + "cause a multiplication overflow");
        }

        this.dict_size = dict_size;
        this.n = size;

        // h = (a * x) + b
        // a and b should be randomly generated in [1,PRIME-1]
        hash_coefs = new long[n][2];
        for (int i = 0; i < n; i++) {
            hash_coefs[i][0] = r.nextInt(LARGE_PRIME - 1) + 1; // a
            hash_coefs[i][1] = r.nextInt(LARGE_PRIME - 1) + 1; // b
        }
    }

    /**
     * Computes hi(x) as (a_i * x + b_i) % LARGE_PRIME .
     *
     * @param i
     * @param x
     * @return the hashed value of x, using ith hash function
     */
    private int h(final int i, final int x) {
        return (int)
                ((hash_coefs[i][0] * (long) x + hash_coefs[i][1])
                 % LARGE_PRIME);
    }

    /**
     * Get the coefficients used by hash function hi.
     * @return
     */
    public final long[][] getCoefficients() {
        return hash_coefs;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy