All Downloads are FREE. Search and download functionalities are using the official Maven repository.

info.debatty.java.lsh.LSH Maven / Gradle / Ivy

package info.debatty.java.lsh;

import java.io.Serializable;

/**
 * Implementation of Locality Sensitive Hashing (LSH) principle, as described in
 * Leskovec, Rajaraman & Ullman (2014), "Mining of Massive Datasets",
 * Cambridge University Press.
 *
 * @author Thibault Debatty http://www.debatty.info
 */
public abstract class LSH implements Serializable {

    protected static final long LARGE_PRIME =  433494437;
    private static final int DEFAULT_STAGES = 3;
    private static final int DEFAULT_BUCKETS = 10;

    private int stages = DEFAULT_STAGES;
    private int buckets = DEFAULT_BUCKETS;

    /**
     * Instantiates a LSH instance with s stages (or bands) and b buckets (per
     * stage), in a space with n dimensions.
     *
     * @param stages stages
     * @param buckets buckets (per stage)
     */
    public LSH(final int stages, final int buckets) {
        this.stages = stages;
        this.buckets = buckets;
    }

    /**
     * Instantiate an empty LSH instance (useful only for serialization).
     */
    public LSH() {

    }

    /**
     * Hash a signature.
     * The signature is divided in s stages (or bands). Each stage is hashed to
     * one of the b buckets.
     * @param signature
     * @return An vector of s integers (between 0 and b-1)
     */
    public final int[] hashSignature(final int[] signature) {

        // Create an accumulator for each stage
        int[] hash = new int[stages];

        // Number of rows per stage
        int rows = signature.length / stages;

        for (int i = 0; i < signature.length; i++) {
            int stage = Math.min(i / rows, stages - 1);
            hash[stage] = (int)
                    ((hash[stage] + (long) signature[i] * LARGE_PRIME)
                    % buckets);

        }

        return hash;
    }

    /**
     * Hash a signature.
     * The signature is divided in s stages (or bands). Each stage is hashed to
     * one of the b buckets.
     * @param signature
     * @return An vector of s integers (between 0 and b-1)
     */
    public final int[] hashSignature(final boolean[] signature) {

        // Create an accumulator for each stage
        long[] acc = new long[stages];
        for (int i = 0; i < stages; i++) {
            acc[i] = 0;
        }

        // Number of rows per stage
        int rows = signature.length / stages;

        for (int i = 0; i < signature.length; i++) {
            long v = 0;
            if (signature[i]) {
                v = (i + 1) * LARGE_PRIME;
            }

            // current stage
            int j = Math.min(i / rows, stages - 1);
            acc[j] = (acc[j] + v) % Integer.MAX_VALUE;
        }

        int[] r = new int[stages];
        for (int i = 0; i < stages; i++) {
            r[i] = (int) (acc[i] % buckets);
        }

        return r;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy