info.debatty.java.lsh.examples.SimpleLSHMinHashExample Maven / Gradle / Ivy
/*
* The MIT License
*
* Copyright 2015 tibo.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package info.debatty.java.lsh.examples;
import info.debatty.java.lsh.LSHMinHash;
import java.util.Random;
/**
*
* @author tibo
*/
public class SimpleLSHMinHashExample {
/**
* @param args the command line arguments
*/
public static void main(String[] args) {
// proportion of 0's in the vectors
// if the vectors are dense (lots of 1's), the average jaccard similarity
// will be very high (especially for large vectors), and LSH
// won't be able to distinguish them
// as a result, all vectors will be binned in the same bucket...
double sparsity = 0.75;
// Number of sets
int count = 10000;
// Size of vectors
int n = 100;
// LSH parameters
// the number of stages is also sometimes called thge number of bands
int stages = 2;
// Attention: to get relevant results, the number of elements per bucket
// should be at least 100
int buckets = 10;
// Let's generate some random sets
boolean[][] vectors = new boolean[count][n];
Random rand = new Random();
for (int i = 0; i < count; i++) {
for (int j = 0; j < n; j++) {
vectors[i][j] = rand.nextDouble() > sparsity;
}
}
// Create and configure LSH algorithm
LSHMinHash lsh = new LSHMinHash(stages, buckets, n);
int[][] counts = new int[stages][buckets];
// Perform hashing
for (boolean[] vector : vectors) {
int[] hash = lsh.hash(vector);
for (int i = 0; i < hash.length; i++) {
counts[i][hash[i]]++;
}
print(vector);
System.out.print(" : ");
print(hash);
System.out.print("\n");
}
System.out.println("Number of elements per bucket at each stage:");
for (int i = 0; i < stages; i++) {
print(counts[i]);
System.out.print("\n");
}
}
static void print(int[] array) {
System.out.print("[");
for (int v : array) {
System.out.print("" + v + " ");
}
System.out.print("]");
}
static void print(boolean[] array) {
System.out.print("[");
for (boolean v : array) {
System.out.print(v ? "1" : "0");
}
System.out.print("]");
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy