Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
package info.debatty.java.lsh;
import java.security.InvalidParameterException;
import java.util.HashSet;
import java.util.Random;
import java.util.Set;
import java.util.TreeSet;
/**
* MinHash is a hashing scheme that tents to produce similar signatures for sets
* that have a high Jaccard similarity.
*
* The Jaccard similarity between two sets is the relative number of elements
* these sets have in common: J(A, B) = |A ∩ B| / |A ∪ B|
* A MinHash signature is a sequence of numbers produced by multiple hash
* functions hi. It can be shown that the Jaccard similarity between two sets is
* also the probability that this hash result is the same for the two sets:
* J(A, B) = Pr[hi(A) = hi(B)]. Therefore, MinHash signatures can be used to
* estimate Jaccard similarity between two sets. Moreover, it can be shown that
* the expected estimation error is O(1 / sqrt(n)), where n is the size of the
* signature (the number of hash functions that are used to produce the
* signature).
*
* @author Thibault Debatty http://www.debatty.info
*/
public class MinHash {
public static double JaccardIndex(Set s1, Set s2) {
Set intersection = new HashSet(s1);
intersection.retainAll(s2);
Set union = new HashSet(s1);
union.addAll(s2);
if (union.isEmpty()) {
return 0;
}
return (double) intersection.size() / union.size();
}
public static double JaccardIndex(boolean[] s1, boolean[] s2) {
if (s1.length != s2.length) {
throw new InvalidParameterException("sets must be same size!");
}
return JaccardIndex(Convert2Set(s1), Convert2Set(s2));
}
public static Set Convert2Set(boolean[] array) {
Set set = new TreeSet();
for (int i = 0; i < array.length; i++) {
if (array[i]) {
set.add(i);
}
}
return set;
}
/**
* Computes the size of the signature required to achieve a given error
* in similarity estimation (1 / error^2)
* @param error
* @return size of the signature
*/
public static int size(double error) {
if (error < 0 && error > 1) {
throw new IllegalArgumentException("error should be in [0 .. 1]");
}
return (int) (1 / (error * error));
}
/**
* Signature size
*/
private int n;
/**
* Random a and b coefficients for the random hash functions
*/
private int[][] hash_coefs;
/**
* Dictionary size
*/
private int dict_size;
/**
* Initializes hash functions to compute MinHash signatures for sets built
* from a dictionary of dict_size elements
*
* @param size the number of hash functions (and the size of resulting signatures)
* @param dict_size
*/
public MinHash (int size, int dict_size) {
init(size, dict_size);
}
/**
* Initializes hash function to compute MinHash signatures for sets built
* from a dictionary of dict_size elements, with a given similarity
* estimation error
* @param error
* @param dict_size
*/
public MinHash (double error, int dict_size) {
init(size(error), dict_size);
}
/**
* Computes the signature for this set
* The input set is represented as an vector of booleans
* For example the array [true, false, true, true, false]
* corresponds to the set {0, 2, 3}
*
* @param vector
* @return the signature
*/
public int[] signature(boolean[] vector) {
if (vector.length != dict_size) {
throw new IllegalArgumentException("Size of array should be dict_size");
}
return signature(Convert2Set(vector));
}
/**
* Computes the signature for this set
* For example set = {0, 2, 3}
* @param set
* @return the signature
*/
public int[] signature(Set set) {
int[] sig = new int[n];
for (int i = 0; i < n; i++) {
sig[i] = Integer.MAX_VALUE;
}
for (int r = 0; r < dict_size; r++) {
if (set.contains(r)) {
for (int i = 0; i < n; i++) {
sig[i] = Math.min(
sig[i],
h(i, r));
}
}
}
return sig;
}
/**
* Computes an estimation of Jaccard similarity (the number of elements in
* common) between two sets, using the MinHash signatures of these two sets
* @param sig1 MinHash signature of set1
* @param sig2 MinHash signature of set2 (produced using the same coefficients)
* @return the estimated similarity
*/
public double similarity(int[] sig1, int[] sig2) {
if (sig1.length != sig2.length) {
throw new IllegalArgumentException("Size of signatures should be the same");
}
double sim = 0;
for (int i = 0; i < sig1.length; i++) {
if (sig1[i] == sig2[i]) {
sim += 1;
}
}
return sim / sig1.length;
}
/**
* Computes the expected error of similarity computed using signatures
* @return the expected error
*/
public double error() {
return 1.0 / Math.sqrt(n);
}
private void init(int size, int dict_size) {
this.dict_size = dict_size;
n = size;
// h = (a * x) + b
// a and b should be randomly generated
Random r = new Random();
hash_coefs = new int[n][2];
for (int i = 0; i < n; i++) {
hash_coefs[i][0] = r.nextInt(Integer.MAX_VALUE); // a
hash_coefs[i][1] = r.nextInt(Integer.MAX_VALUE); // b
}
}
/**
* Computes hi(x) as (a_i * x + b_i) % dict_size.
* Computations are executed using long, then returned as an int
*
* @param i
* @param x
* @return the hashed value of x, using ith hash function
*/
private int h(int i, int x) {
return (int) ((((long)hash_coefs[i][0]) * x +
((long)hash_coefs[i][1])) % dict_size);
}
}