smile.neighbor.SNLSH Maven / Gradle / Ivy
The newest version!
/*******************************************************************************
* Copyright (c) 2010 Haifeng Li
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
package smile.neighbor;
import smile.hash.MurmurHash;
import smile.math.distance.HammingDistance;
import smile.sort.HeapSelect;
import java.lang.reflect.Array;
import java.nio.ByteBuffer;
import java.util.List;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.Arrays;
import java.util.Set;
import java.util.LinkedHashMap;
import java.util.HashSet;
import static smile.neighbor.SNLSH.SimHash.simhash64;
/**
* Locality-Sensitive Hashing for Signatures.
* LSH is an efficient algorithm for approximate nearest neighbor search
* in high dimensional spaces by performing probabilistic dimension reduction of data.
* The basic idea is to hash the input items so that similar items are mapped to the same
* buckets with high probability (the number of buckets being much smaller
* than the universe of possible input items).
* To avoid computing the similarity of every pair of sets or their signatures.
* If we are given signatures for the sets, we may divide them into bands, and only
* measure the similarity of a pair of sets if they are identical in at least one band.
* By choosing the size of bands appropriately, we can eliminate from
* consideration most of the pairs that do not meet our threshold of similarity.
*
* References
*
* - Moses S. Charikar. Similarity Estimation Techniques from Rounding Algorithms
*
*
* @see LSH
* @author Qiyang Zuo
*/
public class SNLSH implements NearestNeighborSearch, KNNSearch, RNNSearch {
private final int bandSize;
private final long mask;
private static final int BITS = 64;
/**
* Signature fractions
*/
private Band[] bands;
/**
* The data objects.
*/
private List data;
/**
* The keys of data objects.
*/
private List keys;
/**
* signatures generated by simhash
*/
private List signs;
/**
* Whether to exclude query object self from the neighborhood.
*/
private boolean identicalExcluded = true;
@SuppressWarnings("unchecked")
public SNLSH(int bandSize) {
if (bandSize < 2 || bandSize > 32) {
throw new IllegalArgumentException("Invalid band size!");
}
this.bandSize = bandSize;
bands = (Band[]) Array.newInstance(Band.class, bandSize);
Arrays.fill(bands, new Band());
this.mask = -1 >>> (BITS / bandSize * (bandSize - 1));
data = new ArrayList();
keys = new ArrayList();
signs = new ArrayList();
}
public void put(AbstractSentence sentence, E v) {
int index = data.size();
data.add(v);
keys.add(sentence);
long sign = simhash64(sentence.tokens);
signs.add(sign);
for (int i = 0; i < bands.length; i++) {
long bandKey = bandHash(sign, i);
Bucket bucket = bands[i].get(bandKey);
if (bucket == null) {
bucket = new Bucket();
}
bucket.add(index);
bands[i].put(bandKey, bucket);
}
}
@SuppressWarnings("unchecked")
public Neighbor[] knn(AbstractSentence q, int k) {
if(k < 1) {
throw new IllegalArgumentException("Invalid k: " + k);
}
long fpq = simhash64(q.tokens);
Set candidates = obtainCandidates(q.tokens);
Neighbor[] neighbors = (Neighbor[])Array.newInstance(Neighbor.class, k);
HeapSelect> heap = new HeapSelect>(neighbors);
Neighbor neighbor = new Neighbor(null, null, 0, Double.MAX_VALUE);
for (int i = 0; i < k; i++) {
heap.add(neighbor);
}
int hit = 0;
for (int index : candidates) {
AbstractSentence key = keys.get(index);
if (q.line.equals(key.line) && identicalExcluded) {
continue;
}
long sign = signs.get(index);
double distance = HammingDistance.d(fpq, sign);
if (distance < heap.peek().distance) {
heap.add(new Neighbor(keys.get(index), data.get(index), index, distance));
hit++;
}
}
heap.sort();
if (hit < k) {
Neighbor[] n2 = (Neighbor[])Array.newInstance(Neighbor.class, hit);
int start = k - hit;
for (int i = 0; i < hit; i++) {
n2[i] = neighbors[i + start];
}
neighbors = n2;
}
return neighbors;
}
public Neighbor nearest(AbstractSentence q) {
Neighbor[] ns = knn(q, 1);
if(ns.length>0) {
return ns[0];
}
return new Neighbor(null, null, -1, Double.MAX_VALUE);
}
public void range(AbstractSentence q, double radius, List> neighbors) {
if (radius <= 0.0) {
throw new IllegalArgumentException("Invalid radius: " + radius);
}
long fpq = simhash64(q.tokens);
Set candidates = obtainCandidates(q.tokens);
for (int index : candidates) {
double distance = HammingDistance.d(fpq, signs.get(index));
if (distance <= radius) {
if (keys.get(index).line.equals(q.line) && identicalExcluded) {
continue;
}
neighbors.add(new Neighbor(keys.get(index), data.get(index), index, distance));
}
}
}
@SuppressWarnings("serial")
private class Band extends LinkedHashMap {}
@SuppressWarnings("serial")
private class Bucket extends LinkedList {}
private long bandHash(long hash, int bandNum) {
return hash >>> ((bandNum * (BITS / this.bandSize))) & mask;
}
private Set obtainCandidates(List q) {
Set candidates = new HashSet();
long sign = simhash64(q);
for (int i = 0; i < bands.length; i++) {
long bandKey = bandHash(sign, i);
Bucket bucket = bands[i].get(bandKey);
if (bucket != null) {
candidates.addAll(bucket);
}
}
return candidates;
}
public static class SimHash {
private static final long seed = 0; //do not change seed
public static long simhash64(List tokens) {
final int BITS = 64;
if (tokens == null || tokens.isEmpty()) {
return 0;
}
int[] bits = new int[BITS];
for (String s : tokens) {
ByteBuffer buffer = ByteBuffer.wrap(s.getBytes());
long hc = MurmurHash.hash2_64(buffer, 0, buffer.array().length, seed);
for (int i = 0; i < BITS; i++) {
if (((hc >>> i) & 1) == 1) {
bits[i]++;
} else {
bits[i]--;
}
}
}
long hash = 0;
long one = 1;
for (int i = 0; i < BITS; i++) {
if (bits[i] >= 0) {
hash |= one;
}
one <<= 1;
}
return hash;
}
}
public static abstract class AbstractSentence {
public String line;
public List tokens;
abstract List tokenize(String line);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy