de.lmu.ifi.dbs.elki.index.lsh.InMemoryLSHIndex Maven / Gradle / Ivy
* This file is part of ELKI:
* Environment for Developing KDD-Applications Supported by Index-Structures
* Copyright (C) 2019
* ELKI Development Team
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* GNU Affero General Public License for more details.
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
package de.lmu.ifi.dbs.elki.index.lsh;
import java.util.ArrayList;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.KNNHeap;
import de.lmu.ifi.dbs.elki.database.ids.KNNList;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDoubleDBIDList;
import de.lmu.ifi.dbs.elki.database.query.DatabaseQuery;
import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery;
import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
import de.lmu.ifi.dbs.elki.index.AbstractRefiningIndex;
import de.lmu.ifi.dbs.elki.index.IndexFactory;
import de.lmu.ifi.dbs.elki.index.KNNIndex;
import de.lmu.ifi.dbs.elki.index.RangeIndex;
import de.lmu.ifi.dbs.elki.index.lsh.hashfamilies.LocalitySensitiveHashFunctionFamily;
import de.lmu.ifi.dbs.elki.index.lsh.hashfunctions.LocalitySensitiveHashFunction;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap;
* Locality Sensitive Hashing.
* @author Erich Schubert
* @since 0.6.0
* @has - - - LocalitySensitiveHashFunctionFamily
* @has - - - Instance
* @param Object type to index
public class InMemoryLSHIndex implements IndexFactory {
* Class logger
private static final Logging LOG = Logging.getLogger(InMemoryLSHIndex.class);
* LSH hash function family to use.
LocalitySensitiveHashFunctionFamily super V> family;
* Number of hash tables to use.
int l;
* Number of buckets to use.
int numberOfBuckets;
* Constructor.
* @param family Projection family
* @param l Number of hash tables to use
* @param numberOfBuckets Number of buckets to use.
public InMemoryLSHIndex(LocalitySensitiveHashFunctionFamily super V> family, int l, int numberOfBuckets) {
this.family = family;
this.l = l;
this.numberOfBuckets = numberOfBuckets;
public Instance instantiate(Relation relation) {
return new Instance(relation, family.generateHashFunctions(relation, l), numberOfBuckets);
public TypeInformation getInputTypeRestriction() {
return family.getInputTypeRestriction();
* Instance of a LSH index for a single relation.
* @author Erich Schubert
* @has - - - LocalitySensitiveHashFunction
public class Instance extends AbstractRefiningIndex implements KNNIndex, RangeIndex {
* Hash functions to use.
ArrayList extends LocalitySensitiveHashFunction super V>> hashfunctions;
* The actual table
ArrayList> hashtables;
* Number of buckets to use.
private int numberOfBuckets;
* Constructor.
* @param relation Relation to index.
* @param hashfunctions Hash functions.
public Instance(Relation relation, ArrayList extends LocalitySensitiveHashFunction super V>> hashfunctions, int numberOfBuckets) {
this.hashfunctions = hashfunctions;
this.numberOfBuckets = numberOfBuckets;
public String getLongName() {
return "LSH index";
public String getShortName() {
return "lsh-index";
public void initialize() {
final int numhash = hashfunctions.size();
hashtables = new ArrayList<>(numhash);
for(int i = 0; i < numhash; i++) {
hashtables.add(new Int2ObjectOpenHashMap(numberOfBuckets));
// TODO: We assume all hash functions have the same dimensionality.
double[] buf = new double[hashfunctions.get(0).getNumberOfProjections()];
FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Building LSH index", relation.size(), LOG) : null;
int expect = Math.max(2, (int) Math.ceil(relation.size() / (double) numberOfBuckets));
for(DBIDIter iter = relation.getDBIDs().iter(); iter.valid(); iter.advance()) {
V obj = relation.get(iter);
for(int i = 0; i < numhash; i++) {
final Int2ObjectOpenHashMap table = hashtables.get(i);
final LocalitySensitiveHashFunction super V> hashfunc = hashfunctions.get(i);
// Get the initial (unbounded) hash code:
int hash = hashfunc.hashObject(obj, buf);
// Reduce to hash table size
int bucket = hash % numberOfBuckets;
DBIDs cur = table.get(bucket);
if(cur == null) {
table.put(bucket, DBIDUtil.deref(iter));
else if(cur.size() > 1) {
((ModifiableDBIDs) cur).add(iter);
else {
ModifiableDBIDs newbuck = DBIDUtil.newArray(expect);
table.put(bucket, newbuck);
if(LOG.isStatistics()) {
int min = Integer.MAX_VALUE, max = 0;
for(int i = 0; i < numhash; i++) {
final Int2ObjectOpenHashMap table = hashtables.get(i);
for(DBIDs set : table.values()) {
final int size = set.size();
min = size < min ? size : min;
max = size > max ? size : max;
LOG.statistics(new LongStatistic(this.getClass().getName() + ".fill.min", min));
LOG.statistics(new LongStatistic(this.getClass().getName() + ".fill.max", max));
LOG.statistics(new LongStatistic(this.getClass().getName() + ".hashtables", hashtables.size()));
public Logging getLogger() {
return LOG;
public KNNQuery getKNNQuery(DistanceQuery distanceQuery, Object... hints) {
for(Object hint : hints) {
if(DatabaseQuery.HINT_EXACT.equals(hint)) {
return null;
DistanceFunction super V> df = distanceQuery.getDistanceFunction();
if(!family.isCompatible(df)) {
return null;
return new LSHKNNQuery(distanceQuery);
public RangeQuery getRangeQuery(DistanceQuery distanceQuery, Object... hints) {
for(Object hint : hints) {
if(DatabaseQuery.HINT_EXACT.equals(hint)) {
return null;
DistanceFunction super V> df = distanceQuery.getDistanceFunction();
if(!family.isCompatible(df)) {
return null;
return new LSHRangeQuery(distanceQuery);
* Get the candidates: points which have at least one hash bucket in common.
* @param obj Query object
* @return Candidates
protected DBIDs getCandidates(V obj) {
ModifiableDBIDs candidates = null;
final int numhash = hashtables.size();
double[] buf = new double[hashfunctions.get(0).getNumberOfProjections()];
for(int i = 0; i < numhash; i++) {
final Int2ObjectOpenHashMap table = hashtables.get(i);
final LocalitySensitiveHashFunction super V> hashfunc = hashfunctions.get(i);
// Get the initial (unbounded) hash code:
int hash = hashfunc.hashObject(obj, buf);
// Reduce to hash table size
int bucket = hash % numberOfBuckets;
DBIDs cur = table.get(bucket);
if(cur != null) {
if(candidates == null) {
candidates = DBIDUtil.newHashSet(cur.size() * numhash);
return (candidates == null) ? DBIDUtil.EMPTYDBIDS : candidates;
* Class for handling kNN queries against the LSH index.
* @author Erich Schubert
protected class LSHKNNQuery extends AbstractKNNQuery {
* Constructor.
* @param distanceQuery
public LSHKNNQuery(DistanceQuery distanceQuery) {
public KNNList getKNNForObject(V obj, int k) {
DBIDs candidates = getCandidates(obj);
// Refine.
KNNHeap heap = DBIDUtil.newHeap(k);
for(DBIDIter iter = candidates.iter(); iter.valid(); iter.advance()) {
final double dist = distanceQuery.distance(obj, iter);
heap.insert(dist, iter);
return heap.toKNNList();
* Class for handling kNN queries against the LSH index.
* @author Erich Schubert
protected class LSHRangeQuery extends AbstractRangeQuery {
* Constructor.
* @param distanceQuery
public LSHRangeQuery(DistanceQuery distanceQuery) {
public void getRangeForObject(V obj, double range, ModifiableDoubleDBIDList result) {
DBIDs candidates = getCandidates(obj);
// Refine.
for(DBIDIter iter = candidates.iter(); iter.valid(); iter.advance()) {
final double dist = distanceQuery.distance(obj, iter);
if(dist <= range) {
result.add(dist, iter);
* Parameterization class.
* @author Erich Schubert
public static class Parameterizer extends AbstractParameterizer {
* Hash function family parameter.
public static final OptionID FAMILY_ID = new OptionID("lsh.family", "Hash function family to use for LSH.");
* Number of hash tables to use for LSH.
public static final OptionID L_ID = new OptionID("lsh.tables", "Number of hash tables to use.");
* Number of hash tables to use for LSH.
public static final OptionID BUCKETS_ID = new OptionID("lsh.buckets", "Number of hash buckets to use.");
* LSH hash function family to use.
LocalitySensitiveHashFunctionFamily super V> family;
* Number of hash functions for each table.
int l;
* Number of buckets to use.
int numberOfBuckets;
protected void makeOptions(Parameterization config) {
ObjectParameter> familyP = new ObjectParameter<>(FAMILY_ID, LocalitySensitiveHashFunctionFamily.class);
if(config.grab(familyP)) {
family = familyP.instantiateClass(config);
IntParameter lP = new IntParameter(L_ID) //
if(config.grab(lP)) {
l = lP.intValue();
IntParameter bucketsP = new IntParameter(BUCKETS_ID) //
.setDefaultValue(7919); // Primes work best, apparently.
if(config.grab(bucketsP)) {
numberOfBuckets = bucketsP.intValue();
protected InMemoryLSHIndex makeInstance() {
return new InMemoryLSHIndex<>(family, l, numberOfBuckets);
© 2015 - 2025 Weber Informatics LLC | Privacy Policy