org.elasticsearch.common.util.SetBackedScalingCuckooFilter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of elasticsearch Show documentation
Show all versions of elasticsearch Show documentation
Elasticsearch subproject :server
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.common.util;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.hash.MurmurHash3;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.io.stream.Writeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Objects;
import java.util.Random;
import java.util.Set;
import java.util.function.Consumer;
/**
* An approximate set membership datastructure that scales as more unique values are inserted.
* Can definitively say if a member does not exist (no false negatives), but may say an item exists
* when it does not (has false positives). Similar in usage to a Bloom Filter.
*
* Internally, the datastructure maintains a Set of hashes up to a specified threshold. This provides
* 100% accurate membership queries.
*
* When the threshold is breached, a list of CuckooFilters are created and used to track membership.
* These filters are approximate similar to Bloom Filters.
*
* This datastructure scales as more values are inserted by growing the list of CuckooFilters.
* Final size is dependent on the cardinality of data inserted, and the precision specified.
*/
public class SetBackedScalingCuckooFilter implements Writeable {
/**
* This is the estimated insertion capacity for each individual internal CuckooFilter.
*/
private static final int FILTER_CAPACITY = 1000000;
/**
* This set is used to track the insertions before we convert over to an approximate
* filter. This gives us 100% accuracy for small cardinalities. This will be null
* if isSetMode = false;
*
* package-private for testing
*/
Set hashes;
/**
* This list holds our approximate filters, after we have migrated out of a set.
* This will be null if isSetMode = true;
*/
List filters;
private final int threshold;
private final Random rng;
private final int capacity;
private final double fpp;
private Consumer breaker = aLong -> {
//noop
};
// cached here for performance reasons
private int numBuckets = 0;
private int bitsPerEntry = 0;
private int fingerprintMask = 0;
private MurmurHash3.Hash128 scratchHash = new MurmurHash3.Hash128();
// True if we are tracking inserts with a set, false otherwise
private boolean isSetMode = true;
/**
* @param threshold The number of distinct values that should be tracked
* before converting to an approximate representation
* @param rng A random number generator needed for the cuckoo hashing process
* @param fpp the false-positive rate that should be used for the cuckoo filters.
*/
public SetBackedScalingCuckooFilter(int threshold, Random rng, double fpp) {
if (threshold <= 0) {
throw new IllegalArgumentException("[threshold] must be a positive integer");
}
// We have to ensure that, in the worst case, two full sets can be converted into
// one cuckoo filter without overflowing. This keeps merging logic simpler
if (threshold * 2 > FILTER_CAPACITY) {
throw new IllegalArgumentException("[threshold] must be smaller than [" + (FILTER_CAPACITY / 2) + "]");
}
if (fpp < 0) {
throw new IllegalArgumentException("[fpp] must be a positive double");
}
this.hashes = new HashSet<>(threshold);
this.threshold = threshold;
this.rng = rng;
this.capacity = FILTER_CAPACITY;
this.fpp = fpp;
}
public SetBackedScalingCuckooFilter(SetBackedScalingCuckooFilter other) {
this.threshold = other.threshold;
this.isSetMode = other.isSetMode;
this.rng = other.rng;
this.breaker = other.breaker;
this.capacity = other.capacity;
this.fpp = other.fpp;
if (isSetMode) {
this.hashes = new HashSet<>(other.hashes);
} else {
this.filters = new ArrayList<>(other.filters);
this.numBuckets = filters.get(0).getNumBuckets();
this.fingerprintMask = filters.get(0).getFingerprintMask();
this.bitsPerEntry = filters.get(0).getBitsPerEntry();
}
}
public SetBackedScalingCuckooFilter(StreamInput in, Random rng) throws IOException {
this.threshold = in.readVInt();
this.isSetMode = in.readBoolean();
this.rng = rng;
this.capacity = in.readVInt();
this.fpp = in.readDouble();
if (isSetMode) {
this.hashes = in.readSet(StreamInput::readZLong);
} else {
this.filters = in.readList(in12 -> new CuckooFilter(in12, rng));
this.numBuckets = filters.get(0).getNumBuckets();
this.fingerprintMask = filters.get(0).getFingerprintMask();
this.bitsPerEntry = filters.get(0).getBitsPerEntry();
}
}
@Override
public void writeTo(StreamOutput out) throws IOException {
out.writeVInt(threshold);
out.writeBoolean(isSetMode);
out.writeVInt(capacity);
out.writeDouble(fpp);
if (isSetMode) {
out.writeCollection(hashes, StreamOutput::writeZLong);
} else {
out.writeList(filters);
}
}
/**
* Registers a circuit breaker with the datastructure.
*
* CuckooFilter's can "saturate" and refuse to accept any new values. When this happens,
* the datastructure scales by adding a new filter. This new filter's bytes will be tracked
* in the registered breaker when configured.
*/
public void registerBreaker(Consumer breaker) {
this.breaker = Objects.requireNonNull(breaker, "Circuit Breaker Consumer cannot be null");
breaker.accept(getSizeInBytes());
}
/**
* Returns true if the set might contain the provided value, false otherwise. False values are
* 100% accurate, while true values may be a false-positive.
*/
public boolean mightContain(BytesRef value) {
MurmurHash3.Hash128 hash = MurmurHash3.hash128(value.bytes, value.offset, value.length, 0, scratchHash);
return mightContainHash(hash.h1);
}
/**
* Returns true if the set might contain the provided value, false otherwise. False values are
* 100% accurate, while true values may be a false-positive.
*/
public boolean mightContain(long value) {
long hash = MurmurHash3.murmur64(value);
return mightContainHash(hash);
}
/**
* Returns true if the set might contain the provided value, false otherwise. False values are
* 100% accurate, while true values may be a false-positive.
*/
private boolean mightContainHash(long hash) {
if (isSetMode) {
return hashes.contains(hash);
}
// We calculate these once up front for all the filters and use the expert API
int bucket = CuckooFilter.hashToIndex((int) hash, numBuckets);
int fingerprint = CuckooFilter.fingerprint((int) (hash >> 32), bitsPerEntry, fingerprintMask);
int alternateIndex = CuckooFilter.alternateIndex(bucket, fingerprint, numBuckets);
for (CuckooFilter filter : filters) {
if (filter.mightContainFingerprint(bucket, fingerprint, alternateIndex)) {
return true;
}
}
return false;
}
/**
* Returns true if any of the filters contain this fingerprint at the specified bucket.
* This is an expert-level API since it is dealing with buckets and fingerprints, not raw values
* being hashed.
*/
private boolean mightContainFingerprint(int bucket, int fingerprint) {
int alternateIndex = CuckooFilter.alternateIndex(bucket, fingerprint, numBuckets);
for (CuckooFilter filter : filters) {
if (filter.mightContainFingerprint(bucket, fingerprint, alternateIndex)) {
return true;
}
}
return false;
}
/**
* Add's the provided value to the set for tracking
*/
public void add(BytesRef value) {
MurmurHash3.Hash128 hash = MurmurHash3.hash128(value.bytes, value.offset, value.length, 0, scratchHash);
addHash(hash.h1);
}
/**
* Add's the provided value to the set for tracking
*/
public void add(long value) {
addHash(MurmurHash3.murmur64(value));
}
private void addHash(long hash) {
if (isSetMode) {
hashes.add(hash);
maybeConvert();
return;
}
boolean success = filters.get(filters.size() - 1).add(hash);
if (success == false) {
// filter is full, create a new one and insert there
CuckooFilter t = new CuckooFilter(capacity, fpp, rng);
t.add(hash);
filters.add(t);
breaker.accept(t.getSizeInBytes()); // make sure we account for the new filter
}
}
private void maybeConvert() {
if (isSetMode && hashes.size() > threshold) {
convert();
}
}
/**
* If we still holding values in a set, convert this filter into an approximate, cuckoo-backed filter.
* This will create a list of CuckooFilters, and null out the set of hashes
*/
void convert() {
if (isSetMode == false) {
throw new IllegalStateException("Cannot convert SetBackedScalingCuckooFilter to approximate " +
"when it has already been converted.");
}
long oldSize = getSizeInBytes();
filters = new ArrayList<>();
CuckooFilter t = new CuckooFilter(capacity, fpp, rng);
// Cache the chosen numBuckets for later use
numBuckets = t.getNumBuckets();
fingerprintMask = t.getFingerprintMask();
bitsPerEntry = t.getBitsPerEntry();
hashes.forEach(t::add);
filters.add(t);
hashes = null;
isSetMode = false;
breaker.accept(-oldSize); // this zeros out the overhead of the set
breaker.accept(getSizeInBytes()); // this adds back in the new overhead of the cuckoo filters
}
/**
* Get the approximate size of this datastructure. Approximate because only the Set occupants
* are tracked, not the overhead of the Set itself.
*/
public long getSizeInBytes() {
long bytes = 13; // fpp (double), threshold (int), isSetMode (boolean)
if (hashes != null) {
bytes = (hashes.size() * 16);
}
if (filters != null) {
bytes += filters.stream().mapToLong(CuckooFilter::getSizeInBytes).sum();
}
return bytes;
}
/**
* Merge `other` cuckoo filter into this cuckoo. After merging, this filter's state will
* be the union of the two. During the merging process, the internal Set may be upgraded
* to a cuckoo if it goes over threshold
*/
public void merge(SetBackedScalingCuckooFilter other) {
// Some basic sanity checks to make sure we can merge
if (this.threshold != other.threshold) {
throw new IllegalStateException("Cannot merge other CuckooFilter because thresholds do not match: ["
+ this.threshold + "] vs [" + other.threshold + "]");
}
if (this.capacity != other.capacity) {
throw new IllegalStateException("Cannot merge other CuckooFilter because capacities do not match: ["
+ this.capacity + "] vs [" + other.capacity + "]");
}
if (this.fpp != other.fpp) {
throw new IllegalStateException("Cannot merge other CuckooFilter because precisions do not match: ["
+ this.fpp + "] vs [" + other.fpp + "]");
}
if (isSetMode && other.isSetMode) {
// Both in sets, merge collections then see if we need to convert to cuckoo
hashes.addAll(other.hashes);
maybeConvert();
} else if (isSetMode && other.isSetMode == false) {
// Other is in cuckoo mode, so we convert our set to a cuckoo, then
// call the merge function again. Since both are now in set-mode
// this will fall through to the last conditional and do a cuckoo-cuckoo merge
convert();
merge(other);
} else if (isSetMode == false && other.isSetMode) {
// Rather than converting the other to a cuckoo first, we can just
// replay the values directly into our filter.
other.hashes.forEach(this::add);
} else {
// Both are in cuckoo mode, merge raw fingerprints
CuckooFilter currentFilter = filters.get(filters.size() - 1);
for (CuckooFilter otherFilter : other.filters) {
// The iterator returns an array of longs corresponding to the
// fingerprints for buckets at the current position
Iterator iter = otherFilter.getBuckets();
int bucket = 0;
while (iter.hasNext()) {
long[] fingerprints = iter.next();
// We check to see if the fingerprint is present in any of the existing filters
// (in the same bucket/alternate bucket), or if the fingerprint is empty. In these cases
// we can skip the fingerprint
for (long fingerprint : fingerprints) {
if (fingerprint == CuckooFilter.EMPTY || mightContainFingerprint(bucket, (int) fingerprint)) {
continue;
}
// Try to insert into the last filter in our list
if (currentFilter.mergeFingerprint(bucket, (int) fingerprint) == false) {
// if we failed, the filter is now saturated and we need to create a new one
CuckooFilter t = new CuckooFilter(capacity, fpp, rng);
filters.add(t);
breaker.accept(t.getSizeInBytes()); // make sure we account for the new filter
currentFilter = filters.get(filters.size() - 1);
}
}
bucket += 1;
}
}
}
}
@Override
public int hashCode() {
return Objects.hash(hashes, filters, threshold, isSetMode, capacity, fpp);
}
@Override
public boolean equals(Object other) {
if (this == other) {
return true;
}
if (other == null || getClass() != other.getClass()) {
return false;
}
final SetBackedScalingCuckooFilter that = (SetBackedScalingCuckooFilter) other;
return Objects.equals(this.hashes, that.hashes)
&& Objects.equals(this.filters, that.filters)
&& Objects.equals(this.threshold, that.threshold)
&& Objects.equals(this.isSetMode, that.isSetMode)
&& Objects.equals(this.capacity, that.capacity)
&& Objects.equals(this.fpp, that.fpp);
}
}