All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.elasticsearch.common.util.SetBackedScalingCuckooFilter Maven / Gradle / Ivy

There is a newer version: 8.13.2
Show newest version
/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.elasticsearch.common.util;

import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.hash.MurmurHash3;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.io.stream.Writeable;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Objects;
import java.util.Random;
import java.util.Set;
import java.util.function.Consumer;

/**
 * An approximate set membership datastructure that scales as more unique values are inserted.
 * Can definitively say if a member does not exist (no false negatives), but may say an item exists
 * when it does not (has false positives).  Similar in usage to a Bloom Filter.
 *
 * Internally, the datastructure maintains a Set of hashes up to a specified threshold.  This provides
 * 100% accurate membership queries.
 *
 * When the threshold is breached, a list of CuckooFilters are created and used to track membership.
 * These filters are approximate similar to Bloom Filters.
 *
 * This datastructure scales as more values are inserted by growing the list of CuckooFilters.
 * Final size is dependent on the cardinality of data inserted, and the precision specified.
 */
public class SetBackedScalingCuckooFilter implements Writeable {

    /**
     * This is the estimated insertion capacity for each individual internal CuckooFilter.
     */
    private static final int FILTER_CAPACITY = 1000000;

    /**
     * This set is used to track the insertions before we convert over to an approximate
     * filter. This gives us 100% accuracy for small cardinalities.  This will be null
     * if isSetMode = false;
     *
     * package-private for testing
     */
    Set hashes;

    /**
     * This list holds our approximate filters, after we have migrated out of a set.
     * This will be null if isSetMode = true;
     */
    List filters;

    private final int threshold;
    private final Random rng;
    private final int capacity;
    private final double fpp;
    private Consumer breaker = aLong -> {
        //noop
    };

    // cached here for performance reasons
    private int numBuckets = 0;
    private int bitsPerEntry = 0;
    private int fingerprintMask = 0;
    private MurmurHash3.Hash128 scratchHash = new MurmurHash3.Hash128();

    // True if we are tracking inserts with a set, false otherwise
    private boolean isSetMode = true;

    /**
     * @param threshold The number of distinct values that should be tracked
     *                  before converting to an approximate representation
     * @param rng A random number generator needed for the cuckoo hashing process
     * @param fpp the false-positive rate that should be used for the cuckoo filters.
     */
    public SetBackedScalingCuckooFilter(int threshold, Random rng, double fpp) {
        if (threshold <= 0) {
            throw new IllegalArgumentException("[threshold] must be a positive integer");
        }

        // We have to ensure that, in the worst case, two full sets can be converted into
        // one cuckoo filter without overflowing.  This keeps merging logic simpler
        if (threshold * 2 > FILTER_CAPACITY) {
            throw new IllegalArgumentException("[threshold] must be smaller than [" + (FILTER_CAPACITY / 2) + "]");
        }
        if (fpp < 0) {
            throw new IllegalArgumentException("[fpp] must be a positive double");
        }
        this.hashes = new HashSet<>(threshold);
        this.threshold = threshold;
        this.rng = rng;
        this.capacity = FILTER_CAPACITY;
        this.fpp = fpp;
    }

    public SetBackedScalingCuckooFilter(SetBackedScalingCuckooFilter other) {
        this.threshold = other.threshold;
        this.isSetMode = other.isSetMode;
        this.rng = other.rng;
        this.breaker = other.breaker;
        this.capacity = other.capacity;
        this.fpp = other.fpp;
        if (isSetMode) {
            this.hashes = new HashSet<>(other.hashes);
        } else {
            this.filters = new ArrayList<>(other.filters);
            this.numBuckets = filters.get(0).getNumBuckets();
            this.fingerprintMask = filters.get(0).getFingerprintMask();
            this.bitsPerEntry = filters.get(0).getBitsPerEntry();
        }
    }

    public SetBackedScalingCuckooFilter(StreamInput in, Random rng) throws IOException {
        this.threshold = in.readVInt();
        this.isSetMode = in.readBoolean();
        this.rng = rng;
        this.capacity = in.readVInt();
        this.fpp = in.readDouble();

        if (isSetMode) {
            this.hashes = in.readSet(StreamInput::readZLong);
        } else {
            this.filters = in.readList(in12 -> new CuckooFilter(in12, rng));
            this.numBuckets = filters.get(0).getNumBuckets();
            this.fingerprintMask = filters.get(0).getFingerprintMask();
            this.bitsPerEntry = filters.get(0).getBitsPerEntry();
        }
    }

    @Override
    public void writeTo(StreamOutput out) throws IOException {
        out.writeVInt(threshold);
        out.writeBoolean(isSetMode);
        out.writeVInt(capacity);
        out.writeDouble(fpp);
        if (isSetMode) {
            out.writeCollection(hashes, StreamOutput::writeZLong);
        } else {
            out.writeList(filters);
        }
    }

    /**
     * Registers a circuit breaker with the datastructure.
     *
     * CuckooFilter's can "saturate" and refuse to accept any new values.  When this happens,
     * the datastructure scales by adding a new filter.  This new filter's bytes will be tracked
     * in the registered breaker when configured.
     */
    public void registerBreaker(Consumer breaker) {
        this.breaker = Objects.requireNonNull(breaker, "Circuit Breaker Consumer cannot be null");
        breaker.accept(getSizeInBytes());
    }

    /**
     * Returns true if the set might contain the provided value, false otherwise.  False values are
     * 100% accurate, while true values may be a false-positive.
     */
    public boolean mightContain(BytesRef value) {
        MurmurHash3.Hash128 hash = MurmurHash3.hash128(value.bytes, value.offset, value.length, 0, scratchHash);
        return mightContainHash(hash.h1);
    }

    /**
     * Returns true if the set might contain the provided value, false otherwise.  False values are
     * 100% accurate, while true values may be a false-positive.
     */
    public boolean mightContain(long value) {
        long hash = MurmurHash3.murmur64(value);
        return mightContainHash(hash);
    }

    /**
     * Returns true if the set might contain the provided value, false otherwise.  False values are
     * 100% accurate, while true values may be a false-positive.
     */
    private boolean mightContainHash(long hash) {
        if (isSetMode) {
            return hashes.contains(hash);
        }

        // We calculate these once up front for all the filters and use the expert API
        int bucket = CuckooFilter.hashToIndex((int) hash, numBuckets);
        int fingerprint = CuckooFilter.fingerprint((int) (hash >> 32), bitsPerEntry, fingerprintMask);
        int alternateIndex = CuckooFilter.alternateIndex(bucket, fingerprint, numBuckets);

        for (CuckooFilter filter : filters) {
            if (filter.mightContainFingerprint(bucket, fingerprint, alternateIndex)) {
                return true;
            }
        }
        return false;
    }

    /**
     * Returns true if any of the filters contain this fingerprint at the specified bucket.
     * This is an expert-level API since it is dealing with buckets and fingerprints, not raw values
     * being hashed.
     */
    private boolean mightContainFingerprint(int bucket, int fingerprint) {
        int alternateIndex = CuckooFilter.alternateIndex(bucket, fingerprint, numBuckets);
        for (CuckooFilter filter : filters) {
            if (filter.mightContainFingerprint(bucket, fingerprint, alternateIndex)) {
                return true;
            }
        }
        return false;
    }

    /**
     * Add's the provided value to the set for tracking
     */
    public void add(BytesRef value) {
        MurmurHash3.Hash128 hash = MurmurHash3.hash128(value.bytes, value.offset, value.length, 0, scratchHash);
        addHash(hash.h1);
    }

    /**
     * Add's the provided value to the set for tracking
     */
    public void add(long value) {
        addHash(MurmurHash3.murmur64(value));
    }

    private void addHash(long hash) {
        if (isSetMode) {
            hashes.add(hash);
            maybeConvert();
            return;
        }

        boolean success = filters.get(filters.size() - 1).add(hash);
        if (success == false) {
            // filter is full, create a new one and insert there
            CuckooFilter t = new CuckooFilter(capacity, fpp, rng);
            t.add(hash);
            filters.add(t);
            breaker.accept(t.getSizeInBytes()); // make sure we account for the new filter
        }
    }

    private void maybeConvert() {
        if (isSetMode && hashes.size() > threshold) {
            convert();
        }
    }

    /**
     * If we still holding values in a set, convert this filter into an approximate, cuckoo-backed filter.
     * This will create a list of CuckooFilters, and null out the set of hashes
     */
    void convert() {
        if (isSetMode == false) {
            throw new IllegalStateException("Cannot convert SetBackedScalingCuckooFilter to approximate " +
                "when it has already been converted.");
        }
        long oldSize = getSizeInBytes();

        filters = new ArrayList<>();
        CuckooFilter t = new CuckooFilter(capacity, fpp, rng);
        // Cache the chosen numBuckets for later use
        numBuckets = t.getNumBuckets();
        fingerprintMask = t.getFingerprintMask();
        bitsPerEntry = t.getBitsPerEntry();

        hashes.forEach(t::add);
        filters.add(t);

        hashes = null;
        isSetMode = false;

        breaker.accept(-oldSize); // this zeros out the overhead of the set
        breaker.accept(getSizeInBytes()); // this adds back in the new overhead of the cuckoo filters

    }

    /**
     * Get the approximate size of this datastructure.  Approximate because only the Set occupants
     * are tracked, not the overhead of the Set itself.
     */
    public long getSizeInBytes() {
        long bytes = 13; // fpp (double), threshold (int), isSetMode (boolean)
        if (hashes != null) {
            bytes = (hashes.size() * 16);
        }
        if (filters != null) {
            bytes += filters.stream().mapToLong(CuckooFilter::getSizeInBytes).sum();
        }
        return bytes;
    }


    /**
     * Merge `other` cuckoo filter into this cuckoo.  After merging, this filter's state will
     * be the union of the two.  During the merging process, the internal Set may be upgraded
     * to a cuckoo if it goes over threshold
     */
    public void merge(SetBackedScalingCuckooFilter other) {
        // Some basic sanity checks to make sure we can merge
        if (this.threshold != other.threshold) {
            throw new IllegalStateException("Cannot merge other CuckooFilter because thresholds do not match: ["
                + this.threshold + "] vs [" + other.threshold + "]");
        }
        if (this.capacity != other.capacity) {
            throw new IllegalStateException("Cannot merge other CuckooFilter because capacities do not match: ["
                + this.capacity + "] vs [" + other.capacity + "]");
        }
        if (this.fpp != other.fpp) {
            throw new IllegalStateException("Cannot merge other CuckooFilter because precisions do not match: ["
                + this.fpp + "] vs [" + other.fpp + "]");
        }

        if (isSetMode && other.isSetMode) {
            // Both in sets, merge collections then see if we need to convert to cuckoo
            hashes.addAll(other.hashes);
            maybeConvert();
        } else if (isSetMode && other.isSetMode == false) {
            // Other is in cuckoo mode, so we convert our set to a cuckoo, then
            // call the merge function again.  Since both are now in set-mode
            // this will fall through to the last conditional and do a cuckoo-cuckoo merge
            convert();
            merge(other);
        } else if (isSetMode == false && other.isSetMode) {
            // Rather than converting the other to a cuckoo first, we can just
            // replay the values directly into our filter.
            other.hashes.forEach(this::add);
        } else {
            // Both are in cuckoo mode, merge raw fingerprints

            CuckooFilter currentFilter = filters.get(filters.size() - 1);

            for (CuckooFilter otherFilter : other.filters) {

                // The iterator returns an array of longs corresponding to the
                // fingerprints for buckets at the current position
                Iterator iter = otherFilter.getBuckets();
                int bucket = 0;
                while (iter.hasNext()) {
                    long[] fingerprints = iter.next();

                    // We check to see if the fingerprint is present in any of the existing filters
                    // (in the same bucket/alternate bucket), or if the fingerprint is empty.  In these cases
                    // we can skip the fingerprint
                    for (long fingerprint : fingerprints) {
                        if (fingerprint == CuckooFilter.EMPTY || mightContainFingerprint(bucket, (int) fingerprint)) {
                            continue;
                        }
                        // Try to insert into the last filter in our list
                        if (currentFilter.mergeFingerprint(bucket, (int) fingerprint) == false) {
                            // if we failed, the filter is now saturated and we need to create a new one
                            CuckooFilter t = new CuckooFilter(capacity, fpp, rng);
                            filters.add(t);
                            breaker.accept(t.getSizeInBytes()); // make sure we account for the new filter

                            currentFilter = filters.get(filters.size() - 1);
                        }
                    }
                    bucket += 1;
                }
            }
        }
    }


    @Override
    public int hashCode() {
        return Objects.hash(hashes, filters, threshold, isSetMode, capacity, fpp);
    }

    @Override
    public boolean equals(Object other) {
        if (this == other) {
            return true;
        }
        if (other == null || getClass() != other.getClass()) {
            return false;
        }

        final SetBackedScalingCuckooFilter that = (SetBackedScalingCuckooFilter) other;
        return Objects.equals(this.hashes, that.hashes)
            && Objects.equals(this.filters, that.filters)
            && Objects.equals(this.threshold, that.threshold)
            && Objects.equals(this.isSetMode, that.isSetMode)
            && Objects.equals(this.capacity, that.capacity)
            && Objects.equals(this.fpp, that.fpp);
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy