com.github.jparkie.deduplicator.impl.BSBFSDDeDuplicator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of DeDuplicator Show documentation
Advanced Bloom Filter Based Algorithms for Efficient Approximate Data De-Duplication in Streams
The newest version!
package com.github.jparkie.deduplicator.impl;

import com.github.jparkie.deduplicator.BitArray;
import com.github.jparkie.deduplicator.Murmur3_x86_32;
import com.github.jparkie.deduplicator.Platform;
import com.github.jparkie.deduplicator.ProbabilisticDeDuplicator;

import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.Arrays;
import java.util.concurrent.ThreadLocalRandom;

/**
 * De-Duplication by a Biased Sampling based Bloom Filter with Single Deletion (BSBFSD).
 * 
 * Described by Suman K. Bera, Sourav Dutta, Ankur Narang, Souvik Bhattacherjee in
 * Advanced Bloom Filter Based Algorithms for Efficient Approximate Data De-Duplication in Streams:
 * 
 * https://arxiv.org/abs/1212.3964
 */
public class BSBFSDDeDuplicator implements ProbabilisticDeDuplicator, Serializable {
    private static final BSBFSDDeDuplicatorSerializer serializer = new BSBFSDDeDuplicatorSerializer();

    long numBits;
    int numHashFunctions;
    BitArray[] bloomFilters;

    private transient int[] hashBuffer;

    public BSBFSDDeDuplicator(long numBits, int numHashFunctions) {
        this(numBits, numHashFunctions, bloomFilters(numBits, numHashFunctions));
    }

    BSBFSDDeDuplicator(long numBits, int numHashFunctions, BitArray[] bloomFilters) {
        this.numBits = numBits;
        this.numHashFunctions = numHashFunctions;
        this.bloomFilters = bloomFilters;
        this.hashBuffer = new int[this.bloomFilters.length];
    }

    private BSBFSDDeDuplicator() {
    }

    public static BSBFSDDeDuplicator create(long numBits, double fpp) {
        return new BSBFSDDeDuplicator(numBits, optimalNumOfHashFunctions(fpp));
    }

    private static int optimalNumOfHashFunctions(double fpp) {
        if (fpp <= 0D || fpp >= 1D) {
            final String error = String.format("fpp must be in the range (0, 1), but got %f", fpp);
            throw new IllegalArgumentException(error);
        }
        /*
         * From Advanced Bloom Filter Based Algorithms for Efficient Approximate Data De-Duplication in Streams:
         * As a trade-off we set k as the arithmetic mean of 1 and ln(fpp) / ln(1 - 1/e).
         */
        return (int) Math.ceil(((Math.log(fpp) / Math.log(1D - (1D / Math.E))) + 1D) / 2D);
    }

    private static BitArray[] bloomFilters(long numBits, int numHashFunctions) {
        if (numBits <= 0L) {
            final String error = String.format("numBits must be positive, but got %d", numBits);
            throw new IllegalArgumentException(error);
        }
        if (numHashFunctions <= 0) {
            final String error = String.format("numHashFunctions must be positive, but got %d", numHashFunctions);
            throw new IllegalArgumentException(error);
        }
        final long bloomFilterBits = numBits / numHashFunctions;
        final BitArray[] bloomFilters = new BitArray[numHashFunctions];
        for (int index = 0; index < numHashFunctions; index++) {
            bloomFilters[index] = new BitArray(bloomFilterBits);
        }
        return bloomFilters;
    }

    @Override
    public long numBits() {
        return numBits;
    }

    @Override
    public int numHashFunctions() {
        return numHashFunctions;
    }

    @Override
    public boolean classifyDistinct(byte[] element) {
        /*
         * Algorithm 3: BSBFSD(S)
         * Require: Threshold FPR (FPRt), Memory in bits (M), and Stream (S)
         * Ensure: Detecting duplicate and distinct elements in S
         *
         * Compute the value of k from FPRt.
         * Construct k Bloom filters each having M/k bits of memory.
         *
         * for each element e of S do
         *   Hash e into k bit positions, H = h1,··· ,hk.
         *   if all bit at positions H are set then
         *     Result ← DISTINCT
         *   else
         *     Result ← DUPLICATE
         *   end if
         *   if e is DISTINCT then
         *     Randomly select a Bloom filter Bi.
         *     Randomly select a bit hˆ from the Bth Bloom Filter
         *     Reset hˆi to 0.
         *     Set all the bits in H to 1.
         *   end if
         * end for
         */
        fillHashBuffer(element, hashBuffer);
        final boolean temporaryIsDistinct = !containsHashBuffer(bloomFilters, hashBuffer);
        if (temporaryIsDistinct) {
            setHashBuffer(bloomFilters, hashBuffer);
        }
        return temporaryIsDistinct;
    }

    @Override
    public boolean peekDistinct(byte[] element) {
        fillHashBuffer(element, hashBuffer);
        return !containsHashBuffer(bloomFilters, hashBuffer);
    }

    @Override
    public void reset() {
        final int bloomFiltersLength = bloomFilters.length;
        for (int index = 0; index < bloomFiltersLength; index++) {
            bloomFilters[index] = new BitArray(bloomFilters[index].bitSize());
        }
    }

    private void fillHashBuffer(byte[] element, int[] hashBuffer) {
        /*
         * Adam Kirsch and Michael Mitzenmacher. 2008. Less hashing, same performance: Building a better Bloom filter.
         * Random Struct. Algorithms 33, 2 (September 2008), 187-218. DOI=http://dx.doi.org/10.1002/rsa.v33:2
         */
        final int hashBufferLength = hashBuffer.length;
        final int hash1 = Murmur3_x86_32.hashUnsafeBytes(element, Platform.BYTE_ARRAY_OFFSET, element.length, 0);
        final int hash2 = Murmur3_x86_32.hashUnsafeBytes(element, Platform.BYTE_ARRAY_OFFSET, element.length, hash1);
        for (int index = 0; index < hashBufferLength; index++) {
            int combinedHash = hash1 + ((index + 1) * hash2);
            if (combinedHash < 0) {
                combinedHash = ~combinedHash;
            }
            hashBuffer[index] = combinedHash;
        }
    }

    private boolean containsHashBuffer(BitArray[] bloomFilters, int[] hashBuffer) {
        final int hashBufferLength = hashBuffer.length;
        for (int index = 0; index < hashBufferLength; index++) {
            final int combinedHash = hashBuffer[index];
            final BitArray bloomFilter = bloomFilters[index];
            if (!bloomFilter.get(combinedHash % bloomFilter.bitSize())) {
                return false;
            }
        }
        return true;
    }

    private void setHashBuffer(BitArray[] bloomFilters, int[] hashBuffer) {
        final int hashBufferLength = hashBuffer.length;
        final BitArray randomBloomFilter = bloomFilters[ThreadLocalRandom.current().nextInt(hashBufferLength)];
        randomBloomFilter.clear(ThreadLocalRandom.current().nextLong(randomBloomFilter.bitSize()));
        for (int index = 0; index < hashBufferLength; index++) {
            final int combinedHash = hashBuffer[index];
            final BitArray bloomFilter = bloomFilters[index];
            bloomFilter.set(combinedHash % bloomFilter.bitSize());
        }
    }

    @Override
    public boolean equals(Object other) {
        if (this == other) {
            return true;
        }
        if (other == null || getClass() != other.getClass()) {
            return false;
        }
        final BSBFSDDeDuplicator that = (BSBFSDDeDuplicator) other;
        if (numBits != that.numBits) {
            return false;
        }
        if (numHashFunctions != that.numHashFunctions) {
            return false;
        }
        if (!Arrays.equals(bloomFilters, that.bloomFilters)) {
            return false;
        }
        return true;
    }

    @Override
    public int hashCode() {
        int result;
        result = (int) (numBits ^ (numBits >>> 32));
        result = 31 * result + numHashFunctions;
        result = 31 * result + Arrays.hashCode(bloomFilters);
        return result;
    }

    // http://docs.oracle.com/javase/8/docs/api/java/io/Serializable.html
    private void writeObject(ObjectOutputStream out) throws IOException {
        serializer.writeTo(this, out);
    }

    // http://docs.oracle.com/javase/8/docs/api/java/io/Serializable.html
    private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
        final BSBFSDDeDuplicator tempDeDuplicator = serializer.readFrom(in);
        this.numBits = tempDeDuplicator.numBits;
        this.numHashFunctions = tempDeDuplicator.numHashFunctions;
        this.bloomFilters = tempDeDuplicator.bloomFilters;
        this.hashBuffer = new int[this.bloomFilters.length];
    }
}