All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.jparkie.deduplicator.impl.BSBFDeDuplicator Maven / Gradle / Ivy

Go to download

Advanced Bloom Filter Based Algorithms for Efficient Approximate Data De-Duplication in Streams

The newest version!
package com.github.jparkie.deduplicator.impl;

import com.github.jparkie.deduplicator.BitArray;
import com.github.jparkie.deduplicator.Murmur3_x86_32;
import com.github.jparkie.deduplicator.Platform;
import com.github.jparkie.deduplicator.ProbabilisticDeDuplicator;

import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.Arrays;
import java.util.concurrent.ThreadLocalRandom;

/**
 * De-Duplication by a Biased Sampling based Bloom Filter (BSBF).
 * 

* Described by Suman K. Bera, Sourav Dutta, Ankur Narang, Souvik Bhattacherjee in * Advanced Bloom Filter Based Algorithms for Efficient Approximate Data De-Duplication in Streams: *

* https://arxiv.org/abs/1212.3964 */ public class BSBFDeDuplicator implements ProbabilisticDeDuplicator, Serializable { private static final BSBFDeDuplicatorSerializer serializer = new BSBFDeDuplicatorSerializer(); long numBits; int numHashFunctions; BitArray[] bloomFilters; private transient int[] hashBuffer; public BSBFDeDuplicator(long numBits, int numHashFunctions) { this(numBits, numHashFunctions, bloomFilters(numBits, numHashFunctions)); } BSBFDeDuplicator(long numBits, int numHashFunctions, BitArray[] bloomFilters) { this.numBits = numBits; this.numHashFunctions = numHashFunctions; this.bloomFilters = bloomFilters; this.hashBuffer = new int[this.bloomFilters.length]; } private BSBFDeDuplicator() { } public static BSBFDeDuplicator create(long numBits, double fpp) { return new BSBFDeDuplicator(numBits, optimalNumOfHashFunctions(fpp)); } private static int optimalNumOfHashFunctions(double fpp) { if (fpp <= 0D || fpp >= 1D) { final String error = String.format("fpp must be in the range (0, 1), but got %f", fpp); throw new IllegalArgumentException(error); } /* * From Advanced Bloom Filter Based Algorithms for Efficient Approximate Data De-Duplication in Streams: * As a trade-off we set k as the arithmetic mean of 1 and ln(fpp) / ln(1 - 1/e). */ return (int) Math.ceil(((Math.log(fpp) / Math.log(1D - (1D / Math.E))) + 1D) / 2D); } private static BitArray[] bloomFilters(long numBits, int numHashFunctions) { if (numBits <= 0L) { final String error = String.format("numBits must be positive, but got %d", numBits); throw new IllegalArgumentException(error); } if (numHashFunctions <= 0) { final String error = String.format("numHashFunctions must be positive, but got %d", numHashFunctions); throw new IllegalArgumentException(error); } final long bloomFilterBits = numBits / numHashFunctions; final BitArray[] bloomFilters = new BitArray[numHashFunctions]; for (int index = 0; index < numHashFunctions; index++) { bloomFilters[index] = new BitArray(bloomFilterBits); } return bloomFilters; } @Override public long numBits() { return numBits; } @Override public int numHashFunctions() { return numHashFunctions; } @Override public boolean classifyDistinct(byte[] element) { /* * Algorithm 2: BSBF (S) * Require: Threshold FPR (FPRt), Memory in bits (M), and Stream (S) * Ensure: Detecting duplicate and distinct elements in S * * Compute the value of k from FPRt. * Construct k Bloom filters each having M/k bits of memory. * * for each element e of S do * Hash e into k bit positions, H = h1,··· ,hk. * if all bit at positions H are set then * Result ← DISTINCT * else * Result ← DUPLICATE * end if * if e is DISTINCT then * Randomly select k bit positions hatH = hˆ1, hˆ2, ..., hˆk one each from the k Bloom filters. * Reset all bits in Hˆ to 0. * Set all the bits in H to 1. * end if * end for */ fillHashBuffer(element, hashBuffer); final boolean temporaryIsDistinct = !containsHashBuffer(bloomFilters, hashBuffer); if (temporaryIsDistinct) { setHashBuffer(bloomFilters, hashBuffer); } return temporaryIsDistinct; } @Override public boolean peekDistinct(byte[] element) { fillHashBuffer(element, hashBuffer); return !containsHashBuffer(bloomFilters, hashBuffer); } @Override public void reset() { final int bloomFiltersLength = bloomFilters.length; for (int index = 0; index < bloomFiltersLength; index++) { bloomFilters[index] = new BitArray(bloomFilters[index].bitSize()); } } private void fillHashBuffer(byte[] element, int[] hashBuffer) { /* * Adam Kirsch and Michael Mitzenmacher. 2008. Less hashing, same performance: Building a better Bloom filter. * Random Struct. Algorithms 33, 2 (September 2008), 187-218. DOI=http://dx.doi.org/10.1002/rsa.v33:2 */ final int hashBufferLength = hashBuffer.length; final int hash1 = Murmur3_x86_32.hashUnsafeBytes(element, Platform.BYTE_ARRAY_OFFSET, element.length, 0); final int hash2 = Murmur3_x86_32.hashUnsafeBytes(element, Platform.BYTE_ARRAY_OFFSET, element.length, hash1); for (int index = 0; index < hashBufferLength; index++) { int combinedHash = hash1 + ((index + 1) * hash2); if (combinedHash < 0) { combinedHash = ~combinedHash; } hashBuffer[index] = combinedHash; } } private boolean containsHashBuffer(BitArray[] bloomFilters, int[] hashBuffer) { final int hashBufferLength = hashBuffer.length; for (int index = 0; index < hashBufferLength; index++) { final int combinedHash = hashBuffer[index]; final BitArray bloomFilter = bloomFilters[index]; if (!bloomFilter.get(combinedHash % bloomFilter.bitSize())) { return false; } } return true; } private void setHashBuffer(BitArray[] bloomFilters, int[] hashBuffer) { final int hashBufferLength = hashBuffer.length; for (int index = 0; index < hashBufferLength; index++) { final int combinedHash = hashBuffer[index]; final BitArray bloomFilter = bloomFilters[index]; bloomFilter.clear(ThreadLocalRandom.current().nextLong(bloomFilter.bitSize())); bloomFilter.set(combinedHash % bloomFilter.bitSize()); } } @Override public boolean equals(Object other) { if (this == other) { return true; } if (other == null || getClass() != other.getClass()) { return false; } final BSBFDeDuplicator that = (BSBFDeDuplicator) other; if (numBits != that.numBits) { return false; } if (numHashFunctions != that.numHashFunctions) { return false; } if (!Arrays.equals(bloomFilters, that.bloomFilters)) { return false; } return true; } @Override public int hashCode() { int result; result = (int) (numBits ^ (numBits >>> 32)); result = 31 * result + numHashFunctions; result = 31 * result + Arrays.hashCode(bloomFilters); return result; } // http://docs.oracle.com/javase/8/docs/api/java/io/Serializable.html private void writeObject(ObjectOutputStream out) throws IOException { serializer.writeTo(this, out); } // http://docs.oracle.com/javase/8/docs/api/java/io/Serializable.html private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException { final BSBFDeDuplicator tempDeDuplicator = serializer.readFrom(in); this.numBits = tempDeDuplicator.numBits; this.numHashFunctions = tempDeDuplicator.numHashFunctions; this.bloomFilters = tempDeDuplicator.bloomFilters; this.hashBuffer = new int[this.bloomFilters.length]; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy