orestes.bloomfilter.FilterBuilder Maven / Gradle / Ivy
Show all versions of bloom-filter Show documentation
package orestes.bloomfilter;
import orestes.bloomfilter.HashProvider.HashFunction;
import orestes.bloomfilter.HashProvider.HashMethod;
import orestes.bloomfilter.memory.BloomFilterMemory;
import orestes.bloomfilter.memory.CountingBloomFilter16;
import orestes.bloomfilter.memory.CountingBloomFilter32;
import orestes.bloomfilter.memory.CountingBloomFilter64;
import orestes.bloomfilter.memory.CountingBloomFilter8;
import orestes.bloomfilter.memory.CountingBloomFilterMemory;
import orestes.bloomfilter.redis.BloomFilterRedis;
import orestes.bloomfilter.redis.CountingBloomFilterRedis;
import orestes.bloomfilter.redis.helper.RedisPool;
import redis.clients.jedis.Protocol;
import java.io.Serializable;
import java.nio.charset.Charset;
import java.util.AbstractMap.SimpleEntry;
import java.util.HashSet;
import java.util.Map.Entry;
import java.util.Set;
/**
* Builder for Bloom Filters.
*/
public class FilterBuilder implements Cloneable, Serializable {
private boolean redisBacked = false;
private boolean overwriteIfExists = false;
private Integer expectedElements;
private Integer size;
private Integer hashes;
private Integer countingBits = 16;
private Double falsePositiveProbability;
private String name = "";
private String redisHost = "localhost";
private Integer redisPort = 6379;
private Integer redisConnections = 10;
private boolean redisSsl = false;
private HashMethod hashMethod = HashMethod.Murmur3KirschMitzenmacher;
private HashFunction hashFunction = HashMethod.Murmur3KirschMitzenmacher.getHashFunction();
private Set> slaves = new HashSet<>();
private static transient Charset defaultCharset = Charset.forName("UTF-8");
private boolean done = false;
private String password = null;
private RedisPool pool;
private int database = Protocol.DEFAULT_DATABASE;
/**
* Constructs a new builder for Bloom filters and counting Bloom filters.
*/
public FilterBuilder() {
}
/**
* Constructs a new Bloom Filter Builder by specifying the expected size of the filter and the tolerable false
* positive probability. The size of the BLoom filter in in bits and the optimal number of hash functions will be
* inferred from this.
*
* @param expectedElements expected elements in the filter
* @param falsePositiveProbability tolerable false positive probability
*/
public FilterBuilder(int expectedElements, double falsePositiveProbability) {
this.expectedElements(expectedElements).falsePositiveProbability(falsePositiveProbability);
}
/**
* Constructs a new Bloom Filter Builder using the specified size in bits and the specified number of hash
* functions.
*
* @param size bit size of the Bloom filter
* @param hashes number of hash functions to use
*/
public FilterBuilder(int size, int hashes) {
this.size(size).hashes(hashes);
}
/**
* Sets the number of expected elements. In combination with the tolerable false positive probability, this is used
* to infer the optimal size and optimal number of hash functions of the filter.
*
* @param expectedElements number of expected elements.
* @return the modified FilterBuilder (fluent interface)
*/
public FilterBuilder expectedElements(int expectedElements) {
this.expectedElements = expectedElements;
return this;
}
/**
* Sets the size of the filter in bits.
*
* @param size size of the filter in bits
* @return the modified FilterBuilder (fluent interface)
*/
public FilterBuilder size(int size) {
this.size = size;
return this;
}
/**
* Sets the tolerable false positive probability. In combination with the number of expected elements, this is used
* to infer the optimal size and optimal number of hash functions of the filter.
*
* @param falsePositiveProbability the tolerable false
* @return the modified FilterBuilder (fluent interface)
*/
public FilterBuilder falsePositiveProbability(double falsePositiveProbability) {
this.falsePositiveProbability = falsePositiveProbability;
return this;
}
/**
* Set the number of hash functions to be used.
*
* @param numberOfHashes number of hash functions used by the filter.
* @return the modified FilterBuilder (fluent interface)
*/
public FilterBuilder hashes(int numberOfHashes) {
this.hashes = numberOfHashes;
return this;
}
/**
* Sets the number of bits used for counting in case of a counting Bloom filter. For non-counting Bloom filters this
* setting has no effect. Default: 16
*
* @param countingBits Number of counting bits used by the counting Bloom filter
* @return the modified FilterBuilder (fluent interface)
*/
public FilterBuilder countingBits(int countingBits) {
this.countingBits = countingBits;
return this;
}
/**
* Sets the name of the Bloom filter. If a redis-backed Bloom filter with the provided name exists and it is
* compatible to this FilterBuilder configuration, it will be loaded and used. This behaviour can be changed by
* {@link #overwriteIfExists(boolean)}. Default: ""
*
* @param name The name of the filter
* @return the modified FilterBuilder (fluent interface)
*/
public FilterBuilder name(String name) {
this.name = name;
return this;
}
/**
* Sets a password authentication for Redis.
*
* @param password The Redis PW
* @return the modified FilterBuilder (fluent interface)
*/
public FilterBuilder password(String password) {
this.password = password;
return this;
}
/**
* Sets an existing RedisPool for reuse
*
* @param pool The RedisPool
* @return the modified FilterBuilder (fluent interface)
*/
public FilterBuilder pool(RedisPool pool) {
this.redisBacked(true);
this.pool = pool;
return this;
}
/**
* Instructs the FilterBuilder to build a Redis-Backed Bloom filters. Default: false
*
* @param redisBacked a boolean indicating whether redis should be used
* @return the modified FilterBuilder (fluent interface)
*/
public FilterBuilder redisBacked(boolean redisBacked) {
this.redisBacked = redisBacked;
return this;
}
/**
* Sets the host of the backing Redis instance. Default: localhost
*
* @param host the Redis host
* @return the modified FilterBuilder (fluent interface)
*/
public FilterBuilder redisHost(String host) {
this.redisBacked = true;
this.redisHost = host;
return this;
}
/**
* Sets the port of the backing Redis instance. Default: 6379
*
* @param port the Redis port
* @return the modified FilterBuilder (fluent interface)
*/
public FilterBuilder redisPort(int port) {
this.redisBacked = true;
this.redisPort = port;
return this;
}
/**
* Sets the number of connections to use for Redis. Default: 10
*
* @param numConnections the number of connections to use for Redis
* @return the modified FilterBuilder (fluent interface)
*/
public FilterBuilder redisConnections(int numConnections) {
this.redisBacked = true;
this.redisConnections = numConnections;
return this;
}
/**
* Enables or disables SSL connection to Redis. Default: false
*
* @param ssl enables or disables SSL connection to Redis
* @return the modified FilterBuilder (fluent interface)
*/
public FilterBuilder redisSsl(boolean ssl) {
this.redisBacked = true;
this.redisSsl = ssl;
return this;
}
/**
* Sets whether any existing Bloom filter with same name should be overwritten in Redis. Default:
* false
*
* @param overwrite boolean indicating whether to overwrite any existing filter with the same name
* @return the modified FilterBuilder (fluent interface)
*/
public FilterBuilder overwriteIfExists(boolean overwrite) {
this.overwriteIfExists = overwrite;
return this;
}
/**
* Adds a read slave to speed up reading access (e.g. contains or getEstimatedCount) to normal and counting
* Redis-backed Bloom filters. The read slave has to be a slave of the main Redis instance (this can be done in the
* redis-cli using the SLAVEOF command). This setting might cause stale reads since Redis replication is
* asynchronous. However anecdotally, in our experiments, we were unable to read any stale data - the replication
* lag between both Redis instances was small than one round-trip time to Redis.
*
* @param host host of the Redis read slave
* @param port port of the Redis read slave
* @return the modified FilterBuilder (fluent interface)
*/
public FilterBuilder addReadSlave(String host, int port) {
slaves.add(new SimpleEntry<>(host, port));
return this;
}
/**
* Sets the method used to generate hash values. Possible hash methods are documented in the corresponding enum
* {@link HashProvider.HashMethod}. Default: MD5
*
* For the generation of hash values the String representation of objects is used.
*
* @param hashMethod the method used to generate hash values
* @return the modified FilterBuilder (fluent interface)
*/
public FilterBuilder hashFunction(HashMethod hashMethod) {
this.hashMethod = hashMethod;
this.hashFunction = hashMethod.getHashFunction();
return this;
}
/**
* Uses a given custom hash function.
*
* @param hf the custom hash function
* @return the modified FilterBuilder (fluent interface)
*/
public FilterBuilder hashFunction(HashFunction hf) {
this.hashFunction = hf;
return this;
}
/**
* Use a given database number.
*
* @param database number
* @return the modified FilterBuilder (fluent interface)
*/
public FilterBuilder database(int database) {
this.database = database;
return this;
}
public int database() {
return database;
}
/**
* Constructs a Bloom filter using the specified parameters and computing missing parameters if possible (e.g. the
* optimal Bloom filter bit size).
*
* @param the type of element contained in the Bloom filter.
* @return the constructed Bloom filter
*/
public BloomFilter buildBloomFilter() {
complete();
if (redisBacked) {
return new BloomFilterRedis<>(this);
} else {
return new BloomFilterMemory<>(this);
}
}
/**
* Constructs a Counting Bloom filter using the specified parameters and by computing missing parameters if possible
* (e.g. the optimal Bloom filter bit size).
*
* @param the type of element contained in the Counting Bloom filter.
* @return the constructed Counting Bloom filter
*/
public CountingBloomFilter buildCountingBloomFilter() {
complete();
if (redisBacked) {
return new CountingBloomFilterRedis<>(this);
} else {
if (countingBits == 32) {
return new CountingBloomFilter32<>(this);
} else if (countingBits == 16) {
return new CountingBloomFilter16<>(this);
} else if (countingBits == 8) {
return new CountingBloomFilter8<>(this);
} else if (countingBits == 64) {
return new CountingBloomFilter64<>(this);
} else {
return new CountingBloomFilterMemory<>(this);
}
}
}
/**
* Checks if all necessary parameters were set and tries to infer optimal parameters (e.g. size and hashes from
* given expectedElements and falsePositiveProbability). This is done automatically.
*
* @return the completed FilterBuilder
*/
public FilterBuilder complete() {
if (done) { return this; }
if (size == null && expectedElements != null && falsePositiveProbability != null) {
size = optimalM(expectedElements, falsePositiveProbability);
}
if (hashes == null && expectedElements != null && size != null) { hashes = optimalK(expectedElements, size); }
if (size == null || hashes == null) {
throw new NullPointerException("Neither (expectedElements, falsePositiveProbability) nor (size, hashes) were specified.");
}
if (expectedElements == null) { expectedElements = optimalN(hashes, size); }
if (falsePositiveProbability == null) { falsePositiveProbability = optimalP(hashes, size, expectedElements); }
done = true;
return this;
}
@Override
public FilterBuilder clone() {
Object clone;
try {
clone = super.clone();
} catch (CloneNotSupportedException e) {
throw new RuntimeException("Cloning failed.");
}
return (FilterBuilder) clone;
}
/**
* @return {@code true} if the Bloom Filter will be Redis-backed
*/
public boolean redisBacked() {
return redisBacked;
}
/**
* @return the number of expected elements for the Bloom filter
*/
public int expectedElements() {
return expectedElements;
}
/**
* @return the size of the Bloom filter in bits
*/
public int size() {
return size;
}
/**
* @return the number of hashes used by the Bloom filter
*/
public int hashes() {
return hashes;
}
/**
* @return The number of bits used for counting in case of a counting Bloom filter
*/
public int countingBits() {
return countingBits;
}
/**
* @return the tolerable false positive probability of the Bloom filter
*/
public double falsePositiveProbability() {
return falsePositiveProbability;
}
/**
* @return the name of the Bloom filter
*/
public String name() {
return name;
}
/**
* @return the host name of the Redis server backing the Bloom filter
*/
public String redisHost() {
return redisHost;
}
/**
* @return the port used by the Redis server backing the Bloom filter
*/
public int redisPort() {
return redisPort;
}
/**
* @return the number of connections used by the Redis Server backing the Bloom filter
*/
public int redisConnections() {
return redisConnections;
}
/**
* @return if SSL is enabled for Redis connection
*/
public boolean redisSsl() {
return redisSsl;
}
/**
* @return The hash method to be used by the Bloom filter
*/
public HashMethod hashMethod() {
return hashMethod;
}
/**
* @return the actual hash function to be used by the Bloom filter
*/
public HashFunction hashFunction() {
return hashFunction;
}
/**
* @return Return the default Charset used for conversion of String values into byte arrays used for hashing
*/
public static Charset defaultCharset() {
return defaultCharset;
}
/**
* @return {@code true} if the Bloom filter that is to be built should overwrite any existing Bloom filter with the
* same name
*/
public boolean overwriteIfExists() {
return overwriteIfExists;
}
/**
* @return return the list of all read slaves to be used by the Redis-backed Bloom filter
*/
public Set> getReadSlaves() {
return slaves;
}
/**
* Checks whether a configuration is compatible to another configuration based on the size of the Bloom filter and
* its hash functions.
*
* @param other the other configuration
* @return {@code true} if the configurations are compatible
*/
public boolean isCompatibleTo(FilterBuilder other) {
return this.size() == other.size() && this.hashes() == other.hashes() && this.hashMethod() == other.hashMethod();
}
/**
* Calculates the optimal size size of the bloom filter in bits given expectedElements (expected
* number of elements in bloom filter) and falsePositiveProbability (tolerable false positive rate).
*
* @param n Expected number of elements inserted in the bloom filter
* @param p Tolerable false positive rate
* @return the optimal size size of the bloom filter in bits
*/
public static int optimalM(long n, double p) {
return (int) Math.ceil(-1 * (n * Math.log(p)) / Math.pow(Math.log(2), 2));
}
/**
* Calculates the optimal hashes (number of hash function) given expectedElements (expected number of
* elements in bloom filter) and size (size of bloom filter in bits).
*
* @param n Expected number of elements inserted in the bloom filter
* @param m The size of the bloom filter in bits.
* @return the optimal amount of hash functions hashes
*/
public static int optimalK(long n, long m) {
return (int) Math.ceil((Math.log(2) * m) / n);
}
/**
* Calculates the amount of elements a Bloom filter for which the given configuration of size and hashes is
* optimal.
*
* @param k number of hashes
* @param m The size of the bloom filter in bits.
* @return amount of elements a Bloom filter for which the given configuration of size and hashes is optimal.
*/
public static int optimalN(long k, long m) {
return (int) Math.ceil((Math.log(2) * m) / k);
}
/**
* Calculates the best-case (uniform hash function) false positive probability.
*
* @param k number of hashes
* @param m The size of the bloom filter in bits.
* @param insertedElements number of elements inserted in the filter
* @return The calculated false positive probability
*/
public static double optimalP(long k, long m, double insertedElements) {
return Math.pow((1 - Math.exp(-k * insertedElements / (double) m)), k);
}
public String password() {
return password;
}
public RedisPool pool() {
if(done && pool == null) {
pool = RedisPool.builder()
.host(redisHost())
.port(redisPort())
.readSlaves(getReadSlaves())
.password(password())
.database(database())
.redisConnections(redisConnections())
.build();
}
return pool;
}
}