org.apache.flink.runtime.state.gemini.engine.page.bloomfilter.PageBloomFilter Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.runtime.state.gemini.engine.page.bloomfilter;
import org.apache.flink.runtime.state.gemini.engine.page.DataPage;
import org.apache.flink.runtime.state.gemini.engine.page.bmap.GBinaryHashMap;
import java.util.Arrays;
import static org.apache.flink.util.Preconditions.checkArgument;
/**
* Copy From table.runtime.util.
* BloomFilter based on a long array of Java heap, and serialization and merge based on Unsafe.
*
* Part of this class refers to the implementation from Apache Hive project
* https://github.com/apache/hive/blob/master/common/src/java/org/apache/hive/common/util/BloomFilter.java.
*/
public class PageBloomFilter {
/**
* Default false positive probability for BloomFilter.
*/
public static final double DEFAULT_FPP = 0.03f;
private final int numBits;
private final int numHashFunctions;
private final BitSet bitSet;
public static PageBloomFilter createByPage(DataPage dataPage) {
GBinaryHashMap gBinaryHashMap = dataPage.getGBinaryHashMap();
if (gBinaryHashMap == null) {
return null;
}
int indexLen = gBinaryHashMap.indexCount();
int keyCount = gBinaryHashMap.keyCount();
if (keyCount < 16) {
return null;
}
PageBloomFilter pageBloomFilter = new PageBloomFilter(keyCount);
for (int i = 0; i < keyCount; i++) {
pageBloomFilter.addHash(gBinaryHashMap.getHashCode(indexLen, i));
}
return pageBloomFilter;
}
public PageBloomFilter(long maxNumEntries) {
this(maxNumEntries, DEFAULT_FPP);
}
/**
* Constructor. MaxNumEntries and fpp together determine the size of bloomFilter.
*
* @param maxNumEntries max number entries in this bloomFilter.
* @param fpp false positive probability.
*/
public PageBloomFilter(long maxNumEntries, double fpp) {
checkArgument(maxNumEntries > 0, "expectedEntries should be > 0");
int nb = optimalNumOfBits(maxNumEntries, fpp);
this.numBits = nb + (Long.SIZE - (nb % Long.SIZE));
this.numHashFunctions = optimalNumOfHashFunctions(maxNumEntries, numBits);
this.bitSet = new BitSet(this.numBits);
}
public int byteSize() {
return 2 * Integer.BYTES + 16 + bitSet.getData().length * Long.BYTES;
}
public void addHash(int hash32) {
int hash1 = hash32;
int hash2 = hash32 >>> 16;
for (int i = 1; i <= numHashFunctions; i++) {
int combinedHash = hash1 + ((i + 1) * hash2);
// hashcode should be positive, flip all the bits if it's negative
if (combinedHash < 0) {
combinedHash = ~combinedHash;
}
int pos = combinedHash % numBits;
bitSet.set(pos);
}
}
public boolean mightContain(int hash32) {
int hash1 = hash32;
int hash2 = hash32 >>> 16;
for (int i = 1; i <= numHashFunctions; i++) {
int combinedHash = hash1 + ((i + 1) * hash2);
// hashcode should be positive, flip all the bits if it's negative
if (combinedHash < 0) {
combinedHash = ~combinedHash;
}
int pos = combinedHash % numBits;
if (!bitSet.get(pos)) {
return false;
}
}
return true;
}
public long[] getBitSet() {
return bitSet.getData();
}
@Override
public String toString() {
return "numBits: " + numBits + " numHashFunctions: " + numHashFunctions;
}
private static int optimalNumOfHashFunctions(long n, long m) {
return Math.max(1, (int) Math.round((double) m / n * Math.log(2)));
}
private static int optimalNumOfBits(long maxNumEntries, double fpp) {
return (int) (-maxNumEntries * Math.log(fpp) / (Math.log(2) * Math.log(2)));
}
/**
* Bare metal bit set implementation. For performance reasons, this implementation does not
* check for index bounds nor expand the bit set size if the specified index is greater than
* the size.
*/
public static class BitSet {
private final long[] data;
BitSet(long bits) {
this(new long[(int) Math.ceil((double) bits / (double) Long.SIZE)]);
}
/**
* Deserialize long array as bit set.
*
* @param data - bit array
*/
BitSet(long[] data) {
assert data.length > 0 : "data length is zero!";
this.data = data;
}
/**
* Sets the bit at specified index.
*
* @param index - position
*/
public void set(int index) {
data[index >>> 6] |= (1L << index);
}
/**
* Returns true if the bit is set in the specified index.
*
* @param index - position
* @return - value at the bit position
*/
public boolean get(int index) {
return (data[index >>> 6] & (1L << index)) != 0;
}
public long[] getData() {
return data;
}
/**
* Combines the two BitArrays using bitwise OR.
*/
public void putAll(BitSet array) {
assert data.length == array.data.length : "BitArrays must be of equal length (" + data.length + "!= " + array.data.length + ")";
for (int i = 0; i < data.length; i++) {
data[i] |= array.data[i];
}
}
/**
* Clear the bit set.
*/
public void clear() {
Arrays.fill(data, 0);
}
}
}