All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.jparkie.deduplicator.BitArray Maven / Gradle / Ivy

Go to download

Advanced Bloom Filter Based Algorithms for Efficient Approximate Data De-Duplication in Streams

The newest version!
package com.github.jparkie.deduplicator;

import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.Arrays;

/**
 * Adapted From: https://github.com/apache/spark/blob/branch-2.0/common/sketch/src/main/java/org/apache/spark/util/sketch/BitArray.java
 */
public final class BitArray {
    private final long[] data;
    private long bitCount;

    public BitArray(long numBits) {
        this(new long[numWords(numBits)]);
    }

    private BitArray(long[] data) {
        this.data = data;
        long bitCount = 0;
        for (long word : data) {
            bitCount += Long.bitCount(word);
        }
        this.bitCount = bitCount;
    }

    private static int numWords(long numBits) {
        if (numBits <= 0) {
            final String error = String.format("numBits must be positive, but got %d", numBits);
            throw new IllegalArgumentException(error);
        }
        final long numWords = (long) Math.ceil(numBits / 64D);
        if (numWords > Integer.MAX_VALUE) {
            final String error = String.format("Cannot allocate enough space for %d bits", numBits);
            throw new IllegalArgumentException(error);
        }
        return (int) numWords;
    }

    public boolean get(long index) {
        return (data[(int) (index >>> 6)] & (1L << index)) != 0;
    }

    public boolean set(long index) {
        if (!get(index)) {
            data[(int) (index >>> 6)] |= (1L << index);
            bitCount++;
            return true;
        }
        return false;
    }

    public boolean clear(long index) {
        if (get(index)) {
            data[(int) (index >>> 6)] &= ~(1L << index);
            bitCount--;
            return true;
        }
        return false;
    }

    public long bitSize() {
        return (long) data.length * Long.SIZE;
    }

    public long bitCount() {
        return bitCount;
    }

    public void putAll(BitArray array) {
        if (data.length != array.data.length) {
            final String error = String.format(
                    "BitArrays must be of equal length (%d != %d)",
                    data.length,
                    array.data.length);
            throw new IllegalArgumentException(error);
        }
        long bitCount = 0;
        for (int i = 0; i < data.length; i++) {
            data[i] |= array.data[i];
            bitCount += Long.bitCount(data[i]);
        }
        this.bitCount = bitCount;
    }

    // @formatter:off
    public void writeTo(DataOutputStream out) throws IOException {
        out.writeInt(data.length);
        for (long datum : data) {
            out.writeLong(datum);
        }
    }

    public static BitArray readFrom(DataInputStream in) throws IOException {
        final int numWords = in.readInt();
        long[] data = new long[numWords];
        for (int i = 0; i < numWords; i++) {
            data[i] = in.readLong();
        }
        return new BitArray(data);
    }
    // @formatter:on

    @Override
    public boolean equals(Object other) {
        if (this == other) {
            return true;
        }
        if (other == null || getClass() != other.getClass()) {
            return false;
        }
        final BitArray that = (BitArray) other;
        return Arrays.equals(data, that.data);
    }

    @Override
    public int hashCode() {
        return Arrays.hashCode(data);
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy