All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.util.BloomFilter64bit Maven / Gradle / Ivy

Go to download

The Archive Commons Code Libraries project contains general Java utility libraries, as used by the Heritrix crawler and other projects.

There is a newer version: 3.4.0-20220727
Show newest version
/* BloomFilter
*
* $Id$
*
* Created on Jun 21, 2005
*
* Copyright (C) 2005 Internet Archive; a slight adaptation of
* LGPL work (C) Sebastiano Vigna
*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Heritrix is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* any later version.
*
* Heritrix is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU Lesser Public License for more details.
*
* You should have received a copy of the GNU Lesser Public License
* along with Heritrix; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

package org.archive.util;

import java.io.Serializable;
import java.lang.reflect.Field;
import java.lang.reflect.Method;
import java.security.SecureRandom;
import java.util.Random;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.hash.Funnels;
import com.google.common.primitives.Ints;

public class BloomFilter64bit implements Serializable, BloomFilter {
    private static final long serialVersionUID = 3L;

    /** The expected number of inserts; determines calculated size */ 
    private final long expectedInserts; 

    /** The number of elements currently in the filter. It may be
     * smaller than the actual number of additions of distinct character
     * sequences because of false positives.
     */
    private int size;

    private final com.google.common.hash.BloomFilter delegate;
    private final long bitSize;
    private final int numHashFunctions;

    /** Creates a new Bloom filter with given number of hash functions and 
     * expected number of elements.
     *
     * @param n the expected number of elements.
     * @param d the number of hash functions; if the filter add not more 
     * than n elements, false positives will happen with 
     * probability 2-d.
     */
    public BloomFilter64bit( final long n, final int d) {
        this(n,d, new SecureRandom(), false);
    }
    
    public BloomFilter64bit( final long n, final int d, boolean roundUp) {
        this(n,d, new SecureRandom(), roundUp);
    }
    
    /** Creates a new Bloom filter with given number of hash functions and 
     * expected number of elements.
     *
     * @param n the expected number of elements.
     * @param d the number of hash functions; if the filter add not more 
     * than n elements, false positives will happen with 
     * probability 2-d.
     * @param weightsGenerator may provide a seeded Random for reproducible
     * internal universal hash function weighting
     * @param roundUp if true, round bit size up to next-nearest-power-of-2
     */
    public BloomFilter64bit(final long n, final int d, Random weightsGenerator, boolean roundUp ) {
        delegate = com.google.common.hash.BloomFilter.create(Funnels.unencodedCharsFunnel(), Ints.saturatedCast(n), Math.pow(2, -d));
        this.expectedInserts = n; 
        try {
        Method bitSizeMethod = delegate.getClass().getDeclaredMethod("bitSize", new Class[] {});
        bitSizeMethod.setAccessible(true);
        bitSize = (long) bitSizeMethod.invoke(delegate, new Object[] {}); 

        Field numHashFunctionField = delegate.getClass().getDeclaredField("numHashFunctions");
        numHashFunctionField.setAccessible(true);
        numHashFunctions = numHashFunctionField.getInt(delegate);
        } catch (Exception e) {
          throw new RuntimeException(e);
        }
    }

    /** The number of character sequences in the filter.
     *
     * @return the number of character sequences in the filter (but 
     * see {@link #contains(CharSequence)}).
     */

    public int size() {
        return size;
    }

    /** Checks whether the given character sequence is in this filter.
     *
     * 

Note that this method may return true on a character sequence that is has * not been added to the filter. This will happen with probability 2-d, * where d is the number of hash functions specified at creation time, if * the number of the elements in the filter is less than n, the number * of expected elements specified at creation time. * * @param s a character sequence. * @return true if the sequence is in the filter (or if a sequence with the * same hash sequence is in the filter). */ public boolean contains( final CharSequence s ) { return delegate.mightContain(s); } /** Adds a character sequence to the filter. * * @param s a character sequence. * @return true if the character sequence was not in the filter (but see {@link #contains(CharSequence)}). */ public boolean add( final CharSequence s ) { boolean added = delegate.put(s); if (added) { size++; } return added; } /* (non-Javadoc) * @see org.archive.util.BloomFilter#getSizeBytes() */ public long getSizeBytes() { return bitSize / 8; } @Override public long getExpectedInserts() { return expectedInserts; } @Override public long getHashCount() { return numHashFunctions; } @VisibleForTesting public boolean getBit(long bitIndex) { try { Field bitsField = delegate.getClass().getDeclaredField("bits"); bitsField.setAccessible(true); Object bitarray = bitsField.get(delegate); Method getBitMethod = bitarray.getClass().getDeclaredMethod("get", long.class); getBitMethod.setAccessible(true); return (boolean) getBitMethod.invoke(bitarray, bitIndex); } catch (Exception e) { throw new RuntimeException(e); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy