org.apache.lucene.codecs.bloom.FuzzySet Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-codecs Show documentation
Codecs and postings formats for Apache Lucene.
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.codecs.bloom;

import java.io.IOException;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.RamUsageEstimator;

/**
 * A class used to represent a set of many, potentially large, values (e.g. many long strings such
 * as URLs), using a significantly smaller amount of memory.
 *
 * The set is "lossy" in that it cannot definitively state that is does contain a value but it
 * can definitively say if a value is not in the set. It can therefore be used as
 * a Bloom Filter. Another application of the set is that it can be used to perform fuzzy counting
 * because it can estimate reasonably accurately how many unique values are contained in the set.
 *
 * 
This class is NOT threadsafe.
 *
 * 
Internally a Bitset is used to record values and once a client has finished recording a stream
 * of values the {@link #downsize(float)} method can be used to create a suitably smaller set that
 * is sized appropriately for the number of values recorded and desired saturation levels.
 *
 * @lucene.experimental
 */
public class FuzzySet implements Accountable {

  /**
   * Result from {@link FuzzySet#contains(BytesRef)}: can never return definitively YES (always
   * MAYBE), but can sometimes definitely return NO.
   */
  public enum ContainsResult {
    MAYBE,
    NO
  };

  private HashFunction hashFunction;
  private FixedBitSet filter;
  private int bloomSize;
  private final int hashCount;

  // The sizes of BitSet used are all numbers that, when expressed in binary form,
  // are all ones. This is to enable fast downsizing from one bitset to another
  // by simply ANDing each set index in one bitset with the size of the target bitset
  // - this provides a fast modulo of the number. Values previously accumulated in
  // a large bitset and then mapped to a smaller set can be looked up using a single
  // AND operation of the query term's hash rather than needing to perform a 2-step
  // translation of the query term that mirrors the stored content's reprojections.
  static final int[] usableBitSetSizes;

  static {
    usableBitSetSizes = new int[26];
    for (int i = 0; i < usableBitSetSizes.length; i++) {
      usableBitSetSizes[i] = (1 << (i + 6)) - 1;
    }
  }

  /**
   * Rounds down required maxNumberOfBits to the nearest number that is made up of all ones as a
   * binary number. Use this method where controlling memory use is paramount.
   */
  public static int getNearestSetSize(int maxNumberOfBits) {
    int result = usableBitSetSizes[0];
    for (int i = 0; i < usableBitSetSizes.length; i++) {
      if (usableBitSetSizes[i] <= maxNumberOfBits) {
        result = usableBitSetSizes[i];
      }
    }
    return result;
  }

  /**
   * Use this method to choose a set size where accuracy (low content saturation) is more important
   * than deciding how much memory to throw at the problem.
   *
   * @param desiredSaturation A number between 0 and 1 expressing the % of bits set once all values
   *     have been recorded
   * @return The size of the set nearest to the required size
   */
  public static int getNearestSetSize(int maxNumberOfValuesExpected, float desiredSaturation) {
    // Iterate around the various scales of bitset from smallest to largest looking for the first
    // that
    // satisfies value volumes at the chosen saturation level
    for (int i = 0; i < usableBitSetSizes.length; i++) {
      int numSetBitsAtDesiredSaturation = (int) (usableBitSetSizes[i] * desiredSaturation);
      int estimatedNumUniqueValues =
          getEstimatedNumberUniqueValuesAllowingForCollisions(
              usableBitSetSizes[i], numSetBitsAtDesiredSaturation);
      if (estimatedNumUniqueValues > maxNumberOfValuesExpected) {
        return usableBitSetSizes[i];
      }
    }
    return -1;
  }

  public static FuzzySet createSetBasedOnMaxMemory(int maxNumBytes) {
    int setSize = getNearestSetSize(maxNumBytes);
    return new FuzzySet(new FixedBitSet(setSize + 1), setSize, 1);
  }

  public static FuzzySet createSetBasedOnQuality(
      int maxNumUniqueValues, float desiredMaxSaturation, int version) {
    int setSize = getNearestSetSize(maxNumUniqueValues, desiredMaxSaturation);
    return new FuzzySet(new FixedBitSet(setSize + 1), setSize, 1);
  }

  public static FuzzySet createOptimalSet(int maxNumUniqueValues, float targetMaxFpp) {
    int setSize =
        (int)
            Math.ceil(
                (maxNumUniqueValues * Math.log(targetMaxFpp))
                    / Math.log(1 / Math.pow(2, Math.log(2))));
    setSize = getNearestSetSize(2 * setSize);
    int optimalK = (int) Math.round(((double) setSize / maxNumUniqueValues) * Math.log(2));
    return new FuzzySet(new FixedBitSet(setSize + 1), setSize, optimalK);
  }

  private FuzzySet(FixedBitSet filter, int bloomSize, int hashCount) {
    super();
    this.filter = filter;
    this.bloomSize = bloomSize;
    this.hashFunction = MurmurHash64.INSTANCE;
    this.hashCount = hashCount;
  }

  /**
   * The main method required for a Bloom filter which, given a value determines set membership.
   * Unlike a conventional set, the fuzzy set returns NO or MAYBE rather than true or false. Hash
   * generation follows the same principles as {@link #addValue(BytesRef)}
   *
   * @return NO or MAYBE
   */
  public ContainsResult contains(BytesRef value) {
    long hash = hashFunction.hash(value);
    int msb = (int) (hash >>> Integer.SIZE);
    int lsb = (int) hash;
    for (int i = 0; i < hashCount; i++) {
      int bloomPos = (lsb + i * msb);
      if (!mayContainValue(bloomPos)) {
        return ContainsResult.NO;
      }
    }
    return ContainsResult.MAYBE;
  }

  /**
   * Serializes the data set to file using the following format:
   *
   * 

   *   FuzzySet -->hashCount,BloomSize, NumBitSetWords,BitSetWord^{NumBitSetWords}
   *   
hashCount --> {@link DataOutput#writeVInt Uint32} The number of hash functions (k).
   *   
BloomSize --> {@link DataOutput#writeInt Uint32} The modulo value used to project
   *       hashes into the field's Bitset
   *   
NumBitSetWords --> {@link DataOutput#writeInt Uint32} The number of longs (as returned
   *       from {@link FixedBitSet#getBits})
   *   
BitSetWord --> {@link DataOutput#writeLong Long} A long from the array returned by
   *       {@link FixedBitSet#getBits}
   * 
   *
   * @param out Data output stream
   * @throws IOException If there is a low-level I/O error
   */
  public void serialize(DataOutput out) throws IOException {
    out.writeVInt(hashCount);
    out.writeInt(bloomSize);
    long[] bits = filter.getBits();
    out.writeInt(bits.length);
    for (int i = 0; i < bits.length; i++) {
      // Can't used VLong encoding because cant cope with negative numbers
      // output by FixedBitSet
      out.writeLong(bits[i]);
    }
  }

  public static FuzzySet deserialize(DataInput in) throws IOException {
    int hashCount = in.readVInt();
    int bloomSize = in.readInt();
    int numLongs = in.readInt();
    long[] longs = new long[numLongs];
    in.readLongs(longs, 0, numLongs);
    FixedBitSet bits = new FixedBitSet(longs, bloomSize + 1);
    return new FuzzySet(bits, bloomSize, hashCount);
  }

  private boolean mayContainValue(int aHash) {
    // Bloom sizes are always base 2 and so can be ANDed for a fast modulo
    int pos = aHash & bloomSize;
    return filter.get(pos);
  }

  /**
   * Records a value in the set. The referenced bytes are hashed. From the 64-bit generated hash,
   * two 32-bit hashes are derived from the msb and lsb which can be used to derive more hashes (see
   * https://www.eecs.harvard.edu/~michaelm/postscripts/rsa2008.pdf). Finally, each generated hash
   * is modulo n'd where n is the chosen size of the internal bitset.
   *
   * @param value the key value to be hashed
   * @throws IOException If there is a low-level I/O error
   */
  public void addValue(BytesRef value) throws IOException {
    long hash = hashFunction.hash(value);
    int msb = (int) (hash >>> Integer.SIZE);
    int lsb = (int) hash;
    for (int i = 0; i < hashCount; i++) {
      // Bitmasking using bloomSize is effectively a modulo operation.
      int bloomPos = (lsb + i * msb) & bloomSize;
      filter.set(bloomPos);
    }
  }

  /**
   * @param targetMaxSaturation A number between 0 and 1 describing the % of bits that would ideally
   *     be set in the result. Lower values have better accuracy but require more space.
   * @return a smaller FuzzySet or null if the current set is already over-saturated
   */
  public FuzzySet downsize(float targetMaxSaturation) {
    int numBitsSet = filter.cardinality();
    FixedBitSet rightSizedBitSet = filter;
    int rightSizedBitSetSize = bloomSize;
    // Hopefully find a smaller size bitset into which we can project accumulated values while
    // maintaining desired saturation level
    for (int i = 0; i < usableBitSetSizes.length; i++) {
      int candidateBitsetSize = usableBitSetSizes[i];
      float candidateSaturation = (float) numBitsSet / (float) candidateBitsetSize;
      if (candidateSaturation <= targetMaxSaturation) {
        rightSizedBitSetSize = candidateBitsetSize;
        break;
      }
    }
    // Re-project the numbers to a smaller space if necessary
    if (rightSizedBitSetSize < bloomSize) {
      // Reset the choice of bitset to the smaller version
      rightSizedBitSet = new FixedBitSet(rightSizedBitSetSize + 1);
      // Map across the bits from the large set to the smaller one
      int bitIndex = 0;
      do {
        bitIndex = filter.nextSetBit(bitIndex);
        if (bitIndex != DocIdSetIterator.NO_MORE_DOCS) {
          // Project the larger number into a smaller one effectively
          // modulo-ing by using the target bitset size as a mask
          int downSizedBitIndex = bitIndex & rightSizedBitSetSize;
          rightSizedBitSet.set(downSizedBitIndex);
          bitIndex++;
        }
      } while ((bitIndex >= 0) && (bitIndex <= bloomSize));
    } else {
      return null;
    }
    return new FuzzySet(rightSizedBitSet, rightSizedBitSetSize, hashCount);
  }

  public int getEstimatedUniqueValues() {
    return getEstimatedNumberUniqueValuesAllowingForCollisions(bloomSize, filter.cardinality());
  }

  // Given a set size and a the number of set bits, produces an estimate of the number of unique
  // values recorded
  public static int getEstimatedNumberUniqueValuesAllowingForCollisions(
      int setSize, int numRecordedBits) {
    double setSizeAsDouble = setSize;
    double numRecordedBitsAsDouble = numRecordedBits;
    double saturation = numRecordedBitsAsDouble / setSizeAsDouble;
    double logInverseSaturation = Math.log(1 - saturation) * -1;
    return (int) (setSizeAsDouble * logInverseSaturation);
  }

  public float getTargetMaxSaturation() {
    return 0.5f;
  }

  public float getSaturation() {
    int numBitsSet = filter.cardinality();
    return (float) numBitsSet / (float) bloomSize;
  }

  @Override
  public long ramBytesUsed() {
    return RamUsageEstimator.sizeOf(filter.getBits());
  }

  @Override
  public String toString() {
    return getClass().getSimpleName()
        + "(hash="
        + hashFunction
        + ", k="
        + hashCount
        + ", bits="
        + filter.cardinality()
        + "/"
        + filter.length()
        + ")";
  }
}