All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.util.RoaringDocIdSet Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.util;


import java.io.IOException;
import java.util.Arrays;

import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;

/**
 * {@link DocIdSet} implementation inspired from http://roaringbitmap.org/
 *
 * The space is divided into blocks of 2^16 bits and each block is encoded
 * independently. In each block, if less than 2^12 bits are set, then
 * documents are simply stored in a short[]. If more than 2^16-2^12 bits are
 * set, then the inverse of the set is encoded in a simple short[]. Otherwise
 * a {@link FixedBitSet} is used.
 *
 * @lucene.internal
 */
public class RoaringDocIdSet extends DocIdSet {

  // Number of documents in a block
  private static final int BLOCK_SIZE = 1 << 16;
  // The maximum length for an array, beyond that point we switch to a bitset
  private static final int MAX_ARRAY_LENGTH = 1 << 12;
  private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(RoaringDocIdSet.class);

  /** A builder of {@link RoaringDocIdSet}s. */
  public static class Builder {

    private final int maxDoc;
    private final DocIdSet[] sets;

    private int cardinality;
    private int lastDocId;
    private int currentBlock;
    private int currentBlockCardinality;

    // We start by filling the buffer and when it's full we copy the content of
    // the buffer to the FixedBitSet and put further documents in that bitset
    private final short[] buffer;
    private FixedBitSet denseBuffer;

    /** Sole constructor. */
    public Builder(int maxDoc) {
      this.maxDoc = maxDoc;
      sets = new DocIdSet[(maxDoc + (1 << 16) - 1) >>> 16];
      lastDocId = -1;
      currentBlock = -1;
      buffer = new short[MAX_ARRAY_LENGTH];
    }

    private void flush() {
      assert currentBlockCardinality <= BLOCK_SIZE;
      if (currentBlockCardinality <= MAX_ARRAY_LENGTH) {
        // Use sparse encoding
        assert denseBuffer == null;
        if (currentBlockCardinality > 0) {
          sets[currentBlock] = new ShortArrayDocIdSet(Arrays.copyOf(buffer, currentBlockCardinality));
        }
      } else {
        assert denseBuffer != null;
        assert denseBuffer.cardinality() == currentBlockCardinality;
        if (denseBuffer.length() == BLOCK_SIZE && BLOCK_SIZE - currentBlockCardinality < MAX_ARRAY_LENGTH) {
          // Doc ids are very dense, inverse the encoding
          final short[] excludedDocs = new short[BLOCK_SIZE - currentBlockCardinality];
          denseBuffer.flip(0, denseBuffer.length());
          int excludedDoc = -1;
          for (int i = 0; i < excludedDocs.length; ++i) {
            excludedDoc = denseBuffer.nextSetBit(excludedDoc + 1);
            assert excludedDoc != DocIdSetIterator.NO_MORE_DOCS;
            excludedDocs[i] = (short) excludedDoc;
          }
          assert excludedDoc + 1 == denseBuffer.length() || denseBuffer.nextSetBit(excludedDoc + 1) == DocIdSetIterator.NO_MORE_DOCS;
          sets[currentBlock] = new NotDocIdSet(BLOCK_SIZE, new ShortArrayDocIdSet(excludedDocs));
        } else {
          // Neither sparse nor super dense, use a fixed bit set
          sets[currentBlock] = new BitDocIdSet(denseBuffer, currentBlockCardinality);
        }
        denseBuffer = null;
      }

      cardinality += currentBlockCardinality;
      denseBuffer = null;
      currentBlockCardinality = 0;
    }

    /**
     * Add a new doc-id to this builder.
     * NOTE: doc ids must be added in order.
     */
    public Builder add(int docId) {
      if (docId <= lastDocId) {
        throw new IllegalArgumentException("Doc ids must be added in-order, got " + docId + " which is <= lastDocID=" + lastDocId);
      }
      final int block = docId >>> 16;
      if (block != currentBlock) {
        // we went to a different block, let's flush what we buffered and start from fresh
        flush();
        currentBlock = block;
      }

      if (currentBlockCardinality < MAX_ARRAY_LENGTH) {
        buffer[currentBlockCardinality] = (short) docId;
      } else {
        if (denseBuffer == null) {
          // the buffer is full, let's move to a fixed bit set
          final int numBits = Math.min(1 << 16, maxDoc - (block << 16));
          denseBuffer = new FixedBitSet(numBits);
          for (short doc : buffer) {
            denseBuffer.set(doc & 0xFFFF);
          }
        }
        denseBuffer.set(docId & 0xFFFF);
      }

      lastDocId = docId;
      currentBlockCardinality += 1;
      return this;
    }

    /** Add the content of the provided {@link DocIdSetIterator}. */
    public Builder add(DocIdSetIterator disi) throws IOException {
      for (int doc = disi.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = disi.nextDoc()) {
        add(doc);
      }
      return this;
    }

    /** Build an instance. */
    public RoaringDocIdSet build() {
      flush();
      return new RoaringDocIdSet(sets, cardinality);
    }

  }

  /**
   * {@link DocIdSet} implementation that can store documents up to 2^16-1 in a short[].
   */
  private static class ShortArrayDocIdSet extends DocIdSet {

    private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(ShortArrayDocIdSet.class);

    private final short[] docIDs;

    private ShortArrayDocIdSet(short[] docIDs) {
      this.docIDs = docIDs;
    }

    @Override
    public long ramBytesUsed() {
      return BASE_RAM_BYTES_USED + RamUsageEstimator.sizeOf(docIDs);
    }

    @Override
    public DocIdSetIterator iterator() throws IOException {
      return new DocIdSetIterator() {

        int i = -1; // this is the index of the current document in the array
        int doc = -1;

        private int docId(int i) {
          return docIDs[i] & 0xFFFF;
        }

        @Override
        public int nextDoc() throws IOException {
          if (++i >= docIDs.length) {
            return doc = NO_MORE_DOCS;
          }
          return doc = docId(i);
        }

        @Override
        public int docID() {
          return doc;
        }

        @Override
        public long cost() {
          return docIDs.length;
        }

        @Override
        public int advance(int target) throws IOException {
          // binary search
          int lo = i + 1;
          int hi = docIDs.length - 1;
          while (lo <= hi) {
            final int mid = (lo + hi) >>> 1;
            final int midDoc = docId(mid);
            if (midDoc < target) {
              lo = mid + 1;
            } else {
              hi = mid - 1;
            }
          }
          if (lo == docIDs.length) {
            i = docIDs.length;
            return doc = NO_MORE_DOCS;
          } else {
            i = lo;
            return doc = docId(i);
          }
        }
      };
    }

  }

  private final DocIdSet[] docIdSets;
  private final int cardinality;
  private final long ramBytesUsed;

  private RoaringDocIdSet(DocIdSet[] docIdSets, int cardinality) {
    this.docIdSets = docIdSets;
    long ramBytesUsed = BASE_RAM_BYTES_USED + RamUsageEstimator.shallowSizeOf(docIdSets);
    for (DocIdSet set : this.docIdSets) {
      if (set != null) {
        ramBytesUsed += set.ramBytesUsed();
      }
    }
    this.ramBytesUsed = ramBytesUsed;
    this.cardinality = cardinality;
  }

  @Override
  public boolean isCacheable() {
    return true;
  }

  @Override
  public long ramBytesUsed() {
    return ramBytesUsed;
  }

  @Override
  public DocIdSetIterator iterator() throws IOException {
    if (cardinality == 0) {
      return null;
    }
    return new Iterator();
  }

  private class Iterator extends DocIdSetIterator {

    int block;
    DocIdSetIterator sub = null;
    int doc;

    Iterator() throws IOException {
      doc = -1;
      block = -1;
      sub = DocIdSetIterator.empty();
    }

    @Override
    public int docID() {
      return doc;
    }

    @Override
    public int nextDoc() throws IOException {
      final int subNext = sub.nextDoc();
      if (subNext == NO_MORE_DOCS) {
        return firstDocFromNextBlock();
      }
      return doc = (block << 16) | subNext;
    }

    @Override
    public int advance(int target) throws IOException {
      final int targetBlock = target >>> 16;
      if (targetBlock != block) {
        block = targetBlock;
        if (block >= docIdSets.length) {
          sub = null;
          return doc = NO_MORE_DOCS;
        }
        if (docIdSets[block] == null) {
          return firstDocFromNextBlock();
        }
        sub = docIdSets[block].iterator();
      }
      final int subNext = sub.advance(target & 0xFFFF);
      if (subNext == NO_MORE_DOCS) {
        return firstDocFromNextBlock();
      }
      return doc = (block << 16) | subNext;
    }

    private int firstDocFromNextBlock() throws IOException {
      while (true) {
        block += 1;
        if (block >= docIdSets.length) {
          sub = null;
          return doc = NO_MORE_DOCS;
        } else if (docIdSets[block] != null) {
          sub = docIdSets[block].iterator();
          final int subNext = sub.nextDoc();
          assert subNext != NO_MORE_DOCS;
          return doc = (block << 16) | subNext;
        }
      }
    }

    @Override
    public long cost() {
      return cardinality;
    }

  }

  /** Return the exact number of documents that are contained in this set. */
  public int cardinality() {
    return cardinality;
  }

  @Override
  public String toString() {
    return "RoaringDocIdSet(cardinality=" + cardinality + ")";
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy