org.apache.lucene.util.DocIdSetBuilder Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hivemall-all
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.util;


import java.io.IOException;

import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;

/**
 * A builder of {@link DocIdSet}s.  At first it uses a sparse structure to gather
 * documents, and then upgrades to a non-sparse bit set once enough hits match.
 *
 * @lucene.internal
 */
public final class DocIdSetBuilder {

  private final int maxDoc;
  private final int threshold;

  private int[] buffer;
  private int bufferSize;

  private BitSet bitSet;

  /**
   * Create a builder that can contain doc IDs between {@code 0} and {@code maxDoc}.
   */
  public DocIdSetBuilder(int maxDoc) {
    this.maxDoc = maxDoc;
    // For ridiculously small sets, we'll just use a sorted int[]
    // maxDoc >>> 7 is a good value if you want to save memory, lower values
    // such as maxDoc >>> 11 should provide faster building but at the expense
    // of using a full bitset even for quite sparse data
    this.threshold = maxDoc >>> 7;

    this.buffer = new int[0];
    this.bufferSize = 0;
    this.bitSet = null;
  }

  private void upgradeToBitSet() {
    assert bitSet == null;
    bitSet = new FixedBitSet(maxDoc);
    for (int i = 0; i < bufferSize; ++i) {
      bitSet.set(buffer[i]);
    }
    this.buffer = null;
    this.bufferSize = 0;
  }

  /** Grows the buffer to at least minSize, but never larger than threshold. */
  private void growBuffer(int minSize) {
    assert minSize < threshold;
    if (buffer.length < minSize) {
      int nextSize = Math.min(threshold, ArrayUtil.oversize(minSize, RamUsageEstimator.NUM_BYTES_INT));
      int[] newBuffer = new int[nextSize];
      System.arraycopy(buffer, 0, newBuffer, 0, buffer.length);
      buffer = newBuffer;
    }
  }

  /**
   * Add the content of the provided {@link DocIdSetIterator} to this builder.
   * NOTE: if you need to build a {@link DocIdSet} out of a single
   * {@link DocIdSetIterator}, you should rather use {@link RoaringDocIdSet.Builder}.
   */
  public void add(DocIdSetIterator iter) throws IOException {
    grow((int) Math.min(Integer.MAX_VALUE, iter.cost()));

    if (bitSet != null) {
      bitSet.or(iter);
    } else {
      while (true) {  
        assert buffer.length <= threshold;
        final int end = buffer.length;
        for (int i = bufferSize; i < end; ++i) {
          final int doc = iter.nextDoc();
          if (doc == DocIdSetIterator.NO_MORE_DOCS) {
            bufferSize = i;
            return;
          }
          buffer[bufferSize++] = doc;
        }
        bufferSize = end;

        if (bufferSize + 1 >= threshold) {
          break;
        }

        growBuffer(bufferSize+1);
      }

      upgradeToBitSet();
      for (int doc = iter.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iter.nextDoc()) {
        bitSet.set(doc);
      }
    }
  }

  /**
   * Reserve space so that this builder can hold {@code numDocs} MORE documents.
   */
  public void grow(int numDocs) {
    if (bitSet == null) {
      final long newLength = bufferSize + numDocs;
      if (newLength < threshold) {
        growBuffer((int) newLength);
      } else {
        upgradeToBitSet();
      }
    }
  }

  /**
   * Add a document to this builder.
   * NOTE: doc IDs do not need to be provided in order.
   * NOTE: if you plan on adding several docs at once, look into using
   * {@link #grow(int)} to reserve space.
   */
  public void add(int doc) {
    if (bitSet != null) {
      bitSet.set(doc);
    } else {
      if (bufferSize + 1 > buffer.length) {
        if (bufferSize + 1 >= threshold) {
          upgradeToBitSet();
          bitSet.set(doc);
          return;
        }
        growBuffer(bufferSize+1);
      }
      buffer[bufferSize++] = doc;
    }
  }

  private static int dedup(int[] arr, int length) {
    if (length == 0) {
      return 0;
    }
    int l = 1;
    int previous = arr[0];
    for (int i = 1; i < length; ++i) {
      final int value = arr[i];
      assert value >= previous;
      if (value != previous) {
        arr[l++] = value;
        previous = value;
      }
    }
    return l;
  }

  /**
   * Build a {@link DocIdSet} from the accumulated doc IDs.
   */
  public DocIdSet build() {
    return build(-1);
  }

  /**
   * Expert: build a {@link DocIdSet} with a hint on the cost that the resulting
   * {@link DocIdSet} would have.
   */
  public DocIdSet build(long costHint) {
    try {
      if (bitSet != null) {
        if (costHint == -1) {
          return new BitDocIdSet(bitSet);
        } else {
          return new BitDocIdSet(bitSet, costHint);
        }
      } else {
        LSBRadixSorter sorter = new LSBRadixSorter();
        sorter.sort(buffer, 0, bufferSize);
        final int l = dedup(buffer, bufferSize);
        assert l <= bufferSize;
        buffer = ArrayUtil.grow(buffer, l + 1);
        buffer[l] = DocIdSetIterator.NO_MORE_DOCS;
        return new IntArrayDocIdSet(buffer, l);
      }
    } finally {
      this.buffer = null;
      this.bufferSize = 0;
      this.bitSet = null;
    }
  }

}