org.apache.solr.search.DocSetBuilder Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.search;

import java.io.IOException;

import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.LSBRadixSorter;
import org.apache.lucene.util.packed.PackedInts;

/**
 * Adapted from DocIdSetBuilder to build DocSets
 *
 * @lucene.internal
 */
public final class DocSetBuilder {

  private final int maxDoc;
  private final int threshold;

  private int[] buffer;
  private int pos;

  private FixedBitSet bitSet;


  public DocSetBuilder(int maxDoc, long costEst) {
    this.maxDoc = maxDoc;
    // For ridiculously small sets, we'll just use a sorted int[]
    // maxDoc >>> 7 is a good value if you want to save memory, lower values
    // such as maxDoc >>> 11 should provide faster building but at the expense
    // of using a full bitset even for quite sparse data
    this.threshold = (maxDoc >>> 7) + 4; // the +4 is for better testing on small indexes

    if (costEst > threshold) {
      bitSet = new FixedBitSet(maxDoc);
    } else {
      this.buffer = new int[Math.max((int)costEst,1)];
    }
  }

  private void upgradeToBitSet() {
    assert bitSet == null;
    bitSet = new FixedBitSet(maxDoc);
    for (int i = 0; i < pos; ++i) {
      bitSet.set(buffer[i]);
    }
    this.buffer = null;
    this.pos = 0;
  }

  private void growBuffer(int minSize) {
    if (minSize < buffer.length) return;

    int newSize = buffer.length;
    while (newSize < minSize) {
      newSize = newSize << 1;
    }
    newSize = Math.min(newSize, threshold);

    int[] newBuffer = new int[newSize];
    System.arraycopy(buffer, 0, newBuffer, 0, pos);
    buffer = newBuffer;
  }

  public void add(DocIdSetIterator iter, int base) throws IOException {
    grow((int) Math.min(Integer.MAX_VALUE, iter.cost()));

    if (bitSet != null) {
      add(bitSet, iter, base);
    } else {
      while (true) {
        for (int i = pos; i < buffer.length; ++i) {
          final int doc = iter.nextDoc();
          if (doc == DocIdSetIterator.NO_MORE_DOCS) {
            pos = i; // update pos
            return;
          }
          buffer[i] = doc + base;  // using the loop counter may help with removal of bounds checking
        }

        pos = buffer.length; // update pos
        if (pos + 1 >= threshold) {
          break;
        }

        growBuffer(pos + 1);
      }

      upgradeToBitSet();
      add(bitSet, iter, base);
    }
  }


  public static void add(FixedBitSet bitSet, DocIdSetIterator iter, int base) throws IOException {
    for (int doc = iter.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iter.nextDoc()) {
      bitSet.set(doc + base);
    }
  }

  /** Returns the number of terms visited */
  public int add(TermsEnum te, int base) throws IOException {
    PostingsEnum postings = null;

    int termCount = 0;
    for(;;) {
      BytesRef term = te.next();
      if (term == null) break;
      termCount++;
      postings = te.postings(postings, PostingsEnum.NONE);
      add(postings, base);
    }

    return termCount;
  }


  public void grow(int numDocs) {
    if (bitSet == null) {
      final long newLength = pos + numDocs;
      if (newLength < threshold) {
        growBuffer((int) newLength);
      } else {
        upgradeToBitSet();
      }
    }
  }


  public void add(int doc) {
    if (bitSet != null) {
      bitSet.set(doc);
    } else {
      if (pos >= buffer.length) {
        if (pos + 1 >= threshold) {
          upgradeToBitSet();
          bitSet.set(doc);
          return;
        }
        growBuffer(pos + 1);
      }
      buffer[pos++] = doc;
    }
  }

  private static int dedup(int[] arr, int length, FixedBitSet acceptDocs) {
    int pos = 0;
    int previous = -1;
    for (int i = 0; i < length; ++i) {
      final int value = arr[i];
      // assert value >= previous;
      if (value != previous && (acceptDocs == null || acceptDocs.get(value))) {
        arr[pos++] = value;
        previous = value;
      }
    }
    return pos;
  }



  public DocSet build(FixedBitSet filter) {
    if (bitSet != null) {
      if (filter != null) {
        bitSet.and(filter);
      }
      return new BitDocSet(bitSet);
      // TODO - if this set will be cached, should we make it smaller if it's below DocSetUtil.smallSetSize?
    } else {
      LSBRadixSorter sorter = new LSBRadixSorter();
      sorter.sort(PackedInts.bitsRequired(maxDoc - 1), buffer, pos);
      final int l = dedup(buffer, pos, filter);
      assert l <= pos;
      return new SortedIntDocSet(buffer, l);  // TODO: have option to not shrink in the future if it will be a temporary set
    }
  }

  /** Only use this if you know there were no duplicates and that docs were collected in-order! */
  public DocSet buildUniqueInOrder(FixedBitSet filter) {
    if (bitSet != null) {
      if (filter != null) {
        bitSet.and(filter);
      }
      return new BitDocSet(bitSet);
    } else {
      // don't need to sort, but still need to remove non accepted docs
      int l = pos;
      if (filter != null) {
        l = dedup(buffer, pos, filter);
      }
      return new SortedIntDocSet(buffer, l);  // TODO: have option to not shrink in the future if it will be a temporary set
    }
  }

}