All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.codecs.lucene70.IndexedDISI Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.codecs.lucene70;

import java.io.DataInput;
import java.io.IOException;

import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BitSetIterator;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.RoaringDocIdSet;

/**
 * Disk-based implementation of a {@link DocIdSetIterator} which can return
 * the index of the current document, i.e. the ordinal of the current document
 * among the list of documents that this iterator can return. This is useful
 * to implement sparse doc values by only having to encode values for documents
 * that actually have a value.
 * 

Implementation-wise, this {@link DocIdSetIterator} is inspired of * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} * documents independently and picks between 3 encodings depending on the * density of the range:

    *
  • {@code ALL} if the range contains 65536 documents exactly, *
  • {@code DENSE} if the range contains 4096 documents or more; in that * case documents are stored in a bit set, *
  • {@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are * stored in a {@link DataInput#readShort() short}. *
*

Only ranges that contain at least one value are encoded. *

This implementation uses 6 bytes per document in the worst-case, which happens * in the case that all ranges contain exactly one document. * @lucene.internal */ final class IndexedDISI extends DocIdSetIterator { static final int MAX_ARRAY_LENGTH = (1 << 12) - 1; private static void flush(int block, FixedBitSet buffer, int cardinality, IndexOutput out) throws IOException { assert block >= 0 && block < 65536; out.writeShort((short) block); assert cardinality > 0 && cardinality <= 65536; out.writeShort((short) (cardinality - 1)); if (cardinality > MAX_ARRAY_LENGTH) { if (cardinality != 65536) { // all docs are set for (long word : buffer.getBits()) { out.writeLong(word); } } } else { BitSetIterator it = new BitSetIterator(buffer, cardinality); for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) { out.writeShort((short) doc); } } } static void writeBitSet(DocIdSetIterator it, IndexOutput out) throws IOException { int i = 0; final FixedBitSet buffer = new FixedBitSet(1<<16); int prevBlock = -1; for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) { final int block = doc >>> 16; if (prevBlock != -1 && block != prevBlock) { flush(prevBlock, buffer, i, out); buffer.clear(0, buffer.length()); prevBlock = block; i = 0; } buffer.set(doc & 0xFFFF); i++; prevBlock = block; } if (i > 0) { flush(prevBlock, buffer, i, out); buffer.clear(0, buffer.length()); } // NO_MORE_DOCS is stored explicitly buffer.set(DocIdSetIterator.NO_MORE_DOCS & 0xFFFF); flush(DocIdSetIterator.NO_MORE_DOCS >>> 16, buffer, 1, out); } /** The slice that stores the {@link DocIdSetIterator}. */ private final IndexInput slice; private final long cost; IndexedDISI(IndexInput in, long offset, long length, long cost) throws IOException { this(in.slice("docs", offset, length), cost); } // This constructor allows to pass the slice directly in case it helps reuse // see eg. Lucene70 norms producer's merge instance IndexedDISI(IndexInput slice, long cost) throws IOException { this.slice = slice; this.cost = cost; } private int block = -1; private long blockEnd; private int nextBlockIndex = -1; Method method; private int doc = -1; private int index = -1; // SPARSE variables boolean exists; // DENSE variables private long word; private int wordIndex = -1; // number of one bits encountered so far, including those of `word` private int numberOfOnes; // ALL variables private int gap; @Override public int docID() { return doc; } @Override public int advance(int target) throws IOException { final int targetBlock = target & 0xFFFF0000; if (block < targetBlock) { advanceBlock(targetBlock); } if (block == targetBlock) { if (method.advanceWithinBlock(this, target)) { return doc; } readBlockHeader(); } boolean found = method.advanceWithinBlock(this, block); assert found; return doc; } public boolean advanceExact(int target) throws IOException { final int targetBlock = target & 0xFFFF0000; if (block < targetBlock) { advanceBlock(targetBlock); } boolean found = block == targetBlock && method.advanceExactWithinBlock(this, target); this.doc = target; return found; } private void advanceBlock(int targetBlock) throws IOException { do { slice.seek(blockEnd); readBlockHeader(); } while (block < targetBlock); } private void readBlockHeader() throws IOException { block = Short.toUnsignedInt(slice.readShort()) << 16; assert block >= 0; final int numValues = 1 + Short.toUnsignedInt(slice.readShort()); index = nextBlockIndex; nextBlockIndex = index + numValues; if (numValues <= MAX_ARRAY_LENGTH) { method = Method.SPARSE; blockEnd = slice.getFilePointer() + (numValues << 1); } else if (numValues == 65536) { method = Method.ALL; blockEnd = slice.getFilePointer(); gap = block - index - 1; } else { method = Method.DENSE; blockEnd = slice.getFilePointer() + (1 << 13); wordIndex = -1; numberOfOnes = index + 1; } } @Override public int nextDoc() throws IOException { return advance(doc + 1); } public int index() { return index; } @Override public long cost() { return cost; } enum Method { SPARSE { @Override boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException { final int targetInBlock = target & 0xFFFF; // TODO: binary search for (; disi.index < disi.nextBlockIndex;) { int doc = Short.toUnsignedInt(disi.slice.readShort()); disi.index++; if (doc >= targetInBlock) { disi.doc = disi.block | doc; disi.exists = true; return true; } } return false; } @Override boolean advanceExactWithinBlock(IndexedDISI disi, int target) throws IOException { final int targetInBlock = target & 0xFFFF; // TODO: binary search if (target == disi.doc) { return disi.exists; } for (; disi.index < disi.nextBlockIndex;) { int doc = Short.toUnsignedInt(disi.slice.readShort()); disi.index++; if (doc >= targetInBlock) { if (doc != targetInBlock) { disi.index--; disi.slice.seek(disi.slice.getFilePointer() - Short.BYTES); break; } disi.exists = true; return true; } } disi.exists = false; return false; } }, DENSE { @Override boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException { final int targetInBlock = target & 0xFFFF; final int targetWordIndex = targetInBlock >>> 6; for (int i = disi.wordIndex + 1; i <= targetWordIndex; ++i) { disi.word = disi.slice.readLong(); disi.numberOfOnes += Long.bitCount(disi.word); } disi.wordIndex = targetWordIndex; long leftBits = disi.word >>> target; if (leftBits != 0L) { disi.doc = target + Long.numberOfTrailingZeros(leftBits); disi.index = disi.numberOfOnes - Long.bitCount(leftBits); return true; } while (++disi.wordIndex < 1024) { disi.word = disi.slice.readLong(); if (disi.word != 0) { disi.index = disi.numberOfOnes; disi.numberOfOnes += Long.bitCount(disi.word); disi.doc = disi.block | (disi.wordIndex << 6) | Long.numberOfTrailingZeros(disi.word); return true; } } return false; } @Override boolean advanceExactWithinBlock(IndexedDISI disi, int target) throws IOException { final int targetInBlock = target & 0xFFFF; final int targetWordIndex = targetInBlock >>> 6; for (int i = disi.wordIndex + 1; i <= targetWordIndex; ++i) { disi.word = disi.slice.readLong(); disi.numberOfOnes += Long.bitCount(disi.word); } disi.wordIndex = targetWordIndex; long leftBits = disi.word >>> target; disi.index = disi.numberOfOnes - Long.bitCount(leftBits); return (leftBits & 1L) != 0; } }, ALL { @Override boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException { disi.doc = target; disi.index = target - disi.gap; return true; } @Override boolean advanceExactWithinBlock(IndexedDISI disi, int target) throws IOException { disi.index = target - disi.gap; return true; } }; /** Advance to the first doc from the block that is equal to or greater than {@code target}. * Return true if there is such a doc and false otherwise. */ abstract boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException; /** Advance the iterator exactly to the position corresponding to the given {@code target} * and return whether this document exists. */ abstract boolean advanceExactWithinBlock(IndexedDISI disi, int target) throws IOException; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy