org.apache.lucene.codecs.compressing.CompressingStoredFieldsIndexWriter Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.compressing;
import static org.apache.lucene.util.BitUtil.zigZagEncode;
import java.io.Closeable;
import java.io.IOException;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.packed.PackedInts;
/**
* Efficient index format for block-based {@link Codec}s.
* This writer generates a file which can be loaded into memory using
* memory-efficient data structures to quickly locate the block that contains
* any document.
*
In order to have a compact in-memory representation, for every block of
* 1024 chunks, this index computes the average number of bytes per
* chunk and for every chunk, only stores the difference between
* - ${chunk number} * ${average length of a chunk}
* - and the actual start offset of the chunk
* Data is written as follows:
*
* - PackedIntsVersion, <Block>BlockCount, BlocksEndMarker
* - PackedIntsVersion --> {@link PackedInts#VERSION_CURRENT} as a {@link DataOutput#writeVInt VInt}
* - BlocksEndMarker --> 0 as a {@link DataOutput#writeVInt VInt}, this marks the end of blocks since blocks are not allowed to start with 0
* - Block --> BlockChunks, <DocBases>, <StartPointers>
* - BlockChunks --> a {@link DataOutput#writeVInt VInt} which is the number of chunks encoded in the block
* - DocBases --> DocBase, AvgChunkDocs, BitsPerDocBaseDelta, DocBaseDeltas
* - DocBase --> first document ID of the block of chunks, as a {@link DataOutput#writeVInt VInt}
* - AvgChunkDocs --> average number of documents in a single chunk, as a {@link DataOutput#writeVInt VInt}
* - BitsPerDocBaseDelta --> number of bits required to represent a delta from the average using ZigZag encoding
* - DocBaseDeltas --> {@link PackedInts packed} array of BlockChunks elements of BitsPerDocBaseDelta bits each, representing the deltas from the average doc base using ZigZag encoding.
* - StartPointers --> StartPointerBase, AvgChunkSize, BitsPerStartPointerDelta, StartPointerDeltas
* - StartPointerBase --> the first start pointer of the block, as a {@link DataOutput#writeVLong VLong}
* - AvgChunkSize --> the average size of a chunk of compressed documents, as a {@link DataOutput#writeVLong VLong}
* - BitsPerStartPointerDelta --> number of bits required to represent a delta from the average using ZigZag encoding
* - StartPointerDeltas --> {@link PackedInts packed} array of BlockChunks elements of BitsPerStartPointerDelta bits each, representing the deltas from the average start pointer using ZigZag encoding
* - Footer --> {@link CodecUtil#writeFooter CodecFooter}
*
* Notes
*
* - For any block, the doc base of the n-th chunk can be restored with
*
DocBase + AvgChunkDocs * n + DocBaseDeltas[n]
.
* - For any block, the start pointer of the n-th chunk can be restored with
*
StartPointerBase + AvgChunkSize * n + StartPointerDeltas[n]
.
* - Once data is loaded into memory, you can lookup the start pointer of any
* document by performing two binary searches: a first one based on the values
* of DocBase in order to find the right block, and then inside the block based
* on DocBaseDeltas (by reconstructing the doc bases for every chunk).
*
* @lucene.internal
*/
public final class CompressingStoredFieldsIndexWriter implements Closeable {
final IndexOutput fieldsIndexOut;
final int blockSize;
int totalDocs;
int blockDocs;
int blockChunks;
long firstStartPointer;
long maxStartPointer;
final int[] docBaseDeltas;
final long[] startPointerDeltas;
CompressingStoredFieldsIndexWriter(IndexOutput indexOutput, int blockSize) throws IOException {
if (blockSize <= 0) {
throw new IllegalArgumentException("blockSize must be positive");
}
this.blockSize = blockSize;
this.fieldsIndexOut = indexOutput;
reset();
totalDocs = 0;
docBaseDeltas = new int[blockSize];
startPointerDeltas = new long[blockSize];
fieldsIndexOut.writeVInt(PackedInts.VERSION_CURRENT);
}
private void reset() {
blockChunks = 0;
blockDocs = 0;
firstStartPointer = -1; // means unset
}
private void writeBlock() throws IOException {
assert blockChunks > 0;
fieldsIndexOut.writeVInt(blockChunks);
// The trick here is that we only store the difference from the average start
// pointer or doc base, this helps save bits per value.
// And in order to prevent a few chunks that would be far from the average to
// raise the number of bits per value for all of them, we only encode blocks
// of 1024 chunks at once
// See LUCENE-4512
// doc bases
final int avgChunkDocs;
if (blockChunks == 1) {
avgChunkDocs = 0;
} else {
avgChunkDocs = Math.round((float) (blockDocs - docBaseDeltas[blockChunks - 1]) / (blockChunks - 1));
}
fieldsIndexOut.writeVInt(totalDocs - blockDocs); // docBase
fieldsIndexOut.writeVInt(avgChunkDocs);
int docBase = 0;
long maxDelta = 0;
for (int i = 0; i < blockChunks; ++i) {
final int delta = docBase - avgChunkDocs * i;
maxDelta |= zigZagEncode(delta);
docBase += docBaseDeltas[i];
}
final int bitsPerDocBase = PackedInts.bitsRequired(maxDelta);
fieldsIndexOut.writeVInt(bitsPerDocBase);
PackedInts.Writer writer = PackedInts.getWriterNoHeader(fieldsIndexOut,
PackedInts.Format.PACKED, blockChunks, bitsPerDocBase, 1);
docBase = 0;
for (int i = 0; i < blockChunks; ++i) {
final long delta = docBase - avgChunkDocs * i;
assert PackedInts.bitsRequired(zigZagEncode(delta)) <= writer.bitsPerValue();
writer.add(zigZagEncode(delta));
docBase += docBaseDeltas[i];
}
writer.finish();
// start pointers
fieldsIndexOut.writeVLong(firstStartPointer);
final long avgChunkSize;
if (blockChunks == 1) {
avgChunkSize = 0;
} else {
avgChunkSize = (maxStartPointer - firstStartPointer) / (blockChunks - 1);
}
fieldsIndexOut.writeVLong(avgChunkSize);
long startPointer = 0;
maxDelta = 0;
for (int i = 0; i < blockChunks; ++i) {
startPointer += startPointerDeltas[i];
final long delta = startPointer - avgChunkSize * i;
maxDelta |= zigZagEncode(delta);
}
final int bitsPerStartPointer = PackedInts.bitsRequired(maxDelta);
fieldsIndexOut.writeVInt(bitsPerStartPointer);
writer = PackedInts.getWriterNoHeader(fieldsIndexOut, PackedInts.Format.PACKED,
blockChunks, bitsPerStartPointer, 1);
startPointer = 0;
for (int i = 0; i < blockChunks; ++i) {
startPointer += startPointerDeltas[i];
final long delta = startPointer - avgChunkSize * i;
assert PackedInts.bitsRequired(zigZagEncode(delta)) <= writer.bitsPerValue();
writer.add(zigZagEncode(delta));
}
writer.finish();
}
void writeIndex(int numDocs, long startPointer) throws IOException {
if (blockChunks == blockSize) {
writeBlock();
reset();
}
if (firstStartPointer == -1) {
firstStartPointer = maxStartPointer = startPointer;
}
assert firstStartPointer > 0 && startPointer >= firstStartPointer;
docBaseDeltas[blockChunks] = numDocs;
startPointerDeltas[blockChunks] = startPointer - maxStartPointer;
++blockChunks;
blockDocs += numDocs;
totalDocs += numDocs;
maxStartPointer = startPointer;
}
void finish(int numDocs, long maxPointer) throws IOException {
if (numDocs != totalDocs) {
throw new IllegalStateException("Expected " + numDocs + " docs, but got " + totalDocs);
}
if (blockChunks > 0) {
writeBlock();
}
fieldsIndexOut.writeVInt(0); // end marker
fieldsIndexOut.writeVLong(maxPointer);
CodecUtil.writeFooter(fieldsIndexOut);
}
@Override
public void close() throws IOException {
fieldsIndexOut.close();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy