All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.codecs.compressing.CompressingStoredFieldsIndexWriter Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.codecs.compressing;


import static org.apache.lucene.util.BitUtil.zigZagEncode;

import java.io.Closeable;
import java.io.IOException;

import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.packed.PackedInts;

/**
 * Efficient index format for block-based {@link Codec}s.
 * 

This writer generates a file which can be loaded into memory using * memory-efficient data structures to quickly locate the block that contains * any document. *

In order to have a compact in-memory representation, for every block of * 1024 chunks, this index computes the average number of bytes per * chunk and for every chunk, only stores the difference between

    *
  • ${chunk number} * ${average length of a chunk}
  • *
  • and the actual start offset of the chunk
*

Data is written as follows: *

    *
  • PackedIntsVersion, <Block>BlockCount, BlocksEndMarker
  • *
  • PackedIntsVersion --> {@link PackedInts#VERSION_CURRENT} as a {@link DataOutput#writeVInt VInt}
  • *
  • BlocksEndMarker --> 0 as a {@link DataOutput#writeVInt VInt}, this marks the end of blocks since blocks are not allowed to start with 0
  • *
  • Block --> BlockChunks, <DocBases>, <StartPointers>
  • *
  • BlockChunks --> a {@link DataOutput#writeVInt VInt} which is the number of chunks encoded in the block
  • *
  • DocBases --> DocBase, AvgChunkDocs, BitsPerDocBaseDelta, DocBaseDeltas
  • *
  • DocBase --> first document ID of the block of chunks, as a {@link DataOutput#writeVInt VInt}
  • *
  • AvgChunkDocs --> average number of documents in a single chunk, as a {@link DataOutput#writeVInt VInt}
  • *
  • BitsPerDocBaseDelta --> number of bits required to represent a delta from the average using ZigZag encoding
  • *
  • DocBaseDeltas --> {@link PackedInts packed} array of BlockChunks elements of BitsPerDocBaseDelta bits each, representing the deltas from the average doc base using ZigZag encoding.
  • *
  • StartPointers --> StartPointerBase, AvgChunkSize, BitsPerStartPointerDelta, StartPointerDeltas
  • *
  • StartPointerBase --> the first start pointer of the block, as a {@link DataOutput#writeVLong VLong}
  • *
  • AvgChunkSize --> the average size of a chunk of compressed documents, as a {@link DataOutput#writeVLong VLong}
  • *
  • BitsPerStartPointerDelta --> number of bits required to represent a delta from the average using ZigZag encoding
  • *
  • StartPointerDeltas --> {@link PackedInts packed} array of BlockChunks elements of BitsPerStartPointerDelta bits each, representing the deltas from the average start pointer using ZigZag encoding
  • *
  • Footer --> {@link CodecUtil#writeFooter CodecFooter}
  • *
*

Notes *

    *
  • For any block, the doc base of the n-th chunk can be restored with * DocBase + AvgChunkDocs * n + DocBaseDeltas[n].
  • *
  • For any block, the start pointer of the n-th chunk can be restored with * StartPointerBase + AvgChunkSize * n + StartPointerDeltas[n].
  • *
  • Once data is loaded into memory, you can lookup the start pointer of any * document by performing two binary searches: a first one based on the values * of DocBase in order to find the right block, and then inside the block based * on DocBaseDeltas (by reconstructing the doc bases for every chunk).
  • *
* @lucene.internal */ public final class CompressingStoredFieldsIndexWriter implements Closeable { final IndexOutput fieldsIndexOut; final int blockSize; int totalDocs; int blockDocs; int blockChunks; long firstStartPointer; long maxStartPointer; final int[] docBaseDeltas; final long[] startPointerDeltas; CompressingStoredFieldsIndexWriter(IndexOutput indexOutput, int blockSize) throws IOException { if (blockSize <= 0) { throw new IllegalArgumentException("blockSize must be positive"); } this.blockSize = blockSize; this.fieldsIndexOut = indexOutput; reset(); totalDocs = 0; docBaseDeltas = new int[blockSize]; startPointerDeltas = new long[blockSize]; fieldsIndexOut.writeVInt(PackedInts.VERSION_CURRENT); } private void reset() { blockChunks = 0; blockDocs = 0; firstStartPointer = -1; // means unset } private void writeBlock() throws IOException { assert blockChunks > 0; fieldsIndexOut.writeVInt(blockChunks); // The trick here is that we only store the difference from the average start // pointer or doc base, this helps save bits per value. // And in order to prevent a few chunks that would be far from the average to // raise the number of bits per value for all of them, we only encode blocks // of 1024 chunks at once // See LUCENE-4512 // doc bases final int avgChunkDocs; if (blockChunks == 1) { avgChunkDocs = 0; } else { avgChunkDocs = Math.round((float) (blockDocs - docBaseDeltas[blockChunks - 1]) / (blockChunks - 1)); } fieldsIndexOut.writeVInt(totalDocs - blockDocs); // docBase fieldsIndexOut.writeVInt(avgChunkDocs); int docBase = 0; long maxDelta = 0; for (int i = 0; i < blockChunks; ++i) { final int delta = docBase - avgChunkDocs * i; maxDelta |= zigZagEncode(delta); docBase += docBaseDeltas[i]; } final int bitsPerDocBase = PackedInts.bitsRequired(maxDelta); fieldsIndexOut.writeVInt(bitsPerDocBase); PackedInts.Writer writer = PackedInts.getWriterNoHeader(fieldsIndexOut, PackedInts.Format.PACKED, blockChunks, bitsPerDocBase, 1); docBase = 0; for (int i = 0; i < blockChunks; ++i) { final long delta = docBase - avgChunkDocs * i; assert PackedInts.bitsRequired(zigZagEncode(delta)) <= writer.bitsPerValue(); writer.add(zigZagEncode(delta)); docBase += docBaseDeltas[i]; } writer.finish(); // start pointers fieldsIndexOut.writeVLong(firstStartPointer); final long avgChunkSize; if (blockChunks == 1) { avgChunkSize = 0; } else { avgChunkSize = (maxStartPointer - firstStartPointer) / (blockChunks - 1); } fieldsIndexOut.writeVLong(avgChunkSize); long startPointer = 0; maxDelta = 0; for (int i = 0; i < blockChunks; ++i) { startPointer += startPointerDeltas[i]; final long delta = startPointer - avgChunkSize * i; maxDelta |= zigZagEncode(delta); } final int bitsPerStartPointer = PackedInts.bitsRequired(maxDelta); fieldsIndexOut.writeVInt(bitsPerStartPointer); writer = PackedInts.getWriterNoHeader(fieldsIndexOut, PackedInts.Format.PACKED, blockChunks, bitsPerStartPointer, 1); startPointer = 0; for (int i = 0; i < blockChunks; ++i) { startPointer += startPointerDeltas[i]; final long delta = startPointer - avgChunkSize * i; assert PackedInts.bitsRequired(zigZagEncode(delta)) <= writer.bitsPerValue(); writer.add(zigZagEncode(delta)); } writer.finish(); } void writeIndex(int numDocs, long startPointer) throws IOException { if (blockChunks == blockSize) { writeBlock(); reset(); } if (firstStartPointer == -1) { firstStartPointer = maxStartPointer = startPointer; } assert firstStartPointer > 0 && startPointer >= firstStartPointer; docBaseDeltas[blockChunks] = numDocs; startPointerDeltas[blockChunks] = startPointer - maxStartPointer; ++blockChunks; blockDocs += numDocs; totalDocs += numDocs; maxStartPointer = startPointer; } void finish(int numDocs, long maxPointer) throws IOException { if (numDocs != totalDocs) { throw new IllegalStateException("Expected " + numDocs + " docs, but got " + totalDocs); } if (blockChunks > 0) { writeBlock(); } fieldsIndexOut.writeVInt(0); // end marker fieldsIndexOut.writeVLong(maxPointer); CodecUtil.writeFooter(fieldsIndexOut); } @Override public void close() throws IOException { fieldsIndexOut.close(); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy