org.apache.lucene.codecs.compressing.CompressingStoredFieldsIndexWriter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hivemall-all
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.codecs.compressing;


import static org.apache.lucene.util.BitUtil.zigZagEncode;

import java.io.Closeable;
import java.io.IOException;

import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.packed.PackedInts;

/**
 * Efficient index format for block-based {@link Codec}s.
 *  This writer generates a file which can be loaded into memory using
 * memory-efficient data structures to quickly locate the block that contains
 * any document.
 * 
In order to have a compact in-memory representation, for every block of
 * 1024 chunks, this index computes the average number of bytes per
 * chunk and for every chunk, only stores the difference between

 * ${chunk number} * ${average length of a chunk}
 * and the actual start offset of the chunk
 * Data is written as follows:
 * 

 * PackedIntsVersion, <Block>^BlockCount, BlocksEndMarker
 * PackedIntsVersion --> {@link PackedInts#VERSION_CURRENT} as a {@link DataOutput#writeVInt VInt}
 * BlocksEndMarker --> 0 as a {@link DataOutput#writeVInt VInt}, this marks the end of blocks since blocks are not allowed to start with 0
 * Block --> BlockChunks, <DocBases>, <StartPointers>
 * BlockChunks --> a {@link DataOutput#writeVInt VInt} which is the number of chunks encoded in the block
 * DocBases --> DocBase, AvgChunkDocs, BitsPerDocBaseDelta, DocBaseDeltas
 * DocBase --> first document ID of the block of chunks, as a {@link DataOutput#writeVInt VInt}
 * AvgChunkDocs --> average number of documents in a single chunk, as a {@link DataOutput#writeVInt VInt}
 * BitsPerDocBaseDelta --> number of bits required to represent a delta from the average using ZigZag encoding
 * DocBaseDeltas --> {@link PackedInts packed} array of BlockChunks elements of BitsPerDocBaseDelta bits each, representing the deltas from the average doc base using ZigZag encoding.
 * StartPointers --> StartPointerBase, AvgChunkSize, BitsPerStartPointerDelta, StartPointerDeltas
 * StartPointerBase --> the first start pointer of the block, as a {@link DataOutput#writeVLong VLong}
 * AvgChunkSize --> the average size of a chunk of compressed documents, as a {@link DataOutput#writeVLong VLong}
 * BitsPerStartPointerDelta --> number of bits required to represent a delta from the average using ZigZag encoding
 * StartPointerDeltas --> {@link PackedInts packed} array of BlockChunks elements of BitsPerStartPointerDelta bits each, representing the deltas from the average start pointer using ZigZag encoding
 * Footer --> {@link CodecUtil#writeFooter CodecFooter}
 * 
 * Notes
 * 

 * For any block, the doc base of the n-th chunk can be restored with
 * DocBase + AvgChunkDocs * n + DocBaseDeltas[n].
 * For any block, the start pointer of the n-th chunk can be restored with
 * StartPointerBase + AvgChunkSize * n + StartPointerDeltas[n].
 * Once data is loaded into memory, you can lookup the start pointer of any
 * document by performing two binary searches: a first one based on the values
 * of DocBase in order to find the right block, and then inside the block based
 * on DocBaseDeltas (by reconstructing the doc bases for every chunk).
 * 
 * @lucene.internal
 */
public final class CompressingStoredFieldsIndexWriter implements Closeable {
  
  final IndexOutput fieldsIndexOut;
  final int blockSize;
  int totalDocs;
  int blockDocs;
  int blockChunks;
  long firstStartPointer;
  long maxStartPointer;
  final int[] docBaseDeltas;
  final long[] startPointerDeltas;

  CompressingStoredFieldsIndexWriter(IndexOutput indexOutput, int blockSize) throws IOException {
    if (blockSize <= 0) {
      throw new IllegalArgumentException("blockSize must be positive");
    }
    this.blockSize = blockSize;
    this.fieldsIndexOut = indexOutput;
    reset();
    totalDocs = 0;
    docBaseDeltas = new int[blockSize];
    startPointerDeltas = new long[blockSize];
    fieldsIndexOut.writeVInt(PackedInts.VERSION_CURRENT);
  }

  private void reset() {
    blockChunks = 0;
    blockDocs = 0;
    firstStartPointer = -1; // means unset
  }

  private void writeBlock() throws IOException {
    assert blockChunks > 0;
    fieldsIndexOut.writeVInt(blockChunks);

    // The trick here is that we only store the difference from the average start
    // pointer or doc base, this helps save bits per value.
    // And in order to prevent a few chunks that would be far from the average to
    // raise the number of bits per value for all of them, we only encode blocks
    // of 1024 chunks at once
    // See LUCENE-4512

    // doc bases
    final int avgChunkDocs;
    if (blockChunks == 1) {
      avgChunkDocs = 0;
    } else {
      avgChunkDocs = Math.round((float) (blockDocs - docBaseDeltas[blockChunks - 1]) / (blockChunks - 1));
    }
    fieldsIndexOut.writeVInt(totalDocs - blockDocs); // docBase
    fieldsIndexOut.writeVInt(avgChunkDocs);
    int docBase = 0;
    long maxDelta = 0;
    for (int i = 0; i < blockChunks; ++i) {
      final int delta = docBase - avgChunkDocs * i;
      maxDelta |= zigZagEncode(delta);
      docBase += docBaseDeltas[i];
    }

    final int bitsPerDocBase = PackedInts.bitsRequired(maxDelta);
    fieldsIndexOut.writeVInt(bitsPerDocBase);
    PackedInts.Writer writer = PackedInts.getWriterNoHeader(fieldsIndexOut,
        PackedInts.Format.PACKED, blockChunks, bitsPerDocBase, 1);
    docBase = 0;
    for (int i = 0; i < blockChunks; ++i) {
      final long delta = docBase - avgChunkDocs * i;
      assert PackedInts.bitsRequired(zigZagEncode(delta)) <= writer.bitsPerValue();
      writer.add(zigZagEncode(delta));
      docBase += docBaseDeltas[i];
    }
    writer.finish();

    // start pointers
    fieldsIndexOut.writeVLong(firstStartPointer);
    final long avgChunkSize;
    if (blockChunks == 1) {
      avgChunkSize = 0;
    } else {
      avgChunkSize = (maxStartPointer - firstStartPointer) / (blockChunks - 1);
    }
    fieldsIndexOut.writeVLong(avgChunkSize);
    long startPointer = 0;
    maxDelta = 0;
    for (int i = 0; i < blockChunks; ++i) {
      startPointer += startPointerDeltas[i];
      final long delta = startPointer - avgChunkSize * i;
      maxDelta |= zigZagEncode(delta);
    }

    final int bitsPerStartPointer = PackedInts.bitsRequired(maxDelta);
    fieldsIndexOut.writeVInt(bitsPerStartPointer);
    writer = PackedInts.getWriterNoHeader(fieldsIndexOut, PackedInts.Format.PACKED,
        blockChunks, bitsPerStartPointer, 1);
    startPointer = 0;
    for (int i = 0; i < blockChunks; ++i) {
      startPointer += startPointerDeltas[i];
      final long delta = startPointer - avgChunkSize * i;
      assert PackedInts.bitsRequired(zigZagEncode(delta)) <= writer.bitsPerValue();
      writer.add(zigZagEncode(delta));
    }
    writer.finish();
  }

  void writeIndex(int numDocs, long startPointer) throws IOException {
    if (blockChunks == blockSize) {
      writeBlock();
      reset();
    }

    if (firstStartPointer == -1) {
      firstStartPointer = maxStartPointer = startPointer;
    }
    assert firstStartPointer > 0 && startPointer >= firstStartPointer;

    docBaseDeltas[blockChunks] = numDocs;
    startPointerDeltas[blockChunks] = startPointer - maxStartPointer;

    ++blockChunks;
    blockDocs += numDocs;
    totalDocs += numDocs;
    maxStartPointer = startPointer;
  }

  void finish(int numDocs, long maxPointer) throws IOException {
    if (numDocs != totalDocs) {
      throw new IllegalStateException("Expected " + numDocs + " docs, but got " + totalDocs);
    }
    if (blockChunks > 0) {
      writeBlock();
    }
    fieldsIndexOut.writeVInt(0); // end marker
    fieldsIndexOut.writeVLong(maxPointer);
    CodecUtil.writeFooter(fieldsIndexOut);
  }

  @Override
  public void close() throws IOException {
    fieldsIndexOut.close();
  }

}