org.apache.cassandra.index.sai.disk.v1.postings.PostingsWriter Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of cassandra-all Show documentation
The Apache Cassandra Project develops a highly scalable second-generation distributed database, bringing together Dynamo's fully distributed design and Bigtable's ColumnFamily-based data model.
There is a newer version: 5.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.cassandra.index.sai.disk.v1.postings;


import java.io.Closeable;
import java.io.IOException;
import javax.annotation.concurrent.NotThreadSafe;

import com.google.common.annotations.VisibleForTesting;

import org.agrona.collections.LongArrayList;
import org.apache.cassandra.index.sai.disk.ResettableByteBuffersIndexOutput;
import org.apache.cassandra.index.sai.disk.format.IndexComponent;
import org.apache.cassandra.index.sai.disk.format.IndexDescriptor;
import org.apache.cassandra.index.sai.utils.IndexIdentifier;
import org.apache.cassandra.index.sai.disk.io.IndexOutputWriter;
import org.apache.cassandra.index.sai.disk.v1.SAICodecUtils;
import org.apache.cassandra.index.sai.postings.PostingList;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.packed.DirectWriter;

import static com.google.common.base.Preconditions.checkArgument;
import static java.lang.Math.max;

/**
 * Encodes, compresses and writes postings lists to disk.
 * 
 * All postings in the posting list are delta encoded, then deltas are divided into blocks for compression.
 * The deltas are based on the final value of the previous block. For the first block in the posting list
 * the first value in the block is written as a VLong prior to block delta encodings.
 * 

 * In packed blocks, longs are encoded with the same bit width (FoR compression). The block size (i.e. number of
 * longs inside block) is fixed (currently 128). Additionally blocks that are all the same value are encoded in an
 * optimized way.
 * 
 * 
 * In VLong blocks, longs are compressed with {@link DataOutput#writeVLong}. The block size is variable.
 * 
 *
 * 
 * Packed blocks are favoured, meaning when the postings are long enough, {@link PostingsWriter} will try
 * to encode most data as a packed block. Take a term with 259 postings as an example, the first 256 postings are encoded
 * as two packed blocks, while the remaining 3 are encoded as one VLong block.
 * 
 * 
 * Each posting list ends with a block summary containing metadata and a skip table, written right after all postings
 * blocks. Skip interval is the same as block size, and each skip entry points to the end of each block.
 * Skip table consist of block offsets and last values of each block, compressed as two FoR blocks.
 * 
 *
 * Visual representation of the disk format:
 *  *
 * +========+========================+=====+==============+===============+===============+=====+========================+========+
 * | HEADER | POSTINGS LIST (TERM 1)                                                      | ... | POSTINGS LIST (TERM N) | FOOTER |
 * +========+========================+=====+==============+===============+===============+=====+========================+========+
 *          | FIRST VALUE| FOR BLOCK (1)| ... | FOR BLOCK (N)| BLOCK SUMMARY              |
 *          +---------------------------+-----+--------------+---------------+------------+
 *                                                           | BLOCK SIZE    |            |
 *                                                           | LIST SIZE     | SKIP TABLE |
 *                                                           +---------------+------------+
 *                                                                           | BLOCKS POS.|
 *                                                                           | MAX VALUES |
 *                                                                           +------------+
 *
 *  
 */
@NotThreadSafe
public class PostingsWriter implements Closeable
{
    // import static org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat.BLOCK_SIZE;
    private final static int BLOCK_SIZE = 128;

    private static final String POSTINGS_MUST_BE_SORTED_ERROR_MSG = "Postings must be sorted ascending, got [%s] after [%s]";

    private final IndexOutput dataOutput;
    private final int blockSize;
    private final long[] deltaBuffer;
    private final LongArrayList blockOffsets = new LongArrayList();
    private final LongArrayList blockMaximumPostings = new LongArrayList();
    private final ResettableByteBuffersIndexOutput inMemoryOutput = new ResettableByteBuffersIndexOutput("blockOffsets");

    private final long startOffset;

    private int bufferUpto;
    private long firstPosting = Long.MIN_VALUE;
    private long lastPosting = Long.MIN_VALUE;
    private long maxDelta;
    private long totalPostings;

    public PostingsWriter(IndexDescriptor indexDescriptor, IndexIdentifier indexIdentifier) throws IOException
    {
        this(indexDescriptor, indexIdentifier, BLOCK_SIZE);
    }

    public PostingsWriter(IndexOutputWriter dataOutput) throws IOException
    {
        this(dataOutput, BLOCK_SIZE);
    }

    @VisibleForTesting
    PostingsWriter(IndexDescriptor indexDescriptor, IndexIdentifier indexIdentifier, int blockSize) throws IOException
    {
        this(indexDescriptor.openPerIndexOutput(IndexComponent.POSTING_LISTS, indexIdentifier, true), blockSize);
    }

    private PostingsWriter(IndexOutputWriter dataOutput, int blockSize) throws IOException
    {
        this.blockSize = blockSize;
        this.dataOutput = dataOutput;
        startOffset = dataOutput.getFilePointer();
        deltaBuffer = new long[blockSize];
        SAICodecUtils.writeHeader(dataOutput);
    }

    /**
     * @return current file pointer
     */
    public long getFilePointer()
    {
        return dataOutput.getFilePointer();
    }

    /**
     * @return file pointer where index structure begins (before header)
     */
    public long getStartOffset()
    {
        return startOffset;
    }

    /**
     * write footer to the postings
     */
    public void complete() throws IOException
    {
        SAICodecUtils.writeFooter(dataOutput);
    }

    @Override
    public void close() throws IOException
    {
        dataOutput.close();
    }

    /**
     * Encodes, compresses and flushes given posting list to disk.
     *
     * @param postings posting list to write to disk
     *
     * @return file offset to the summary block of this posting list
     */
    public long write(PostingList postings) throws IOException
    {
        checkArgument(postings != null, "Expected non-null posting list.");
        checkArgument(postings.size() > 0, "Expected non-empty posting list.");

        lastPosting = Long.MIN_VALUE;
        resetBlockCounters();
        blockOffsets.clear();
        blockMaximumPostings.clear();

        long posting;
        // When postings list are merged, we don't know exact size, just an upper bound.
        // We need to count how many postings we added to the block ourselves.
        int size = 0;
        while ((posting = postings.nextPosting()) != PostingList.END_OF_STREAM)
        {
            writePosting(posting);
            size++;
            totalPostings++;
        }

        assert size > 0 : "No postings were written";

        finish();

        final long summaryOffset = dataOutput.getFilePointer();
        writeSummary(size);
        return summaryOffset;
    }

    public long getTotalPostings()
    {
        return totalPostings;
    }

    private void writePosting(long posting) throws IOException
    {
        if (lastPosting == Long.MIN_VALUE)
        {
            firstPosting = posting;
            deltaBuffer[bufferUpto++] = 0;
        }
        else
        {
            if (posting < lastPosting)
                throw new IllegalArgumentException(String.format(POSTINGS_MUST_BE_SORTED_ERROR_MSG, posting, lastPosting));
            long delta = posting - lastPosting;
            maxDelta = max(maxDelta, delta);
            deltaBuffer[bufferUpto++] = delta;
        }
        lastPosting = posting;

        if (bufferUpto == blockSize)
        {
            addBlockToSkipTable();
            writePostingsBlock();
            resetBlockCounters();
        }
    }

    private void finish() throws IOException
    {
        if (bufferUpto > 0)
        {
            addBlockToSkipTable();
            writePostingsBlock();
        }
    }

    private void resetBlockCounters()
    {
        firstPosting = Long.MIN_VALUE;
        bufferUpto = 0;
        maxDelta = 0;
    }

    private void addBlockToSkipTable()
    {
        blockOffsets.add(dataOutput.getFilePointer());
        blockMaximumPostings.add(lastPosting);
    }

    private void writeSummary(int exactSize) throws IOException
    {
        dataOutput.writeVInt(blockSize);
        dataOutput.writeVInt(exactSize);
        writeSkipTable();
    }

    private void writeSkipTable() throws IOException
    {
        assert blockOffsets.size() == blockMaximumPostings.size();
        dataOutput.writeVInt(blockOffsets.size());

        // compressing offsets in memory first, to know the exact length (with padding)
        inMemoryOutput.reset();

        writeSortedFoRBlock(blockOffsets, inMemoryOutput);
        dataOutput.writeVLong(inMemoryOutput.getFilePointer());
        inMemoryOutput.copyTo(dataOutput);
        writeSortedFoRBlock(blockMaximumPostings, dataOutput);
    }

    private void writePostingsBlock() throws IOException
    {
        final int bitsPerValue = maxDelta == 0 ? 0 : DirectWriter.unsignedBitsRequired(maxDelta);

        // If we have a first posting, indicating that this is the first block in the posting list
        // then write it prior to the deltas.
        if (firstPosting != Long.MIN_VALUE)
            dataOutput.writeVLong(firstPosting);

        dataOutput.writeByte((byte) bitsPerValue);
        if (bitsPerValue > 0)
        {
            final DirectWriter writer = DirectWriter.getInstance(dataOutput, blockSize, bitsPerValue);
            for (int index = 0; index < bufferUpto; ++index)
            {
                writer.add(deltaBuffer[index]);
            }
            if (bufferUpto < blockSize)
            {
                // Pad the rest of the block with 0, so we don't write invalid
                // values from previous blocks
                for (int index = bufferUpto; index < blockSize; index++)
                {
                    writer.add(0);
                }
            }
            writer.finish();
        }
    }

    private void writeSortedFoRBlock(LongArrayList values, IndexOutput output) throws IOException
    {
        final long maxValue = values.getLong(values.size() - 1);

        assert values.size() > 0;
        final int bitsPerValue = maxValue == 0 ? 0 : DirectWriter.unsignedBitsRequired(maxValue);
        output.writeByte((byte) bitsPerValue);
        if (bitsPerValue > 0)
        {
            final DirectWriter writer = DirectWriter.getInstance(output, values.size(), bitsPerValue);
            for (int i = 0; i < values.size(); ++i)
            {
                writer.add(values.getLong(i));
            }
            writer.finish();
        }
    }
}