org.apache.cassandra.io.sstable.IndexSummaryBuilder Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of cassandra-all Show documentation
A fork of the Apache Cassandra Project ready to embed Elasticsearch.
There is a newer version: 3.11.12.3
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.cassandra.io.sstable;

import java.io.IOException;
import java.nio.ByteOrder;
import java.util.Map;
import java.util.TreeMap;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.cassandra.db.DecoratedKey;
import org.apache.cassandra.dht.IPartitioner;
import org.apache.cassandra.io.util.Memory;
import org.apache.cassandra.io.util.SafeMemoryWriter;

import static org.apache.cassandra.io.sstable.Downsampling.BASE_SAMPLING_LEVEL;

public class IndexSummaryBuilder implements AutoCloseable
{
    private static final Logger logger = LoggerFactory.getLogger(IndexSummaryBuilder.class);

    // the offset in the keys memory region to look for a given summary boundary
    private final SafeMemoryWriter offsets;
    private final SafeMemoryWriter entries;

    private final int minIndexInterval;
    private final int samplingLevel;
    private final int[] startPoints;
    private long keysWritten = 0;
    private long indexIntervalMatches = 0;
    private long nextSamplePosition;

    // for each ReadableBoundary, we map its dataLength property to itself, permitting us to lookup the
    // last readable boundary from the perspective of the data file
    // [data file position limit] => [ReadableBoundary]
    private TreeMap lastReadableByData = new TreeMap<>();
    // for each ReadableBoundary, we map its indexLength property to itself, permitting us to lookup the
    // last readable boundary from the perspective of the index file
    // [index file position limit] => [ReadableBoundary]
    private TreeMap lastReadableByIndex = new TreeMap<>();
    // the last synced data file position
    private long dataSyncPosition;
    // the last synced index file position
    private long indexSyncPosition;

    // the last summary interval boundary that is fully readable in both data and index files
    private ReadableBoundary lastReadableBoundary;

    /**
     * Represents a boundary that is guaranteed fully readable in the summary, index file and data file.
     * The key contained is the last key readable if the index and data files have been flushed to the
     * stored lengths.
     */
    public static class ReadableBoundary
    {
        public final DecoratedKey lastKey;
        public final long indexLength;
        public final long dataLength;
        public final int summaryCount;
        public final long entriesLength;
        public ReadableBoundary(DecoratedKey lastKey, long indexLength, long dataLength, int summaryCount, long entriesLength)
        {
            this.lastKey = lastKey;
            this.indexLength = indexLength;
            this.dataLength = dataLength;
            this.summaryCount = summaryCount;
            this.entriesLength = entriesLength;
        }
    }

    public IndexSummaryBuilder(long expectedKeys, int minIndexInterval, int samplingLevel)
    {
        this.samplingLevel = samplingLevel;
        this.startPoints = Downsampling.getStartPoints(BASE_SAMPLING_LEVEL, samplingLevel);

        long maxExpectedEntries = expectedKeys / minIndexInterval;
        if (maxExpectedEntries > Integer.MAX_VALUE)
        {
            // that's a _lot_ of keys, and a very low min index interval
            int effectiveMinInterval = (int) Math.ceil((double) Integer.MAX_VALUE / expectedKeys);
            maxExpectedEntries = expectedKeys / effectiveMinInterval;
            assert maxExpectedEntries <= Integer.MAX_VALUE : maxExpectedEntries;
            logger.warn("min_index_interval of {} is too low for {} expected keys; using interval of {} instead",
                        minIndexInterval, expectedKeys, effectiveMinInterval);
            this.minIndexInterval = effectiveMinInterval;
        }
        else
        {
            this.minIndexInterval = minIndexInterval;
        }

        // for initializing data structures, adjust our estimates based on the sampling level
        maxExpectedEntries = Math.max(1, (maxExpectedEntries * samplingLevel) / BASE_SAMPLING_LEVEL);
        offsets = new SafeMemoryWriter(4 * maxExpectedEntries).order(ByteOrder.nativeOrder());
        entries = new SafeMemoryWriter(40 * maxExpectedEntries).order(ByteOrder.nativeOrder());

        // the summary will always contain the first index entry (downsampling will never remove it)
        nextSamplePosition = 0;
        indexIntervalMatches++;
    }

    // the index file has been flushed to the provided position; stash it and use that to recalculate our max readable boundary
    public void markIndexSynced(long upToPosition)
    {
        indexSyncPosition = upToPosition;
        refreshReadableBoundary();
    }

    // the data file has been flushed to the provided position; stash it and use that to recalculate our max readable boundary
    public void markDataSynced(long upToPosition)
    {
        dataSyncPosition = upToPosition;
        refreshReadableBoundary();
    }

    private void refreshReadableBoundary()
    {
        // grab the readable boundary prior to the given position in either the data or index file
        Map.Entry byData = lastReadableByData.floorEntry(dataSyncPosition);
        Map.Entry byIndex = lastReadableByIndex.floorEntry(indexSyncPosition);
        if (byData == null || byIndex == null)
            return;

        // take the lowest of the two, and stash it
        lastReadableBoundary = byIndex.getValue().indexLength < byData.getValue().indexLength
                               ? byIndex.getValue() : byData.getValue();

        // clear our data prior to this, since we no longer need it
        lastReadableByData.headMap(lastReadableBoundary.dataLength, false).clear();
        lastReadableByIndex.headMap(lastReadableBoundary.indexLength, false).clear();
    }

    public ReadableBoundary getLastReadableBoundary()
    {
        return lastReadableBoundary;
    }

    public IndexSummaryBuilder maybeAddEntry(DecoratedKey decoratedKey, long indexStart) throws IOException
    {
        return maybeAddEntry(decoratedKey, indexStart, 0, 0);
    }

    /**
     *
     * @param decoratedKey the key for this record
     * @param indexStart the position in the index file this record begins
     * @param indexEnd the position in the index file we need to be able to read to (exclusive) to read this record
     * @param dataEnd the position in the data file we need to be able to read to (exclusive) to read this record
     *                a value of 0 indicates we are not tracking readable boundaries
     */
    public IndexSummaryBuilder maybeAddEntry(DecoratedKey decoratedKey, long indexStart, long indexEnd, long dataEnd) throws IOException
    {
        if (keysWritten == nextSamplePosition)
        {
            assert entries.length() <= Integer.MAX_VALUE;
            offsets.writeInt((int) entries.length());
            entries.write(decoratedKey.getKey());
            entries.writeLong(indexStart);
            setNextSamplePosition(keysWritten);
        }
        else if (dataEnd != 0 && keysWritten + 1 == nextSamplePosition)
        {
            // this is the last key in this summary interval, so stash it
            ReadableBoundary boundary = new ReadableBoundary(decoratedKey, indexEnd, dataEnd, (int)(offsets.length() / 4), entries.length());
            lastReadableByData.put(dataEnd, boundary);
            lastReadableByIndex.put(indexEnd, boundary);
        }
        keysWritten++;

        return this;
    }

    // calculate the next key we will store to our summary
    private void setNextSamplePosition(long position)
    {
        tryAgain: while (true)
        {
            position += minIndexInterval;
            long test = indexIntervalMatches++;
            for (int start : startPoints)
                if ((test - start) % BASE_SAMPLING_LEVEL == 0)
                    continue tryAgain;

            nextSamplePosition = position;
            return;
        }
    }

    public void prepareToCommit()
    {
        // this method should only be called when we've finished appending records, so we truncate the
        // memory we're using to the exact amount required to represent it before building our summary
        entries.setCapacity(entries.length());
        offsets.setCapacity(offsets.length());
    }

    public IndexSummary build(IPartitioner partitioner)
    {
        return build(partitioner, null);
    }

    // build the summary up to the provided boundary; this is backed by shared memory between
    // multiple invocations of this build method
    public IndexSummary build(IPartitioner partitioner, ReadableBoundary boundary)
    {
        assert entries.length() > 0;

        int count = (int) (offsets.length() / 4);
        long entriesLength = entries.length();
        if (boundary != null)
        {
            count = boundary.summaryCount;
            entriesLength = boundary.entriesLength;
        }

        int sizeAtFullSampling = (int) Math.ceil(keysWritten / (double) minIndexInterval);
        assert count > 0;
        return new IndexSummary(partitioner, offsets.currentBuffer().sharedCopy(),
                                count, entries.currentBuffer().sharedCopy(), entriesLength,
                                sizeAtFullSampling, minIndexInterval, samplingLevel);
    }

    // close the builder and release any associated memory
    public void close()
    {
        entries.close();
        offsets.close();
    }

    public Throwable close(Throwable accumulate)
    {
        accumulate = entries.close(accumulate);
        accumulate = offsets.close(accumulate);
        return accumulate;
    }

    public static int entriesAtSamplingLevel(int samplingLevel, int maxSummarySize)
    {
        return (int) Math.ceil((samplingLevel * maxSummarySize) / (double) BASE_SAMPLING_LEVEL);
    }

    public static int calculateSamplingLevel(int currentSamplingLevel, int currentNumEntries, long targetNumEntries, int minIndexInterval, int maxIndexInterval)
    {
        // effective index interval == (BASE_SAMPLING_LEVEL / samplingLevel) * minIndexInterval
        // so we can just solve for minSamplingLevel here:
        // maxIndexInterval == (BASE_SAMPLING_LEVEL / minSamplingLevel) * minIndexInterval
        int effectiveMinSamplingLevel = Math.max(1, (int) Math.ceil((BASE_SAMPLING_LEVEL * minIndexInterval) / (double) maxIndexInterval));

        // Algebraic explanation for calculating the new sampling level (solve for newSamplingLevel):
        // originalNumEntries = (baseSamplingLevel / currentSamplingLevel) * currentNumEntries
        // newSpaceUsed = (newSamplingLevel / baseSamplingLevel) * originalNumEntries
        // newSpaceUsed = (newSamplingLevel / baseSamplingLevel) * (baseSamplingLevel / currentSamplingLevel) * currentNumEntries
        // newSpaceUsed = (newSamplingLevel / currentSamplingLevel) * currentNumEntries
        // (newSpaceUsed * currentSamplingLevel) / currentNumEntries = newSamplingLevel
        int newSamplingLevel = (int) (targetNumEntries * currentSamplingLevel) / currentNumEntries;
        return Math.min(BASE_SAMPLING_LEVEL, Math.max(effectiveMinSamplingLevel, newSamplingLevel));
    }

    /**
     * Downsamples an existing index summary to a new sampling level.
     * @param existing an existing IndexSummary
     * @param newSamplingLevel the target level for the new IndexSummary.  This must be less than the current sampling
     *                         level for `existing`.
     * @param partitioner the partitioner used for the index summary
     * @return a new IndexSummary
     */
    @SuppressWarnings("resource")
    public static IndexSummary downsample(IndexSummary existing, int newSamplingLevel, int minIndexInterval, IPartitioner partitioner)
    {
        // To downsample the old index summary, we'll go through (potentially) several rounds of downsampling.
        // Conceptually, each round starts at position X and then removes every Nth item.  The value of X follows
        // a particular pattern to evenly space out the items that we remove.  The value of N decreases by one each
        // round.

        int currentSamplingLevel = existing.getSamplingLevel();
        assert currentSamplingLevel > newSamplingLevel;
        assert minIndexInterval == existing.getMinIndexInterval();

        // calculate starting indexes for downsampling rounds
        int[] startPoints = Downsampling.getStartPoints(currentSamplingLevel, newSamplingLevel);

        // calculate new off-heap size
        int newKeyCount = existing.size();
        long newEntriesLength = existing.getEntriesLength();
        for (int start : startPoints)
        {
            for (int j = start; j < existing.size(); j += currentSamplingLevel)
            {
                newKeyCount--;
                long length = existing.getEndInSummary(j) - existing.getPositionInSummary(j);
                newEntriesLength -= length;
            }
        }

        Memory oldEntries = existing.getEntries();
        Memory newOffsets = Memory.allocate(newKeyCount * 4);
        Memory newEntries = Memory.allocate(newEntriesLength);

        // Copy old entries to our new Memory.
        int i = 0;
        int newEntriesOffset = 0;
        outer:
        for (int oldSummaryIndex = 0; oldSummaryIndex < existing.size(); oldSummaryIndex++)
        {
            // to determine if we can skip this entry, go through the starting points for our downsampling rounds
            // and see if the entry's index is covered by that round
            for (int start : startPoints)
            {
                if ((oldSummaryIndex - start) % currentSamplingLevel == 0)
                    continue outer;
            }

            // write the position of the actual entry in the index summary (4 bytes)
            newOffsets.setInt(i * 4, newEntriesOffset);
            i++;
            long start = existing.getPositionInSummary(oldSummaryIndex);
            long length = existing.getEndInSummary(oldSummaryIndex) - start;
            newEntries.put(newEntriesOffset, oldEntries, start, length);
            newEntriesOffset += length;
        }
        assert newEntriesOffset == newEntriesLength;
        return new IndexSummary(partitioner, newOffsets, newKeyCount, newEntries, newEntriesLength,
                                existing.getMaxNumberOfEntries(), minIndexInterval, newSamplingLevel);
    }
}