org.apache.cassandra.io.sstable.indexsummary.IndexSummary Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of cassandra-all Show documentation
The Apache Cassandra Project develops a highly scalable second-generation distributed database, bringing together Dynamo's fully distributed design and Bigtable's ColumnFamily-based data model.
There is a newer version: 5.0.2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.cassandra.io.sstable.indexsummary;

import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;

import com.google.common.annotations.VisibleForTesting;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.cassandra.db.DecoratedKey;
import org.apache.cassandra.db.PartitionPosition;
import org.apache.cassandra.dht.IPartitioner;
import org.apache.cassandra.dht.Range;
import org.apache.cassandra.dht.Token;
import org.apache.cassandra.io.sstable.Downsampling;
import org.apache.cassandra.io.sstable.format.SSTableReader;
import org.apache.cassandra.io.util.DataInputPlus;
import org.apache.cassandra.io.util.DataInputPlus.DataInputStreamPlus;
import org.apache.cassandra.io.util.DataOutputPlus;
import org.apache.cassandra.io.util.Memory;
import org.apache.cassandra.io.util.MemoryOutputStream;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.cassandra.utils.FBUtilities;
import org.apache.cassandra.utils.Pair;
import org.apache.cassandra.utils.concurrent.Ref;
import org.apache.cassandra.utils.concurrent.WrappedSharedCloseable;
import org.apache.cassandra.utils.memory.MemoryUtil;

import static org.apache.cassandra.io.sstable.Downsampling.BASE_SAMPLING_LEVEL;

/*
 * Layout of Memory for index summaries:
 *
 * There are two sections:
 *  1. A "header" containing the offset into `bytes` of entries in the summary summary data, consisting of
 *     one four byte position for each entry in the summary.  This allows us do simple math in getIndex()
 *     to find the position in the Memory to start reading the actual index summary entry.
 *     (This is necessary because keys can have different lengths.)
 *  2.  A sequence of (DecoratedKey, position) pairs, where position is the offset into the actual index file.
 */
public class IndexSummary extends WrappedSharedCloseable
{
    private static final Logger logger = LoggerFactory.getLogger(IndexSummary.class);
    public static final IndexSummarySerializer serializer = new IndexSummarySerializer();

    /**
     * A lower bound for the average number of partitions in between each index summary entry. A lower value means
     * that more partitions will have an entry in the index summary when at the full sampling level.
     */
    private final int minIndexInterval;

    private final IPartitioner partitioner;
    private final int sizeAtFullSampling;
    // we permit the memory to span a range larger than we use,
    // so we have an accompanying count and length for each part
    // we split our data into two ranges: offsets (indexing into entries),
    // and entries containing the summary data
    private final Memory offsets;
    private final int offsetCount;
    // entries is a list of (partition key, index file offset) pairs
    private final Memory entries;
    private final long entriesLength;

    /**
     * A value between 1 and BASE_SAMPLING_LEVEL that represents how many of the original
     * index summary entries ((1 / indexInterval) * numKeys) have been retained.
     *
     * Thus, this summary contains (samplingLevel / BASE_SAMPLING_LEVEL) * ((1 / indexInterval) * numKeys)) entries.
     */
    private final int samplingLevel;

    public IndexSummary(IPartitioner partitioner, Memory offsets, int offsetCount, Memory entries, long entriesLength,
                        int sizeAtFullSampling, int minIndexInterval, int samplingLevel)
    {
        super(new Memory[] { offsets, entries });
        assert offsets.getInt(0) == 0;
        this.partitioner = partitioner;
        this.minIndexInterval = minIndexInterval;
        this.offsetCount = offsetCount;
        this.entriesLength = entriesLength;
        this.sizeAtFullSampling = sizeAtFullSampling;
        this.offsets = offsets;
        this.entries = entries;
        this.samplingLevel = samplingLevel;
        assert samplingLevel > 0;
    }

    private IndexSummary(IndexSummary copy)
    {
        super(copy);
        this.partitioner = copy.partitioner;
        this.minIndexInterval = copy.minIndexInterval;
        this.offsetCount = copy.offsetCount;
        this.entriesLength = copy.entriesLength;
        this.sizeAtFullSampling = copy.sizeAtFullSampling;
        this.offsets = copy.offsets;
        this.entries = copy.entries;
        this.samplingLevel = copy.samplingLevel;
    }

    // binary search is notoriously more difficult to get right than it looks; this is lifted from
    // Harmony's Collections implementation
    public int binarySearch(PartitionPosition key)
    {
        // We will be comparing non-native Keys, so use a buffer with appropriate byte order
        ByteBuffer hollow = MemoryUtil.getHollowDirectByteBuffer().order(ByteOrder.BIG_ENDIAN);
        int low = 0, mid = offsetCount, high = mid - 1, result = -1;
        while (low <= high)
        {
            mid = (low + high) >> 1;
            fillTemporaryKey(mid, hollow);
            result = -DecoratedKey.compareTo(partitioner, hollow, key);
            if (result > 0)
            {
                low = mid + 1;
            }
            else if (result == 0)
            {
                return mid;
            }
            else
            {
                high = mid - 1;
            }
        }

        return -mid - (result < 0 ? 1 : 2);
    }

    /**
     * Gets the position of the actual index summary entry in our Memory attribute, 'bytes'.
     * @param index The index of the entry or key to get the position for
     * @return an offset into our Memory attribute where the actual entry resides
     */
    public int getPositionInSummary(int index)
    {
        // The first section of bytes holds a four-byte position for each entry in the summary, so just multiply by 4.
        return offsets.getInt(index << 2);
    }

    public byte[] getKey(int index)
    {
        long start = getPositionInSummary(index);
        int keySize = (int) (calculateEnd(index) - start - 8L);
        byte[] key = new byte[keySize];
        entries.getBytes(start, key, 0, keySize);
        return key;
    }

    private void fillTemporaryKey(int index, ByteBuffer buffer)
    {
        long start = getPositionInSummary(index);
        int keySize = (int) (calculateEnd(index) - start - 8L);
        entries.setByteBuffer(buffer, start, keySize);
    }

    public void addTo(Ref.IdentityCollection identities)
    {
        super.addTo(identities);
        identities.add(offsets);
        identities.add(entries);
    }

    public long getPosition(int index)
    {
        return entries.getLong(calculateEnd(index) - 8);
    }

    public long getEndInSummary(int index)
    {
        return calculateEnd(index);
    }

    private long calculateEnd(int index)
    {
        return index == (offsetCount - 1) ? entriesLength : getPositionInSummary(index + 1);
    }

    public int getMinIndexInterval()
    {
        return minIndexInterval;
    }

    public double getEffectiveIndexInterval()
    {
        return (BASE_SAMPLING_LEVEL / (double) samplingLevel) * minIndexInterval;
    }

    /**
     * Returns an estimate of the total number of keys in the SSTable.
     */
    public long getEstimatedKeyCount()
    {
        return ((long) getMaxNumberOfEntries() + 1) * minIndexInterval;
    }

    public int size()
    {
        return offsetCount;
    }

    public int getSamplingLevel()
    {
        return samplingLevel;
    }

    /**
     * Returns the number of entries this summary would have if it were at the full sampling level, which is equal
     * to the number of entries in the primary on-disk index divided by the min index interval.
     */
    public int getMaxNumberOfEntries()
    {
        return sizeAtFullSampling;
    }

    /**
     * Returns the amount of off-heap memory used for the entries portion of this summary.
     * @return size in bytes
     */
    long getEntriesLength()
    {
        return entriesLength;
    }

    Memory getOffsets()
    {
        return offsets;
    }

    Memory getEntries()
    {
        return entries;
    }

    public long getOffHeapSize()
    {
        return offsetCount * 4 + entriesLength;
    }

    /**
     * Returns the number of primary (on-disk) index entries between the index summary entry at `index` and the next
     * index summary entry (assuming there is one).  Without any downsampling, this will always be equivalent to
     * the index interval.
     *
     * @param index the index of an index summary entry (between zero and the index entry size)
     *
     * @return the number of partitions after `index` until the next partition with a summary entry
     */
    public int getEffectiveIndexIntervalAfterIndex(int index)
    {
        return Downsampling.getEffectiveIndexIntervalAfterIndex(index, samplingLevel, minIndexInterval);
    }

    public List getSampleIndexesForRanges(Collection> ranges)
    {
        // use the index to determine a minimal section for each range
        List positions = new ArrayList<>();

        for (Range range : Range.normalize(ranges))
        {
            PartitionPosition leftPosition = range.left.maxKeyBound();
            PartitionPosition rightPosition = range.right.maxKeyBound();

            int left = binarySearch(leftPosition);
            if (left < 0)
                left = (left + 1) * -1;
            else
                // left range are start exclusive
                left = left + 1;
            if (left == size())
                // left is past the end of the sampling
                continue;

            int right = Range.isWrapAround(range.left, range.right)
                        ? size() - 1
                        : binarySearch(rightPosition);
            if (right < 0)
            {
                // range are end inclusive so we use the previous index from what binarySearch give us
                // since that will be the last index we will return
                right = (right + 1) * -1;
                if (right == 0)
                    // Means the first key is already stricly greater that the right bound
                    continue;
                right--;
            }

            if (left > right)
                // empty range
                continue;
            positions.add(new SSTableReader.IndexesBounds(left, right));
        }
        return positions;
    }

    public Iterable getKeySamples(final Range range)
    {
        final List indexRanges = getSampleIndexesForRanges(Collections.singletonList(range));

        if (indexRanges.isEmpty())
            return Collections.emptyList();

        return () -> new Iterator()
        {
            private Iterator rangeIter = indexRanges.iterator();
            private SSTableReader.IndexesBounds current;
            private int idx;

            public boolean hasNext()
            {
                if (current == null || idx > current.upperPosition)
                {
                    if (rangeIter.hasNext())
                    {
                        current = rangeIter.next();
                        idx = current.lowerPosition;
                        return true;
                    }
                    return false;
                }

                return true;
            }

            public byte[] next()
            {
                return getKey(idx++);
            }

            public void remove()
            {
                throw new UnsupportedOperationException();
            }
        };
    }

    public long getScanPosition(PartitionPosition key)
    {
        return getScanPositionFromBinarySearchResult(binarySearch(key));
    }

    @VisibleForTesting
    public long getScanPositionFromBinarySearchResult(int binarySearchResult)
    {
        if (binarySearchResult == -1)
            return 0;
        else
            return getPosition(getIndexFromBinarySearchResult(binarySearchResult));
    }

    public static int getIndexFromBinarySearchResult(int binarySearchResult)
    {
        if (binarySearchResult < 0)
        {
            // binary search gives us the first index _greater_ than the key searched for,
            // i.e., its insertion position
            int greaterThan = (binarySearchResult + 1) * -1;
            if (greaterThan == 0)
                return -1;
            return greaterThan - 1;
        }
        else
        {
            return binarySearchResult;
        }
    }

    public IndexSummary sharedCopy()
    {
        return new IndexSummary(this);
    }

    public static class IndexSummarySerializer
    {
        public void serialize(IndexSummary t, DataOutputPlus out) throws IOException
        {
            out.writeInt(t.minIndexInterval);
            out.writeInt(t.offsetCount);
            out.writeLong(t.getOffHeapSize());
            out.writeInt(t.samplingLevel);
            out.writeInt(t.sizeAtFullSampling);
            // our on-disk representation treats the offsets and the summary data as one contiguous structure,
            // in which the offsets are based from the start of the structure. i.e., if the offsets occupy
            // X bytes, the value of the first offset will be X. In memory we split the two regions up, so that
            // the summary values are indexed from zero, so we apply a correction to the offsets when de/serializing.
            // In this case adding X to each of the offsets.
            int baseOffset = t.offsetCount * 4;
            for (int i = 0 ; i < t.offsetCount ; i++)
            {
                int offset = t.offsets.getInt(i * 4) + baseOffset;
                // our serialization format for this file uses native byte order, so if this is different to the
                // default Java serialization order (BIG_ENDIAN) we have to reverse our bytes
                offset = Integer.reverseBytes(offset);
                out.writeInt(offset);
            }
            out.write(t.entries, 0, t.entriesLength);
        }

        @SuppressWarnings("resource")
        public  IndexSummary deserialize(T in, IPartitioner partitioner, int expectedMinIndexInterval, int maxIndexInterval) throws IOException
        {
            int minIndexInterval = in.readInt();
            if (minIndexInterval != expectedMinIndexInterval)
            {
                throw new IOException(String.format("Cannot read index summary because min_index_interval changed from %d to %d.",
                                                    minIndexInterval, expectedMinIndexInterval));
            }

            int offsetCount = in.readInt();
            long offheapSize = in.readLong();
            int samplingLevel = in.readInt();
            int fullSamplingSummarySize = in.readInt();

            int effectiveIndexInterval = (int) Math.ceil((BASE_SAMPLING_LEVEL / (double) samplingLevel) * minIndexInterval);
            if (effectiveIndexInterval > maxIndexInterval)
            {
                throw new IOException(String.format("Rebuilding index summary because the effective index interval (%d) is higher than" +
                                                    " the current max index interval (%d)", effectiveIndexInterval, maxIndexInterval));
            }

            Memory offsets = Memory.allocate(offsetCount * 4);
            Memory entries = Memory.allocate(offheapSize - offsets.size());
            try
            {
                FBUtilities.copy(in, new MemoryOutputStream(offsets), offsets.size());
                FBUtilities.copy(in, new MemoryOutputStream(entries), entries.size());
            }
            catch (IOException ioe)
            {
                offsets.free();
                entries.free();
                throw ioe;
            }
            // our on-disk representation treats the offsets and the summary data as one contiguous structure,
            // in which the offsets are based from the start of the structure. i.e., if the offsets occupy
            // X bytes, the value of the first offset will be X. In memory we split the two regions up, so that
            // the summary values are indexed from zero, so we apply a correction to the offsets when de/serializing.
            // In this case subtracting X from each of the offsets.
            for (int i = 0 ; i < offsets.size() ; i += 4)
                offsets.setInt(i, (int) (offsets.getInt(i) - offsets.size()));
            return new IndexSummary(partitioner, offsets, offsetCount, entries, entries.size(), fullSamplingSummarySize, minIndexInterval, samplingLevel);
        }

        /**
         * Deserializes the first and last key stored in the summary
         * 
         * Only for use by offline tools like SSTableMetadataViewer, otherwise SSTable.first/last should be used.
         */
        public Pair deserializeFirstLastKey(DataInputStreamPlus in, IPartitioner partitioner) throws IOException
        {
            in.skipBytes(4); // minIndexInterval
            int offsetCount = in.readInt();
            long offheapSize = in.readLong();
            in.skipBytes(8); // samplingLevel, fullSamplingSummarySize

            in.skipBytes(offsetCount * 4);
            in.skipBytes((int) (offheapSize - offsetCount * 4));

            DecoratedKey first = partitioner.decorateKey(ByteBufferUtil.readWithLength(in));
            DecoratedKey last = partitioner.decorateKey(ByteBufferUtil.readWithLength(in));
            return Pair.create(first, last);
        }
    }
}