org.apache.cassandra.io.util.SegmentedFile Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of cassandra-all Show documentation
The Apache Cassandra Project develops a highly scalable second-generation distributed database, bringing together Dynamo's fully distributed design and Bigtable's ColumnFamily-based data model.
There is a newer version: 5.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.cassandra.io.util;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.File;
import java.io.IOException;
import java.util.function.Supplier;

import com.google.common.util.concurrent.RateLimiter;

import org.apache.cassandra.config.Config;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.io.compress.CompressedSequentialWriter;
import org.apache.cassandra.io.sstable.Component;
import org.apache.cassandra.io.sstable.Descriptor;
import org.apache.cassandra.io.sstable.IndexSummary;
import org.apache.cassandra.io.sstable.IndexSummaryBuilder;
import org.apache.cassandra.io.sstable.format.Version;
import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
import org.apache.cassandra.utils.CLibrary;
import org.apache.cassandra.utils.concurrent.RefCounted;
import org.apache.cassandra.utils.concurrent.SharedCloseableImpl;

import static org.apache.cassandra.utils.Throwables.maybeFail;

/**
 * Abstracts a read-only file that has been split into segments, each of which can be represented by an independent
 * FileDataInput. Allows for iteration over the FileDataInputs, or random access to the FileDataInput for a given
 * position.
 *
 * The JVM can only map up to 2GB at a time, so each segment is at most that size when using mmap i/o. If a segment
 * would need to be longer than 2GB, that segment will not be mmap'd, and a new RandomAccessFile will be created for
 * each access to that segment.
 */
public abstract class SegmentedFile extends SharedCloseableImpl
{
    public final ChannelProxy channel;
    public final int bufferSize;
    public final long length;

    // This differs from length for compressed files (but we still need length for
    // SegmentIterator because offsets in the file are relative to the uncompressed size)
    public final long onDiskLength;

    /**
     * Use getBuilder to get a Builder to construct a SegmentedFile.
     */
    SegmentedFile(Cleanup cleanup, ChannelProxy channel, int bufferSize, long length)
    {
        this(cleanup, channel, bufferSize, length, length);
    }

    protected SegmentedFile(Cleanup cleanup, ChannelProxy channel, int bufferSize, long length, long onDiskLength)
    {
        super(cleanup);
        this.channel = channel;
        this.bufferSize = bufferSize;
        this.length = length;
        this.onDiskLength = onDiskLength;
    }

    protected SegmentedFile(SegmentedFile copy)
    {
        super(copy);
        channel = copy.channel;
        bufferSize = copy.bufferSize;
        length = copy.length;
        onDiskLength = copy.onDiskLength;
    }

    public String path()
    {
        return channel.filePath();
    }

    protected static class Cleanup implements RefCounted.Tidy
    {
        final ChannelProxy channel;
        protected Cleanup(ChannelProxy channel)
        {
            this.channel = channel;
        }

        public String name()
        {
            return channel.filePath();
        }

        public void tidy()
        {
            channel.close();
        }
    }

    public abstract SegmentedFile sharedCopy();

    public RandomAccessReader createReader()
    {
        return new RandomAccessReader.Builder(channel)
               .overrideLength(length)
               .bufferSize(bufferSize)
               .build();
    }

    public RandomAccessReader createReader(RateLimiter limiter)
    {
        return new RandomAccessReader.Builder(channel)
               .overrideLength(length)
               .bufferSize(bufferSize)
               .limiter(limiter)
               .build();
    }

    public FileDataInput createReader(long position)
    {
        RandomAccessReader reader = createReader();
        reader.seek(position);
        return reader;
    }

    public void dropPageCache(long before)
    {
        CLibrary.trySkipCache(channel.getFileDescriptor(), 0, before, path());
    }

    /**
     * @return A SegmentedFile.Builder.
     */
    public static Builder getBuilder(Config.DiskAccessMode mode, boolean compressed)
    {
        return compressed ? new CompressedSegmentedFile.Builder(null)
                          : mode == Config.DiskAccessMode.mmap ? new MmappedSegmentedFile.Builder()
                                                               : new BufferedSegmentedFile.Builder();
    }

    public static Builder getCompressedBuilder(CompressedSequentialWriter writer)
    {
        return new CompressedSegmentedFile.Builder(writer);
    }

    /**
     * Collects potential segmentation points in an underlying file, and builds a SegmentedFile to represent it.
     */
    public static abstract class Builder implements AutoCloseable
    {
        private ChannelProxy channel;

        /**
         * Called after all potential boundaries have been added to apply this Builder to a concrete file on disk.
         * @param channel The channel to the file on disk.
         */
        protected abstract SegmentedFile complete(ChannelProxy channel, int bufferSize, long overrideLength);

        @SuppressWarnings("resource") // SegmentedFile owns channel
        private SegmentedFile complete(String path, int bufferSize, long overrideLength)
        {
            ChannelProxy channelCopy = getChannel(path);
            try
            {
                return complete(channelCopy, bufferSize, overrideLength);
            }
            catch (Throwable t)
            {
                channelCopy.close();
                throw t;
            }
        }

        public SegmentedFile buildData(Descriptor desc, StatsMetadata stats, IndexSummaryBuilder.ReadableBoundary boundary)
        {
            return complete(desc.filenameFor(Component.DATA), bufferSize(stats), boundary.dataLength);
        }

        public SegmentedFile buildData(Descriptor desc, StatsMetadata stats)
        {
            return complete(desc.filenameFor(Component.DATA), bufferSize(stats), -1L);
        }

        public SegmentedFile buildIndex(Descriptor desc, IndexSummary indexSummary, IndexSummaryBuilder.ReadableBoundary boundary)
        {
            return complete(desc.filenameFor(Component.PRIMARY_INDEX), bufferSize(desc, indexSummary), boundary.indexLength);
        }

        public SegmentedFile buildIndex(Descriptor desc, IndexSummary indexSummary)
        {
            return complete(desc.filenameFor(Component.PRIMARY_INDEX), bufferSize(desc, indexSummary), -1L);
        }

        private static int bufferSize(StatsMetadata stats)
        {
            return bufferSize(stats.estimatedPartitionSize.percentile(DatabaseDescriptor.getDiskOptimizationEstimatePercentile()));
        }

        private static int bufferSize(Descriptor desc, IndexSummary indexSummary)
        {
            File file = new File(desc.filenameFor(Component.PRIMARY_INDEX));
            return bufferSize(file.length() / indexSummary.size());
        }

        /**
            Return the buffer size for a given record size. For spinning disks always add one page.
            For solid state disks only add one page if the chance of crossing to the next page is more
            than a predifined value, @see Config.disk_optimization_page_cross_chance.
         */
        static int bufferSize(long recordSize)
        {
            Config.DiskOptimizationStrategy strategy = DatabaseDescriptor.getDiskOptimizationStrategy();
            if (strategy == Config.DiskOptimizationStrategy.ssd)
            {
                // The crossing probability is calculated assuming a uniform distribution of record
                // start position in a page, so it's the record size modulo the page size divided by
                // the total page size.
                double pageCrossProbability = (recordSize % 4096) / 4096.;
                // if the page cross probability is equal or bigger than disk_optimization_page_cross_chance we add one page
                if ((pageCrossProbability - DatabaseDescriptor.getDiskOptimizationPageCrossChance()) > -1e-16)
                    recordSize += 4096;

                return roundBufferSize(recordSize);
            }
            else if (strategy == Config.DiskOptimizationStrategy.spinning)
            {
                return roundBufferSize(recordSize + 4096);
            }
            else
            {
                throw new IllegalStateException("Unsupported disk optimization strategy: " + strategy);
            }
        }

        /**
           Round up to the next multiple of 4k but no more than 64k
         */
        static int roundBufferSize(long size)
        {
            if (size <= 0)
                return 4096;

            size = (size + 4095) & ~4095;
            return (int)Math.min(size, 1 << 16);
        }

        public void serializeBounds(DataOutput out, Version version) throws IOException
        {
            if (!version.hasBoundaries())
                return;

            out.writeUTF(DatabaseDescriptor.getDiskAccessMode().name());
        }

        public void deserializeBounds(DataInput in, Version version) throws IOException
        {
            if (!version.hasBoundaries())
                return;

            if (!in.readUTF().equals(DatabaseDescriptor.getDiskAccessMode().name()))
                throw new IOException("Cannot deserialize SSTable Summary component because the DiskAccessMode was changed!");
        }

        public Throwable close(Throwable accumulate)
        {
            if (channel != null)
                return channel.close(accumulate);

            return accumulate;
        }

        public void close()
        {
            maybeFail(close(null));
        }

        private ChannelProxy getChannel(String path)
        {
            if (channel != null)
            {
                // This is really fragile, both path and channel.filePath()
                // must agree, i.e. they both must be absolute or both relative
                // eventually we should really pass the filePath to the builder
                // constructor and remove this
                if (channel.filePath().equals(path))
                    return channel.sharedCopy();
                else
                    channel.close();
            }

            channel = new ChannelProxy(path);
            return channel.sharedCopy();
        }
    }

    @Override
    public String toString() {
        return getClass().getSimpleName() + "(path='" + path() + '\'' +
               ", length=" + length +
               ')';
}
}