All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.cassandra.db.compaction.UnifiedCompactionStrategy Maven / Gradle / Ivy

Go to download

The Apache Cassandra Project develops a highly scalable second-generation distributed database, bringing together Dynamo's fully distributed design and Bigtable's ColumnFamily-based data model.

There is a newer version: 5.0.2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.cassandra.db.compaction;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.base.Predicate;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.Sets;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.cassandra.db.ColumnFamilyStore;
import org.apache.cassandra.db.SerializationHeader;
import org.apache.cassandra.db.commitlog.CommitLogPosition;
import org.apache.cassandra.db.commitlog.IntervalSet;
import org.apache.cassandra.db.compaction.unified.Controller;
import org.apache.cassandra.db.compaction.unified.ShardedMultiWriter;
import org.apache.cassandra.db.compaction.unified.UnifiedCompactionTask;
import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
import org.apache.cassandra.exceptions.ConfigurationException;
import org.apache.cassandra.index.Index;
import org.apache.cassandra.io.sstable.Descriptor;
import org.apache.cassandra.io.sstable.SSTableMultiWriter;
import org.apache.cassandra.io.sstable.format.SSTableReader;
import org.apache.cassandra.schema.TableMetadata;
import org.apache.cassandra.service.StorageService;
import org.apache.cassandra.utils.Clock;
import org.apache.cassandra.utils.FBUtilities;
import org.apache.cassandra.utils.Overlaps;
import org.apache.cassandra.utils.TimeUUID;

/**
 * The design of the unified compaction strategy is described in the accompanying UnifiedCompactionStrategy.md.
 *
 * See CEP-26: https://cwiki.apache.org/confluence/display/CASSANDRA/CEP-26%3A+Unified+Compaction+Strategy
 */
public class UnifiedCompactionStrategy extends AbstractCompactionStrategy
{
    private static final Logger logger = LoggerFactory.getLogger(UnifiedCompactionStrategy.class);

    static final int MAX_LEVELS = 32;   // This is enough for a few petabytes of data (with the worst case fan factor
    // at W=0 this leaves room for 2^32 sstables, presumably of at least 1MB each).

    private static final Pattern SCALING_PARAMETER_PATTERN = Pattern.compile("(N)|L(\\d+)|T(\\d+)|([+-]?\\d+)");
    private static final String SCALING_PARAMETER_PATTERN_SIMPLIFIED = SCALING_PARAMETER_PATTERN.pattern()
                                                                                                .replaceAll("[()]", "")
                                                                                                .replace("\\d", "[0-9]");

    private final Controller controller;

    private volatile ShardManager shardManager;

    private long lastExpiredCheck;

    protected volatile int estimatedRemainingTasks;
    @VisibleForTesting
    protected final Set sstables = new HashSet<>();

    public UnifiedCompactionStrategy(ColumnFamilyStore cfs, Map options)
    {
        this(cfs, options, Controller.fromOptions(cfs, options));
    }

    public UnifiedCompactionStrategy(ColumnFamilyStore cfs, Map options, Controller controller)
    {
        super(cfs, options);
        this.controller = controller;
        estimatedRemainingTasks = 0;
    }

    public static Map validateOptions(Map options) throws ConfigurationException
    {
        return Controller.validateOptions(AbstractCompactionStrategy.validateOptions(options));
    }

    public static int fanoutFromScalingParameter(int w)
    {
        return w < 0 ? 2 - w : 2 + w; // see formula in design doc
    }

    public static int thresholdFromScalingParameter(int w)
    {
        return w <= 0 ? 2 : 2 + w; // see formula in design doc
    }

    public static int parseScalingParameter(String value)
    {
        Matcher m = SCALING_PARAMETER_PATTERN.matcher(value);
        if (!m.matches())
            throw new ConfigurationException("Scaling parameter " + value + " must match " + SCALING_PARAMETER_PATTERN_SIMPLIFIED);

        if (m.group(1) != null)
            return 0;
        else if (m.group(2) != null)
            return 2 - atLeast2(Integer.parseInt(m.group(2)), value);
        else if (m.group(3) != null)
            return atLeast2(Integer.parseInt(m.group(3)), value) - 2;
        else
            return Integer.parseInt(m.group(4));
    }

    private static int atLeast2(int value, String str)
    {
        if (value < 2)
            throw new ConfigurationException("Fan factor cannot be lower than 2 in " + str);
        return value;
    }

    public static String printScalingParameter(int w)
    {
        if (w < 0)
            return "L" + Integer.toString(2 - w);
        else if (w > 0)
            return "T" + Integer.toString(w + 2);
        else
            return "N";
    }

    @Override
    public synchronized Collection getMaximalTask(long gcBefore, boolean splitOutput)
    {
        maybeUpdateShardManager();
        // The tasks are split by repair status and disk, as well as in non-overlapping sections to enable some
        // parallelism (to the amount that L0 sstables are split, i.e. at least base_shard_count). The result will be
        // split across shards according to its density. Depending on the parallelism, the operation may require up to
        // 100% extra space to complete.
        List tasks = new ArrayList<>();
        List> nonOverlapping = splitInNonOverlappingSets(filterSuspectSSTables(getSSTables()));
        for (Set set : nonOverlapping)
        {
            @SuppressWarnings("resource")   // closed by the returned task
            LifecycleTransaction txn = cfs.getTracker().tryModify(set, OperationType.COMPACTION);
            if (txn != null)
                tasks.add(createCompactionTask(txn, gcBefore));
        }
        return tasks;
    }

    private static List> splitInNonOverlappingSets(Collection sstables)
    {
        List> overlapSets = Overlaps.constructOverlapSets(new ArrayList<>(sstables),
                                                                             UnifiedCompactionStrategy::startsAfter,
                                                                             SSTableReader.firstKeyComparator,
                                                                             SSTableReader.lastKeyComparator);
        if (overlapSets.isEmpty())
            return overlapSets;

        Set group = overlapSets.get(0);
        List> groups = new ArrayList<>();
        for (int i = 1; i < overlapSets.size(); ++i)
        {
            Set current = overlapSets.get(i);
            if (Sets.intersection(current, group).isEmpty())
            {
                groups.add(group);
                group = current;
            }
            else
            {
                group.addAll(current);
            }
        }
        groups.add(group);
        return groups;
    }

    @Override
    @SuppressWarnings("resource")   // transaction closed by the returned task
    public AbstractCompactionTask getUserDefinedTask(Collection sstables, final long gcBefore)
    {
        assert !sstables.isEmpty(); // checked for by CM.submitUserDefined

        LifecycleTransaction transaction = cfs.getTracker().tryModify(sstables, OperationType.COMPACTION);
        if (transaction == null)
        {
            logger.trace("Unable to mark {} for compaction; probably a background compaction got to it first.  You can disable background compactions temporarily if this is a problem", sstables);
            return null;
        }

        return createCompactionTask(transaction, gcBefore).setUserDefined(true);
    }

    /**
     * Returns a compaction task to run next.
     *
     * This method is synchronized because task creation is significantly more expensive in UCS; the strategy is
     * stateless, therefore it has to compute the shard/bucket structure on each call.
     *
     * @param gcBefore throw away tombstones older than this
     */
    @Override
    public synchronized UnifiedCompactionTask getNextBackgroundTask(long gcBefore)
    {
        while (true)
        {
            CompactionPick pick = getNextCompactionPick(gcBefore);
            if (pick == null)
                return null;
            UnifiedCompactionTask task = createCompactionTask(pick, gcBefore);
            if (task != null)
                return task;
        }
    }

    @SuppressWarnings("resource")   // transaction closed by the returned task
    private UnifiedCompactionTask createCompactionTask(CompactionPick pick, long gcBefore)
    {
        Preconditions.checkNotNull(pick);
        Preconditions.checkArgument(!pick.isEmpty());

        LifecycleTransaction transaction = cfs.getTracker().tryModify(pick,
                                                                      OperationType.COMPACTION);
        if (transaction != null)
        {
            return createCompactionTask(transaction, gcBefore);
        }
        else
        {
            // This can happen e.g. due to a race with upgrade tasks.
            logger.warn("Failed to submit compaction {} because a transaction could not be created. If this happens frequently, it should be reported", pick);
            // This may be an indication of an SSTableReader reference leak. See CASSANDRA-18342.
            return null;
        }
    }

    /**
     * Create the sstable writer used for flushing.
     *
     * @return an sstable writer that will split sstables into a number of shards as calculated by the controller for
     *         the expected flush density.
     */
    @Override
    public SSTableMultiWriter createSSTableMultiWriter(Descriptor descriptor,
                                                       long keyCount,
                                                       long repairedAt,
                                                       TimeUUID pendingRepair,
                                                       boolean isTransient,
                                                       IntervalSet commitLogPositions,
                                                       int sstableLevel,
                                                       SerializationHeader header,
                                                       Collection indexGroups,
                                                       LifecycleNewTracker lifecycleNewTracker)
    {
        ShardManager shardManager = getShardManager();
        double flushDensity = cfs.metric.flushSizeOnDisk.get() * shardManager.shardSetCoverage() / shardManager.localSpaceCoverage();
        ShardTracker boundaries = shardManager.boundaries(controller.getNumShards(flushDensity));
        return new ShardedMultiWriter(cfs,
                                      descriptor,
                                      keyCount,
                                      repairedAt,
                                      pendingRepair,
                                      isTransient,
                                      commitLogPositions,
                                      header,
                                      indexGroups,
                                      lifecycleNewTracker,
                                      boundaries);
    }

    /**
     * Create the task that in turns creates the sstable writer used for compaction.
     *
     * @return a sharded compaction task that in turn will create a sharded compaction writer.
     */
    private UnifiedCompactionTask createCompactionTask(LifecycleTransaction transaction, long gcBefore)
    {
        return new UnifiedCompactionTask(cfs, this, transaction, gcBefore, getShardManager());
    }

    private void maybeUpdateShardManager()
    {
        if (shardManager != null && !shardManager.isOutOfDate(StorageService.instance.getTokenMetadata().getRingVersion()))
            return; // the disk boundaries (and thus the local ranges too) have not changed since the last time we calculated

        synchronized (this)
        {
            // Recheck after entering critical section, another thread may have beaten us to it.
            while (shardManager == null || shardManager.isOutOfDate(StorageService.instance.getTokenMetadata().getRingVersion()))
                shardManager = ShardManager.create(cfs);
            // Note: this can just as well be done without the synchronization (races would be benign, just doing some
            // redundant work). For the current usages of this blocking is fine and expected to perform no worse.
        }
    }

    @VisibleForTesting
    ShardManager getShardManager()
    {
        maybeUpdateShardManager();
        return shardManager;
    }

    /**
     * Selects a compaction to run next.
     */
    @VisibleForTesting
    CompactionPick getNextCompactionPick(long gcBefore)
    {
        SelectionContext context = new SelectionContext(controller);
        List suitable = getCompactableSSTables(getSSTables(), UnifiedCompactionStrategy::isSuitableForCompaction);
        Set expired = maybeGetExpiredSSTables(gcBefore, suitable);
        suitable.removeAll(expired);

        CompactionPick selected = chooseCompactionPick(suitable, context);
        estimatedRemainingTasks = context.estimatedRemainingTasks;
        if (selected == null)
        {
            if (expired.isEmpty())
                return null;
            else
                return new CompactionPick(-1, -1, expired);
        }

        selected.addAll(expired);
        return selected;
    }

    private Set maybeGetExpiredSSTables(long gcBefore, List suitable)
    {
        Set expired;
        long ts = Clock.Global.currentTimeMillis();
        if (ts - lastExpiredCheck > controller.getExpiredSSTableCheckFrequency())
        {
            lastExpiredCheck = ts;
            expired = CompactionController.getFullyExpiredSSTables(cfs,
                                                                   suitable,
                                                                   cfs.getOverlappingLiveSSTables(suitable),
                                                                   gcBefore,
                                                                   controller.getIgnoreOverlapsInExpirationCheck());
            if (logger.isTraceEnabled() && !expired.isEmpty())
                logger.trace("Expiration check for {}.{} found {} fully expired SSTables",
                             cfs.getKeyspaceName(),
                             cfs.getTableName(),
                             expired.size());
        }
        else
            expired = Collections.emptySet();
        return expired;
    }

    private CompactionPick chooseCompactionPick(List suitable, SelectionContext context)
    {
        // Select the level with the highest overlap; when multiple levels have the same overlap, prefer the lower one
        // (i.e. reduction of RA for bigger token coverage).
        int maxOverlap = -1;
        CompactionPick selected = null;
        for (Level level : formLevels(suitable))
        {
            CompactionPick pick = level.getCompactionPick(context);
            int levelOverlap = level.maxOverlap;
            if (levelOverlap > maxOverlap)
            {
                maxOverlap = levelOverlap;
                selected = pick;
            }
        }
        if (logger.isDebugEnabled() && selected != null)
            logger.debug("Selected compaction on level {} overlap {} sstables {}",
                         selected.level, selected.overlap, selected.size());

        return selected;
    }

    @Override
    public int getEstimatedRemainingTasks()
    {
        return estimatedRemainingTasks;
    }

    @Override
    public long getMaxSSTableBytes()
    {
        return Long.MAX_VALUE;
    }

    @VisibleForTesting
    public Controller getController()
    {
        return controller;
    }

    public static boolean isSuitableForCompaction(SSTableReader rdr)
    {
        return !rdr.isMarkedSuspect() && rdr.openReason != SSTableReader.OpenReason.EARLY;
    }

    @Override
    public synchronized void addSSTable(SSTableReader added)
    {
        sstables.add(added);
    }

    @Override
    public synchronized void removeSSTable(SSTableReader sstable)
    {
        sstables.remove(sstable);
    }

    @Override
    protected synchronized Set getSSTables()
    {
        // Filter the set of sstables through the live set. This is to ensure no zombie sstables are picked for
        // compaction (see CASSANDRA-18342).
        return ImmutableSet.copyOf(Iterables.filter(cfs.getLiveSSTables(), sstables::contains));
    }

    /**
     * @return a list of the levels in the compaction hierarchy
     */
    @VisibleForTesting
    List getLevels()
    {
        return getLevels(getSSTables(), UnifiedCompactionStrategy::isSuitableForCompaction);
    }

    /**
     * Groups the sstables passed in into levels. This is used by the strategy to determine
     * new compactions, and by external tools to analyze the strategy decisions.
     *
     * @param sstables a collection of the sstables to be assigned to levels
     * @param compactionFilter a filter to exclude CompactionSSTables,
     *                         e.g., {@link #isSuitableForCompaction}
     *
     * @return a list of the levels in the compaction hierarchy
     */
    public List getLevels(Collection sstables,
                                 Predicate compactionFilter)
    {
        List suitable = getCompactableSSTables(sstables, compactionFilter);
        return formLevels(suitable);
    }

    private List formLevels(List suitable)
    {
        maybeUpdateShardManager();
        List levels = new ArrayList<>(MAX_LEVELS);
        suitable.sort(shardManager::compareByDensity);

        double maxDensity = controller.getMaxLevelDensity(0, controller.getBaseSstableSize(controller.getFanout(0)) / shardManager.localSpaceCoverage());
        int index = 0;
        Level level = new Level(controller, index, 0, maxDensity);
        for (SSTableReader candidate : suitable)
        {
            final double density = shardManager.density(candidate);
            if (density < level.max)
            {
                level.add(candidate);
                continue;
            }

            level.complete();
            levels.add(level); // add even if empty

            while (true)
            {
                ++index;
                double minDensity = maxDensity;
                maxDensity = controller.getMaxLevelDensity(index, minDensity);
                level = new Level(controller, index, minDensity, maxDensity);
                if (density < level.max)
                {
                    level.add(candidate);
                    break;
                }
                else
                {
                    levels.add(level); // add the empty level
                }
            }
        }

        if (!level.sstables.isEmpty())
        {
            level.complete();
            levels.add(level);
        }

        return levels;
    }

    private List getCompactableSSTables(Collection sstables,
                                                       Predicate compactionFilter)
    {
        Set compacting = cfs.getTracker().getCompacting();
        List suitable = new ArrayList<>(sstables.size());
        for (SSTableReader rdr : sstables)
        {
            if (compactionFilter.test(rdr) && !compacting.contains(rdr))
                suitable.add(rdr);
        }
        return suitable;
    }

    public TableMetadata getMetadata()
    {
        return cfs.metadata();
    }

    private static boolean startsAfter(SSTableReader a, SSTableReader b)
    {
        // Strict comparison because the span is end-inclusive.
        return a.getFirst().compareTo(b.getLast()) > 0;
    }

    @Override
    public String toString()
    {
        return String.format("Unified strategy %s", getMetadata());
    }

    /**
     * A level: index, sstables and some properties.
     */
    public static class Level
    {
        final List sstables;
        final int index;
        final double survivalFactor;
        final int scalingParameter; // scaling parameter used to calculate fanout and threshold
        final int fanout; // fanout factor between levels
        final int threshold; // number of SSTables that trigger a compaction
        final double min; // min density of sstables for this level
        final double max; // max density of sstables for this level
        int maxOverlap = -1; // maximum number of overlapping sstables, i.e. maximum number of sstables that need
                             // to be queried on this level for any given key

        Level(Controller controller, int index, double minSize, double maxSize)
        {
            this.index = index;
            this.survivalFactor = controller.getSurvivalFactor(index);
            this.scalingParameter = controller.getScalingParameter(index);
            this.fanout = controller.getFanout(index);
            this.threshold = controller.getThreshold(index);
            this.sstables = new ArrayList<>(threshold);
            this.min = minSize;
            this.max = maxSize;
        }

        public Collection getSSTables()
        {
            return sstables;
        }

        public int getIndex()
        {
            return index;
        }

        void add(SSTableReader sstable)
        {
            this.sstables.add(sstable);
        }

        void complete()
        {
            if (logger.isTraceEnabled())
                logger.trace("Level: {}", this);
        }

        /**
         * Return the compaction pick for this level.
         * 

* This is done by splitting the level into buckets that we can treat as independent regions for compaction. * We then use the maxOverlap value (i.e. the maximum number of sstables that can contain data for any covered * key) of each bucket to determine if compactions are needed, and to prioritize the buckets that contribute * most to the complexity of queries: if maxOverlap is below the level's threshold, no compaction is needed; * otherwise, we choose one from the buckets that have the highest maxOverlap. */ CompactionPick getCompactionPick(SelectionContext context) { List buckets = getBuckets(context); if (buckets == null) { if (logger.isDebugEnabled()) logger.debug("Level {} sstables {} max overlap {} buckets with compactions {} tasks {}", index, sstables.size(), maxOverlap, 0, 0); return null; // nothing crosses the threshold in this level, nothing to do } int estimatedRemainingTasks = 0; int overlapMatchingCount = 0; Bucket selectedBucket = null; Controller controller = context.controller; for (Bucket bucket : buckets) { // We can have just one pick in each level. Pick one bucket randomly out of the ones with // the highest overlap. // The random() part below implements reservoir sampling with size 1, giving us a uniformly random selection. if (bucket.maxOverlap == maxOverlap && controller.random().nextInt(++overlapMatchingCount) == 0) selectedBucket = bucket; // The estimated remaining tasks is a measure of the remaining amount of work, thus we prefer to // calculate the number of tasks we would do in normal operation, even though we may compact in bigger // chunks when we are late. estimatedRemainingTasks += bucket.maxOverlap / threshold; } context.estimatedRemainingTasks += estimatedRemainingTasks; assert selectedBucket != null; if (logger.isDebugEnabled()) logger.debug("Level {} sstables {} max overlap {} buckets with compactions {} tasks {}", index, sstables.size(), maxOverlap, buckets.size(), estimatedRemainingTasks); CompactionPick selected = selectedBucket.constructPick(controller); if (logger.isTraceEnabled()) logger.trace("Returning compaction pick with selected compaction {}", selected); return selected; } /** * Group the sstables in this level into buckets. *

* The buckets are formed by grouping sstables that overlap at some key together, and then expanded to cover * any overlapping sstable according to the overlap inclusion method. With the usual TRANSITIVE method this * results into non-overlapping buckets that can't affect one another and can be compacted in parallel without * any loss of efficiency. *

* Other overlap inclusion methods are provided to cover situations where we may be okay with compacting * sstables partially and doing more than the strictly necessary amount of compaction to solve a problem: e.g. * after an upgrade from LCS where transitive overlap may cause a complete level to be compacted together * (creating an operation that will take a very long time to complete) and we want to make some progress as * quickly as possible at the cost of redoing some work. *

* The number of sstables that overlap at some key defines the "overlap" of a set of sstables. The maximum such * value in the bucket is its "maxOverlap", i.e. the highest number of sstables we need to read to find the * data associated with a given key. */ @VisibleForTesting List getBuckets(SelectionContext context) { List liveSet = sstables; if (logger.isTraceEnabled()) logger.trace("Creating compaction pick with live set {}", liveSet); List> overlaps = Overlaps.constructOverlapSets(liveSet, UnifiedCompactionStrategy::startsAfter, SSTableReader.firstKeyComparator, SSTableReader.lastKeyComparator); for (Set overlap : overlaps) maxOverlap = Math.max(maxOverlap, overlap.size()); if (maxOverlap < threshold) return null; List buckets = Overlaps.assignOverlapsIntoBuckets(threshold, context.controller.overlapInclusionMethod(), overlaps, this::makeBucket); return buckets; } private Bucket makeBucket(List> overlaps, int startIndex, int endIndex) { return endIndex == startIndex + 1 ? new SimpleBucket(this, overlaps.get(startIndex)) : new MultiSetBucket(this, overlaps.subList(startIndex, endIndex)); } @Override public String toString() { return String.format("W: %d, T: %d, F: %d, index: %d, min: %s, max %s, %d sstables, overlap %s", scalingParameter, threshold, fanout, index, densityAsString(min), densityAsString(max), sstables.size(), maxOverlap); } private String densityAsString(double density) { return FBUtilities.prettyPrintBinary(density, "B", " "); } } /** * A compaction bucket, i.e. a selection of overlapping sstables from which a compaction should be selected. */ static abstract class Bucket { final Level level; final List allSSTablesSorted; final int maxOverlap; Bucket(Level level, Collection allSSTablesSorted, int maxOverlap) { // single section this.level = level; this.allSSTablesSorted = new ArrayList<>(allSSTablesSorted); this.allSSTablesSorted.sort(SSTableReader.maxTimestampDescending); // we remove entries from the back this.maxOverlap = maxOverlap; } Bucket(Level level, List> overlapSections) { // multiple sections this.level = level; int maxOverlap = 0; Set all = new HashSet<>(); for (Set section : overlapSections) { maxOverlap = Math.max(maxOverlap, section.size()); all.addAll(section); } this.allSSTablesSorted = new ArrayList<>(all); this.allSSTablesSorted.sort(SSTableReader.maxTimestampDescending); // we remove entries from the back this.maxOverlap = maxOverlap; } /** * Select compactions from this bucket. Normally this would form a compaction out of all sstables in the * bucket, but if compaction is very late we may prefer to act more carefully: * - we should not use more inputs than the permitted maximum * - we should select SSTables in a way that preserves the structure of the compaction hierarchy * These impose a limit on the size of a compaction; to make sure we always reduce the read amplification by * this much, we treat this number as a limit on overlapping sstables, i.e. if A and B don't overlap with each * other but both overlap with C and D, all four will be selected to form a limit-three compaction. A limit-two * one may choose CD, ABC or ABD. * Also, the subset is selected by max timestamp order, oldest first, to avoid violating sstable time order. In * the example above, if B is oldest and C is older than D, the limit-two choice would be ABC (if A is older * than D) or BC (if A is younger, avoiding combining C with A skipping D). * * @param controller The compaction controller. * @return A compaction pick to execute next. */ CompactionPick constructPick(Controller controller) { int count = maxOverlap; int threshold = level.threshold; int fanout = level.fanout; int index = level.index; int maxSSTablesToCompact = Math.max(fanout, controller.maxSSTablesToCompact()); assert count >= threshold; if (count <= fanout) { /** * Happy path. We are not late or (for levelled) we are only so late that a compaction now will * have the same effect as doing levelled compactions one by one. Compact all. We do not cap * this pick at maxSSTablesToCompact due to an assumption that maxSSTablesToCompact is much * greater than F. See {@link Controller#MAX_SSTABLES_TO_COMPACT_OPTION} for more details. */ return new CompactionPick(index, count, allSSTablesSorted); } else if (count <= fanout * controller.getFanout(index + 1) || maxSSTablesToCompact == fanout) { // Compaction is a bit late, but not enough to jump levels via layout compactions. We need a special // case to cap compaction pick at maxSSTablesToCompact. if (count <= maxSSTablesToCompact) return new CompactionPick(index, count, allSSTablesSorted); return new CompactionPick(index, maxSSTablesToCompact, pullOldestSSTables(maxSSTablesToCompact)); } else { // We may, however, have accumulated a lot more than T if compaction is very late. // In this case we pick a compaction in such a way that the result of doing it spreads the data in // a similar way to how compaction would lay them if it was able to keep up. This means: // - for tiered compaction (w >= 0), compact in sets of as many as required to get to a level. // for example, for w=2 and 55 sstables, pick a compaction of 16 sstables (on the next calls, given no // new files, 2 more of 16, 1 of 4, and leaving the other 3 sstables alone). // - for levelled compaction (w < 0), compact all that would reach a level. // for w=-2 and 55, this means pick a compaction of 48 (on the next calls, given no new files, one of // 4, and one of 3 sstables). int pickSize = selectPickSize(controller, maxSSTablesToCompact); return new CompactionPick(index, pickSize, pullOldestSSTables(pickSize)); } } private int selectPickSize(Controller controller, int maxSSTablesToCompact) { int pickSize; int fanout = level.fanout; int nextStep = fanout; int index = level.index; int limit = Math.min(maxSSTablesToCompact, maxOverlap); do { pickSize = nextStep; fanout = controller.getFanout(++index); nextStep *= fanout; } while (nextStep <= limit); if (level.scalingParameter < 0) { // For levelled compaction all the sstables that would reach this level need to be compacted to one, // so select the highest multiple of step that fits. pickSize *= limit / pickSize; assert pickSize > 0; } return pickSize; } /** * Pull the oldest sstables to get at most limit-many overlapping sstables to compact in each overlap section. */ abstract Collection pullOldestSSTables(int overlapLimit); } public static class SimpleBucket extends Bucket { public SimpleBucket(Level level, Collection sstables) { super(level, sstables, sstables.size()); } Collection pullOldestSSTables(int overlapLimit) { if (allSSTablesSorted.size() <= overlapLimit) return allSSTablesSorted; return Overlaps.pullLast(allSSTablesSorted, overlapLimit); } } public static class MultiSetBucket extends Bucket { final List> overlapSets; public MultiSetBucket(Level level, List> overlapSets) { super(level, overlapSets); this.overlapSets = overlapSets; } Collection pullOldestSSTables(int overlapLimit) { return Overlaps.pullLastWithOverlapLimit(allSSTablesSorted, overlapSets, overlapLimit); } } /** * Utility class holding a collection of sstables for compaction. */ static class CompactionPick extends ArrayList { final int level; final int overlap; CompactionPick(int level, int overlap, Collection sstables) { super(sstables); this.level = level; this.overlap = overlap; } } static class SelectionContext { final Controller controller; int estimatedRemainingTasks = 0; SelectionContext(Controller controller) { this.controller = controller; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy