org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.db.compaction;
import java.util.*;
import java.util.Map.Entry;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.google.common.primitives.Longs;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.cassandra.cql3.statements.CFPropDefs;
import org.apache.cassandra.db.ColumnFamilyStore;
import org.apache.cassandra.exceptions.ConfigurationException;
import org.apache.cassandra.io.sstable.ColumnNameHelper;
import org.apache.cassandra.io.sstable.SSTableReader;
import org.apache.cassandra.utils.Pair;
public class SizeTieredCompactionStrategy extends AbstractCompactionStrategy
{
private static final Logger logger = LoggerFactory.getLogger(SizeTieredCompactionStrategy.class);
private static final Comparator,Double>> bucketsByHotnessComparator = new Comparator, Double>>()
{
public int compare(Pair, Double> o1, Pair, Double> o2)
{
int comparison = Double.compare(o1.right, o2.right);
if (comparison != 0)
return comparison;
// break ties by compacting the smallest sstables first (this will probably only happen for
// system tables and new/unread sstables)
return Long.compare(avgSize(o1.left), avgSize(o2.left));
}
private long avgSize(List sstables)
{
long n = 0;
for (SSTableReader sstable : sstables)
n += sstable.bytesOnDisk();
return n / sstables.size();
}
};
protected SizeTieredCompactionStrategyOptions options;
protected volatile int estimatedRemainingTasks;
private final Set sstables = new HashSet<>();
public SizeTieredCompactionStrategy(ColumnFamilyStore cfs, Map options)
{
super(cfs, options);
this.estimatedRemainingTasks = 0;
this.options = new SizeTieredCompactionStrategyOptions(options);
}
private List getNextBackgroundSSTables(final int gcBefore)
{
if (!isEnabled())
return Collections.emptyList();
// make local copies so they can't be changed out from under us mid-method
int minThreshold = cfs.getMinimumCompactionThreshold();
int maxThreshold = cfs.getMaximumCompactionThreshold();
Iterable candidates = filterSuspectSSTables(Sets.intersection(cfs.getUncompactingSSTables(), sstables));
candidates = filterColdSSTables(Lists.newArrayList(candidates), options.coldReadsToOmit, cfs.getMinimumCompactionThreshold());
List> buckets = getBuckets(createSSTableAndLengthPairs(candidates), options.bucketHigh, options.bucketLow, options.minSSTableSize);
logger.debug("Compaction buckets are {}", buckets);
updateEstimatedCompactionsByTasks(buckets);
List mostInteresting = mostInterestingBucket(buckets, minThreshold, maxThreshold);
if (!mostInteresting.isEmpty())
return mostInteresting;
// if there is no sstable to compact in standard way, try compacting single sstable whose droppable tombstone
// ratio is greater than threshold.
List sstablesWithTombstones = new ArrayList<>();
for (SSTableReader sstable : candidates)
{
if (worthDroppingTombstones(sstable, gcBefore))
sstablesWithTombstones.add(sstable);
}
if (sstablesWithTombstones.isEmpty())
return Collections.emptyList();
Collections.sort(sstablesWithTombstones, new SSTableReader.SizeComparator());
return Collections.singletonList(sstablesWithTombstones.get(0));
}
/**
* Removes as many cold sstables as possible while retaining at least 1-coldReadsToOmit of the total reads/sec
* across all sstables
* @param sstables all sstables to consider
* @param coldReadsToOmit the proportion of total reads/sec that will be omitted (0=omit nothing, 1=omit everything)
* @param minThreshold min compaction threshold
* @return a list of sstables with the coldest sstables excluded until the reads they represent reaches coldReadsToOmit
*/
@VisibleForTesting
static List filterColdSSTables(List sstables, double coldReadsToOmit, int minThreshold)
{
if (coldReadsToOmit == 0.0)
return sstables;
// Sort the sstables by hotness (coldest-first). We first build a map because the hotness may change during the sort.
final Map hotnessSnapshot = getHotnessMap(sstables);
Collections.sort(sstables, new Comparator()
{
public int compare(SSTableReader o1, SSTableReader o2)
{
int comparison = Double.compare(hotnessSnapshot.get(o1), hotnessSnapshot.get(o2));
if (comparison != 0)
return comparison;
// break ties with size on disk (mainly for system tables and cold tables)
comparison = Long.compare(o1.bytesOnDisk(), o2.bytesOnDisk());
if (comparison != 0)
return comparison;
// if there's still a tie, use generation, which is guaranteed to be unique. this ensures that
// our filtering is deterministic, which can be useful when debugging.
return o1.descriptor.generation - o2.descriptor.generation;
}
});
// calculate the total reads/sec across all sstables
double totalReads = 0.0;
for (SSTableReader sstr : sstables)
if (sstr.readMeter != null)
totalReads += sstr.readMeter.twoHourRate();
// if this is a system table with no read meters or we don't have any read rates yet, just return them all
if (totalReads == 0.0)
return sstables;
// iteratively ignore the coldest sstables until ignoring one more would put us over the coldReadsToOmit threshold
double maxColdReads = coldReadsToOmit * totalReads;
double totalColdReads = 0.0;
int cutoffIndex = 0;
while (cutoffIndex < sstables.size())
{
SSTableReader sstable = sstables.get(cutoffIndex);
if (sstable.readMeter == null)
{
throw new AssertionError("If you're seeing this exception, please attach your logs to CASSANDRA-8238 to help us debug. "+sstable);
}
double reads = sstable.readMeter.twoHourRate();
if (totalColdReads + reads > maxColdReads)
break;
totalColdReads += reads;
cutoffIndex++;
}
List hotSSTables = new ArrayList<>(sstables.subList(cutoffIndex, sstables.size()));
List coldSSTables = sstables.subList(0, cutoffIndex);
logger.debug("hotSSTables={}, coldSSTables={}", hotSSTables.size(), coldSSTables.size());
if (hotSSTables.size() >= minThreshold)
return hotSSTables;
if (coldSSTables.size() < minThreshold)
return Collections.emptyList();
Map> overlapMap = new HashMap<>();
for (int i = 0; i < coldSSTables.size(); i++)
{
SSTableReader sstable = coldSSTables.get(i);
Set overlaps = new HashSet<>();
for (int j = 0; j < coldSSTables.size(); j++)
{
SSTableReader innerSSTable = coldSSTables.get(j);
if (ColumnNameHelper.overlaps(sstable.getSSTableMetadata().minColumnNames,
sstable.getSSTableMetadata().maxColumnNames,
innerSSTable.getSSTableMetadata().minColumnNames,
innerSSTable.getSSTableMetadata().maxColumnNames,
sstable.metadata.comparator))
{
overlaps.add(innerSSTable);
}
}
overlapMap.put(sstable, overlaps);
}
List> overlapChains = new ArrayList<>();
for (SSTableReader sstable : overlapMap.keySet())
overlapChains.add(createOverlapChain(sstable, overlapMap));
Collections.sort(overlapChains, new Comparator>()
{
@Override
public int compare(Set o1, Set o2)
{
return Longs.compare(SSTableReader.getTotalBytes(o2), SSTableReader.getTotalBytes(o1));
}
});
for (Set overlapping : overlapChains)
{
// if we are expecting to only keep 70% of the keys after a compaction, run a compaction on these cold sstables:
if (SSTableReader.estimateCompactionGain(overlapping) < 0.7)
return new ArrayList<>(overlapping);
}
return Collections.emptyList();
}
/**
* returns a set with all overlapping sstables starting with s.
* if we have 3 sstables, a, b, c where a overlaps with b, but not c and b overlaps with c, all sstables would be returned.
*
* m contains an sstable -> all overlapping mapping
*/
private static Set createOverlapChain(SSTableReader s, Map> m)
{
Deque sstables = new ArrayDeque<>();
Set overlapChain = new HashSet<>();
sstables.push(s);
while (!sstables.isEmpty())
{
SSTableReader sstable = sstables.pop();
if (overlapChain.add(sstable))
{
if (m.containsKey(sstable))
sstables.addAll(m.get(sstable));
}
}
return overlapChain;
}
/**
* @param buckets list of buckets from which to return the most interesting, where "interesting" is the total hotness for reads
* @param minThreshold minimum number of sstables in a bucket to qualify as interesting
* @param maxThreshold maximum number of sstables to compact at once (the returned bucket will be trimmed down to this)
* @return a bucket (list) of sstables to compact
*/
public static List mostInterestingBucket(List> buckets, int minThreshold, int maxThreshold)
{
// skip buckets containing less than minThreshold sstables, and limit other buckets to maxThreshold sstables
final List, Double>> prunedBucketsAndHotness = new ArrayList<>(buckets.size());
for (List bucket : buckets)
{
Pair, Double> bucketAndHotness = trimToThresholdWithHotness(bucket, maxThreshold);
if (bucketAndHotness != null && bucketAndHotness.left.size() >= minThreshold)
prunedBucketsAndHotness.add(bucketAndHotness);
}
if (prunedBucketsAndHotness.isEmpty())
return Collections.emptyList();
Pair, Double> hottest = Collections.max(prunedBucketsAndHotness, bucketsByHotnessComparator);
return hottest.left;
}
/**
* Returns a (bucket, hotness) pair or null if there were not enough sstables in the bucket to meet minThreshold.
* If there are more than maxThreshold sstables, the coldest sstables will be trimmed to meet the threshold.
**/
@VisibleForTesting
static Pair, Double> trimToThresholdWithHotness(List bucket, int maxThreshold)
{
// Sort by sstable hotness (descending). We first build a map because the hotness may change during the sort.
final Map hotnessSnapshot = getHotnessMap(bucket);
Collections.sort(bucket, new Comparator()
{
public int compare(SSTableReader o1, SSTableReader o2)
{
return -1 * Double.compare(hotnessSnapshot.get(o1), hotnessSnapshot.get(o2));
}
});
// and then trim the coldest sstables off the end to meet the maxThreshold
List prunedBucket = bucket.subList(0, Math.min(bucket.size(), maxThreshold));
// bucket hotness is the sum of the hotness of all sstable members
double bucketHotness = 0.0;
for (SSTableReader sstr : prunedBucket)
bucketHotness += hotness(sstr);
return Pair.create(prunedBucket, bucketHotness);
}
private static Map getHotnessMap(Collection sstables)
{
Map hotness = new HashMap<>();
for (SSTableReader sstable : sstables)
hotness.put(sstable, hotness(sstable));
return hotness;
}
/**
* Returns the reads per second per key for this sstable, or 0.0 if the sstable has no read meter
*/
private static double hotness(SSTableReader sstr)
{
// system tables don't have read meters, just use 0.0 for the hotness
return sstr.readMeter == null ? 0.0 : sstr.readMeter.twoHourRate() / sstr.estimatedKeys();
}
public synchronized AbstractCompactionTask getNextBackgroundTask(int gcBefore)
{
if (!isEnabled())
return null;
while (true)
{
List hottestBucket = getNextBackgroundSSTables(gcBefore);
if (hottestBucket.isEmpty())
return null;
if (cfs.getDataTracker().markCompacting(hottestBucket))
return new CompactionTask(cfs, hottestBucket, gcBefore, false);
}
}
public Collection getMaximalTask(final int gcBefore)
{
Iterable filteredSSTables = filterSuspectSSTables(sstables);
if (Iterables.isEmpty(sstables))
return null;
if (!cfs.getDataTracker().markCompacting(filteredSSTables))
return null;
return Arrays.asList(new CompactionTask(cfs, filteredSSTables, gcBefore, false));
}
public AbstractCompactionTask getUserDefinedTask(Collection sstables, final int gcBefore)
{
assert !sstables.isEmpty(); // checked for by CM.submitUserDefined
if (!cfs.getDataTracker().markCompacting(sstables))
{
logger.debug("Unable to mark {} for compaction; probably a background compaction got to it first. You can disable background compactions temporarily if this is a problem", sstables);
return null;
}
return new CompactionTask(cfs, sstables, gcBefore, false).setUserDefined(true);
}
public int getEstimatedRemainingTasks()
{
return estimatedRemainingTasks;
}
public static List> createSSTableAndLengthPairs(Iterable sstables)
{
List> sstableLengthPairs = new ArrayList>(Iterables.size(sstables));
for(SSTableReader sstable : sstables)
sstableLengthPairs.add(Pair.create(sstable, sstable.onDiskLength()));
return sstableLengthPairs;
}
/*
* Group files of similar size into buckets.
*/
public static List> getBuckets(Collection> files, double bucketHigh, double bucketLow, long minSSTableSize)
{
// Sort the list in order to get deterministic results during the grouping below
List> sortedFiles = new ArrayList>(files);
Collections.sort(sortedFiles, new Comparator>()
{
public int compare(Pair p1, Pair p2)
{
return p1.right.compareTo(p2.right);
}
});
Map> buckets = new HashMap>();
outer:
for (Pair pair: sortedFiles)
{
long size = pair.right;
// look for a bucket containing similar-sized files:
// group in the same bucket if it's w/in 50% of the average for this bucket,
// or this file and the bucket are all considered "small" (less than `minSSTableSize`)
for (Entry> entry : buckets.entrySet())
{
List bucket = entry.getValue();
long oldAverageSize = entry.getKey();
if ((size > (oldAverageSize * bucketLow) && size < (oldAverageSize * bucketHigh))
|| (size < minSSTableSize && oldAverageSize < minSSTableSize))
{
// remove and re-add under new new average size
buckets.remove(oldAverageSize);
long totalSize = bucket.size() * oldAverageSize;
long newAverageSize = (totalSize + size) / (bucket.size() + 1);
bucket.add(pair.left);
buckets.put(newAverageSize, bucket);
continue outer;
}
}
// no similar bucket found; put it in a new one
ArrayList bucket = new ArrayList();
bucket.add(pair.left);
buckets.put(size, bucket);
}
return new ArrayList>(buckets.values());
}
private void updateEstimatedCompactionsByTasks(List> tasks)
{
int n = 0;
for (List bucket: tasks)
{
if (bucket.size() >= cfs.getMinimumCompactionThreshold())
n += Math.ceil((double)bucket.size() / cfs.getMaximumCompactionThreshold());
}
estimatedRemainingTasks = n;
}
public long getMaxSSTableBytes()
{
return Long.MAX_VALUE;
}
public static Map validateOptions(Map options) throws ConfigurationException
{
Map uncheckedOptions = AbstractCompactionStrategy.validateOptions(options);
uncheckedOptions = SizeTieredCompactionStrategyOptions.validateOptions(options, uncheckedOptions);
uncheckedOptions.remove(CFPropDefs.KW_MINCOMPACTIONTHRESHOLD);
uncheckedOptions.remove(CFPropDefs.KW_MAXCOMPACTIONTHRESHOLD);
return uncheckedOptions;
}
@Override
public boolean shouldDefragment()
{
return true;
}
@Override
public void addSSTable(SSTableReader added)
{
sstables.add(added);
}
@Override
public void removeSSTable(SSTableReader sstable)
{
sstables.remove(sstable);
}
public String toString()
{
return String.format("SizeTieredCompactionStrategy[%s/%s]",
cfs.getMinimumCompactionThreshold(),
cfs.getMaximumCompactionThreshold());
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy