org.apache.cassandra.db.compaction.CompactionStrategyManager Maven / Gradle / Ivy
Show all versions of cassandra-all Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.db.compaction;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.ConcurrentModificationException;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.primitives.Longs;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.db.ColumnFamilyStore;
import org.apache.cassandra.db.Directories;
import org.apache.cassandra.db.DiskBoundaries;
import org.apache.cassandra.db.SerializationHeader;
import org.apache.cassandra.db.commitlog.CommitLogPosition;
import org.apache.cassandra.db.commitlog.IntervalSet;
import org.apache.cassandra.db.compaction.AbstractStrategyHolder.TaskSupplier;
import org.apache.cassandra.db.compaction.PendingRepairManager.CleanupTask;
import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
import org.apache.cassandra.db.lifecycle.SSTableSet;
import org.apache.cassandra.dht.Range;
import org.apache.cassandra.dht.Token;
import org.apache.cassandra.index.Index;
import org.apache.cassandra.io.sstable.Descriptor;
import org.apache.cassandra.io.sstable.ISSTableScanner;
import org.apache.cassandra.io.sstable.SSTable;
import org.apache.cassandra.io.sstable.SSTableMultiWriter;
import org.apache.cassandra.io.sstable.format.SSTableFormat.Components;
import org.apache.cassandra.io.sstable.format.SSTableReader;
import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
import org.apache.cassandra.io.util.File;
import org.apache.cassandra.notifications.INotification;
import org.apache.cassandra.notifications.INotificationConsumer;
import org.apache.cassandra.notifications.SSTableAddedNotification;
import org.apache.cassandra.notifications.SSTableDeletingNotification;
import org.apache.cassandra.notifications.SSTableListChangedNotification;
import org.apache.cassandra.notifications.SSTableMetadataChanged;
import org.apache.cassandra.notifications.SSTableRepairStatusChanged;
import org.apache.cassandra.repair.consistent.admin.CleanupSummary;
import org.apache.cassandra.schema.CompactionParams;
import org.apache.cassandra.service.ActiveRepairService;
import org.apache.cassandra.utils.TimeUUID;
import static org.apache.cassandra.db.compaction.AbstractStrategyHolder.GroupedSSTableContainer;
/**
* Manages the compaction strategies.
*
* SSTables are isolated from each other based on their incremental repair status (repaired, unrepaired, or pending repair)
* and directory (determined by their starting token). This class handles the routing between {@link AbstractStrategyHolder}
* instances based on repair status, and the {@link AbstractStrategyHolder} instances have separate compaction strategies
* for each directory, which it routes sstables to. Note that {@link PendingRepairHolder} also divides sstables on their
* pending repair id.
*
* Operations on this class are guarded by a {@link ReentrantReadWriteLock}. This lock performs mutual exclusion on
* reads and writes to the following variables: {@link this#repaired}, {@link this#unrepaired}, {@link this#isActive},
* {@link this#params}, {@link this#currentBoundaries}. Whenever performing reads on these variables,
* the {@link this#readLock} should be acquired. Likewise, updates to these variables should be guarded by
* {@link this#writeLock}.
*
* Whenever the {@link DiskBoundaries} change, the compaction strategies must be reloaded, so in order to ensure
* the compaction strategy placement reflect most up-to-date disk boundaries, call {@link this#maybeReloadDiskBoundaries()}
* before acquiring the read lock to acess the strategies.
*
*/
public class CompactionStrategyManager implements INotificationConsumer
{
private static final Logger logger = LoggerFactory.getLogger(CompactionStrategyManager.class);
public final CompactionLogger compactionLogger;
private final ColumnFamilyStore cfs;
private final boolean partitionSSTablesByTokenRange;
private final Supplier boundariesSupplier;
/**
* Performs mutual exclusion on the variables below
*/
private final ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
private final ReentrantReadWriteLock.ReadLock readLock = lock.readLock();
private final ReentrantReadWriteLock.WriteLock writeLock = lock.writeLock();
/**
* Variables guarded by read and write lock above
*/
private final PendingRepairHolder transientRepairs;
private final PendingRepairHolder pendingRepairs;
private final CompactionStrategyHolder repaired;
private final CompactionStrategyHolder unrepaired;
private final ImmutableList holders;
private volatile CompactionParams params;
private DiskBoundaries currentBoundaries;
private volatile boolean enabled;
private volatile boolean isActive = true;
/*
We keep a copy of the schema compaction parameters here to be able to decide if we
should update the compaction strategy in maybeReload() due to an ALTER.
If a user changes the local compaction strategy and then later ALTERs a compaction parameter,
we will use the new compaction parameters.
*/
private volatile CompactionParams schemaCompactionParams;
private volatile boolean supportsEarlyOpen;
private volatile int fanout;
private volatile long maxSSTableSizeBytes;
private volatile String name;
public static int TWCS_BUCKET_COUNT_MAX = 128;
public CompactionStrategyManager(ColumnFamilyStore cfs)
{
this(cfs, cfs::getDiskBoundaries, cfs.getPartitioner().splitter().isPresent());
}
@VisibleForTesting
public CompactionStrategyManager(ColumnFamilyStore cfs, Supplier boundariesSupplier,
boolean partitionSSTablesByTokenRange)
{
AbstractStrategyHolder.DestinationRouter router = new AbstractStrategyHolder.DestinationRouter()
{
public int getIndexForSSTable(SSTableReader sstable)
{
return compactionStrategyIndexFor(sstable);
}
public int getIndexForSSTableDirectory(Descriptor descriptor)
{
return compactionStrategyIndexForDirectory(descriptor);
}
};
transientRepairs = new PendingRepairHolder(cfs, router, true);
pendingRepairs = new PendingRepairHolder(cfs, router, false);
repaired = new CompactionStrategyHolder(cfs, router, true);
unrepaired = new CompactionStrategyHolder(cfs, router, false);
holders = ImmutableList.of(transientRepairs, pendingRepairs, repaired, unrepaired);
cfs.getTracker().subscribe(this);
logger.trace("{} subscribed to the data tracker.", this);
this.cfs = cfs;
this.compactionLogger = new CompactionLogger(cfs, this);
this.boundariesSupplier = boundariesSupplier;
this.partitionSSTablesByTokenRange = partitionSSTablesByTokenRange;
currentBoundaries = boundariesSupplier.get();
params = schemaCompactionParams = cfs.metadata().params.compaction;
enabled = params.isEnabled();
setStrategy(schemaCompactionParams);
startup();
}
/**
* Return the next background task
*
* Returns a task for the compaction strategy that needs it the most (most estimated remaining tasks)
*/
public AbstractCompactionTask getNextBackgroundTask(long gcBefore)
{
maybeReloadDiskBoundaries();
readLock.lock();
try
{
if (!isEnabled())
return null;
int numPartitions = getNumTokenPartitions();
// first try to promote/demote sstables from completed repairs
AbstractCompactionTask repairFinishedTask;
repairFinishedTask = pendingRepairs.getNextRepairFinishedTask();
if (repairFinishedTask != null)
return repairFinishedTask;
repairFinishedTask = transientRepairs.getNextRepairFinishedTask();
if (repairFinishedTask != null)
return repairFinishedTask;
// sort compaction task suppliers by remaining tasks descending
List suppliers = new ArrayList<>(numPartitions * holders.size());
for (AbstractStrategyHolder holder : holders)
suppliers.addAll(holder.getBackgroundTaskSuppliers(gcBefore));
Collections.sort(suppliers);
// return the first non-null task
for (TaskSupplier supplier : suppliers)
{
AbstractCompactionTask task = supplier.getTask();
if (task != null)
return task;
}
return null;
}
finally
{
readLock.unlock();
}
}
/**
* finds the oldest (by modification date) non-latest-version sstable on disk and creates an upgrade task for it
* @return
*/
@VisibleForTesting
@SuppressWarnings("resource") // transaction is closed by AbstractCompactionTask::execute
AbstractCompactionTask findUpgradeSSTableTask()
{
if (!isEnabled() || !DatabaseDescriptor.automaticSSTableUpgrade())
return null;
Set compacting = cfs.getTracker().getCompacting();
List potentialUpgrade = cfs.getLiveSSTables()
.stream()
.filter(s -> !compacting.contains(s) && !s.descriptor.version.isLatestVersion())
.sorted((o1, o2) -> {
File f1 = o1.descriptor.fileFor(Components.DATA);
File f2 = o2.descriptor.fileFor(Components.DATA);
return Longs.compare(f1.lastModified(), f2.lastModified());
}).collect(Collectors.toList());
for (SSTableReader sstable : potentialUpgrade)
{
LifecycleTransaction txn = cfs.getTracker().tryModify(sstable, OperationType.UPGRADE_SSTABLES);
if (txn != null)
{
logger.debug("Running automatic sstable upgrade for {}", sstable);
return getCompactionStrategyFor(sstable).getCompactionTask(txn, Integer.MIN_VALUE, Long.MAX_VALUE);
}
}
return null;
}
public boolean isEnabled()
{
return enabled && isActive;
}
public boolean isActive()
{
return isActive;
}
public void resume()
{
writeLock.lock();
try
{
isActive = true;
}
finally
{
writeLock.unlock();
}
}
/**
* pause compaction while we cancel all ongoing compactions
*
* Separate call from enable/disable to not have to save the enabled-state externally
*/
public void pause()
{
writeLock.lock();
try
{
isActive = false;
}
finally
{
writeLock.unlock();
}
}
private void startup()
{
writeLock.lock();
try
{
for (SSTableReader sstable : cfs.getSSTables(SSTableSet.CANONICAL))
{
if (sstable.openReason != SSTableReader.OpenReason.EARLY)
compactionStrategyFor(sstable).addSSTable(sstable);
}
holders.forEach(AbstractStrategyHolder::startup);
supportsEarlyOpen = repaired.first().supportsEarlyOpen();
fanout = (repaired.first() instanceof LeveledCompactionStrategy) ? ((LeveledCompactionStrategy) repaired.first()).getLevelFanoutSize() : LeveledCompactionStrategy.DEFAULT_LEVEL_FANOUT_SIZE;
maxSSTableSizeBytes = repaired.first().getMaxSSTableBytes();
name = repaired.first().getName();
}
finally
{
writeLock.unlock();
}
if (repaired.first().logAll)
compactionLogger.enable();
}
/**
* return the compaction strategy for the given sstable
*
* returns differently based on the repaired status and which vnode the compaction strategy belongs to
* @param sstable
* @return
*/
public AbstractCompactionStrategy getCompactionStrategyFor(SSTableReader sstable)
{
maybeReloadDiskBoundaries();
return compactionStrategyFor(sstable);
}
@VisibleForTesting
AbstractCompactionStrategy compactionStrategyFor(SSTableReader sstable)
{
// should not call maybeReloadDiskBoundaries because it may be called from within lock
readLock.lock();
try
{
return getHolder(sstable).getStrategyFor(sstable);
}
finally
{
readLock.unlock();
}
}
/**
* Get the correct compaction strategy for the given sstable. If the first token starts within a disk boundary, we
* will add it to that compaction strategy.
*
* In the case we are upgrading, the first compaction strategy will get most files - we do not care about which disk
* the sstable is on currently (unless we don't know the local tokens yet). Once we start compacting we will write out
* sstables in the correct locations and give them to the correct compaction strategy instance.
*
* @param sstable
* @return
*/
int compactionStrategyIndexFor(SSTableReader sstable)
{
// should not call maybeReloadDiskBoundaries because it may be called from within lock
readLock.lock();
try
{
//We only have a single compaction strategy when sstables are not
//partitioned by token range
if (!partitionSSTablesByTokenRange)
return 0;
return currentBoundaries.getDiskIndex(sstable);
}
finally
{
readLock.unlock();
}
}
private int compactionStrategyIndexForDirectory(Descriptor descriptor)
{
readLock.lock();
try
{
return partitionSSTablesByTokenRange ? currentBoundaries.getBoundariesFromSSTableDirectory(descriptor) : 0;
}
finally
{
readLock.unlock();
}
}
@VisibleForTesting
CompactionStrategyHolder getRepairedUnsafe()
{
return repaired;
}
@VisibleForTesting
CompactionStrategyHolder getUnrepairedUnsafe()
{
return unrepaired;
}
@VisibleForTesting
PendingRepairHolder getPendingRepairsUnsafe()
{
return pendingRepairs;
}
@VisibleForTesting
PendingRepairHolder getTransientRepairsUnsafe()
{
return transientRepairs;
}
public boolean hasDataForPendingRepair(TimeUUID sessionID)
{
readLock.lock();
try
{
return pendingRepairs.hasDataForSession(sessionID) || transientRepairs.hasDataForSession(sessionID);
}
finally
{
readLock.unlock();
}
}
public void shutdown()
{
writeLock.lock();
try
{
isActive = false;
holders.forEach(AbstractStrategyHolder::shutdown);
compactionLogger.disable();
}
finally
{
writeLock.unlock();
}
}
/**
* Maybe reload the compaction strategies. Called after changing configuration.
*/
public void maybeReloadParamsFromSchema(CompactionParams params)
{
// compare the old schema configuration to the new one, ignore any locally set changes.
if (params.equals(schemaCompactionParams))
return;
writeLock.lock();
try
{
if (!params.equals(schemaCompactionParams))
reloadParamsFromSchema(params);
}
finally
{
writeLock.unlock();
}
}
/**
* @param newParams new CompactionParams set in via CQL
*/
private void reloadParamsFromSchema(CompactionParams newParams)
{
logger.debug("Recreating compaction strategy for {}.{} - compaction parameters changed via CQL",
cfs.getKeyspaceName(), cfs.getTableName());
/*
* It's possible for compaction to be explicitly enabled/disabled
* via JMX when already enabled/disabled via params. In that case,
* if we now toggle enabled/disabled via params, we'll technically
* be overriding JMX-set value with params-set value.
*/
boolean enabledWithJMX = enabled && !shouldBeEnabled();
boolean disabledWithJMX = !enabled && shouldBeEnabled();
schemaCompactionParams = newParams;
setStrategy(newParams);
// enable/disable via JMX overrides CQL params, but please see the comment above
if (enabled && !shouldBeEnabled() && !enabledWithJMX)
disable();
else if (!enabled && shouldBeEnabled() && !disabledWithJMX)
enable();
startup();
}
private void maybeReloadParamsFromJMX(CompactionParams params)
{
// compare the old local configuration to the new one, ignoring schema
if (params.equals(this.params))
return;
writeLock.lock();
try
{
if (!params.equals(this.params))
reloadParamsFromJMX(params);
}
finally
{
writeLock.unlock();
}
}
/**
* @param newParams new CompactionParams set via JMX
*/
private void reloadParamsFromJMX(CompactionParams newParams)
{
logger.debug("Recreating compaction strategy for {}.{} - compaction parameters changed via JMX",
cfs.getKeyspaceName(), cfs.getTableName());
setStrategy(newParams);
// compaction params set via JMX override enable/disable via JMX
if (enabled && !shouldBeEnabled())
disable();
else if (!enabled && shouldBeEnabled())
enable();
startup();
}
/**
* Checks if the disk boundaries changed and reloads the compaction strategies
* to reflect the most up-to-date disk boundaries.
*
* This is typically called before acquiring the {@link this#readLock} to ensure the most up-to-date
* disk locations and boundaries are used.
*
* This should *never* be called inside by a thread holding the {@link this#readLock}, since it
* will potentially acquire the {@link this#writeLock} to update the compaction strategies
* what can cause a deadlock.
*
* TODO: improve this to reload after receiving a notification rather than trying to reload on every operation
*/
@VisibleForTesting
protected void maybeReloadDiskBoundaries()
{
if (!currentBoundaries.isOutOfDate())
return;
writeLock.lock();
try
{
if (currentBoundaries.isOutOfDate())
reloadDiskBoundaries(boundariesSupplier.get());
}
finally
{
writeLock.unlock();
}
}
/**
* @param newBoundaries new DiskBoundaries - potentially functionally equivalent to current ones
*/
private void reloadDiskBoundaries(DiskBoundaries newBoundaries)
{
DiskBoundaries oldBoundaries = currentBoundaries;
currentBoundaries = newBoundaries;
if (newBoundaries.isEquivalentTo(oldBoundaries))
{
logger.debug("Not recreating compaction strategy for {}.{} - disk boundaries are equivalent",
cfs.getKeyspaceName(), cfs.getTableName());
return;
}
logger.debug("Recreating compaction strategy for {}.{} - disk boundaries are out of date",
cfs.getKeyspaceName(), cfs.getTableName());
setStrategy(params);
startup();
}
private Iterable getAllStrategies()
{
return Iterables.concat(Iterables.transform(holders, AbstractStrategyHolder::allStrategies));
}
public int getUnleveledSSTables()
{
maybeReloadDiskBoundaries();
readLock.lock();
try
{
if (repaired.first() instanceof LeveledCompactionStrategy)
{
int count = 0;
for (AbstractCompactionStrategy strategy : getAllStrategies())
count += ((LeveledCompactionStrategy) strategy).getLevelSize(0);
return count;
}
}
finally
{
readLock.unlock();
}
return 0;
}
public int getLevelFanoutSize()
{
return fanout;
}
public int[] getSSTableCountPerLevel()
{
maybeReloadDiskBoundaries();
readLock.lock();
try
{
if (repaired.first() instanceof LeveledCompactionStrategy)
{
int[] res = new int[LeveledGenerations.MAX_LEVEL_COUNT];
for (AbstractCompactionStrategy strategy : getAllStrategies())
{
int[] repairedCountPerLevel = ((LeveledCompactionStrategy) strategy).getAllLevelSize();
res = sumArrays(res, repairedCountPerLevel);
}
return res;
}
}
finally
{
readLock.unlock();
}
return null;
}
public long[] getPerLevelSizeBytes()
{
readLock.lock();
try
{
if (repaired.first() instanceof LeveledCompactionStrategy)
{
long [] res = new long[LeveledGenerations.MAX_LEVEL_COUNT];
for (AbstractCompactionStrategy strategy : getAllStrategies())
{
long[] repairedCountPerLevel = ((LeveledCompactionStrategy) strategy).getAllLevelSizeBytes();
res = sumArrays(res, repairedCountPerLevel);
}
return res;
}
return null;
}
finally
{
readLock.unlock();
}
}
public boolean isLeveledCompaction()
{
readLock.lock();
try
{
return repaired.first() instanceof LeveledCompactionStrategy;
} finally
{
readLock.unlock();
}
}
public int[] getSSTableCountPerTWCSBucket()
{
readLock.lock();
try
{
List