Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.apache.cassandra.db.ColumnFamilyStore Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.db;
import java.io.*;
import java.io.File;
import java.io.IOException;
import java.io.PrintStream;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.util.*;
import java.util.Optional;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.BiFunction;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import javax.management.openmbean.*;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.*;
import com.google.common.base.Throwables;
import com.google.common.collect.*;
import com.google.common.util.concurrent.*;
import com.palantir.cassandra.db.ColumnFamilyStoreManager;
import com.palantir.logsafe.SafeArg;
import com.palantir.logsafe.UnsafeArg;
import com.palantir.tracing.CloseableTracer;
import com.palantir.cassandra.db.RowCountOverwhelmingException;
import org.apache.cassandra.FilterExperiment;
import org.apache.cassandra.db.lifecycle.SSTableIntervalTree;
import org.apache.cassandra.db.lifecycle.View;
import org.apache.cassandra.db.lifecycle.Tracker;
import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
import org.apache.cassandra.io.FSWriteError;
import org.json.simple.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.cassandra.cache.*;
import org.apache.cassandra.concurrent.*;
import org.apache.cassandra.config.*;
import org.apache.cassandra.config.CFMetaData.SpeculativeRetry;
import org.apache.cassandra.db.commitlog.CommitLog;
import org.apache.cassandra.db.commitlog.ReplayPosition;
import org.apache.cassandra.db.compaction.*;
import org.apache.cassandra.db.composites.CellName;
import org.apache.cassandra.db.composites.CellNameType;
import org.apache.cassandra.db.composites.Composite;
import org.apache.cassandra.db.filter.ColumnSlice;
import org.apache.cassandra.db.filter.ExtendedFilter;
import org.apache.cassandra.db.filter.IDiskAtomFilter;
import org.apache.cassandra.db.filter.QueryFilter;
import org.apache.cassandra.db.filter.SliceQueryFilter;
import org.apache.cassandra.db.index.SecondaryIndex;
import org.apache.cassandra.db.index.SecondaryIndexManager;
import org.apache.cassandra.dht.*;
import org.apache.cassandra.dht.Range;
import org.apache.cassandra.exceptions.ConfigurationException;
import org.apache.cassandra.io.FSReadError;
import org.apache.cassandra.io.compress.CompressionParameters;
import org.apache.cassandra.io.sstable.Descriptor;
import org.apache.cassandra.io.sstable.*;
import org.apache.cassandra.io.sstable.format.*;
import org.apache.cassandra.io.sstable.metadata.CompactionMetadata;
import org.apache.cassandra.io.sstable.metadata.MetadataType;
import org.apache.cassandra.io.util.FileUtils;
import org.apache.cassandra.metrics.ColumnFamilyMetrics;
import org.apache.cassandra.metrics.ColumnFamilyMetrics.Sampler;
import org.apache.cassandra.net.MessagingService;
import org.apache.cassandra.service.CacheService;
import org.apache.cassandra.service.StorageService;
import org.apache.cassandra.streaming.StreamLockfile;
import org.apache.cassandra.tracing.Tracing;
import org.apache.cassandra.utils.*;
import org.apache.cassandra.utils.concurrent.*;
import org.apache.cassandra.utils.TopKSampler.SamplerResult;
import org.apache.cassandra.utils.memory.MemtableAllocator;
import com.clearspring.analytics.stream.Counter;
import static org.apache.cassandra.utils.ExecutorUtils.awaitTermination;
import static org.apache.cassandra.utils.ExecutorUtils.shutdown;
import static org.apache.cassandra.utils.Throwables.maybeFail;
public class ColumnFamilyStore implements ColumnFamilyStoreMBean
{
private static final Logger logger = LoggerFactory.getLogger(ColumnFamilyStore.class);
private static final boolean DRY_RUN_NON_COMPACTING_UNUSED_SSTABLE_CLEANUP = Boolean.getBoolean(
"palantir_cassandra.dry_run_non_compacting_unused_sstable_cleanup");
private static final boolean DISABLE_COMPACTION_PRODUCT_CLEANUP = Boolean.getBoolean(
"palantir_cassandra.disable_compaction_product_cleanup");
private static final ExecutorService flushExecutor = new JMXEnabledThreadPoolExecutor(DatabaseDescriptor.getFlushWriters(),
StageManager.KEEPALIVE,
TimeUnit.SECONDS,
new LinkedBlockingQueue(),
new NamedThreadFactory("MemtableFlushWriter"),
"internal");
// post-flush executor is single threaded to provide guarantee that any flush Future on a CF will never return until prior flushes have completed
@VisibleForTesting
static final ExecutorService postFlushExecutor = new JMXEnabledThreadPoolExecutor(1,
StageManager.KEEPALIVE,
TimeUnit.SECONDS,
new LinkedBlockingQueue(),
new NamedThreadFactory("MemtablePostFlush"),
"internal");
// If a flush fails with an error the post-flush is never allowed to continue. This stores the error that caused it
// to be able to show an error on following flushes instead of blindly continuing.
@VisibleForTesting
static volatile FSWriteError previousFlushFailure = null;
private static final ExecutorService reclaimExecutor = new JMXEnabledThreadPoolExecutor(1,
StageManager.KEEPALIVE,
TimeUnit.SECONDS,
new LinkedBlockingQueue(),
new NamedThreadFactory("MemtableReclaimMemory"),
"internal");
private static final String[] COUNTER_NAMES = new String[]{"raw", "count", "error", "string"};
private static final String[] COUNTER_DESCS = new String[]
{ "partition key in raw hex bytes",
"value of this partition for given sampler",
"value is within the error bounds plus or minus of this",
"the partition key turned into a human readable format" };
private static final CompositeType COUNTER_COMPOSITE_TYPE;
private static final TabularType COUNTER_TYPE;
private static final String[] SAMPLER_NAMES = new String[]{"cardinality", "partitions"};
private static final String[] SAMPLER_DESCS = new String[]
{ "cardinality of partitions",
"list of counter results" };
private static final String SAMPLING_RESULTS_NAME = "SAMPLING_RESULTS";
private static final CompositeType SAMPLING_RESULT;
static
{
try
{
OpenType>[] counterTypes = new OpenType[] { SimpleType.STRING, SimpleType.LONG, SimpleType.LONG, SimpleType.STRING };
COUNTER_COMPOSITE_TYPE = new CompositeType(SAMPLING_RESULTS_NAME, SAMPLING_RESULTS_NAME, COUNTER_NAMES, COUNTER_DESCS, counterTypes);
COUNTER_TYPE = new TabularType(SAMPLING_RESULTS_NAME, SAMPLING_RESULTS_NAME, COUNTER_COMPOSITE_TYPE, COUNTER_NAMES);
OpenType>[] samplerTypes = new OpenType[] { SimpleType.LONG, COUNTER_TYPE };
SAMPLING_RESULT = new CompositeType(SAMPLING_RESULTS_NAME, SAMPLING_RESULTS_NAME, SAMPLER_NAMES, SAMPLER_DESCS, samplerTypes);
} catch (OpenDataException e)
{
throw Throwables.propagate(e);
}
}
@VisibleForTesting
public static volatile ColumnFamilyStore discardFlushResults;
public final Keyspace keyspace;
public final String name;
public final CFMetaData metadata;
public final IPartitioner partitioner;
private final String mbeanName;
private volatile boolean valid = true;
/**
* Memtables and SSTables on disk for this column family.
*
* We synchronize on the Tracker to ensure isolation when we want to make sure
* that the memtable we're acting on doesn't change out from under us. I.e., flush
* syncronizes on it to make sure it can submit on both executors atomically,
* so anyone else who wants to make sure flush doesn't interfere should as well.
*/
private final Tracker data;
/* The read order, used to track accesses to off-heap memtable storage */
public final OpOrder readOrdering = new OpOrder();
/* This is used to generate the next index for a SSTable */
private final AtomicInteger fileIndexGenerator = new AtomicInteger(0);
public final SecondaryIndexManager indexManager;
/* These are locally held copies to be changed from the config during runtime */
private volatile DefaultInteger minCompactionThreshold;
private volatile DefaultInteger maxCompactionThreshold;
private final WrappingCompactionStrategy compactionStrategyWrapper;
public final Directories directories;
public final ColumnFamilyMetrics metric;
public volatile long sampleLatencyNanos;
private final ScheduledFuture> latencyCalculator;
private volatile boolean compactionSpaceCheck = true;
public static void shutdownFlushExecutor() throws InterruptedException
{
flushExecutor.shutdown();
flushExecutor.awaitTermination(60, TimeUnit.SECONDS);
}
public static void shutdownPostFlushExecutor() throws InterruptedException
{
postFlushExecutor.shutdown();
postFlushExecutor.awaitTermination(60, TimeUnit.SECONDS);
}
public static void shutdownExecutorsAndWait(long timeout, TimeUnit unit) throws InterruptedException, TimeoutException
{
ExecutorUtils.shutdownAndWait(timeout, unit, reclaimExecutor, postFlushExecutor, flushExecutor);
}
public void reload() {
reload("Unknown");
}
public void reload(String reason)
{
// metadata object has been mutated directly. make all the members jibe with new settings.
// only update these runtime-modifiable settings if they have not been modified.
if (!minCompactionThreshold.isModified())
for (ColumnFamilyStore cfs : concatWithIndexes())
cfs.minCompactionThreshold = new DefaultInteger(metadata.getMinCompactionThreshold());
if (!maxCompactionThreshold.isModified())
for (ColumnFamilyStore cfs : concatWithIndexes())
cfs.maxCompactionThreshold = new DefaultInteger(metadata.getMaxCompactionThreshold());
compactionStrategyWrapper.maybeReloadCompactionStrategy(metadata);
scheduleFlush();
indexManager.reload();
// If the CF comparator has changed, we need to change the memtable,
// because the old one still aliases the previous comparator.
if (data.getView().getCurrentMemtable().initialComparator != metadata.comparator)
switchMemtable(reason);
}
void scheduleFlush()
{
int period = metadata.getMemtableFlushPeriod();
if (period > 0)
{
logger.trace("scheduling flush in {} ms", period);
WrappedRunnable runnable = new WrappedRunnable()
{
protected void runMayThrow() throws Exception
{
synchronized (data)
{
Memtable current = data.getView().getCurrentMemtable();
// if we're not expired, we've been hit by a scheduled flush for an already flushed memtable, so ignore
if (current.isExpired())
{
if (current.isClean())
{
// if we're still clean, instead of swapping just reschedule a flush for later
scheduleFlush();
}
else
{
// we'll be rescheduled by the constructor of the Memtable.
forceFlush("Scheduled flush according to memtableFlushPeriod");
}
}
}
}
};
ScheduledExecutors.scheduledTasks.schedule(runnable, period, TimeUnit.MILLISECONDS);
}
}
public static Runnable getBackgroundCompactionTaskSubmitter()
{
return new Runnable()
{
public void run()
{
for (Keyspace keyspace : Keyspace.all())
for (ColumnFamilyStore cfs : keyspace.getColumnFamilyStores())
CompactionManager.instance.submitBackground(cfs);
}
};
}
public void setCompactionParametersJson(String options)
{
setCompactionParameters(FBUtilities.fromJsonMap(options));
}
public String getCompactionParametersJson()
{
return FBUtilities.json(getCompactionParameters());
}
public void setCompactionParameters(Map options)
{
try
{
Map optionsCopy = new HashMap<>(options);
Class extends AbstractCompactionStrategy> compactionStrategyClass = CFMetaData.createCompactionStrategy(optionsCopy.get("class"));
optionsCopy.remove("class");
CFMetaData.validateCompactionOptions(compactionStrategyClass, optionsCopy);
compactionStrategyWrapper.setNewLocalCompactionStrategy(compactionStrategyClass, optionsCopy);
}
catch (Throwable t)
{
logger.error("Could not set new local compaction strategy", t);
// dont propagate the ConfigurationException over jmx, user will only see a ClassNotFoundException
throw new IllegalArgumentException("Could not set new local compaction strategy: "+t.getMessage());
}
}
public Map getCompactionParameters()
{
Map options = new HashMap<>(compactionStrategyWrapper.options);
options.put("class", compactionStrategyWrapper.getName());
return options;
}
public void setCompactionStrategyClass(String compactionStrategyClass)
{
try
{
metadata.compactionStrategyClass = CFMetaData.createCompactionStrategy(compactionStrategyClass);
compactionStrategyWrapper.maybeReloadCompactionStrategy(metadata);
}
catch (ConfigurationException e)
{
throw new IllegalArgumentException(e.getMessage());
}
}
public String getCompactionStrategyClass()
{
return metadata.compactionStrategyClass.getName();
}
public Map getCompressionParameters()
{
return metadata.compressionParameters().asThriftOptions();
}
public void setCompressionParameters(Map opts)
{
try
{
metadata.compressionParameters = CompressionParameters.create(opts);
}
catch (ConfigurationException e)
{
throw new IllegalArgumentException(e.getMessage());
}
}
public void setCrcCheckChance(double crcCheckChance)
{
try
{
for (SSTableReader sstable : keyspace.getAllSSTables())
if (sstable.compression)
sstable.getCompressionMetadata().parameters.setCrcCheckChance(crcCheckChance);
}
catch (ConfigurationException e)
{
throw new IllegalArgumentException(e.getMessage());
}
}
public ColumnFamilyStore(Keyspace keyspace,
String columnFamilyName,
IPartitioner partitioner,
int generation,
CFMetaData metadata,
Directories directories,
boolean loadSSTables)
{
this(keyspace, columnFamilyName, partitioner, generation, metadata, directories, loadSSTables, true);
}
@VisibleForTesting
public ColumnFamilyStore(Keyspace keyspace,
String columnFamilyName,
IPartitioner partitioner,
int generation,
CFMetaData metadata,
Directories directories,
boolean loadSSTables,
boolean registerBookkeeping)
{
assert metadata != null : "null metadata for " + keyspace + ":" + columnFamilyName;
this.keyspace = keyspace;
name = columnFamilyName;
this.metadata = metadata;
this.minCompactionThreshold = new DefaultInteger(metadata.getMinCompactionThreshold());
this.maxCompactionThreshold = new DefaultInteger(metadata.getMaxCompactionThreshold());
this.partitioner = partitioner;
this.directories = directories;
this.indexManager = new SecondaryIndexManager(this);
this.metric = new ColumnFamilyMetrics(this);
fileIndexGenerator.set(generation);
sampleLatencyNanos = TimeUnit.MILLISECONDS.toNanos(DatabaseDescriptor.getReadRpcTimeout() / 2);
logger.info("Initializing {}.{}", keyspace.getName(), name);
// Create Memtable only on online
Memtable initialMemtable = null;
if (DatabaseDescriptor.isDaemonInitialized())
initialMemtable = new Memtable(new AtomicReference<>(CommitLog.instance.getContext()), this);
data = new Tracker(initialMemtable, loadSSTables);
// scan for sstables corresponding to this cf and load them
if (data.loadsstables)
{
Directories.SSTableLister sstableFiles = directories.sstableLister().skipTemporary(true);
Collection sstables = SSTableReader.openAll(sstableFiles.list().entrySet(), metadata, this.partitioner);
data.addInitialSSTables(sstables);
}
// compaction strategy should be created after the CFS has been prepared
this.compactionStrategyWrapper = new WrappingCompactionStrategy(this);
if (maxCompactionThreshold.value() <= 0 || minCompactionThreshold.value() <=0)
{
logger.warn("Disabling compaction strategy by setting compaction thresholds to 0 is deprecated, set the compaction option 'enabled' to 'false' instead.");
this.compactionStrategyWrapper.disable();
}
// create the private ColumnFamilyStores for the secondary column indexes
for (ColumnDefinition info : metadata.allColumns())
{
if (info.getIndexType() != null)
indexManager.addIndexedColumn(info);
}
if (registerBookkeeping)
{
// register the mbean
String type = this.partitioner instanceof LocalPartitioner ? "IndexColumnFamilies" : "ColumnFamilies";
mbeanName = "org.apache.cassandra.db:type=" + type + ",keyspace=" + this.keyspace.getName() + ",columnfamily=" + name;
MBeanWrapper.instance.registerMBean(this, mbeanName);
logger.trace("retryPolicy for {} is {}", name, this.metadata.getSpeculativeRetry());
latencyCalculator = ScheduledExecutors.optionalTasks.scheduleWithFixedDelay(new Runnable()
{
public void run()
{
SpeculativeRetry retryPolicy = ColumnFamilyStore.this.metadata.getSpeculativeRetry();
switch (retryPolicy.type)
{
case PERCENTILE:
// get percentile in nanos
sampleLatencyNanos = (long) (metric.coordinatorReadLatency.getSnapshot().getValue(retryPolicy.value));
break;
case CUSTOM:
// convert to nanos, since configuration is in millisecond
sampleLatencyNanos = (long) (retryPolicy.value * 1000d * 1000d);
break;
default:
sampleLatencyNanos = Long.MAX_VALUE;
break;
}
}
}, DatabaseDescriptor.getReadRpcTimeout(), DatabaseDescriptor.getReadRpcTimeout(), TimeUnit.MILLISECONDS);
}
else
{
latencyCalculator = ScheduledExecutors.optionalTasks.schedule(Runnables.doNothing(), 0, TimeUnit.NANOSECONDS);
mbeanName = null;
}
}
/** call when dropping or renaming a CF. Performs mbean housekeeping and invalidates CFS to other operations */
public void invalidate()
{
invalidate(true);
}
public void invalidate(boolean expectMBean)
{
// disable and cancel in-progress compactions before invalidating
valid = false;
try
{
unregisterMBean();
}
catch (Exception e)
{
if (expectMBean)
{
JVMStabilityInspector.inspectThrowable(e);
// this shouldn't block anything.
logger.warn("Failed unregistering mbean: {}", mbeanName, e);
}
}
latencyCalculator.cancel(false);
SystemKeyspace.removeTruncationRecord(metadata.cfId);
data.dropSSTables();
indexManager.invalidate();
invalidateCaches();
}
/**
* Removes every SSTable in the directory from the Tracker's view.
* @param directory the unreadable directory, possibly with SSTables in it, but not necessarily.
*/
void maybeRemoveUnreadableSSTables(File directory)
{
data.removeUnreadableSSTables(directory);
}
void unregisterMBean()
{
if (MBeanWrapper.instance.isRegistered(mbeanName)) {
MBeanWrapper.instance.unregisterMBean(mbeanName);
}
// unregister metrics
metric.release();
}
public static ColumnFamilyStore createColumnFamilyStore(Keyspace keyspace, CFMetaData metadata, boolean loadSSTables)
{
return createColumnFamilyStore(keyspace, metadata.cfName, StorageService.getPartitioner(), metadata, loadSSTables);
}
public static synchronized ColumnFamilyStore createColumnFamilyStore(Keyspace keyspace,
String columnFamily,
IPartitioner partitioner,
CFMetaData metadata,
boolean loadSSTables)
{
// get the max generation number, to prevent generation conflicts
Directories directories = new Directories(metadata);
Directories.SSTableLister lister = directories.sstableLister().includeBackups(true);
List generations = new ArrayList();
for (Map.Entry> entry : lister.list().entrySet())
{
Descriptor desc = entry.getKey();
generations.add(desc.generation);
if (!desc.isCompatible())
throw new RuntimeException(String.format("Incompatible SSTable found. Current version %s is unable to read file: %s. Please run upgradesstables.",
desc.getFormat().getLatestVersion(), desc));
}
Collections.sort(generations);
int value = (generations.size() > 0) ? (generations.get(generations.size() - 1)) : 0;
return new ColumnFamilyStore(keyspace, columnFamily, partitioner, value, metadata, directories, loadSSTables);
}
/**
* Removes unnecessary files from the cf directory at startup: these include temp files, orphans, zero-length files
* and compacted sstables. Files that cannot be recognized will be ignored.
*/
public static void scrubDataDirectories(CFMetaData metadata)
{
Directories directories = new Directories(metadata);
// clear ephemeral snapshots that were not properly cleared last session (CASSANDRA-7357)
clearEphemeralSnapshots(directories);
// remove any left-behind SSTables from failed/stalled streaming
FileFilter filter = new FileFilter()
{
public boolean accept(File pathname)
{
return pathname.getPath().endsWith(StreamLockfile.FILE_EXT);
}
};
for (File dir : directories.getCFDirectories())
{
File[] lockfiles = dir.listFiles(filter);
// lock files can be null if I/O error happens
if (lockfiles == null || lockfiles.length == 0)
continue;
logger.info("Removing SSTables from failed streaming session. Found {} files to cleanup.", lockfiles.length);
for (File lockfile : lockfiles)
{
StreamLockfile streamLockfile = new StreamLockfile(lockfile);
streamLockfile.cleanup();
streamLockfile.delete();
}
}
logger.trace("Removing compacted SSTable files from {} (see http://wiki.apache.org/cassandra/MemtableSSTable)", metadata.cfName);
for (Map.Entry> sstableFiles : directories.sstableLister().list().entrySet())
{
Descriptor desc = sstableFiles.getKey();
Set components = sstableFiles.getValue();
if (desc.type.isTemporary)
{
SSTable.delete(desc, components);
continue;
}
File dataFile = new File(desc.filenameFor(Component.DATA));
if (components.contains(Component.DATA) && dataFile.length() > 0)
// everything appears to be in order... moving on.
continue;
// missing the DATA file! all components are orphaned
logger.warn("Removing orphans for {}: {}", desc, components);
for (Component component : components)
{
FileUtils.deleteWithConfirm(desc.filenameFor(component));
}
}
// cleanup incomplete saved caches
Pattern tmpCacheFilePattern = Pattern.compile(metadata.ksName + "-" + metadata.cfName + "-(Key|Row)Cache.*\\.tmp$");
File dir = new File(DatabaseDescriptor.getSavedCachesLocation());
if (dir.exists())
{
assert dir.isDirectory();
for (File file : dir.listFiles())
if (tmpCacheFilePattern.matcher(file.getName()).matches())
if (!file.delete())
logger.warn("could not delete {}", file.getAbsolutePath());
}
// also clean out any index leftovers.
for (ColumnDefinition def : metadata.allColumns())
{
if (def.isIndexed())
{
CellNameType indexComparator = SecondaryIndex.getIndexComparator(metadata, def);
if (indexComparator != null)
{
CFMetaData indexMetadata = CFMetaData.newIndexMetadata(metadata, def, indexComparator);
scrubDataDirectories(indexMetadata);
}
}
}
}
/**
* Replacing compacted sstables is atomic as far as observers of Tracker are concerned, but not on the
* filesystem: first the new sstables are renamed to "live" status (i.e., the tmp marker is removed), then
* their ancestors are removed.
*
* If an unclean shutdown happens at the right time, we can thus end up with both the new ones and their
* ancestors "live" in the system. This is harmless for normal data, but for counters it can cause overcounts.
*
* To prevent this, we record sstables being compacted in the system keyspace. If we find unfinished
* compactions, we remove the new ones (since those may be incomplete -- under LCS, we may create multiple
* sstables from any given ancestor).
*/
public static void removeUnusedSstables(CFMetaData metadata, Map unfinishedCompactions)
{
Directories directories = new Directories(metadata);
Set allGenerations = new HashSet<>();
for (Descriptor desc : directories.sstableLister().list().keySet())
allGenerations.add(desc.generation);
// sanity-check unfinishedCompactions
Set unfinishedGenerations = unfinishedCompactions.keySet();
if (!allGenerations.containsAll(unfinishedGenerations))
{
HashSet missingGenerations = new HashSet<>(unfinishedGenerations);
missingGenerations.removeAll(allGenerations);
logger.info("Unfinished compactions reference missing sstables of generations",
SafeArg.of("keyspace", metadata.ksName), SafeArg.of("cf", metadata.cfName),
SafeArg.of("missingGenerations", missingGenerations));
}
// remove new sstables from compactions that didn't complete, and compute
// set of ancestors that shouldn't exist anymore
Map> allSstableToAncestors = new HashMap<>();
Set completedAncestors = new HashSet<>();
Map> allNonTempSstableFiles = directories.sstableLister().skipTemporary(true).list();
for (Map.Entry> sstableFiles : allNonTempSstableFiles.entrySet())
{
// we rename the Data component last - if it does not exist as a final file, we should ignore this sstable and
// it will be removed during startup
if (!sstableFiles.getValue().contains(Component.DATA))
continue;
Descriptor desc = sstableFiles.getKey();
Set ancestors;
try
{
CompactionMetadata compactionMetadata = (CompactionMetadata) desc.getMetadataSerializer().deserialize(desc, MetadataType.COMPACTION);
ancestors = compactionMetadata.ancestors;
}
catch (IOException e)
{
throw new FSReadError(e, desc.filenameFor(Component.STATS));
}
catch (NullPointerException e)
{
throw new FSReadError(e, "Failed to remove unfinished compaction leftovers (file: " + desc.filenameFor(Component.STATS) + "). See log for details.");
}
allSstableToAncestors.put(desc, ancestors);
}
allSstableToAncestors = ColumnFamilyStoreManager.instance.filterValidAncestors(metadata, allSstableToAncestors, unfinishedCompactions);
SafeArg>> ancestorsArg = SafeArg.of(
"sstableToAncestors",
allSstableToAncestors.entrySet().stream()
.collect(Collectors.toMap(
(Map.Entry> e) -> e.getKey().generation,
Map.Entry::getValue)));
Set cleanedUnfinishedCompactions = new HashSet<>();
for (Map.Entry> sstableToAncestors : allSstableToAncestors.entrySet())
{
Descriptor desc = sstableToAncestors.getKey();
Set ancestors = sstableToAncestors.getValue();
if (!ancestors.isEmpty()
&& unfinishedGenerations.containsAll(ancestors)
&& allGenerations.containsAll(ancestors))
{
// any of the ancestors would work, so we'll just lookup the compaction task ID with the first one
UUID compactionTaskID = unfinishedCompactions.get(ancestors.iterator().next());
assert compactionTaskID != null;
if (DISABLE_COMPACTION_PRODUCT_CLEANUP)
{
logger.info("Would have deleted unfinished compaction product", UnsafeArg.of("desc", desc),
SafeArg.of("keyspace", desc.ksname), SafeArg.of("cf", desc.cfname),
SafeArg.of("generation", desc.generation), ancestorsArg);
}
else
{
logger.info("Going to delete unfinished compaction product", UnsafeArg.of("desc", desc),
SafeArg.of("keyspace", desc.ksname), SafeArg.of("cf", desc.cfname),
SafeArg.of("generation", desc.generation), ancestorsArg);
SSTable.delete(desc, allNonTempSstableFiles.get(desc));
}
cleanedUnfinishedCompactions.add(compactionTaskID);
}
else
{
completedAncestors.addAll(ancestors);
}
}
cleanedUnfinishedCompactions.forEach(SystemKeyspace::finishCompaction);
// remove old sstables from compactions that did complete
for (Map.Entry> sstableFiles : directories.sstableLister().list().entrySet())
{
Descriptor desc = sstableFiles.getKey();
if (completedAncestors.contains(desc.generation))
{
if (DRY_RUN_NON_COMPACTING_UNUSED_SSTABLE_CLEANUP && unfinishedCompactions.isEmpty())
{
logger.warn("Would have deleted leftover compaction ancestor", UnsafeArg.of("desc", desc),
SafeArg.of("keyspace", desc.ksname), SafeArg.of("cf", desc.cfname),
SafeArg.of("generation", desc.generation), ancestorsArg);
} else
{
// if any of the ancestors were participating in a compaction, finish that compaction
logger.warn("Going to delete leftover compaction ancestor", UnsafeArg.of("desc", desc),
SafeArg.of("keyspace", desc.ksname), SafeArg.of("cf", desc.cfname),
SafeArg.of("generation", desc.generation), ancestorsArg);
SSTable.delete(desc, sstableFiles.getValue());
Optional.ofNullable(unfinishedCompactions.get(desc.generation))
.ifPresent(SystemKeyspace::finishCompaction);
}
}
}
}
/**
* See #{@code StorageService.loadNewSSTables(String, String)} for more info
*
* @param ksName The keyspace name
* @param cfName The columnFamily name
*/
public static synchronized void loadNewSSTables(String ksName, String cfName)
{
loadNewSSTables(ksName, cfName, false);
}
/**
* See #{@code StorageService.loadNewSSTables(String, String, boolean)} for more info
*
* @param ksName The keyspace name
* @param cfName The columnFamily name
* @param assumeCfIsEmpty Whether or not we can assume the column family is empty before and while loading the new SSTables
*/
public static synchronized void loadNewSSTables(String ksName, String cfName, boolean assumeCfIsEmpty)
{
/** ks/cf existence checks will be done by open and getCFS methods for us */
Keyspace keyspace = Keyspace.open(ksName);
keyspace.getColumnFamilyStore(cfName).loadNewSSTables(assumeCfIsEmpty);
}
/**
* #{@inheritDoc}
*/
public synchronized void loadNewSSTables()
{
loadNewSSTables(false);
}
public synchronized void loadNewSSTables(boolean assumeCfIsEmpty) {
loadNewSSTablesWithCount(assumeCfIsEmpty);
}
/**
* See #{@code StorageService.loadNewSSTablesWithCount(String, String)} for more info
*
* @param ksName The keyspace name
* @param cfName The columnFamily name
*
* @return the number of new sstables loaded
*/
public static synchronized int loadNewSSTablesWithCount(String ksName, String cfName)
{
return loadNewSSTablesWithCount(ksName, cfName, false);
}
/**
* See #{@code StorageService.loadNewSSTablesWithCount(String, String, boolean)} for more info
*
* @param ksName The keyspace name
* @param cfName The columnFamily name
* @param assumeCfIsEmpty Whether or not we can assume the column family is empty before and while loading the new SSTables
*
* @return the number of new sstables loaded
*/
public static synchronized int loadNewSSTablesWithCount(String ksName, String cfName, boolean assumeCfIsEmpty)
{
/** ks/cf existence checks will be done by open and getCFS methods for us */
Keyspace keyspace = Keyspace.open(ksName);
return keyspace.getColumnFamilyStore(cfName).loadNewSSTablesWithCount(assumeCfIsEmpty);
}
/**
* #{@inheritDoc}
*/
public synchronized int loadNewSSTablesWithCount()
{
return loadNewSSTablesWithCount(false);
}
public synchronized int loadNewSSTablesWithCount(boolean assumeCfIsEmpty)
{
if (assumeCfIsEmpty)
{
throw new UnsupportedOperationException("Loading new SSTables is not supported on version 2.2.18-1.165.0+.");
}
logger.info("Loading new SSTables for {}/{}{}...",
keyspace.getName(), name,
assumeCfIsEmpty ? " assuming the columnfamily is empty" : "");
Set currentDescriptors = new HashSet();
for (SSTableReader sstable : data.getView().sstables)
currentDescriptors.add(sstable.descriptor);
Set newSSTables = new HashSet<>();
Directories.SSTableLister lister = directories.sstableLister().skipTemporary(true);
for (Map.Entry> entry : lister.list().entrySet())
{
Descriptor descriptor = entry.getKey();
if (currentDescriptors.contains(descriptor))
continue; // old (initialized) SSTable found, skipping
if (descriptor.type.isTemporary) // in the process of being written
continue;
if (!descriptor.isCompatible())
throw new RuntimeException(String.format("Can't open incompatible SSTable! Current version %s, found file: %s",
descriptor.getFormat().getLatestVersion(),
descriptor));
// force foreign sstables to level 0
try
{
if (!assumeCfIsEmpty && new File(descriptor.filenameFor(Component.STATS)).exists())
descriptor.getMetadataSerializer().mutateLevel(descriptor, 0);
}
catch (IOException e)
{
SSTableReader.logOpenException(entry.getKey(), e);
continue;
}
// Increment the generation until we find a filename that doesn't exist. This is needed because the new
// SSTables that are being loaded might already use these generation numbers.
Descriptor newDescriptor;
do
{
newDescriptor = new Descriptor(descriptor.version,
descriptor.directory,
descriptor.ksname,
descriptor.cfname,
fileIndexGenerator.incrementAndGet(),
Descriptor.Type.FINAL,
descriptor.formatType);
}
while (new File(newDescriptor.filenameFor(Component.DATA)).exists());
logger.info("Renaming new SSTable {} to {}", descriptor, newDescriptor);
SSTableWriter.rename(descriptor, newDescriptor, entry.getValue());
SSTableReader reader;
try
{
reader = SSTableReader.open(newDescriptor, entry.getValue(), metadata, partitioner);
}
catch (IOException e)
{
SSTableReader.logOpenException(entry.getKey(), e);
continue;
}
newSSTables.add(reader);
}
if (newSSTables.isEmpty())
{
logger.info("No new SSTables were found for {}/{}", keyspace.getName(), name);
return 0;
}
logger.info("Loading new SSTables and building secondary indexes for {}/{}: {}", keyspace.getName(), name, newSSTables);
try (Refs refs = Refs.ref(newSSTables))
{
data.addSSTables(newSSTables);
indexManager.maybeBuildSecondaryIndexes(newSSTables, indexManager.allIndexesNames());
}
logger.info("Done loading load new SSTables for {}/{}", keyspace.getName(), name);
return newSSTables.size();
}
public void rebuildSecondaryIndex(String idxName)
{
rebuildSecondaryIndex(keyspace.getName(), metadata.cfName, idxName);
}
public static void rebuildSecondaryIndex(String ksName, String cfName, String... idxNames)
{
ColumnFamilyStore cfs = Keyspace.open(ksName).getColumnFamilyStore(cfName);
Set indexes = new HashSet(Arrays.asList(idxNames));
Collection sstables = cfs.getSSTables();
try (Refs refs = Refs.ref(sstables))
{
cfs.indexManager.setIndexRemoved(indexes);
logger.info(String.format("User Requested secondary index re-build for %s/%s indexes", ksName, cfName));
cfs.indexManager.maybeBuildSecondaryIndexes(sstables, indexes);
cfs.indexManager.setIndexBuilt(indexes);
}
}
public String getColumnFamilyName()
{
return name;
}
public String getTempSSTablePath(File directory)
{
return getTempSSTablePath(directory, DatabaseDescriptor.getSSTableFormat().info.getLatestVersion(), DatabaseDescriptor.getSSTableFormat());
}
public String getTempSSTablePath(File directory, SSTableFormat.Type format)
{
return getTempSSTablePath(directory, format.info.getLatestVersion(), format);
}
private String getTempSSTablePath(File directory, Version version, SSTableFormat.Type format)
{
Descriptor desc = new Descriptor(version,
directory,
keyspace.getName(),
name,
fileIndexGenerator.incrementAndGet(),
Descriptor.Type.TEMP,
format);
return desc.filenameFor(Component.DATA);
}
/**
* Switches the memtable iff the live memtable is the one provided
*
* @param memtable
* @param reason String description of the cause of the memtable switch
*/
public ListenableFuture switchMemtableIfCurrent(Memtable memtable, String reason)
{
synchronized (data)
{
if (data.getView().getCurrentMemtable() == memtable)
return switchMemtable(reason);
}
return waitForFlushes();
}
/*
* switchMemtable puts Memtable.getSortedContents on the writer executor. When the write is complete,
* we turn the writer into an SSTableReader and add it to ssTables where it is available for reads.
* This method does not block except for synchronizing on Tracker, but the Future it returns will
* not complete until the Memtable (and all prior Memtables) have been successfully flushed, and the CL
* marked clean up to the position owned by the Memtable.
*/
public ListenableFuture switchMemtable(String reason)
{
synchronized (data)
{
if (previousFlushFailure != null)
throw new IllegalStateException("A flush previously failed with the error below. To prevent data loss, "
+ "no flushes can be carried out until the node is restarted.",
previousFlushFailure);
logFlush(reason);
Flush flush = new Flush(false);
ListenableFutureTask flushTask = ListenableFutureTask.create(flush, null);
flushExecutor.execute(flushTask);
ListenableFutureTask task = ListenableFutureTask.create(flush.postFlush);
postFlushExecutor.execute(task);
@SuppressWarnings("unchecked")
ListenableFuture future =
// If either of the two tasks errors out, resulting future must also error out.
// Combine the two futures and only return post-flush result after both have completed.
// Note that flushTask will always yield null, but Futures.allAsList is
// order preserving, which is why the transform function returns the result
// from item 1 in it's input list (i.e. what was yielded by task).
Futures.transform(Futures.allAsList(flushTask, task),
new Function, ReplayPosition>()
{
public ReplayPosition apply(List input)
{
return (ReplayPosition) input.get(1);
}
}, MoreExecutors.directExecutor());
return future;
}
}
/**
* print out size of all memtables we're enqueuing
*
* @param flushReason String description of the cause of the flush
*/
private void logFlush(String flushReason)
{
// reclaiming includes that which we are GC-ing;
float onHeapRatio = 0, offHeapRatio = 0;
long onHeapTotal = 0, offHeapTotal = 0;
Memtable memtable = getTracker().getView().getCurrentMemtable();
onHeapRatio += memtable.getAllocator().onHeap().ownershipRatio();
offHeapRatio += memtable.getAllocator().offHeap().ownershipRatio();
onHeapTotal += memtable.getAllocator().onHeap().owns();
offHeapTotal += memtable.getAllocator().offHeap().owns();
for (SecondaryIndex index : indexManager.getIndexes())
{
if (index.getIndexCfs() != null)
{
MemtableAllocator allocator = index.getIndexCfs().getTracker().getView().getCurrentMemtable().getAllocator();
onHeapRatio += allocator.onHeap().ownershipRatio();
offHeapRatio += allocator.offHeap().ownershipRatio();
onHeapTotal += allocator.onHeap().owns();
offHeapTotal += allocator.offHeap().owns();
}
}
logger.debug("Enqueuing flush of {} for cause {}: {}", name, flushReason, String.format("%d (%.0f%%) on-heap, %d (%.0f%%) off-heap",
onHeapTotal, onHeapRatio * 100, offHeapTotal, offHeapRatio * 100));
}
/**
* Flush if there is unflushed data in the memtables
*
* @param reason String description of the cause of the force flush
*
* @return a Future yielding the commit log position that can be guaranteed to have been successfully written
* to sstables for this table once the future completes
*/
public ListenableFuture forceFlush(String reason)
{
synchronized (data)
{
Memtable current = data.getView().getCurrentMemtable();
for (ColumnFamilyStore cfs : concatWithIndexes())
if (!cfs.data.getView().getCurrentMemtable().isClean())
return switchMemtableIfCurrent(current, reason);
return waitForFlushes();
}
}
public ListenableFuture forceFlush()
{
return forceFlush("Unknown");
}
/**
* Flush if there is unflushed data that was written to the CommitLog before @param flushIfDirtyBefore
* (inclusive).
*
* @param reason String description of the cause of the force flush
*
* @return a Future yielding the commit log position that can be guaranteed to have been successfully written
* to sstables for this table once the future completes
*/
public ListenableFuture forceFlush(ReplayPosition flushIfDirtyBefore, String reason)
{
// we don't loop through the remaining memtables since here we only care about commit log dirtiness
// and this does not vary between a table and its table-backed indexes
Memtable current = data.getView().getCurrentMemtable();
if (current.mayContainDataBefore(flushIfDirtyBefore))
return switchMemtableIfCurrent(current, reason);
return waitForFlushes();
}
/**
* @return a Future yielding the commit log position that can be guaranteed to have been successfully written
* to sstables for this table once the future completes
*/
private ListenableFuture waitForFlushes()
{
// we grab the current memtable; once any preceding memtables have flushed, we know its
// commitLogLowerBound has been set (as this it is set with the upper bound of the preceding memtable)
final Memtable current = data.getView().getCurrentMemtable();
ListenableFutureTask task = ListenableFutureTask.create(new Callable()
{
public ReplayPosition call()
{
logger.debug("forceFlush requested but everything is clean in {}", name);
return current.getCommitLogLowerBound();
}
});
postFlushExecutor.execute(task);
return task;
}
public ReplayPosition forceBlockingFlush(String reason)
{
logger.debug("Flushing memtables on cf {} due to {}", name, reason);
return FBUtilities.waitOnFuture(forceFlush(reason));
}
public ReplayPosition forceBlockingFlush()
{
return forceBlockingFlush("Unknown");
}
/**
* Both synchronises custom secondary indexes and provides ordering guarantees for futures on switchMemtable/flush
* etc, which expect to be able to wait until the flush (and all prior flushes) requested have completed.
*/
private final class PostFlush implements Callable
{
final boolean flushSecondaryIndexes;
final OpOrder.Barrier writeBarrier;
final CountDownLatch latch = new CountDownLatch(1);
final ReplayPosition commitLogUpperBound;
final List memtables;
final List readers;
private PostFlush(boolean flushSecondaryIndexes, OpOrder.Barrier writeBarrier, ReplayPosition commitLogUpperBound,
List memtables, List readers)
{
this.writeBarrier = writeBarrier;
this.flushSecondaryIndexes = flushSecondaryIndexes;
this.commitLogUpperBound = commitLogUpperBound;
this.memtables = memtables;
this.readers = readers;
}
public ReplayPosition call()
{
if (discardFlushResults == ColumnFamilyStore.this)
return commitLogUpperBound;
writeBarrier.await();
/**
* we can flush 2is as soon as the barrier completes, as they will be consistent with (or ahead of) the
* flushed memtables and CL position, which is as good as we can guarantee.
* TODO: SecondaryIndex should support setBarrier(), so custom implementations can co-ordinate exactly
* with CL as we do with memtables/CFS-backed SecondaryIndexes.
*/
if (flushSecondaryIndexes)
{
for (SecondaryIndex index : indexManager.getIndexesNotBackedByCfs())
{
// flush any non-cfs backed indexes
logger.info("Flushing SecondaryIndex {}", index);
index.forceBlockingFlush("Flushing secondary index post-flush");
}
}
try
{
// we wait on the latch for the commitLogUpperBound to be set, and so that waiters
// on this task can rely on all prior flushes being complete
latch.await();
}
catch (InterruptedException e)
{
throw new IllegalStateException();
}
CommitLog.instance.discardCompletedSegments(metadata.cfId, commitLogUpperBound);
for (int i = 0 ; i < memtables.size() ; i++)
{
Memtable memtable = memtables.get(i);
SSTableReader reader = readers.get(i);
memtable.cfs.data.permitCompactionOfFlushed(reader);
memtable.cfs.compactionStrategyWrapper.replaceFlushed(memtable, reader);
}
metric.pendingFlushes.dec();
return commitLogUpperBound;
}
}
/**
* Should only be constructed/used from switchMemtable() or truncate(), with ownership of the Tracker monitor.
* In the constructor the current memtable(s) are swapped, and a barrier on outstanding writes is issued;
* when run by the flushWriter the barrier is waited on to ensure all outstanding writes have completed
* before all memtables are immediately written, and the CL is either immediately marked clean or, if
* there are custom secondary indexes, the post flush clean up is left to update those indexes and mark
* the CL clean
*/
private final class Flush implements Runnable
{
final OpOrder.Barrier writeBarrier;
final List memtables = new ArrayList<>();
final List readers = new ArrayList<>();
final PostFlush postFlush;
final boolean truncate;
private Flush(boolean truncate)
{
// if true, we won't flush, we'll just wait for any outstanding writes, switch the memtable, and discard
this.truncate = truncate;
metric.pendingFlushes.inc();
/**
* To ensure correctness of switch without blocking writes, run() needs to wait for all write operations
* started prior to the switch to complete. We do this by creating a Barrier on the writeOrdering
* that all write operations register themselves with, and assigning this barrier to the memtables,
* after which we *.issue()* the barrier. This barrier is used to direct write operations started prior
* to the barrier.issue() into the memtable we have switched out, and any started after to its replacement.
* In doing so it also tells the write operations to update the commitLogUpperBound of the memtable, so
* that we know the CL position we are dirty to, which can be marked clean when we complete.
*/
writeBarrier = keyspace.writeOrder.newBarrier();
// submit flushes for the memtable for any indexed sub-cfses, and our own
AtomicReference commitLogUpperBound = new AtomicReference<>();
for (ColumnFamilyStore cfs : concatWithIndexes())
{
// switch all memtables, regardless of their dirty status, setting the barrier
// so that we can reach a coordinated decision about cleanliness once they
// are no longer possible to be modified
Memtable newMemtable = new Memtable(commitLogUpperBound, cfs);
Memtable oldMemtable = cfs.data.switchMemtable(truncate, newMemtable);
oldMemtable.setDiscarding(writeBarrier, commitLogUpperBound);
memtables.add(oldMemtable);
}
// we then ensure an atomic decision is made about the upper bound of the continuous range of commit log
// records owned by this memtable
setCommitLogUpperBound(commitLogUpperBound);
// we then issue the barrier; this lets us wait for all operations started prior to the barrier to complete;
// since this happens after wiring up the commitLogUpperBound, we also know all operations with earlier
// replay positions have also completed, i.e. the memtables are done and ready to flush
writeBarrier.issue();
postFlush = new PostFlush(!truncate, writeBarrier, commitLogUpperBound.get(), memtables, readers);
}
public void run()
{
// mark writes older than the barrier as blocking progress, permitting them to exceed our memory limit
// if they are stuck waiting on it, then wait for them all to complete
writeBarrier.markBlocking();
writeBarrier.await();
// mark all memtables as flushing, removing them from the live memtable list, and
// remove any memtables that are already clean from the set we need to flush
Iterator iter = memtables.iterator();
while (iter.hasNext())
{
Memtable memtable = iter.next();
memtable.cfs.data.markFlushing(memtable);
if (memtable.isClean() || truncate)
{
memtable.cfs.data.replaceFlushed(memtable, null);
memtable.cfs.compactionStrategyWrapper.replaceFlushed(memtable, null);
reclaim(memtable);
iter.remove();
}
}
if (memtables.isEmpty())
{
postFlush.latch.countDown();
return;
}
metric.memtableSwitchCount.inc();
try
{
for (Memtable memtable : memtables)
{
// flush the memtable
SSTableReader reader = memtable.flush();
memtable.cfs.data.replaceFlushed(memtable, reader);
reclaim(memtable);
readers.add(reader);
}
// signal the post-flush we've done our work
// Note: This should not be done in case of error. Read more below.
postFlush.latch.countDown();
}
catch (FSWriteError e)
{
JVMStabilityInspector.inspectThrowable(e);
// The call above may kill the process or the transports, or ignore the error.
// In any case we should not be passing on control to post-flush as a subsequent succeeding flush
// could mask the error and:
// - let the commit log discard unpersisted data, resulting in data loss
// - let truncations proceed, with the possibility of resurrecting the unflushed data
// - let snapshots succeed with incomplete data
// Not passing control on means that all flushes from the moment of failure cannot complete
// (including snapshots).
// If the disk failure policy is ignore, this will cause memtables and the commit log to grow
// unboundedly until the node eventually fails.
previousFlushFailure = e;
throw e;
}
}
private void reclaim(final Memtable memtable)
{
// issue a read barrier for reclaiming the memory, and offload the wait to another thread
final OpOrder.Barrier readBarrier = readOrdering.newBarrier();
readBarrier.issue();
reclaimExecutor.execute(new WrappedRunnable()
{
public void runMayThrow() throws InterruptedException, ExecutionException
{
readBarrier.await();
memtable.setDiscarded();
}
});
}
}
// atomically set the upper bound for the commit log
private static void setCommitLogUpperBound(AtomicReference commitLogUpperBound)
{
// we attempt to set the holder to the current commit log context. at the same time all writes to the memtables are
// also maintaining this value, so if somebody sneaks ahead of us somehow (should be rare) we simply retry,
// so that we know all operations prior to the position have not reached it yet
ReplayPosition lastReplayPosition;
while (true)
{
lastReplayPosition = new Memtable.LastReplayPosition(CommitLog.instance.getContext());
ReplayPosition currentLast = commitLogUpperBound.get();
if ((currentLast == null || currentLast.compareTo(lastReplayPosition) <= 0)
&& commitLogUpperBound.compareAndSet(currentLast, lastReplayPosition))
break;
}
}
@VisibleForTesting
// this method should ONLY be used for testing commit log behaviour; it discards the current memtable
// contents without marking the commit log clean, and prevents any proceeding flushes from marking
// the commit log as done, however they *will* terminate (unlike under typical failures) to ensure progress is made
public void simulateFailedFlush()
{
discardFlushResults = this;
data.markFlushing(data.switchMemtable(false, new Memtable(new AtomicReference<>(CommitLog.instance.getContext()), this)));
}
public void resumeFlushing()
{
discardFlushResults = null;
}
/**
* Finds the largest memtable, as a percentage of *either* on- or off-heap memory limits, and immediately
* queues it for flushing. If the memtable selected is flushed before this completes, no work is done.
*/
public static class FlushLargestColumnFamily implements Runnable
{
public void run()
{
float largestRatio = 0f;
Memtable largest = null;
float liveOnHeap = 0, liveOffHeap = 0;
for (ColumnFamilyStore cfs : ColumnFamilyStore.all())
{
// we take a reference to the current main memtable for the CF prior to snapping its ownership ratios
// to ensure we have some ordering guarantee for performing the switchMemtableIf(), i.e. we will only
// swap if the memtables we are measuring here haven't already been swapped by the time we try to swap them
Memtable current = cfs.getTracker().getView().getCurrentMemtable();
// find the total ownership ratio for the memtable and all SecondaryIndexes owned by this CF,
// both on- and off-heap, and select the largest of the two ratios to weight this CF
float onHeap = 0f, offHeap = 0f;
onHeap += current.getAllocator().onHeap().ownershipRatio();
offHeap += current.getAllocator().offHeap().ownershipRatio();
for (SecondaryIndex index : cfs.indexManager.getIndexes())
{
if (index.getIndexCfs() != null)
{
MemtableAllocator allocator = index.getIndexCfs().getTracker().getView().getCurrentMemtable().getAllocator();
onHeap += allocator.onHeap().ownershipRatio();
offHeap += allocator.offHeap().ownershipRatio();
}
}
float ratio = Math.max(onHeap, offHeap);
if (ratio > largestRatio)
{
largest = current;
largestRatio = ratio;
}
liveOnHeap += onHeap;
liveOffHeap += offHeap;
}
if (largest != null)
{
float usedOnHeap = Memtable.MEMORY_POOL.onHeap.usedRatio();
float usedOffHeap = Memtable.MEMORY_POOL.offHeap.usedRatio();
float flushingOnHeap = Memtable.MEMORY_POOL.onHeap.reclaimingRatio();
float flushingOffHeap = Memtable.MEMORY_POOL.offHeap.reclaimingRatio();
float thisOnHeap = largest.getAllocator().onHeap().ownershipRatio();
float thisOffHeap = largest.getAllocator().onHeap().ownershipRatio();
logger.debug("Flushing largest {} to free up room. Used total: {}, live: {}, flushing: {}, this: {}",
largest.cfs, ratio(usedOnHeap, usedOffHeap), ratio(liveOnHeap, liveOffHeap),
ratio(flushingOnHeap, flushingOffHeap), ratio(thisOnHeap, thisOffHeap));
largest.cfs.switchMemtableIfCurrent(largest, "Flushing largest memtable to free up memtable space");
}
}
}
private static String ratio(float onHeap, float offHeap)
{
return String.format("%.2f/%.2f", onHeap, offHeap);
}
public void maybeUpdateRowCache(DecoratedKey key)
{
if (!isRowCacheEnabled())
return;
RowCacheKey cacheKey = new RowCacheKey(metadata.ksAndCFName, key);
invalidateCachedRow(cacheKey);
}
/**
* Insert/Update the column family for this key.
* Caller is responsible for acquiring Keyspace.switchLock
* param @ lock - lock that needs to be used.
* param @ key - key for update/insert
* param @ columnFamily - columnFamily changes
*/
public void apply(DecoratedKey key, ColumnFamily columnFamily, SecondaryIndexManager.Updater indexer, OpOrder.Group opGroup, ReplayPosition replayPosition)
{
try (CloseableTracer ignored = CloseableTracer.startSpan("ColumnFamilyStore#apply"))
{
long start = System.nanoTime();
int writeDelay = DatabaseDescriptor.getWriteDelay();
if (writeDelay > 0)
{
Tracing.trace("Sleeping for delay of {} seconds before performing write", writeDelay);
Uninterruptibles.sleepUninterruptibly(writeDelay, TimeUnit.SECONDS);
}
Memtable mt = data.getMemtableFor(opGroup, replayPosition);
final long timeDelta = mt.put(key, columnFamily, indexer, opGroup);
maybeUpdateRowCache(key);
metric.samplers.get(Sampler.WRITES).addSample(key.getKey(), key.hashCode(), 1);
metric.writeLatency.addNano(System.nanoTime() - start);
// CASSANDRA-11117 - certain resolution paths on memtable put can result in very
// large time deltas, either through a variety of sentinel timestamps (used for empty values, ensuring
// a minimal write, etc). This limits the time delta to the max value the histogram
// can bucket correctly. This also filters the Long.MAX_VALUE case where there was no previous value
// to update.
if (timeDelta < Long.MAX_VALUE)
metric.colUpdateTimeDeltaHistogram.update(Math.min(18165375903306L, timeDelta));
}
}
/**
* Purges gc-able top-level and range tombstones, returning `cf` if there are any columns or tombstones left,
* null otherwise.
* @param gcBefore a timestamp (in seconds); tombstones with a localDeletionTime before this will be purged
*/
public static ColumnFamily removeDeletedCF(ColumnFamily cf, int gcBefore)
{
// purge old top-level and range tombstones
cf.purgeTombstones(gcBefore);
// if there are no columns or tombstones left, return null
return !cf.hasColumns() && !cf.isMarkedForDelete() ? null : cf;
}
/**
* Removes deleted columns and purges gc-able tombstones.
* @return an updated `cf` if any columns or tombstones remain, null otherwise
*/
public static ColumnFamily removeDeleted(ColumnFamily cf, int gcBefore)
{
return removeDeleted(cf, gcBefore, SecondaryIndexManager.nullUpdater);
}
/*
This is complicated because we need to preserve deleted columns and columnfamilies
until they have been deleted for at least GC_GRACE_IN_SECONDS. But, we do not need to preserve
their contents; just the object itself as a "tombstone" that can be used to repair other
replicas that do not know about the deletion.
*/
public static ColumnFamily removeDeleted(ColumnFamily cf, int gcBefore, SecondaryIndexManager.Updater indexer)
{
if (cf == null)
{
return null;
}
return removeDeletedCF(removeDeletedColumnsOnly(cf, gcBefore, indexer), gcBefore);
}
/**
* Removes only per-cell tombstones, cells that are shadowed by a row-level or range tombstone, or
* columns that have been dropped from the schema (for CQL3 tables only).
* @return the updated ColumnFamily
*/
public static ColumnFamily removeDeletedColumnsOnly(ColumnFamily cf, int gcBefore, SecondaryIndexManager.Updater indexer)
{
BatchRemoveIterator iter = cf.batchRemoveIterator();
DeletionInfo.InOrderTester tester = cf.inOrderDeletionTester();
boolean hasDroppedColumns = !cf.metadata.getDroppedColumns().isEmpty();
while (iter.hasNext())
{
Cell c = iter.next();
// remove columns if
// (a) the column itself is gcable or
// (b) the column is shadowed by a CF tombstone
// (c) the column has been dropped from the CF schema (CQL3 tables only)
if (c.getLocalDeletionTime() < gcBefore || tester.isDeleted(c) || (hasDroppedColumns && isDroppedColumn(c, cf.metadata())))
{
iter.remove();
indexer.remove(c);
}
}
iter.commit();
return cf;
}
// returns true if
// 1. this column has been dropped from schema and
// 2. if it has been re-added since then, this particular column was inserted before the last drop
private static boolean isDroppedColumn(Cell c, CFMetaData meta)
{
Long droppedAt = meta.getDroppedColumns().get(c.name().cql3ColumnName(meta));
return droppedAt != null && c.timestamp() <= droppedAt;
}
private void removeDroppedColumns(ColumnFamily cf)
{
if (cf == null || cf.metadata.getDroppedColumns().isEmpty())
return;
BatchRemoveIterator iter = cf.batchRemoveIterator();
while (iter.hasNext())
if (isDroppedColumn(iter.next(), metadata))
iter.remove();
iter.commit();
}
/**
* @param sstables
* @return sstables whose key range overlaps with that of the given sstables, not including itself.
* (The given sstables may or may not overlap with each other.)
*/
public Collection getOverlappingSSTables(Iterable sstables)
{
logger.trace("Checking for sstables overlapping {}", sstables);
// a normal compaction won't ever have an empty sstables list, but we create a skeleton
// compaction controller for streaming, and that passes an empty list.
if (!sstables.iterator().hasNext())
return ImmutableSet.of();
List sortedByFirst = Lists.newArrayList(sstables);
Collections.sort(sortedByFirst, new Comparator()
{
@Override
public int compare(SSTableReader o1, SSTableReader o2)
{
return o1.first.compareTo(o2.first);
}
});
List> intervals = new ArrayList<>();
DecoratedKey first = null, last = null;
/*
normalize the intervals covered by the sstables
assume we have sstables like this (brackets representing first/last key in the sstable);
[ ] [ ] [ ] [ ]
[ ] [ ]
then we can, instead of searching the interval tree 6 times, normalize the intervals and
only query the tree 2 times, for these intervals;
[ ] [ ]
*/
for (SSTableReader sstable : sortedByFirst)
{
if (first == null)
{
first = sstable.first;
last = sstable.last;
}
else
{
if (sstable.first.compareTo(last) <= 0) // we do overlap
{
if (sstable.last.compareTo(last) > 0)
last = sstable.last;
}
else
{
intervals.add(Interval.create(first, last));
first = sstable.first;
last = sstable.last;
}
}
}
intervals.add(Interval.create(first, last));
SSTableIntervalTree tree = data.getView().intervalTree;
Set results = new HashSet<>();
for (Interval interval : intervals)
results.addAll(tree.search(interval));
return Sets.difference(results, ImmutableSet.copyOf(sstables));
}
/**
* like getOverlappingSSTables, but acquires references before returning
*/
public Refs getAndReferenceOverlappingSSTables(Iterable sstables)
{
while (true)
{
Iterable overlapped = getOverlappingSSTables(sstables);
Refs refs = Refs.tryRef(overlapped);
if (refs != null)
return refs;
}
}
/*
* Called after a BinaryMemtable flushes its in-memory data, or we add a file
* via bootstrap. This information is cached in the ColumnFamilyStore.
* This is useful for reads because the ColumnFamilyStore first looks in
* the in-memory store and the into the disk to find the key. If invoked
* during recoveryMode the onMemtableFlush() need not be invoked.
*
* param @ filename - filename just flushed to disk
*/
public void addSSTable(SSTableReader sstable)
{
assert sstable.getColumnFamilyName().equals(name);
addSSTables(Arrays.asList(sstable));
}
public void addSSTables(Collection sstables)
{
data.addSSTables(sstables);
CompactionManager.instance.submitBackground(this);
}
/**
* Calculate expected file size of SSTable after compaction.
*
* If operation type is {@code CLEANUP} and we're not dealing with an index sstable,
* then we calculate expected file size with checking token range to be eliminated.
*
* Otherwise, we just add up all the files' size, which is the worst case file
* size for compaction of all the list of files given.
*
* @param sstables SSTables to calculate expected compacted file size
* @param operation Operation type
* @return Expected file size of SSTable after compaction
*/
public long getExpectedCompactedFileSize(Iterable sstables, OperationType operation)
{
if (operation != OperationType.CLEANUP || isIndex())
{
return SSTableReader.getTotalBytes(sstables);
}
// cleanup size estimation only counts bytes for keys local to this node
long expectedFileSize = 0;
Collection> ranges = StorageService.instance.getLocalRanges(keyspace.getName());
for (SSTableReader sstable : sstables)
{
List> positions = sstable.getPositionsForRanges(ranges);
for (Pair position : positions)
expectedFileSize += position.right - position.left;
}
double compressionRatio = metric.compressionRatio.getValue();
if (compressionRatio > 0d)
expectedFileSize *= compressionRatio;
return expectedFileSize;
}
/*
* Find the maximum size file in the list .
*/
public SSTableReader getMaxSizeFile(Iterable sstables)
{
long maxSize = 0L;
SSTableReader maxFile = null;
for (SSTableReader sstable : sstables)
{
if (sstable.onDiskLength() > maxSize)
{
maxSize = sstable.onDiskLength();
maxFile = sstable;
}
}
return maxFile;
}
public CompactionManager.AllSSTableOpStatus forceCleanup(int jobs) throws ExecutionException, InterruptedException
{
return CompactionManager.instance.performCleanup(ColumnFamilyStore.this, jobs);
}
/** Returns true if all sstables in this CF do not need to be cleaned, or false if one or more need to be cleaned
* or their status cannot be determined.
*/
public boolean isFullyClean(int jobs) throws ExecutionException, InterruptedException
{
return CompactionManager.instance.checkIfFullyClean(ColumnFamilyStore.this, jobs);
}
public CompactionManager.AllSSTableOpStatus scrub(boolean disableSnapshot, boolean skipCorrupted, boolean checkData, boolean reinsertOverflowedTTLRows, int jobs) throws ExecutionException, InterruptedException
{
return scrub(disableSnapshot, skipCorrupted, false, checkData, reinsertOverflowedTTLRows, jobs);
}
public CompactionManager.AllSSTableOpStatus scrub(boolean disableSnapshot, boolean skipCorrupted, boolean alwaysFail, boolean checkData, boolean reinsertOverflowedTTLRows, int jobs) throws ExecutionException, InterruptedException
{
// skip snapshot creation during scrub, SEE JIRA 5891
if(!disableSnapshot)
snapshotWithoutFlush("pre-scrub-" + System.currentTimeMillis());
try
{
return CompactionManager.instance.performScrub(ColumnFamilyStore.this, skipCorrupted, checkData, reinsertOverflowedTTLRows, jobs);
}
catch(Throwable t)
{
if (!rebuildOnFailedScrub(t))
throw t;
return alwaysFail ? CompactionManager.AllSSTableOpStatus.ABORTED : CompactionManager.AllSSTableOpStatus.SUCCESSFUL;
}
}
/**
* CASSANDRA-5174 : For an index cfs we may be able to discard everything and just rebuild
* the index when a scrub fails.
*
* @return true if we are an index cfs and we successfully rebuilt the index
*/
public boolean rebuildOnFailedScrub(Throwable failure)
{
if (!isIndex())
return false;
SecondaryIndex index = null;
if (metadata.cfName.contains(Directories.SECONDARY_INDEX_NAME_SEPARATOR))
{
String[] parts = metadata.cfName.split("\\" + Directories.SECONDARY_INDEX_NAME_SEPARATOR, 2);
ColumnFamilyStore parentCfs = keyspace.getColumnFamilyStore(parts[0]);
index = parentCfs.indexManager.getIndexByName(metadata.cfName);
assert index != null;
}
if (index == null)
return false;
truncateBlocking();
logger.warn("Rebuilding index for {} because of <{}>", name, failure.getMessage());
index.getBaseCfs().rebuildSecondaryIndex(index.getIndexName());
return true;
}
public CompactionManager.AllSSTableOpStatus verify(boolean extendedVerify) throws ExecutionException, InterruptedException
{
return CompactionManager.instance.performVerify(ColumnFamilyStore.this, extendedVerify);
}
public CompactionManager.AllSSTableOpStatus sstablesRewrite(boolean excludeCurrentVersion, int jobs) throws ExecutionException, InterruptedException
{
return CompactionManager.instance.performSSTableRewrite(ColumnFamilyStore.this, excludeCurrentVersion, jobs);
}
public void markObsolete(Collection sstables, OperationType compactionType)
{
assert !sstables.isEmpty();
maybeFail(data.dropSSTables(Predicates.in(sstables), compactionType, null));
}
public boolean isValid()
{
return valid;
}
/**
* Package protected for access from the CompactionManager.
*/
public Tracker getTracker()
{
return data;
}
public Collection getSSTables()
{
return data.getSSTables();
}
public Iterable getPermittedToCompactSSTables()
{
return data.getPermittedToCompact();
}
public Set getUncompactingSSTables()
{
return data.getUncompacting();
}
public ColumnFamily getColumnFamily(DecoratedKey key,
Composite start,
Composite finish,
boolean reversed,
int limit,
long timestamp)
{
return getColumnFamily(QueryFilter.getSliceFilter(key, name, start, finish, reversed, limit, timestamp));
}
/**
* Fetch the row and columns given by filter.key if it is in the cache; if not, read it from disk and cache it
*
* If row is cached, and the filter given is within its bounds, we return from cache, otherwise from disk
*
* If row is not cached, we figure out what filter is "biggest", read that from disk, then
* filter the result and either cache that or return it.
*
* @param cfId the column family to read the row from
* @param filter the columns being queried.
* @return the requested data for the filter provided
*/
private ColumnFamily getThroughCache(UUID cfId, QueryFilter filter)
{
assert isRowCacheEnabled()
: String.format("Row cache is not enabled on table [" + name + "]");
RowCacheKey key = new RowCacheKey(metadata.ksAndCFName, filter.key);
// attempt a sentinel-read-cache sequence. if a write invalidates our sentinel, we'll return our
// (now potentially obsolete) data, but won't cache it. see CASSANDRA-3862
// TODO: don't evict entire rows on writes (#2864)
IRowCacheEntry cached = CacheService.instance.rowCache.get(key);
if (cached != null)
{
if (cached instanceof RowCacheSentinel)
{
// Some other read is trying to cache the value, just do a normal non-caching read
Tracing.trace("Row cache miss (race)");
metric.rowCacheMiss.inc();
return getTopLevelColumns(filter, Integer.MIN_VALUE, FilterExperiment.USE_LEGACY);
}
ColumnFamily cachedCf = (ColumnFamily)cached;
if (isFilterFullyCoveredBy(filter.filter, cachedCf, filter.timestamp))
{
metric.rowCacheHit.inc();
Tracing.trace("Row cache hit");
ColumnFamily result = filterColumnFamily(cachedCf, filter);
metric.updateSSTableIterated(0);
return result;
}
metric.rowCacheHitOutOfRange.inc();
Tracing.trace("Ignoring row cache as cached value could not satisfy query");
return getTopLevelColumns(filter, Integer.MIN_VALUE, FilterExperiment.USE_LEGACY);
}
metric.rowCacheMiss.inc();
Tracing.trace("Row cache miss");
RowCacheSentinel sentinel = new RowCacheSentinel();
boolean sentinelSuccess = CacheService.instance.rowCache.putIfAbsent(key, sentinel);
ColumnFamily data = null;
ColumnFamily toCache = null;
try
{
// If we are explicitely asked to fill the cache with full partitions, we go ahead and query the whole thing
if (metadata.getCaching().rowCache.cacheFullPartitions())
{
data = getTopLevelColumns(QueryFilter.getIdentityFilter(filter.key, name, filter.timestamp), Integer.MIN_VALUE, FilterExperiment.USE_LEGACY);
toCache = data;
Tracing.trace("Populating row cache with the whole partition");
if (sentinelSuccess && toCache != null)
CacheService.instance.rowCache.replace(key, sentinel, toCache);
return filterColumnFamily(data, filter);
}
// Otherwise, if we want to cache the result of the query we're about to do, we must make sure this query
// covers what needs to be cached. And if the user filter does not satisfy that, we sometimes extend said
// filter so we can populate the cache but only if:
// 1) we can guarantee it is a strict extension, i.e. that we will still fetch the data asked by the user.
// 2) the extension does not make us query more than getRowsPerPartitionToCache() (as a mean to limit the
// amount of extra work we'll do on a user query for the purpose of populating the cache).
//
// In practice, we can only guarantee those 2 points if the filter is one that queries the head of the
// partition (and if that filter actually counts CQL3 rows since that's what we cache and it would be
// bogus to compare the filter count to the 'rows to cache' otherwise).
if (filter.filter.isHeadFilter() && filter.filter.countCQL3Rows(metadata.comparator))
{
SliceQueryFilter sliceFilter = (SliceQueryFilter)filter.filter;
int rowsToCache = metadata.getCaching().rowCache.rowsToCache;
SliceQueryFilter cacheSlice = readFilterForCache();
QueryFilter cacheFilter = new QueryFilter(filter.key, name, cacheSlice, filter.timestamp);
// If the filter count is less than the number of rows cached, we simply extend it to make sure we do cover the
// number of rows to cache, and if that count is greater than the number of rows to cache, we simply filter what
// needs to be cached afterwards.
if (sliceFilter.count < rowsToCache)
{
toCache = getTopLevelColumns(cacheFilter, Integer.MIN_VALUE, FilterExperiment.USE_LEGACY);
if (toCache != null)
{
Tracing.trace("Populating row cache ({} rows cached)", cacheSlice.lastCounted());
data = filterColumnFamily(toCache, filter);
}
}
else
{
data = getTopLevelColumns(filter, Integer.MIN_VALUE, FilterExperiment.USE_LEGACY);
if (data != null)
{
// The filter limit was greater than the number of rows to cache. But, if the filter had a non-empty
// finish bound, we may have gotten less than what needs to be cached, in which case we shouldn't cache it
// (otherwise a cache hit would assume the whole partition is cached which is not the case).
if (sliceFilter.finish().isEmpty() || sliceFilter.lastCounted() >= rowsToCache)
{
toCache = filterColumnFamily(data, cacheFilter);
Tracing.trace("Caching {} rows (out of {} requested)", cacheSlice.lastCounted(), sliceFilter.count);
}
else
{
Tracing.trace("Not populating row cache, not enough rows fetched ({} fetched but {} required for the cache)", sliceFilter.lastCounted(), rowsToCache);
}
}
}
if (sentinelSuccess && toCache != null)
CacheService.instance.rowCache.replace(key, sentinel, toCache);
return data;
}
else
{
Tracing.trace("Fetching data but not populating cache as query does not query from the start of the partition");
return getTopLevelColumns(filter, Integer.MIN_VALUE, FilterExperiment.USE_LEGACY);
}
}
finally
{
if (sentinelSuccess && toCache == null)
invalidateCachedRow(key);
}
}
public SliceQueryFilter readFilterForCache()
{
// We create a new filter everytime before for now SliceQueryFilter is unfortunatly mutable.
return new SliceQueryFilter(ColumnSlice.ALL_COLUMNS_ARRAY, false, metadata.getCaching().rowCache.rowsToCache, metadata.clusteringColumns().size());
}
public boolean isFilterFullyCoveredBy(IDiskAtomFilter filter, ColumnFamily cachedCf, long now)
{
// We can use the cached value only if we know that no data it doesn't contain could be covered
// by the query filter, that is if:
// 1) either the whole partition is cached
// 2) or we can ensure than any data the filter selects are in the cached partition
// When counting rows to decide if the whole row is cached, we should be careful with expiring
// columns: if we use a timestamp newer than the one that was used when populating the cache, we might
// end up deciding the whole partition is cached when it's really not (just some rows expired since the
// cf was cached). This is the reason for Integer.MIN_VALUE below.
boolean wholePartitionCached = cachedCf.liveCQL3RowCount(Integer.MIN_VALUE) < metadata.getCaching().rowCache.rowsToCache;
// Contrarily to the "wholePartitionCached" check above, we do want isFullyCoveredBy to take the
// timestamp of the query into account when dealing with expired columns. Otherwise, we could think
// the cached partition has enough live rows to satisfy the filter when it doesn't because some
// are now expired.
return wholePartitionCached || filter.isFullyCoveredBy(cachedCf, now);
}
public int gcBefore(long now)
{
return (int) (now / 1000) - metadata.getGcGraceSeconds();
}
/**
* get a list of columns starting from a given column, in a specified order.
* only the latest version of a column is returned.
* @return null if there is no data and no tombstones; otherwise a ColumnFamily
*/
public ColumnFamily getColumnFamily(QueryFilter filter)
{
assert name.equals(filter.getColumnFamilyName()) : filter.getColumnFamilyName();
ColumnFamily result = null;
if (filter.filter instanceof SliceQueryFilter) {
((SliceQueryFilter) filter.filter).setMetrics(metric);
}
long start = System.nanoTime();
int readDelay = DatabaseDescriptor.getReadDelay();
if (readDelay > 0) {
Tracing.trace("Sleeping for delay of {} seconds before performing read", readDelay);
Uninterruptibles.sleepUninterruptibly(readDelay, TimeUnit.SECONDS);
}
try
{
int gcBefore = gcBefore(filter.timestamp);
if (isRowCacheEnabled())
{
assert !isIndex(); // CASSANDRA-5732
UUID cfId = metadata.cfId;
ColumnFamily cached = getThroughCache(cfId, filter);
if (cached == null)
{
logger.trace("cached row is empty");
return null;
}
result = cached;
}
else
{
// This boolean is not necessay for correctness, but is necessary for the metrics to be updated in the
// same cases, since slice queries skip updating metrics when no data was returned (for some reason).
// While this should be fixed, let's not do this in a PR that changes behaviour.
AtomicBoolean wasNotNull = new AtomicBoolean(false);
BiFunction compute = (experiment, chosenGcBefore) -> {
ColumnFamily retrieved = getTopLevelColumns(filter, chosenGcBefore, experiment);
if (retrieved != null) {
wasNotNull.set(true);
retrieved = removeDeletedCF(retrieved, chosenGcBefore);
}
return retrieved;
};
result = FilterExperiment.execute(
experiment -> compute.apply(experiment, gcBefore),
experiment -> compute.apply(experiment, gcBefore - 60));
if (result == null && !wasNotNull.get())
return null;
}
removeDroppedColumns(result);
if (filter.filter instanceof SliceQueryFilter) {
recordMetrics((SliceQueryFilter) filter.filter);
}
}
finally
{
metric.readLatency.addNano(System.nanoTime() - start);
if (filter.filter instanceof SliceQueryFilter
&& ((SliceQueryFilter) filter.filter).hitTombstoneFailureThreshold())
{
metric.tombstoneFailures.inc();
}
}
return result;
}
private void recordMetrics(SliceQueryFilter filter) {
// Log the number of tombstones scanned on single key queries
metric.tombstoneScannedHistogram.update(filter.lastTombstones());
metric.liveScannedHistogram.update(filter.lastLive());
metric.droppableTombstonesReadHistogram.update(filter.lastReadDroppableTombstones());
metric.droppableTtlsReadHistogram.update(filter.lastReadDroppableTtls());
metric.liveReadHistogram.update(filter.lastReadLive());
metric.tombstonesReadHistogram.update(filter.lastReadTombstones());
if (filter.hitTombstoneWarnThreshold()) metric.tombstoneWarnings.inc();
}
/**
* Filter a cached row, which will not be modified by the filter, but may be modified by throwing out
* tombstones that are no longer relevant.
* The returned column family won't be thread safe.
*/
ColumnFamily filterColumnFamily(ColumnFamily cached, QueryFilter filter)
{
if (cached == null)
return null;
ColumnFamily cf = cached.cloneMeShallow(ArrayBackedSortedColumns.factory, filter.filter.isReversed());
int gcBefore = gcBefore(filter.timestamp);
filter.collateOnDiskAtom(cf, filter.getIterator(cached), gcBefore);
return removeDeletedCF(cf, gcBefore);
}
public Set getUnrepairedSSTables()
{
Set unRepairedSSTables = new HashSet<>(getSSTables());
Iterator sstableIterator = unRepairedSSTables.iterator();
while(sstableIterator.hasNext())
{
SSTableReader sstable = sstableIterator.next();
if (sstable.isRepaired())
sstableIterator.remove();
}
return unRepairedSSTables;
}
public Set getRepairedSSTables()
{
Set repairedSSTables = new HashSet<>(getSSTables());
Iterator sstableIterator = repairedSSTables.iterator();
while(sstableIterator.hasNext())
{
SSTableReader sstable = sstableIterator.next();
if (!sstable.isRepaired())
sstableIterator.remove();
}
return repairedSSTables;
}
@SuppressWarnings("resource")
public RefViewFragment selectAndReference(Function> filter)
{
long failingSince = -1L;
while (true)
{
ViewFragment view = select(filter);
Refs refs = Refs.tryRef(view.sstables);
if (refs != null)
return new RefViewFragment(view.sstables, view.memtables, refs);
if (failingSince <= 0)
{
failingSince = System.nanoTime();
}
else if (System.nanoTime() - failingSince > TimeUnit.MILLISECONDS.toNanos(100))
{
List released = new ArrayList<>();
for (SSTableReader reader : view.sstables)
if (reader.selfRef().globalCount() == 0)
released.add(reader);
NoSpamLogger.log(logger, NoSpamLogger.Level.WARN, 1, TimeUnit.SECONDS,
"Spinning trying to capture readers {}, released: {}, ", view.sstables, released);
failingSince = System.nanoTime();
}
}
}
public ViewFragment select(Function> filter)
{
View view = data.getView();
List sstables = view.intervalTree.isEmpty()
? Collections.emptyList()
: filter.apply(view);
return new ViewFragment(sstables, view.getAllMemtables());
}
/**
* @return a ViewFragment containing the sstables and memtables that may need to be merged
* for the given @param key, according to the interval tree
*/
public Function> viewFilter(final DecoratedKey key)
{
assert !key.isMinimum();
return new Function>()
{
public List apply(View view)
{
return compactionStrategyWrapper.filterSSTablesForReads(view.intervalTree.search(key));
}
};
}
/**
* @return a ViewFragment containing the sstables and memtables that may need to be merged
* for rows within @param rowBounds, inclusive, according to the interval tree.
*/
public Function> viewFilter(final AbstractBounds rowBounds)
{
assert !AbstractBounds.strictlyWrapsAround(rowBounds.left, rowBounds.right);
return new Function>()
{
public List apply(View view)
{
// Note that View.sstablesInBounds always includes it's bound while rowBounds may not. This is ok however
// because the fact we restrict the sstables returned by this function is an optimization in the first
// place and the returned sstables will (almost) never cover *exactly* rowBounds anyway. It's also
// *very* unlikely that a sstable is included *just* because we consider one of the bound inclusively
// instead of exclusively, so the performance impact is negligible in practice.
return view.sstablesInBounds(rowBounds.left, rowBounds.right);
}
};
}
/**
* @return a ViewFragment containing the sstables and memtables that may need to be merged
* for rows for all of @param rowBoundsCollection, inclusive, according to the interval tree.
*/
public Function> viewFilter(final Collection> rowBoundsCollection, final boolean includeRepaired)
{
assert AbstractBounds.noneStrictlyWrapsAround(rowBoundsCollection);
return new Function>()
{
public List apply(View view)
{
Set sstables = Sets.newHashSet();
for (AbstractBounds rowBounds : rowBoundsCollection)
{
// Note that View.sstablesInBounds always includes it's bound while rowBounds may not. This is ok however
// because the fact we restrict the sstables returned by this function is an optimization in the first
// place and the returned sstables will (almost) never cover *exactly* rowBounds anyway. It's also
// *very* unlikely that a sstable is included *just* because we consider one of the bound inclusively
// instead of exclusively, so the performance impact is negligible in practice.
for (SSTableReader sstable : view.sstablesInBounds(rowBounds.left, rowBounds.right))
{
if (includeRepaired || !sstable.isRepaired())
sstables.add(sstable);
}
}
logger.trace("ViewFilter for {}/{} sstables", sstables.size(), getSSTables().size());
return ImmutableList.copyOf(sstables);
}
};
}
public List getSSTablesForKey(String key)
{
DecoratedKey dk = partitioner.decorateKey(metadata.getKeyValidator().fromString(key));
try (OpOrder.Group op = readOrdering.start())
{
List files = new ArrayList<>();
for (SSTableReader sstr : select(viewFilter(dk)).sstables)
{
// check if the key actually exists in this sstable, without updating cache and stats
if (sstr.getPosition(dk, SSTableReader.Operator.EQ, false) != null)
files.add(sstr.getFilename());
}
return files;
}
}
public ColumnFamily getTopLevelColumns(QueryFilter filter, int gcBefore, FilterExperiment experiment)
{
Tracing.trace("Executing single-partition query on {}", name);
CollationController controller = new CollationController(this, filter, gcBefore);
ColumnFamily columns;
try (OpOrder.Group op = readOrdering.start())
{
columns = controller.getTopLevelColumns(Memtable.MEMORY_POOL.needToCopyOnHeap(), experiment);
}
if (columns != null)
metric.samplers.get(Sampler.READS).addSample(filter.key.getKey(), filter.key.hashCode(), 1);
metric.updateSSTableIterated(controller.getSstablesIterated());
return columns;
}
public void beginLocalSampling(String sampler, int capacity)
{
metric.samplers.get(Sampler.valueOf(sampler)).beginSampling(capacity);
}
public CompositeData finishLocalSampling(String sampler, int count) throws OpenDataException
{
SamplerResult samplerResults = metric.samplers.get(Sampler.valueOf(sampler))
.finishSampling(count);
TabularDataSupport result = new TabularDataSupport(COUNTER_TYPE);
for (Counter counter : samplerResults.topK)
{
byte[] key = counter.getItem().array();
result.put(new CompositeDataSupport(COUNTER_COMPOSITE_TYPE, COUNTER_NAMES, new Object[] {
Hex.bytesToHex(key), // raw
counter.getCount(), // count
counter.getError(), // error
metadata.getKeyValidator().getString(ByteBuffer.wrap(key)) })); // string
}
return new CompositeDataSupport(SAMPLING_RESULT, SAMPLER_NAMES, new Object[]{
samplerResults.cardinality, result});
}
public boolean isCompactionDiskSpaceCheckEnabled()
{
return compactionSpaceCheck;
}
public void compactionDiskSpaceCheck(boolean enable)
{
compactionSpaceCheck = enable;
}
public void cleanupCache()
{
Collection> ranges = StorageService.instance.getLocalRanges(keyspace.getName());
for (Iterator keyIter = CacheService.instance.rowCache.keyIterator();
keyIter.hasNext(); )
{
RowCacheKey key = keyIter.next();
DecoratedKey dk = partitioner.decorateKey(ByteBuffer.wrap(key.key));
if (key.ksAndCFName.equals(metadata.ksAndCFName) && !Range.isInRanges(dk.getToken(), ranges))
invalidateCachedRow(dk);
}
if (metadata.isCounter())
{
for (Iterator keyIter = CacheService.instance.counterCache.keyIterator();
keyIter.hasNext(); )
{
CounterCacheKey key = keyIter.next();
DecoratedKey dk = partitioner.decorateKey(ByteBuffer.wrap(key.partitionKey));
if (key.ksAndCFName.equals(metadata.ksAndCFName) && !Range.isInRanges(dk.getToken(), ranges))
CacheService.instance.counterCache.remove(key);
}
}
}
public static abstract class AbstractScanIterator extends AbstractIterator implements CloseableIterator
{
public boolean needsFiltering()
{
return true;
}
}
/**
* Iterate over a range of rows and columns from memtables/sstables.
*
* @param range The range of keys and columns within those keys to fetch
*/
@SuppressWarnings("resource")
private AbstractScanIterator getSequentialIterator(final DataRange range, long now)
{
assert !(range.keyRange() instanceof Range) || !((Range>)range.keyRange()).isWrapAround() || range.keyRange().right.isMinimum() : range.keyRange();
final ViewFragment view = select(viewFilter(range.keyRange()));
Tracing.trace("Executing seq scan across {} sstables for {}", view.sstables.size(), range.keyRange().getString(metadata.getKeyValidator()));
final CloseableIterator iterator = RowIteratorFactory.getIterator(view.memtables, view.sstables, range, this, now);
// todo this could be pushed into SSTableScanner
return new AbstractScanIterator()
{
protected Row computeNext()
{
while (true)
{
// pull a row out of the iterator
if (!iterator.hasNext())
return endOfData();
Row current = iterator.next();
DecoratedKey key = current.key;
if (!range.stopKey().isMinimum() && range.stopKey().compareTo(key) < 0)
return endOfData();
// skipping outside of assigned range
if (!range.contains(key))
continue;
if (logger.isTraceEnabled())
logger.trace("scanned {}", metadata.getKeyValidator().getString(key.getKey()));
return current;
}
}
public void close() throws IOException
{
iterator.close();
}
};
}
@VisibleForTesting
public List getRangeSlice(final AbstractBounds range,
List rowFilter,
IDiskAtomFilter columnFilter,
int maxResults)
{
return getRangeSlice(range, rowFilter, columnFilter, maxResults, System.currentTimeMillis());
}
public List getRangeSlice(final AbstractBounds range,
List rowFilter,
IDiskAtomFilter columnFilter,
int maxResults,
long now)
{
return getRangeSlice(makeExtendedFilter(range, columnFilter, rowFilter, maxResults, false, false, now));
}
/**
* Allows generic range paging with the slice column filter.
* Typically, suppose we have rows A, B, C ... Z having each some columns in [1, 100].
* And suppose we want to page through the query that for all rows returns the columns
* within [25, 75]. For that, we need to be able to do a range slice starting at (row r, column c)
* and ending at (row Z, column 75), *but* that only return columns in [25, 75].
* That is what this method allows. The columnRange is the "window" of columns we are interested
* in each row, and columnStart (resp. columnEnd) is the start (resp. end) for the first
* (resp. last) requested row.
*/
public ExtendedFilter makeExtendedFilter(AbstractBounds keyRange,
SliceQueryFilter columnRange,
Composite columnStart,
Composite columnStop,
List rowFilter,
int maxResults,
boolean countCQL3Rows,
long now)
{
DataRange dataRange = new DataRange.Paging(keyRange, columnRange, columnStart, columnStop, metadata);
return ExtendedFilter.create(this, dataRange, rowFilter, maxResults, countCQL3Rows, now);
}
public List getRangeSlice(AbstractBounds range,
List rowFilter,
IDiskAtomFilter columnFilter,
int maxResults,
long now,
boolean countCQL3Rows,
boolean isPaging)
{
return getRangeSlice(makeExtendedFilter(range, columnFilter, rowFilter, maxResults, countCQL3Rows, isPaging, now));
}
public ExtendedFilter makeExtendedFilter(AbstractBounds range,
IDiskAtomFilter columnFilter,
List rowFilter,
int maxResults,
boolean countCQL3Rows,
boolean isPaging,
long timestamp)
{
DataRange dataRange;
if (isPaging)
{
assert columnFilter instanceof SliceQueryFilter;
SliceQueryFilter sfilter = (SliceQueryFilter)columnFilter;
assert sfilter.slices.length == 1;
// create a new SliceQueryFilter that selects all cells, but pass the original slice start and finish
// through to DataRange.Paging to be used on the first and last partitions
SliceQueryFilter newFilter = new SliceQueryFilter(ColumnSlice.ALL_COLUMNS_ARRAY, sfilter.isReversed(), sfilter.count);
dataRange = new DataRange.Paging(range, newFilter, sfilter.start(), sfilter.finish(), metadata);
}
else
{
dataRange = new DataRange(range, columnFilter);
}
return ExtendedFilter.create(this, dataRange, rowFilter, maxResults, countCQL3Rows, timestamp);
}
public List getRangeSlice(ExtendedFilter filter)
{
long start = System.nanoTime();
try (OpOrder.Group op = readOrdering.start())
{
return filter(getSequentialIterator(filter.dataRange, filter.timestamp), filter);
}
finally
{
metric.rangeLatency.addNano(System.nanoTime() - start);
}
}
@VisibleForTesting
public List search(AbstractBounds range,
List clause,
IDiskAtomFilter dataFilter,
int maxResults)
{
return search(range, clause, dataFilter, maxResults, System.currentTimeMillis());
}
public List search(AbstractBounds range,
List clause,
IDiskAtomFilter dataFilter,
int maxResults,
long now)
{
return search(makeExtendedFilter(range, dataFilter, clause, maxResults, false, false, now));
}
public List search(ExtendedFilter filter)
{
Tracing.trace("Executing indexed scan for {}", filter.dataRange.keyRange().getString(metadata.getKeyValidator()));
return indexManager.search(filter);
}
public List filter(AbstractScanIterator rowIterator, ExtendedFilter filter)
{
logger.trace("Filtering {} for rows matching {}", rowIterator, filter);
List rows = new ArrayList();
int columnsCount = 0;
int total = 0, matched = 0;
boolean ignoreTombstonedPartitions = filter.ignoreTombstonedPartitions();
int rowCountFailureThreshold = DatabaseDescriptor.getRowCountFailureThreshold();
try
{
Row rawRow = null;
while (rowIterator.hasNext() && matched < filter.maxRows() && columnsCount < filter.maxColumns())
{
// get the raw columns requested, and additional columns for the expressions if necessary
rawRow = rowIterator.next();
total++;
ColumnFamily data = rawRow.cf;
if (rowIterator.needsFiltering())
{
IDiskAtomFilter queryFilter = filter.columnFilter(rawRow.key.getKey());
if (queryFilter instanceof SliceQueryFilter) {
((SliceQueryFilter) queryFilter).setMetrics(metric);
}
IDiskAtomFilter extraFilter = filter.getExtraFilter(rawRow.key, data);
if (extraFilter != null)
{
ColumnFamily cf = filter.cfs.getColumnFamily(new QueryFilter(rawRow.key, name, extraFilter, filter.timestamp));
if (cf != null)
data.addAll(cf);
}
removeDroppedColumns(data);
if (!filter.isSatisfiedBy(rawRow.key, data, null, null))
continue;
logger.trace("{} satisfies all filter expressions", data);
// cut the resultset back to what was requested, if necessary
data = filter.prune(rawRow.key, data);
if (queryFilter instanceof SliceQueryFilter) {
recordMetrics((SliceQueryFilter) queryFilter);
}
}
else
{
removeDroppedColumns(data);
}
Row row = new Row(rawRow.key, data);
metric.rangeScanBytesRead.mark(Row.serializer.serializedSize(row, MessagingService.current_version));
rows.add(row);
if (!ignoreTombstonedPartitions || !data.hasOnlyTombstones(filter.timestamp))
matched++;
if (data != null)
columnsCount += filter.lastCounted(data);
// Update the underlying filter to avoid querying more columns per slice than necessary and to handle paging
filter.updateFilter(columnsCount);
if (rows.size() > rowCountFailureThreshold)
{
metric.rowCountFailures.inc();
int numTombstonedRows = countTombstonedRows(rows, filter);
Tracing.trace("Scanned over {} rows ({} tombstoned); query aborted (see rowcount_failure_threshold)",
rows.size(), numTombstonedRows);
throw new RowCountOverwhelmingException(rows.size(),
numTombstonedRows,
filter.maxRows(),
filter.cfs.metadata.ksName,
filter.cfs.metadata.cfName,
rawRow.key.toString(),
filter.dataRange.toString());
}
}
if (logger.isWarnEnabled() && rows.size() > DatabaseDescriptor.getRowCountWarnThreshold())
{
metric.rowCountWarnings.inc();
int numTombstonedRows = countTombstonedRows(rows, filter);
String msg = String.format("Scanned over %d rows (%d tombstoned) in %s.%s; " +
"%d rows were requested (see rowcount_warn_threshold); " +
"lastRow=%s; dataLimits=%s",
rows.size(),
numTombstonedRows,
filter.cfs.metadata.ksName,
filter.cfs.metadata.cfName,
filter.maxRows(),
rawRow == null ? "null" : rawRow.key.toString(),
filter.dataRange.toString());
Tracing.trace("Scanned over {} rows ({} tombstoned) (see tombstone_warn_threshold)",
rows.size(),
numTombstonedRows);
logger.warn(msg);
}
return rows;
}
finally
{
try
{
rowIterator.close();
Tracing.trace("Scanned {} rows and matched {}", total, matched);
}
catch (IOException e)
{
throw new RuntimeException(e);
}
}
}
private int countTombstonedRows(List rows, ExtendedFilter filter)
{
int numTombstonedRows = 0;
for (Row row : rows)
{
if (row.cf.hasOnlyTombstones(filter.timestamp))
{
numTombstonedRows++;
}
}
return numTombstonedRows;
}
public CellNameType getComparator()
{
return metadata.comparator;
}
public void snapshotWithoutFlush(String snapshotName)
{
snapshotWithoutFlush(snapshotName, null, false);
}
/**
* @param ephemeral If this flag is set to true, the snapshot will be cleaned during next startup
*/
public Set snapshotWithoutFlush(String snapshotName, Predicate predicate, boolean ephemeral)
{
logger.debug("Taking snapshot without flush for {}", name);
Set snapshottedSSTables = new HashSet<>();
final JSONArray filesJSONArr = new JSONArray();
for (ColumnFamilyStore cfs : concatWithIndexes())
{
try (RefViewFragment currentView = cfs.selectAndReference(CANONICAL_SSTABLES))
{
for (SSTableReader ssTable : currentView.sstables)
{
if (predicate != null && !predicate.apply(ssTable))
continue;
File snapshotDirectory = Directories.getSnapshotDirectory(ssTable.descriptor, snapshotName);
ssTable.createLinks(snapshotDirectory.getPath()); // hard links
filesJSONArr.add(ssTable.descriptor.relativeFilenameFor(Component.DATA));
if (logger.isTraceEnabled())
logger.trace("Snapshot for {} keyspace data file {} created in {}", keyspace, ssTable.getFilename(), snapshotDirectory);
snapshottedSSTables.add(ssTable);
}
}
}
writeSnapshotManifest(filesJSONArr, snapshotName);
if (ephemeral)
createEphemeralSnapshotMarkerFile(snapshotName);
return snapshottedSSTables;
}
private void writeSnapshotManifest(final JSONArray filesJSONArr, final String snapshotName)
{
final File manifestFile = directories.getSnapshotManifestFile(snapshotName);
try
{
if (!manifestFile.getParentFile().exists())
manifestFile.getParentFile().mkdirs();
try (PrintStream out = new PrintStream(manifestFile))
{
final JSONObject manifestJSON = new JSONObject();
manifestJSON.put("files", filesJSONArr);
out.println(manifestJSON.toJSONString());
}
}
catch (IOException e)
{
throw new FSWriteError(e, manifestFile);
}
}
private void createEphemeralSnapshotMarkerFile(final String snapshot)
{
final File ephemeralSnapshotMarker = directories.getNewEphemeralSnapshotMarkerFile(snapshot);
try
{
if (!ephemeralSnapshotMarker.getParentFile().exists())
ephemeralSnapshotMarker.getParentFile().mkdirs();
Files.createFile(ephemeralSnapshotMarker.toPath());
logger.trace("Created ephemeral snapshot marker file on {}.", ephemeralSnapshotMarker.getAbsolutePath());
}
catch (IOException e)
{
logger.warn(String.format("Could not create marker file %s for ephemeral snapshot %s. " +
"In case there is a failure in the operation that created " +
"this snapshot, you may need to clean it manually afterwards.",
ephemeralSnapshotMarker.getAbsolutePath(), snapshot), e);
}
}
protected static void clearEphemeralSnapshots(Directories directories)
{
for (String ephemeralSnapshot : directories.listEphemeralSnapshots())
{
logger.trace("Clearing ephemeral snapshot {} leftover from previous session.", ephemeralSnapshot);
Directories.clearSnapshot(ephemeralSnapshot, directories.getCFDirectories());
}
}
public Refs getSnapshotSSTableReader(String tag) throws IOException
{
Map active = new HashMap<>();
for (SSTableReader sstable : data.getView().sstables)
active.put(sstable.descriptor.generation, sstable);
Map> snapshots = directories.sstableLister().snapshots(tag).list();
Refs refs = new Refs<>();
try
{
for (Map.Entry> entries : snapshots.entrySet())
{
// Try acquire reference to an active sstable instead of snapshot if it exists,
// to avoid opening new sstables. If it fails, use the snapshot reference instead.
SSTableReader sstable = active.get(entries.getKey().generation);
if (sstable == null || !refs.tryRef(sstable))
{
if (logger.isTraceEnabled())
logger.trace("using snapshot sstable {}", entries.getKey());
// open without tracking hotness
sstable = SSTableReader.open(entries.getKey(), entries.getValue(), metadata, partitioner, true, false);
refs.tryRef(sstable);
// release the self ref as we never add the snapshot sstable to DataTracker where it is otherwise released
sstable.selfRef().release();
}
else if (logger.isTraceEnabled())
{
logger.trace("using active sstable {}", entries.getKey());
}
}
}
catch (IOException | RuntimeException e)
{
// In case one of the snapshot sstables fails to open,
// we must release the references to the ones we opened so far
refs.release();
throw e;
}
return refs;
}
/**
* Take a snap shot of this columnfamily store.
*
* @param snapshotName the name of the associated with the snapshot
*/
public Set snapshot(String snapshotName)
{
return snapshot(snapshotName, null, false);
}
/**
* @param ephemeral If this flag is set to true, the snapshot will be cleaned up during next startup
*/
public Set snapshot(String snapshotName, Predicate predicate, boolean ephemeral)
{
forceBlockingFlush("Snapshot");
return snapshotWithoutFlush(snapshotName, predicate, ephemeral);
}
public boolean snapshotExists(String snapshotName)
{
return directories.snapshotExists(snapshotName);
}
public long getSnapshotCreationTime(String snapshotName)
{
return directories.snapshotCreationTime(snapshotName);
}
/**
* Clear all the snapshots for a given column family.
*
* @param snapshotName the user supplied snapshot name. If left empty,
* all the snapshots will be cleaned.
*/
public void clearSnapshot(String snapshotName)
{
List snapshotDirs = directories.getCFDirectories();
Directories.clearSnapshot(snapshotName, snapshotDirs);
}
/**
*
* @return Return a map of all snapshots to space being used
* The pair for a snapshot has true size and size on disk.
*/
public Map> getSnapshotDetails()
{
return directories.getSnapshotDetails();
}
public boolean hasUnreclaimedSpace()
{
return metric.liveDiskSpaceUsed.getCount() < metric.totalDiskSpaceUsed.getCount();
}
/**
* @return the cached row for @param key if it is already present in the cache.
* That is, unlike getThroughCache, it will not readAndCache the row if it is not present, nor
* are these calls counted in cache statistics.
*
* Note that this WILL cause deserialization of a SerializingCache row, so if all you
* need to know is whether a row is present or not, use containsCachedRow instead.
*/
public ColumnFamily getRawCachedRow(DecoratedKey key)
{
if (!isRowCacheEnabled())
return null;
IRowCacheEntry cached = CacheService.instance.rowCache.getInternal(new RowCacheKey(metadata.ksAndCFName, key));
return cached == null || cached instanceof RowCacheSentinel ? null : (ColumnFamily)cached;
}
private void invalidateCaches()
{
CacheService.instance.invalidateKeyCacheForCf(metadata.ksAndCFName);
CacheService.instance.invalidateRowCacheForCf(metadata.ksAndCFName);
if (metadata.isCounter())
CacheService.instance.invalidateCounterCacheForCf(metadata.ksAndCFName);
}
public int invalidateRowCache(Collection> boundsToInvalidate)
{
int invalidatedKeys = 0;
for (Iterator keyIter = CacheService.instance.rowCache.keyIterator();
keyIter.hasNext(); )
{
RowCacheKey key = keyIter.next();
DecoratedKey dk = partitioner.decorateKey(ByteBuffer.wrap(key.key));
if (key.ksAndCFName.equals(metadata.ksAndCFName) && Bounds.isInBounds(dk.getToken(), boundsToInvalidate))
{
invalidateCachedRow(dk);
invalidatedKeys++;
}
}
return invalidatedKeys;
}
public int invalidateCounterCache(Collection> boundsToInvalidate)
{
int invalidatedKeys = 0;
for (Iterator keyIter = CacheService.instance.counterCache.keyIterator();
keyIter.hasNext(); )
{
CounterCacheKey key = keyIter.next();
DecoratedKey dk = partitioner.decorateKey(ByteBuffer.wrap(key.partitionKey));
if (key.ksAndCFName.equals(metadata.ksAndCFName) && Bounds.isInBounds(dk.getToken(), boundsToInvalidate))
{
CacheService.instance.counterCache.remove(key);
invalidatedKeys++;
}
}
return invalidatedKeys;
}
/**
* @return true if @param key is contained in the row cache
*/
public boolean containsCachedRow(DecoratedKey key)
{
return CacheService.instance.rowCache.getCapacity() != 0 && CacheService.instance.rowCache.containsKey(new RowCacheKey(metadata.ksAndCFName, key));
}
public void invalidateCachedRow(RowCacheKey key)
{
CacheService.instance.rowCache.remove(key);
}
public void invalidateCachedRow(DecoratedKey key)
{
UUID cfId = Schema.instance.getId(keyspace.getName(), this.name);
if (cfId == null)
return; // secondary index
invalidateCachedRow(new RowCacheKey(metadata.ksAndCFName, key));
}
public ClockAndCount getCachedCounter(ByteBuffer partitionKey, CellName cellName)
{
if (CacheService.instance.counterCache.getCapacity() == 0L) // counter cache disabled.
return null;
return CacheService.instance.counterCache.get(CounterCacheKey.create(metadata.ksAndCFName, partitionKey, cellName));
}
public void putCachedCounter(ByteBuffer partitionKey, CellName cellName, ClockAndCount clockAndCount)
{
if (CacheService.instance.counterCache.getCapacity() == 0L) // counter cache disabled.
return;
CacheService.instance.counterCache.put(CounterCacheKey.create(metadata.ksAndCFName, partitionKey, cellName), clockAndCount);
}
public void forceMajorCompaction() throws InterruptedException, ExecutionException
{
forceMajorCompaction(false);
}
public void forceMajorCompaction(boolean splitOutput) throws InterruptedException, ExecutionException
{
CompactionManager.instance.performMaximal(this, splitOutput);
}
public static Iterable all()
{
List> stores = new ArrayList>(Schema.instance.getKeyspaces().size());
for (Keyspace keyspace : Keyspace.all())
{
stores.add(keyspace.getColumnFamilyStores());
}
return Iterables.concat(stores);
}
public Iterable keySamples(Range range)
{
try (RefViewFragment view = selectAndReference(CANONICAL_SSTABLES))
{
Iterable[] samples = new Iterable[view.sstables.size()];
int i = 0;
for (SSTableReader sstable: view.sstables)
{
samples[i++] = sstable.getKeySamples(range);
}
return Iterables.concat(samples);
}
}
public long estimatedKeysForRange(Range range)
{
try (RefViewFragment view = selectAndReference(CANONICAL_SSTABLES))
{
long count = 0;
for (SSTableReader sstable : view.sstables)
count += sstable.estimatedKeysForRanges(Collections.singleton(range));
return count;
}
}
@VisibleForTesting
public void clearUnsafe() {
clearUnsafe(true);
}
/**
* For testing. No effort is made to clear historical or even the current memtables, nor for
* thread safety. All we do is wipe the sstable containers clean, while leaving the actual
* data files present on disk. (This allows tests to easily call loadNewSSTables on them.)
*/
@VisibleForTesting
public void clearUnsafe(final boolean enableCompaction)
{
for (final ColumnFamilyStore cfs : concatWithIndexes())
{
cfs.runWithCompactionsDisabled(new Callable()
{
public Void call()
{
cfs.data.reset(new Memtable(new AtomicReference<>(ReplayPosition.NONE), cfs));
cfs.getCompactionStrategy().shutdown();
if (enableCompaction) cfs.getCompactionStrategy().startup();
return null;
}
}, true);
}
}
/**
* Truncate deletes the entire column family's data with no expensive tombstone creation
*/
public void truncateBlocking()
{
truncateBlocking(DatabaseDescriptor.isAutoSnapshot());
}
/**
* Truncate deletes the column family's data with no expensive tombstone creation,
* optionally snapshotting the data.
*
* @param takeSnapshot whether or not to take a snapshot true if snapshot should be taken,
* false otherwise
*/
public void truncateBlocking(final boolean takeSnapshot)
{
// We have two goals here:
// - truncate should delete everything written before truncate was invoked
// - but not delete anything that isn't part of the snapshot we create.
// We accomplish this by first flushing manually, then snapshotting, and
// recording the timestamp IN BETWEEN those actions. Any sstables created
// with this timestamp or greater time, will not be marked for delete.
//
// Bonus complication: since we store replay position in sstable metadata,
// truncating those sstables means we will replay any CL segments from the
// beginning if we restart before they [the CL segments] are discarded for
// normal reasons post-truncate. To prevent this, we store truncation
// position in the System keyspace.
logger.trace("truncating {}", name);
final long truncatedAt;
final ReplayPosition replayAfter;
if (keyspace.getMetadata().durableWrites || takeSnapshot)
{
replayAfter = forceBlockingFlush("Truncate");
}
else
{
// just nuke the memtable data w/o writing to disk first
Future replayAfterFuture;
synchronized (data)
{
final Flush flush = new Flush(true);
flushExecutor.execute(flush);
replayAfterFuture = postFlushExecutor.submit(flush.postFlush);
}
replayAfter = FBUtilities.waitOnFuture(replayAfterFuture);
}
long now = System.currentTimeMillis();
// make sure none of our sstables are somehow in the future (clock drift, perhaps)
for (ColumnFamilyStore cfs : concatWithIndexes())
for (SSTableReader sstable : cfs.data.getSSTables())
now = Math.max(now, sstable.maxDataAge);
truncatedAt = now;
Runnable truncateRunnable = new Runnable()
{
public void run()
{
logger.debug("Discarding sstable data for truncated CF + indexes");
data.notifyTruncated(truncatedAt);
if (takeSnapshot)
snapshot(Keyspace.getTimestampedSnapshotName(name));
discardSSTables(truncatedAt);
for (SecondaryIndex index : indexManager.getIndexes())
index.truncateBlocking(truncatedAt);
SystemKeyspace.saveTruncationRecord(ColumnFamilyStore.this, truncatedAt, replayAfter);
logger.trace("cleaning out row cache");
invalidateCaches();
}
};
runWithCompactionsDisabled(Executors.callable(truncateRunnable), true);
logger.trace("truncate complete");
}
public V runWithCompactionsDisabled(Callable callable, boolean interruptValidation)
{
// synchronize so that concurrent invocations don't re-enable compactions partway through unexpectedly,
// and so we only run one major compaction at a time
synchronized (this)
{
logger.trace("Cancelling in-progress compactions for {}", metadata.cfName);
Iterable selfWithIndexes = concatWithIndexes();
for (ColumnFamilyStore cfs : selfWithIndexes)
cfs.getCompactionStrategy().pause();
try
{
// interrupt in-progress compactions
CompactionManager.instance.interruptCompactionForCFs(selfWithIndexes, interruptValidation);
CompactionManager.instance.waitForCessation(selfWithIndexes);
// doublecheck that we finished, instead of timing out
for (ColumnFamilyStore cfs : selfWithIndexes)
{
if (!cfs.getTracker().getCompacting().isEmpty())
{
logger.warn("Unable to cancel in-progress compactions for {}. Perhaps there is an unusually large row in progress somewhere, or the system is simply overloaded.", metadata.cfName);
return null;
}
}
logger.trace("Compactions successfully cancelled");
// run our task
try
{
return callable.call();
}
catch (Exception e)
{
throw new RuntimeException(e);
}
}
finally
{
for (ColumnFamilyStore cfs : selfWithIndexes)
cfs.getCompactionStrategy().resume();
}
}
}
public LifecycleTransaction markAllCompacting(final OperationType operationType)
{
Callable callable = new Callable()
{
public LifecycleTransaction call() throws Exception
{
assert data.getCompacting().isEmpty() : data.getCompacting();
Iterable sstables = getPermittedToCompactSSTables();
sstables = AbstractCompactionStrategy.filterSuspectSSTables(sstables);
sstables = ImmutableList.copyOf(sstables);
LifecycleTransaction modifier = data.tryModify(sstables, operationType);
assert modifier != null: "something marked things compacting while compactions are disabled";
return modifier;
}
};
return runWithCompactionsDisabled(callable, false);
}
@Override
public String toString()
{
return "CFS(" +
"Keyspace='" + keyspace.getName() + '\'' +
", ColumnFamily='" + name + '\'' +
')';
}
public void disableAutoCompaction()
{
// we don't use CompactionStrategy.pause since we don't want users flipping that on and off
// during runWithCompactionsDisabled
this.compactionStrategyWrapper.disable();
}
public void enableAutoCompaction()
{
enableAutoCompaction(false);
}
/**
* used for tests - to be able to check things after a minor compaction
* @param waitForFutures if we should block until autocompaction is done
*/
@VisibleForTesting
public void enableAutoCompaction(boolean waitForFutures)
{
this.compactionStrategyWrapper.enable();
List> futures = CompactionManager.instance.submitBackground(this);
if (waitForFutures)
FBUtilities.waitOnFutures(futures);
}
public boolean isAutoCompactionDisabled()
{
return !this.compactionStrategyWrapper.isEnabled();
}
/*
JMX getters and setters for the Defaults.
- get/set minCompactionThreshold
- get/set maxCompactionThreshold
- get memsize
- get memops
- get/set memtime
*/
public AbstractCompactionStrategy getCompactionStrategy()
{
return compactionStrategyWrapper;
}
public void setCompactionThresholds(int minThreshold, int maxThreshold)
{
validateCompactionThresholds(minThreshold, maxThreshold);
minCompactionThreshold.set(minThreshold);
maxCompactionThreshold.set(maxThreshold);
CompactionManager.instance.submitBackground(this);
}
public int getMinimumCompactionThreshold()
{
return minCompactionThreshold.value();
}
public void setMinimumCompactionThreshold(int minCompactionThreshold)
{
validateCompactionThresholds(minCompactionThreshold, maxCompactionThreshold.value());
this.minCompactionThreshold.set(minCompactionThreshold);
}
public int getMaximumCompactionThreshold()
{
return maxCompactionThreshold.value();
}
public void setMaximumCompactionThreshold(int maxCompactionThreshold)
{
validateCompactionThresholds(minCompactionThreshold.value(), maxCompactionThreshold);
this.maxCompactionThreshold.set(maxCompactionThreshold);
}
private void validateCompactionThresholds(int minThreshold, int maxThreshold)
{
if (minThreshold > maxThreshold)
throw new RuntimeException(String.format("The min_compaction_threshold cannot be larger than the max_compaction_threshold. " +
"Min is '%d', Max is '%d'.", minThreshold, maxThreshold));
if (maxThreshold == 0 || minThreshold == 0)
throw new RuntimeException("Disabling compaction by setting min_compaction_threshold or max_compaction_threshold to 0 " +
"is deprecated, set the compaction strategy option 'enabled' to 'false' instead or use the nodetool command 'disableautocompaction'.");
}
// End JMX get/set.
public int getMeanColumns()
{
long sum = 0;
long count = 0;
for (SSTableReader sstable : getSSTables())
{
long n = sstable.getEstimatedColumnCount().count();
sum += sstable.getEstimatedColumnCount().mean() * n;
count += n;
}
return count > 0 ? (int) (sum / count) : 0;
}
public long estimateKeys()
{
long n = 0;
for (SSTableReader sstable : getSSTables())
n += sstable.estimatedKeys();
return n;
}
/** true if this CFS contains secondary index data */
public boolean isIndex()
{
return partitioner instanceof LocalPartitioner;
}
public Iterable concatWithIndexes()
{
// we return the main CFS first, which we rely on for simplicity in switchMemtable(), for getting the
// latest replay position
return Iterables.concat(Collections.singleton(this), indexManager.getIndexesBackedByCfs());
}
public List getBuiltIndexes()
{
return indexManager.getBuiltIndexes();
}
public int getUnleveledSSTables()
{
return this.compactionStrategyWrapper.getUnleveledSSTables();
}
public int[] getSSTableCountPerLevel()
{
return compactionStrategyWrapper.getSSTableCountPerLevel();
}
public static class ViewFragment
{
public final List sstables;
public final Iterable memtables;
public ViewFragment(List sstables, Iterable memtables)
{
this.sstables = sstables;
this.memtables = memtables;
}
}
public static class RefViewFragment extends ViewFragment implements AutoCloseable
{
public final Refs refs;
public RefViewFragment(List sstables, Iterable memtables, Refs refs)
{
super(sstables, memtables);
this.refs = refs;
}
public void release()
{
refs.release();
}
public void close()
{
refs.release();
}
}
public boolean isEmpty()
{
View view = data.getView();
return view.sstables.isEmpty() && view.getCurrentMemtable().getOperations() == 0 && view.liveMemtables.size() <= 1 && view.flushingMemtables.size() == 0;
}
public boolean isRowCacheEnabled()
{
return metadata.getCaching().rowCache.isEnabled() && CacheService.instance.rowCache.getCapacity() > 0;
}
public boolean isCounterCacheEnabled()
{
return metadata.isCounter() && CacheService.instance.counterCache.getCapacity() > 0;
}
public boolean isKeyCacheEnabled()
{
return metadata.getCaching().keyCache.isEnabled() && CacheService.instance.keyCache.getCapacity() > 0;
}
/**
* Discard all SSTables that were created before given timestamp.
*
* Caller should first ensure that comapctions have quiesced.
*
* @param truncatedAt The timestamp of the truncation
* (all SSTables before that timestamp are going be marked as compacted)
*/
public void discardSSTables(long truncatedAt)
{
assert data.getCompacting().isEmpty() : data.getCompacting();
List truncatedSSTables = new ArrayList<>();
for (SSTableReader sstable : getSSTables())
{
if (!sstable.newSince(truncatedAt))
truncatedSSTables.add(sstable);
}
if (!truncatedSSTables.isEmpty())
markObsolete(truncatedSSTables, OperationType.UNKNOWN);
}
public double getDroppableTombstoneRatio()
{
return getDroppableTombstoneRatio(true);
}
public double getTombstoneRatio()
{
return getDroppableTombstoneRatio(false);
}
public double getLiveTombstoneRatio()
{
return getTombstoneRatio() - getDroppableTombstoneRatio();
}
private double getDroppableTombstoneRatio(boolean useGcGrace)
{
long allColumns = 0;
Collection sstables = getSSTables();
for (SSTableReader sstable : sstables)
{
allColumns += sstable.getEstimatedColumnCount().mean() * sstable.getEstimatedColumnCount().count();
}
double allDroppable = getDroppableTombstoneCount(sstables, useGcGrace);
return allColumns > 0 ? allDroppable / allColumns : 0;
}
public double getDroppableTombstoneCount()
{
return getDroppableTombstoneCount(true);
}
public double getTombstoneCount()
{
return getDroppableTombstoneCount(false);
}
public double getLiveTombstoneCount()
{
return getTombstoneCount() - getDroppableTombstoneCount();
}
private double getDroppableTombstoneCount(boolean useGcGrace)
{
return getDroppableTombstoneCount(getSSTables(), useGcGrace);
}
private double getDroppableTombstoneCount(Collection sstables, boolean useGcGrace)
{
double allDroppable = 0;
int localTime = (int) (System.currentTimeMillis() / 1000);
for (SSTableReader sstable : sstables)
{
int gcBefore = localTime;
if (useGcGrace) gcBefore = localTime - sstable.metadata.getGcGraceSeconds();
allDroppable += sstable.getDroppableTombstonesBefore(gcBefore);
}
return allDroppable;
}
public long trueSnapshotsSize()
{
return directories.trueSnapshotsSize();
}
@VisibleForTesting
void resetFileIndexGenerator()
{
fileIndexGenerator.set(0);
}
// returns the "canonical" version of any current sstable, i.e. if an sstable is being replaced and is only partially
// visible to reads, this sstable will be returned as its original entirety, and its replacement will not be returned
// (even if it completely replaces it)
public static final Function> CANONICAL_SSTABLES = new Function>()
{
public List apply(View view)
{
List sstables = new ArrayList<>();
for (SSTableReader sstable : view.compacting)
if (sstable.openReason != SSTableReader.OpenReason.EARLY)
sstables.add(sstable);
for (SSTableReader sstable : view.sstables)
if (!view.compacting.contains(sstable) && sstable.openReason != SSTableReader.OpenReason.EARLY)
sstables.add(sstable);
return sstables;
}
};
public static final Function> UNREPAIRED_SSTABLES = new Function>()
{
public List apply(View view)
{
List sstables = new ArrayList<>();
for (SSTableReader sstable : CANONICAL_SSTABLES.apply(view))
{
if (!sstable.isRepaired())
sstables.add(sstable);
}
return sstables;
}
};
/**
* Returns a ColumnFamilyStore by cfId if it exists, null otherwise
* Differently from others, this method does not throw exception if the table does not exist.
*/
public static ColumnFamilyStore getIfExists(UUID cfId)
{
Pair kscf = Schema.instance.getCF(cfId);
if (kscf == null)
return null;
Keyspace keyspace = Keyspace.open(kscf.left);
if (keyspace == null)
return null;
return keyspace.getColumnFamilyStore(cfId);
}
/**
* Returns a ColumnFamilyStore by ksname and cfname if it exists, null otherwise
* Differently from others, this method does not throw exception if the keyspace or table does not exist.
*/
public static ColumnFamilyStore getIfExists(String ksName, String cfName)
{
if (ksName == null || cfName == null)
return null;
Keyspace keyspace = Keyspace.open(ksName);
if (keyspace == null)
return null;
UUID id = Schema.instance.getId(ksName, cfName);
if (id == null)
return null;
return keyspace.getColumnFamilyStore(id);
}
/**
* @return map of sstable file path to repaired at value
*/
public Map getRepairedAtPerSstable() {
Collection ssTables = getSSTables();
Map repairedAtPerSstable = new HashMap<>(ssTables.size());
for (SSTableReader sstable : ssTables)
{
repairedAtPerSstable.put(sstable.descriptor.relativeFilenameFor(Component.DATA),
sstable.getSSTableMetadata().repairedAt);
}
return repairedAtPerSstable;
}
}
| |