org.apache.cassandra.cache.AutoSavingCache Maven / Gradle / Ivy
Show all versions of cassandra-all Show documentation
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
package org.apache.cassandra.cache;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.file.NoSuchFileException;
import java.util.ArrayDeque;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
import javax.annotation.concurrent.NotThreadSafe;
import org.cliffc.high_scale_lib.NonBlockingHashSet;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.cassandra.concurrent.ExecutorPlus;
import org.apache.cassandra.concurrent.ScheduledExecutors;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.db.ColumnFamilyStore;
import org.apache.cassandra.db.compaction.CompactionInfo;
import org.apache.cassandra.db.compaction.CompactionInfo.Unit;
import org.apache.cassandra.db.compaction.CompactionManager;
import org.apache.cassandra.db.compaction.OperationType;
import org.apache.cassandra.io.FSWriteError;
import org.apache.cassandra.io.util.ChecksummedRandomAccessReader;
import org.apache.cassandra.io.util.ChecksummedSequentialWriter;
import org.apache.cassandra.io.util.CorruptFileException;
import org.apache.cassandra.io.util.DataInputPlus;
import org.apache.cassandra.io.util.DataInputPlus.DataInputStreamPlus;
import org.apache.cassandra.io.util.DataOutputPlus;
import org.apache.cassandra.io.util.DataOutputStreamPlus;
import org.apache.cassandra.io.util.File;
import org.apache.cassandra.io.util.FileInputStreamPlus;
import org.apache.cassandra.io.util.FileOutputStreamPlus;
import org.apache.cassandra.io.util.FileUtils;
import org.apache.cassandra.io.util.SequentialWriterOption;
import org.apache.cassandra.io.util.WrappedDataOutputStreamPlus;
import org.apache.cassandra.schema.Schema;
import org.apache.cassandra.schema.SchemaConstants;
import org.apache.cassandra.schema.TableId;
import org.apache.cassandra.schema.TableMetadata;
import org.apache.cassandra.service.CacheService;
import org.apache.cassandra.utils.JVMStabilityInspector;
import org.apache.cassandra.utils.Pair;
import org.apache.cassandra.utils.concurrent.Future;
import static org.apache.cassandra.concurrent.ExecutorFactory.Global.executorFactory;
import static org.apache.cassandra.utils.Clock.Global.nanoTime;
import static org.apache.cassandra.utils.TimeUUID.Generator.nextTimeUUID;
public class AutoSavingCache extends InstrumentingCache
public interface IStreamFactory
DataInputStreamPlus getInputStream(File dataPath, File crcPath) throws IOException;
DataOutputStreamPlus getOutputStream(File dataPath, File crcPath);
private static final Logger logger = LoggerFactory.getLogger(AutoSavingCache.class);
/** True if a cache flush is currently executing: only one may execute at a time. */
public static final Set flushInProgress = new NonBlockingHashSet();
protected volatile ScheduledFuture> saveTask;
protected final CacheService.CacheType cacheType;
private final CacheSerializer cacheLoader;
* CASSANDRA-10155 required a format change to fix 2i indexes and caching.
* 2.2 is already at version "c" and 3.0 is at "d".
* Since cache versions match exactly and there is no partial fallback just add
* a minor version letter.
* Sticking with "d" is fine for 3.0 since it has never been released or used by another version
* "e" introduced with CASSANDRA-11206, omits IndexInfo from key-cache, stores offset into index-file
* "f" introduced with CASSANDRA-9425, changes "keyspace.table.index" in cache keys to TableMetadata.id+TableMetadata.indexName
* "g" introduced an explicit sstable format type ordinal number so that the entry can be skipped regardless of the actual implementation and used serializer
private static final String CURRENT_VERSION = "g";
private static volatile IStreamFactory streamFactory = new IStreamFactory()
private final SequentialWriterOption writerOption = SequentialWriterOption.newBuilder()
.trickleFsyncByteInterval(DatabaseDescriptor.getTrickleFsyncIntervalInKiB() * 1024)
public DataInputStreamPlus getInputStream(File dataPath, File crcPath) throws IOException
return ChecksummedRandomAccessReader.open(dataPath, crcPath);
public DataOutputStreamPlus getOutputStream(File dataPath, File crcPath)
return new ChecksummedSequentialWriter(dataPath, crcPath, null, writerOption);
// Unused, but exposed for a reason. See CASSANDRA-8096.
public static void setStreamFactory(IStreamFactory streamFactory)
AutoSavingCache.streamFactory = streamFactory;
public AutoSavingCache(ICache cache, CacheService.CacheType cacheType, CacheSerializer cacheloader)
super(cacheType.toString(), cache);
this.cacheType = cacheType;
this.cacheLoader = cacheloader;
public File getCacheDataPath(String version)
return DatabaseDescriptor.getSerializedCachePath(cacheType, version, "db");
public File getCacheCrcPath(String version)
return DatabaseDescriptor.getSerializedCachePath(cacheType, version, "crc");
public File getCacheMetadataPath(String version)
return DatabaseDescriptor.getSerializedCachePath(cacheType, version, "metadata");
public Writer getWriter(int keysToSave)
return new Writer(keysToSave);
public void scheduleSaving(int savePeriodInSeconds, final int keysToSave)
if (saveTask != null)
saveTask.cancel(false); // Do not interrupt an in-progress save
saveTask = null;
if (savePeriodInSeconds > 0)
Runnable runnable = new Runnable()
public void run()
saveTask = ScheduledExecutors.optionalTasks.scheduleWithFixedDelay(runnable,
public Future loadSavedAsync()
final ExecutorPlus es = executorFactory().sequential("loadSavedCache");
final long start = nanoTime();
Future cacheLoad = es.submit(this::loadSaved);
cacheLoad.addListener(() -> {
if (size() > 0)
logger.info("Completed loading ({} ms; {} keys) {} cache",
TimeUnit.NANOSECONDS.toMillis(nanoTime() - start),
return cacheLoad;
public int loadSaved()
int count = 0;
long start = nanoTime();
// modern format, allows both key and value (so key cache load can be purely sequential)
File dataPath = getCacheDataPath(CURRENT_VERSION);
File crcPath = getCacheCrcPath(CURRENT_VERSION);
File metadataPath = getCacheMetadataPath(CURRENT_VERSION);
if (dataPath.exists() && crcPath.exists() && metadataPath.exists())
DataInputStreamPlus in = null;
logger.info("Reading saved cache: {}, {}, {}", dataPath, crcPath, metadataPath);
try (FileInputStreamPlus metadataIn = metadataPath.newInputStream())
in = streamFactory.getInputStream(dataPath, crcPath);
//Check the schema has not changed since CFs are looked up by name which is ambiguous
UUID schemaVersion = new UUID(in.readLong(), in.readLong());
if (!schemaVersion.equals(Schema.instance.getVersion()))
throw new RuntimeException("Cache schema version "
+ schemaVersion
+ " does not match current schema version "
+ Schema.instance.getVersion());
ArrayDeque>> futures = new ArrayDeque<>();
long loadByNanos = start + TimeUnit.SECONDS.toNanos(DatabaseDescriptor.getCacheLoadTimeout());
while (nanoTime() < loadByNanos && in.available() > 0)
Future> entryFuture = cacheLoader.deserialize(in);
// Key cache entry can return null, if the SSTable doesn't exist.
if (entryFuture == null)
* Kind of unwise to accrue an unbounded number of pending futures
* So now there is this loop to keep a bounded number pending.
while (futures.peek() != null && futures.peek().isDone())
Future> future = futures.poll();
Pair entry = future.get();
if (entry != null && entry.right != null)
put(entry.left, entry.right);
if (futures.size() > 1000)
} while(futures.size() > 1000);
Future> future = null;
while ((future = futures.poll()) != null)
Pair entry = future.get();
if (entry != null && entry.right != null)
put(entry.left, entry.right);
catch (CorruptFileException e)
logger.warn(String.format("Non-fatal checksum error reading saved cache %s", dataPath.absolutePath()), e);
catch (Throwable t)
logger.info(String.format("Harmless error reading saved cache %s", dataPath.absolutePath()), t);
if (logger.isTraceEnabled())
logger.trace("completed reading ({} ms; {} keys) saved cache {}",
TimeUnit.NANOSECONDS.toMillis(nanoTime() - start), count, dataPath);
return count;
public Future> submitWrite(int keysToSave)
return CompactionManager.instance.submitCacheWrite(getWriter(keysToSave));
public class Writer extends CompactionInfo.Holder
private final Iterator keyIterator;
private final CompactionInfo info;
private long keysWritten;
private final long keysEstimate;
protected Writer(int keysToSave)
int size = size();
if (keysToSave >= size || keysToSave == 0)
keyIterator = keyIterator();
keysEstimate = size;
keyIterator = hotKeyIterator(keysToSave);
keysEstimate = keysToSave;
OperationType type;
if (cacheType == CacheService.CacheType.KEY_CACHE)
type = OperationType.KEY_CACHE_SAVE;
else if (cacheType == CacheService.CacheType.ROW_CACHE)
type = OperationType.ROW_CACHE_SAVE;
else if (cacheType == CacheService.CacheType.COUNTER_CACHE)
type = OperationType.COUNTER_CACHE_SAVE;
type = OperationType.UNKNOWN;
info = CompactionInfo.withoutSSTables(TableMetadata.minimal(SchemaConstants.SYSTEM_KEYSPACE_NAME, cacheType.toString()),
public CacheService.CacheType cacheType()
return cacheType;
public CompactionInfo getCompactionInfo()
// keyset can change in size, thus total can too
// TODO need to check for this one... was: info.forProgress(keysWritten, Math.max(keysWritten, keys.size()));
return info.forProgress(keysWritten, Math.max(keysWritten, keysEstimate));
public void saveCache()
logger.trace("Deleting old {} files.", cacheType);
if (!keyIterator.hasNext())
logger.trace("Skipping {} save, cache is empty.", cacheType);
long start = nanoTime();
File dataTmpFile = getTempCacheFile(getCacheDataPath(CURRENT_VERSION));
File crcTmpFile = getTempCacheFile(getCacheCrcPath(CURRENT_VERSION));
File metadataTmpFile = getTempCacheFile(getCacheMetadataPath(CURRENT_VERSION));
try (WrappedDataOutputStreamPlus writer = new WrappedDataOutputStreamPlus(streamFactory.getOutputStream(dataTmpFile, crcTmpFile));
FileOutputStreamPlus metadataWriter = metadataTmpFile.newOutputStream(File.WriteMode.OVERWRITE))
//Need to be able to check schema version because CF names are ambiguous
UUID schemaVersion = Schema.instance.getVersion();
while (keyIterator.hasNext())
K key = keyIterator.next();
ColumnFamilyStore cfs = Schema.instance.getColumnFamilyStoreInstance(key.tableId);
if (cfs == null)
continue; // the table or 2i has been dropped.
if (key.indexName != null)
cfs = cfs.indexManager.getIndexByName(key.indexName).getBackingTable().orElse(null);
cacheLoader.serialize(key, writer, cfs);
if (keysWritten >= keysEstimate)
catch (FileNotFoundException | NoSuchFileException e)
throw new RuntimeException(e);
catch (IOException e)
throw new FSWriteError(e, dataTmpFile);
File dataFile = getCacheDataPath(CURRENT_VERSION);
File crcFile = getCacheCrcPath(CURRENT_VERSION);
File metadataFile = getCacheMetadataPath(CURRENT_VERSION);
dataFile.tryDelete(); // ignore error if it didn't exist
if (!dataTmpFile.tryMove(dataFile))
logger.error("Unable to rename {} to {}", dataTmpFile, dataFile);
if (!crcTmpFile.tryMove(crcFile))
logger.error("Unable to rename {} to {}", crcTmpFile, crcFile);
if (!metadataTmpFile.tryMove(metadataFile))
logger.error("Unable to rename {} to {}", metadataTmpFile, metadataFile);
logger.info("Saved {} ({} items) in {} ms to {} : {} MB", cacheType, keysWritten, TimeUnit.NANOSECONDS.toMillis(nanoTime() - start), dataFile.toPath(), dataFile.length() / (1 << 20));
private File getTempCacheFile(File cacheFile)
return FileUtils.createTempFile(cacheFile.name(), null, cacheFile.parent());
private void deleteOldCacheFiles()
File savedCachesDir = new File(DatabaseDescriptor.getSavedCachesLocation());
assert savedCachesDir.exists() && savedCachesDir.isDirectory();
File[] files = savedCachesDir.tryList();
if (files != null)
String cacheNameFormat = String.format("%s-%s.db", cacheType.toString(), CURRENT_VERSION);
for (File file : files)
if (!file.isFile())
continue; // someone's been messing with our directory. naughty!
if (file.name().endsWith(cacheNameFormat)
|| file.name().endsWith(cacheType.toString()))
if (!file.tryDelete())
logger.warn("Failed to delete {}", file.absolutePath());
logger.warn("Could not list files in {}", savedCachesDir);
public boolean isGlobal()
return false;
* A base cache serializer that is used to serialize/deserialize a cache to/from disk.
* It expects the following lifecycle:
* Serializations:
* 1. {@link #serialize(CacheKey, DataOutputPlus, ColumnFamilyStore)} is called for each key in the cache.
* 2. {@link #serializeMetadata(DataOutputPlus)} is called to serialize any metadata.
* 3. {@link #cleanupAfterSerialize()} is called to clean up any resources allocated for serialization.
* Deserializations:
* 1. {@link #deserializeMetadata(DataInputPlus)} is called to deserialize any metadata.
* 2. {@link #deserialize(DataInputPlus)} is called for each key in the cache.
* 3. {@link #cleanupAfterDeserialize()} is called to clean up any resources allocated for deserialization.
* This abstract class provides the default implementation for the metadata serialization/deserialization.
* The metadata includes a dictionary of column family stores collected during serialization whenever
* {@link #writeCFS(DataOutputPlus, ColumnFamilyStore)} or {@link #getOrCreateCFSOrdinal(ColumnFamilyStore)}
* are called. When such metadata is deserialized, the implementation of {@link #deserialize(DataInputPlus)} may
* use {@link #readCFS(DataInputPlus)} method to read the ColumnFamilyStore stored with
* {@link #writeCFS(DataOutputPlus, ColumnFamilyStore)}.
public static abstract class CacheSerializer
private ColumnFamilyStore[] cfStores;
private final LinkedHashMap, Integer> cfsOrdinals = new LinkedHashMap<>();
protected final int getOrCreateCFSOrdinal(ColumnFamilyStore cfs)
Integer ordinal = cfsOrdinals.putIfAbsent(Pair.create(cfs.metadata().id, cfs.metadata().indexName().orElse("")), cfsOrdinals.size());
if (ordinal == null)
ordinal = cfsOrdinals.size() - 1;
return ordinal;
protected ColumnFamilyStore readCFS(DataInputPlus in) throws IOException
return cfStores[in.readUnsignedVInt32()];
protected void writeCFS(DataOutputPlus out, ColumnFamilyStore cfs) throws IOException
public void serializeMetadata(DataOutputPlus out) throws IOException
// write the table ids
for (Pair tableAndIndex : cfsOrdinals.keySet())
public void deserializeMetadata(DataInputPlus in) throws IOException
int tableEntries = in.readUnsignedVInt32();
if (tableEntries == 0)
cfStores = new ColumnFamilyStore[tableEntries];
for (int i = 0; i < tableEntries; i++)
TableId tableId = TableId.deserialize(in);
String indexName = in.readUTF();
cfStores[i] = Schema.instance.getColumnFamilyStoreInstance(tableId);
if (cfStores[i] != null && !indexName.isEmpty())
cfStores[i] = cfStores[i].indexManager.getIndexByName(indexName).getBackingTable().orElse(null);
public abstract void serialize(K key, DataOutputPlus out, ColumnFamilyStore cfs) throws IOException;
public abstract Future> deserialize(DataInputPlus in) throws IOException;
public void cleanupAfterSerialize()
public void cleanupAfterDeserialize()
cfStores = null;