org.apache.cassandra.db.commitlog.CommitLogSegment Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of cassandra-all Show documentation
Show all versions of cassandra-all Show documentation
The Apache Cassandra Project develops a highly scalable second-generation distributed database, bringing together Dynamo's fully distributed design and Bigtable's ColumnFamily-based data model.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.db.commitlog;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.StandardOpenOption;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.locks.LockSupport;
import java.util.zip.CRC32;
import com.google.common.annotations.VisibleForTesting;
import org.apache.cassandra.io.util.File;
import org.apache.cassandra.io.util.FileWriter;
import org.cliffc.high_scale_lib.NonBlockingHashMap;
import com.codahale.metrics.Timer;
import org.apache.cassandra.config.*;
import org.apache.cassandra.db.Mutation;
import org.apache.cassandra.db.commitlog.CommitLog.Configuration;
import org.apache.cassandra.db.partitions.PartitionUpdate;
import org.apache.cassandra.io.FSWriteError;
import org.apache.cassandra.io.util.FileUtils;
import org.apache.cassandra.schema.Schema;
import org.apache.cassandra.schema.TableId;
import org.apache.cassandra.schema.TableMetadata;
import org.apache.cassandra.utils.NativeLibrary;
import org.apache.cassandra.utils.IntegerInterval;
import org.apache.cassandra.utils.concurrent.OpOrder;
import org.apache.cassandra.utils.concurrent.WaitQueue;
import static org.apache.cassandra.utils.Clock.Global.currentTimeMillis;
import static org.apache.cassandra.utils.FBUtilities.updateChecksumInt;
import static org.apache.cassandra.utils.concurrent.WaitQueue.newWaitQueue;
/*
* A single commit log file on disk. Manages creation of the file and writing mutations to disk,
* as well as tracking the last mutation position of any "dirty" CFs covered by the segment file. Segment
* files are initially allocated to a fixed size and can grow to accomidate a larger value if necessary.
*/
public abstract class CommitLogSegment
{
private final static long idBase;
private CDCState cdcState = CDCState.PERMITTED;
public enum CDCState
{
PERMITTED,
FORBIDDEN,
CONTAINS
}
final Object cdcStateLock = new Object();
private final static AtomicInteger nextId = new AtomicInteger(1);
private static long replayLimitId;
static
{
long maxId = Long.MIN_VALUE;
for (File file : new File(DatabaseDescriptor.getCommitLogLocation()).tryList())
{
if (CommitLogDescriptor.isValid(file.name()))
maxId = Math.max(CommitLogDescriptor.fromFileName(file.name()).id, maxId);
}
replayLimitId = idBase = Math.max(currentTimeMillis(), maxId + 1);
}
// The commit log entry overhead in bytes (int: length + int: head checksum + int: tail checksum)
public static final int ENTRY_OVERHEAD_SIZE = 4 + 4 + 4;
// The commit log (chained) sync marker/header size in bytes (int: length + int: checksum [segmentId, position])
static final int SYNC_MARKER_SIZE = 4 + 4;
// The OpOrder used to order appends wrt sync
private final OpOrder appendOrder = new OpOrder();
private final AtomicInteger allocatePosition = new AtomicInteger();
// Everything before this offset has been synced and written. The SYNC_MARKER_SIZE bytes after
// each sync are reserved, and point forwards to the next such offset. The final
// sync marker in a segment will be zeroed out, or point to a position too close to the EOF to fit a marker.
@VisibleForTesting
volatile int lastSyncedOffset;
/**
* Everything before this offset has it's markers written into the {@link #buffer}, but has not necessarily
* been flushed to disk. This value should be greater than or equal to {@link #lastSyncedOffset}.
*/
private volatile int lastMarkerOffset;
// The end position of the buffer. Initially set to its capacity and updated to point to the last written position
// as the segment is being closed.
// No need to be volatile as writes are protected by appendOrder barrier.
private int endOfBuffer;
// a signal for writers to wait on to confirm the log message they provided has been written to disk
private final WaitQueue syncComplete = newWaitQueue();
// a map of Cf->dirty interval in this segment; if interval is not covered by the clean set, the log contains unflushed data
private final NonBlockingHashMap tableDirty = new NonBlockingHashMap<>(1024);
// a map of Cf->clean intervals; separate map from above to permit marking Cfs clean whilst the log is still in use
private final ConcurrentHashMap tableClean = new ConcurrentHashMap<>();
public final long id;
final File logFile;
final FileChannel channel;
final int fd;
protected final AbstractCommitLogSegmentManager manager;
ByteBuffer buffer;
private volatile boolean headerWritten;
public final CommitLogDescriptor descriptor;
static CommitLogSegment createSegment(CommitLog commitLog, AbstractCommitLogSegmentManager manager)
{
Configuration config = commitLog.configuration;
CommitLogSegment segment = config.useEncryption() ? new EncryptedSegment(commitLog, manager)
: config.useCompression() ? new CompressedSegment(commitLog, manager)
: new MemoryMappedSegment(commitLog, manager);
segment.writeLogHeader();
return segment;
}
/**
* Checks if the segments use a buffer pool.
*
* @param commitLog the commit log
* @return true
if the segments use a buffer pool, false
otherwise.
*/
static boolean usesBufferPool(CommitLog commitLog)
{
Configuration config = commitLog.configuration;
return config.useEncryption() || config.useCompression();
}
static long getNextId()
{
return idBase + nextId.getAndIncrement();
}
/**
* Constructs a new segment file.
*/
CommitLogSegment(CommitLog commitLog, AbstractCommitLogSegmentManager manager)
{
this.manager = manager;
id = getNextId();
descriptor = new CommitLogDescriptor(id,
commitLog.configuration.getCompressorClass(),
commitLog.configuration.getEncryptionContext());
logFile = new File(manager.storageDirectory, descriptor.fileName());
try
{
channel = FileChannel.open(logFile.toPath(), StandardOpenOption.WRITE, StandardOpenOption.READ, StandardOpenOption.CREATE);
fd = NativeLibrary.getfd(channel);
}
catch (IOException e)
{
throw new FSWriteError(e, logFile);
}
buffer = createBuffer(commitLog);
}
/**
* Deferred writing of the commit log header until subclasses have had a chance to initialize
*/
void writeLogHeader()
{
CommitLogDescriptor.writeHeader(buffer, descriptor, additionalHeaderParameters());
endOfBuffer = buffer.capacity();
lastSyncedOffset = lastMarkerOffset = buffer.position();
allocatePosition.set(lastSyncedOffset + SYNC_MARKER_SIZE);
headerWritten = true;
}
/**
* Provide any additional header data that should be stored in the {@link CommitLogDescriptor}.
*/
protected Map additionalHeaderParameters()
{
return Collections.emptyMap();
}
abstract ByteBuffer createBuffer(CommitLog commitLog);
/**
* Allocate space in this buffer for the provided mutation, and return the allocated Allocation object.
* Returns null if there is not enough space in this segment, and a new segment is needed.
*/
@SuppressWarnings("resource") //we pass the op order around
Allocation allocate(Mutation mutation, int size)
{
final OpOrder.Group opGroup = appendOrder.start();
try
{
int position = allocate(size);
if (position < 0)
{
opGroup.close();
return null;
}
for (PartitionUpdate update : mutation.getPartitionUpdates())
coverInMap(tableDirty, update.metadata().id, position);
return new Allocation(this, opGroup, position, (ByteBuffer) buffer.duplicate().position(position).limit(position + size));
}
catch (Throwable t)
{
opGroup.close();
throw t;
}
}
static boolean shouldReplay(String name)
{
return CommitLogDescriptor.fromFileName(name).id < replayLimitId;
}
/**
* FOR TESTING PURPOSES.
*/
static void resetReplayLimit()
{
replayLimitId = getNextId();
}
// allocate bytes in the segment, or return -1 if not enough space
private int allocate(int size)
{
while (true)
{
int prev = allocatePosition.get();
int next = prev + size;
if (next >= endOfBuffer)
return -1;
if (allocatePosition.compareAndSet(prev, next))
{
assert buffer != null;
return prev;
}
LockSupport.parkNanos(1); // ConstantBackoffCAS Algorithm from https://arxiv.org/pdf/1305.5800.pdf
}
}
// ensures no more of this segment is writeable, by allocating any unused section at the end and marking it discarded
void discardUnusedTail()
{
// We guard this with the OpOrdering instead of synchronised due to potential dead-lock with ACLSM.advanceAllocatingFrom()
// Ensures endOfBuffer update is reflected in the buffer end position picked up by sync().
// This actually isn't strictly necessary, as currently all calls to discardUnusedTail are executed either by the thread
// running sync or within a mutation already protected by this OpOrdering, but to prevent future potential mistakes,
// we duplicate the protection here so that the contract between discardUnusedTail() and sync() is more explicit.
try (OpOrder.Group group = appendOrder.start())
{
while (true)
{
int prev = allocatePosition.get();
int next = endOfBuffer + 1;
if (prev >= next)
{
// Already stopped allocating, might also be closed.
assert buffer == null || prev == buffer.capacity() + 1;
return;
}
if (allocatePosition.compareAndSet(prev, next))
{
// Stopped allocating now. Can only succeed once, no further allocation or discardUnusedTail can succeed.
endOfBuffer = prev;
assert buffer != null && next == buffer.capacity() + 1;
return;
}
}
}
}
/**
* Wait for any appends or discardUnusedTail() operations started before this method was called
*/
void waitForModifications()
{
// issue a barrier and wait for it
appendOrder.awaitNewBarrier();
}
/**
* Update the chained markers in the commit log buffer and possibly force a disk flush for this segment file.
*
* @param flush true if the segment should flush to disk; else, false for just updating the chained markers.
*/
synchronized void sync(boolean flush)
{
if (!headerWritten)
throw new IllegalStateException("commit log header has not been written");
assert lastMarkerOffset >= lastSyncedOffset : String.format("commit log segment positions are incorrect: last marked = %d, last synced = %d",
lastMarkerOffset, lastSyncedOffset);
// check we have more work to do
final boolean needToMarkData = allocatePosition.get() > lastMarkerOffset + SYNC_MARKER_SIZE;
final boolean hasDataToFlush = lastSyncedOffset != lastMarkerOffset;
if (!(needToMarkData || hasDataToFlush))
return;
// Note: Even if the very first allocation of this sync section failed, we still want to enter this
// to ensure the segment is closed. As allocatePosition is set to 1 beyond the capacity of the buffer,
// this will always be entered when a mutation allocation has been attempted after the marker allocation
// succeeded in the previous sync.
assert buffer != null; // Only close once.
boolean close = false;
int startMarker = lastMarkerOffset;
int nextMarker, sectionEnd;
if (needToMarkData)
{
// Allocate a new sync marker; this is both necessary in itself, but also serves to demarcate
// the point at which we can safely consider records to have been completely written to.
nextMarker = allocate(SYNC_MARKER_SIZE);
if (nextMarker < 0)
{
// Ensure no more of this CLS is writeable, and mark ourselves for closing.
discardUnusedTail();
close = true;
// We use the buffer size as the synced position after a close instead of the end of the actual data
// to make sure we only close the buffer once.
// The endOfBuffer position may be incorrect at this point (to be written by another stalled thread).
nextMarker = buffer.capacity();
}
// Wait for mutations to complete as well as endOfBuffer to have been written.
waitForModifications();
sectionEnd = close ? endOfBuffer : nextMarker;
// Possibly perform compression or encryption and update the chained markers
write(startMarker, sectionEnd);
lastMarkerOffset = sectionEnd;
}
else
{
// note: we don't need to waitForModifications() as, once we get to this block, we are only doing the flush
// and any mutations have already been fully written into the segment (as we wait for it in the previous block).
nextMarker = lastMarkerOffset;
sectionEnd = nextMarker;
}
if (flush || close)
{
try (Timer.Context ignored = CommitLog.instance.metrics.waitingOnFlush.time())
{
flush(startMarker, sectionEnd);
}
if (cdcState == CDCState.CONTAINS)
writeCDCIndexFile(descriptor, sectionEnd, close);
lastSyncedOffset = lastMarkerOffset = nextMarker;
if (close)
internalClose();
syncComplete.signalAll();
}
}
/**
* We persist the offset of the last data synced to disk so clients can parse only durable data if they choose. Data
* in shared / memory-mapped buffers reflects un-synced data so we need an external sentinel for clients to read to
* determine actual durable data persisted.
*/
public static void writeCDCIndexFile(CommitLogDescriptor desc, int offset, boolean complete)
{
try(FileWriter writer = new FileWriter(new File(DatabaseDescriptor.getCDCLogLocation(), desc.cdcIndexFileName())))
{
writer.write(String.valueOf(offset));
if (complete)
writer.write("\nCOMPLETED");
writer.flush();
}
catch (IOException e)
{
if (!CommitLog.instance.handleCommitError("Failed to sync CDC Index: " + desc.cdcIndexFileName(), e))
throw new RuntimeException(e);
}
}
/**
* Create a sync marker to delineate sections of the commit log, typically created on each sync of the file.
* The sync marker consists of a file pointer to where the next sync marker should be (effectively declaring the length
* of this section), as well as a CRC value.
*
* @param buffer buffer in which to write out the sync marker.
* @param offset Offset into the {@code buffer} at which to write the sync marker.
* @param filePos The current position in the target file where the sync marker will be written (most likely different from the buffer position).
* @param nextMarker The file position of where the next sync marker should be.
*/
protected static void writeSyncMarker(long id, ByteBuffer buffer, int offset, int filePos, int nextMarker)
{
if (filePos > nextMarker)
throw new IllegalArgumentException(String.format("commit log sync marker's current file position %d is greater than next file position %d", filePos, nextMarker));
CRC32 crc = new CRC32();
updateChecksumInt(crc, (int) (id & 0xFFFFFFFFL));
updateChecksumInt(crc, (int) (id >>> 32));
updateChecksumInt(crc, filePos);
buffer.putInt(offset, nextMarker);
buffer.putInt(offset + 4, (int) crc.getValue());
}
abstract void write(int lastSyncedOffset, int nextMarker);
abstract void flush(int startMarker, int nextMarker);
public boolean isStillAllocating()
{
return allocatePosition.get() < endOfBuffer;
}
/**
* Discards a segment file when the log no longer requires it. The file may be left on disk if the archive script
* requires it. (Potentially blocking operation)
*/
void discard(boolean deleteFile)
{
close();
if (deleteFile)
FileUtils.deleteWithConfirm(logFile);
manager.addSize(-onDiskSize());
}
/**
* @return the current CommitLogPosition for this log segment
*/
public CommitLogPosition getCurrentCommitLogPosition()
{
return new CommitLogPosition(id, allocatePosition.get());
}
/**
* @return the file path to this segment
*/
public String getPath()
{
return logFile.path();
}
/**
* @return the file name of this segment
*/
public String getName()
{
return logFile.name();
}
/**
* @return a File object representing the CDC directory and this file name for hard-linking
*/
public File getCDCFile()
{
return new File(DatabaseDescriptor.getCDCLogLocation(), logFile.name());
}
/**
* @return a File object representing the CDC Index file holding the offset and completion status of this segment
*/
public File getCDCIndexFile()
{
return new File(DatabaseDescriptor.getCDCLogLocation(), descriptor.cdcIndexFileName());
}
void waitForFinalSync()
{
while (true)
{
WaitQueue.Signal signal = syncComplete.register();
if (lastSyncedOffset < endOfBuffer)
{
signal.awaitUninterruptibly();
}
else
{
signal.cancel();
break;
}
}
}
void waitForSync(int position)
{
while (lastSyncedOffset < position)
{
WaitQueue.Signal signal = syncComplete.register();
if (lastSyncedOffset < position)
signal.awaitThrowUncheckedOnInterrupt();
else
signal.cancel();
}
}
/**
* Stop writing to this file, sync and close it. Does nothing if the file is already closed.
*/
synchronized void close()
{
discardUnusedTail();
sync(true);
assert buffer == null;
}
/**
* Close the segment file. Do not call from outside this class, use syncAndClose() instead.
*/
protected void internalClose()
{
try
{
channel.close();
buffer = null;
}
catch (IOException e)
{
throw new FSWriteError(e, getPath());
}
}
public static void coverInMap(ConcurrentMap map, K key, int value)
{
IntegerInterval i = map.get(key);
if (i == null)
{
i = map.putIfAbsent(key, new IntegerInterval(value, value));
if (i == null)
// success
return;
}
i.expandToCover(value);
}
/**
* Marks the ColumnFamily specified by id as clean for this log segment. If the
* given context argument is contained in this file, it will only mark the CF as
* clean if no newer writes have taken place.
*
* @param tableId the table that is now clean
* @param startPosition the start of the range that is clean
* @param endPosition the end of the range that is clean
*/
public synchronized void markClean(TableId tableId, CommitLogPosition startPosition, CommitLogPosition endPosition)
{
if (startPosition.segmentId > id || endPosition.segmentId < id)
return;
if (!tableDirty.containsKey(tableId))
return;
int start = startPosition.segmentId == id ? startPosition.position : 0;
int end = endPosition.segmentId == id ? endPosition.position : Integer.MAX_VALUE;
tableClean.computeIfAbsent(tableId, k -> new IntegerInterval.Set()).add(start, end);
removeCleanFromDirty();
}
private void removeCleanFromDirty()
{
// if we're still allocating from this segment, don't touch anything since it can't be done thread-safely
if (isStillAllocating())
return;
Iterator> iter = tableClean.entrySet().iterator();
while (iter.hasNext())
{
Map.Entry clean = iter.next();
TableId tableId = clean.getKey();
IntegerInterval.Set cleanSet = clean.getValue();
IntegerInterval dirtyInterval = tableDirty.get(tableId);
if (dirtyInterval != null && cleanSet.covers(dirtyInterval))
{
tableDirty.remove(tableId);
iter.remove();
}
}
}
/**
* @return a collection of dirty CFIDs for this segment file.
*/
public synchronized Collection getDirtyTableIds()
{
if (tableClean.isEmpty() || tableDirty.isEmpty())
return tableDirty.keySet();
List r = new ArrayList<>(tableDirty.size());
for (Map.Entry dirty : tableDirty.entrySet())
{
TableId tableId = dirty.getKey();
IntegerInterval dirtyInterval = dirty.getValue();
IntegerInterval.Set cleanSet = tableClean.get(tableId);
if (cleanSet == null || !cleanSet.covers(dirtyInterval))
r.add(dirty.getKey());
}
return r;
}
/**
* @return true if this segment is unused and safe to recycle or delete
*/
public synchronized boolean isUnused()
{
// if room to allocate, we're still in use as the active allocatingFrom,
// so we don't want to race with updates to tableClean with removeCleanFromDirty
if (isStillAllocating())
return false;
removeCleanFromDirty();
return tableDirty.isEmpty();
}
/**
* Check to see if a certain CommitLogPosition is contained by this segment file.
*
* @param context the commit log segment position to be checked
* @return true if the commit log segment position is contained by this segment file.
*/
public boolean contains(CommitLogPosition context)
{
return context.segmentId == id;
}
// For debugging, not fast
public String dirtyString()
{
StringBuilder sb = new StringBuilder();
for (TableId tableId : getDirtyTableIds())
{
TableMetadata m = Schema.instance.getTableMetadata(tableId);
sb.append(m == null ? "" : m.name).append(" (").append(tableId)
.append(", dirty: ").append(tableDirty.get(tableId))
.append(", clean: ").append(tableClean.get(tableId))
.append("), ");
}
return sb.toString();
}
abstract public long onDiskSize();
public long contentSize()
{
return lastSyncedOffset;
}
@Override
public String toString()
{
return "CommitLogSegment(" + getPath() + ')';
}
public static class CommitLogSegmentFileComparator implements Comparator
{
public int compare(File f, File f2)
{
return Long.compare(CommitLogDescriptor.idFromFileName(f.name()),
CommitLogDescriptor.idFromFileName(f2.name()));
}
}
public CDCState getCDCState()
{
return cdcState;
}
/**
* Change the current cdcState on this CommitLogSegment. There are some restrictions on state transitions and this
* method is idempotent.
*
* @return the old cdc state
*/
public CDCState setCDCState(CDCState newState)
{
if (newState == cdcState)
return cdcState;
// Also synchronized in CDCSizeTracker.processNewSegment and .processDiscardedSegment
synchronized(cdcStateLock)
{
// Need duplicate CONTAINS to be idempotent since 2 threads can race on this lock
if (cdcState == CDCState.CONTAINS && newState != CDCState.CONTAINS)
throw new IllegalArgumentException("Cannot transition from CONTAINS to any other state.");
if (cdcState == CDCState.FORBIDDEN && newState != CDCState.PERMITTED)
throw new IllegalArgumentException("Only transition from FORBIDDEN to PERMITTED is allowed.");
CDCState oldState = cdcState;
cdcState = newState;
return oldState;
}
}
/**
* A simple class for tracking information about the portion of a segment that has been allocated to a log write.
*/
protected static class Allocation
{
private final CommitLogSegment segment;
private final OpOrder.Group appendOp;
private final int position;
private final ByteBuffer buffer;
Allocation(CommitLogSegment segment, OpOrder.Group appendOp, int position, ByteBuffer buffer)
{
this.segment = segment;
this.appendOp = appendOp;
this.position = position;
this.buffer = buffer;
}
CommitLogSegment getSegment()
{
return segment;
}
ByteBuffer getBuffer()
{
return buffer;
}
// markWritten() MUST be called once we are done with the segment or the CL will never flush
// but must not be called more than once
void markWritten()
{
appendOp.close();
}
void awaitDiskSync(Timer waitingOnCommit)
{
try (Timer.Context ignored = waitingOnCommit.time())
{
segment.waitForSync(position);
}
}
/**
* Returns the position in the CommitLogSegment at the end of this allocation.
*/
public CommitLogPosition getCommitLogPosition()
{
return new CommitLogPosition(segment.id, buffer.limit());
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy