org.neo4j.kernel.impl.transaction.log.EnvelopeWriteChannel Maven / Gradle / Ivy
/*
* Copyright (c) "Neo4j"
* Neo4j Sweden AB [https://neo4j.com]
*
* This file is part of Neo4j.
*
* Neo4j is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
package org.neo4j.kernel.impl.transaction.log;
import static java.lang.Math.min;
import static java.util.Objects.requireNonNull;
import static org.neo4j.kernel.impl.transaction.log.entry.LogEnvelopeHeader.HEADER_SIZE;
import static org.neo4j.kernel.impl.transaction.log.entry.LogEnvelopeHeader.IGNORE_KERNEL_VERSION;
import static org.neo4j.kernel.impl.transaction.log.entry.LogEnvelopeHeader.MAX_ZERO_PADDING_SIZE;
import static org.neo4j.storageengine.api.LogVersionRepository.UNKNOWN_LOG_OFFSET;
import static org.neo4j.util.Preconditions.checkArgument;
import static org.neo4j.util.Preconditions.checkState;
import static org.neo4j.util.Preconditions.requireMultipleOf;
import static org.neo4j.util.Preconditions.requireNonNegative;
import static org.neo4j.util.Preconditions.requirePowerOfTwo;
import java.io.Flushable;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.ClosedChannelException;
import java.util.zip.Checksum;
import org.neo4j.io.fs.PhysicalLogChannel;
import org.neo4j.io.fs.StoreChannel;
import org.neo4j.io.memory.ScopedBuffer;
import org.neo4j.kernel.impl.transaction.log.entry.LogEnvelopeHeader;
import org.neo4j.kernel.impl.transaction.log.entry.LogEnvelopeHeader.EnvelopeType;
import org.neo4j.kernel.impl.transaction.log.rotation.LogRotation;
import org.neo4j.kernel.impl.transaction.tracing.DatabaseTracer;
import org.neo4j.util.VisibleForTesting;
/**
* A channel that will write data in segments.
* Data will be wrapped in "envelopes" as defined by {@link LogEnvelopeHeader}.
*
* The reason for doing this so to allow the data to be chunked, and safety span multiple files.
* It will also allow one to start reading from and arbitrary position in the files, which can be
* beneficial when searching for a specific transaction.
*
* This gets a bit complex, so lets sum up.
*
* - Each file will be divided into x numbers of "buffer windows".
* - Each "buffer window" will contain one or more segments.
* - Each segment contains one or more envelopes, with optional padding at the end.
* -
* One or more envelopes are used to represent "logical units", entries, that are
* separated by calls to {@link #endCurrentEntry()}. Entries are e.g. transactions.
*
* - The first segment of each file is reserved for the file header.
*
*
* | <--- file size ---> |
* | <--- buffer window ---> | <--- buffer window ---> |
* | <--- segment ---> | <--- segment ---> | <--- segment ---> | <--- segment ---> |
* | <- file header -> | [###][###][###]00 | [###############] | [####] |
* | "envelope type" FULL FULL FULL 00 BEGIN END |
* | "transactions" |tx1||tx2||tx3| | <--- tx 4 ---> | |
* ↑ ↑
* Padding Initial position
*
* The reason for keeping the {@code buffer window} aligned to the file boundaries is to better support direct IO.
*
* Since we write the envelope header as part of completing an envelope, calling {@link #prepareForFlush()} will
* only flush up until the last completed envelope.
*/
public class EnvelopeWriteChannel implements PhysicalLogChannel {
@VisibleForTesting
static final String ERROR_MSG_TEMPLATE_OFFSET_SIZE_TOO_SMALL =
"offset size must be at least envelope header size (%d).";
@VisibleForTesting
static final String ERROR_MSG_TEMPLATE_OFFSET_SIZE_TOO_LARGE =
"offset cannot be bigger than the segment size (%d) and must leave enough space for at least one "
+ "envelope after it.";
@VisibleForTesting
static final String ERROR_MSG_TEMPLATE_OFFSET_MUST_BE_FIRST_IN_THE_FIRST_SEGMENT =
"START_OFFSET envelopes can only be inserted at the start of the first segment";
@VisibleForTesting
static final String ERROR_MSG_TEMPLATE_OFFSET_MUST_NOT_BE_INSIDE_ANOTHER_ENVELOPE =
"START_OFFSET cannot be inserted while another envelope is still open. Close the current entry first.";
public static final long START_INDEX = 0;
private static final byte[] PADDING_ZEROES = new byte[MAX_ZERO_PADDING_SIZE];
private final Checksum checksum = CHECKSUM_FACTORY.get();
private final ScopedBuffer scopedBuffer;
private final LogRotation logRotation;
private final DatabaseTracer databaseTracer;
private final ByteBuffer buffer;
private final ByteBuffer checksumView;
private final int segmentBlockSize;
private StoreChannel channel;
private int currentEnvelopeStart;
private byte currentVersion = IGNORE_KERNEL_VERSION;
// The index of the current entry. See LogEnvelopeHeader.index.
private long currentIndex;
private int lastWrittenPosition;
private boolean begin = true;
private int nextSegmentOffset;
private int previousChecksum;
private long rotateAtSize;
private long appendedBytes;
private volatile boolean closed;
public EnvelopeWriteChannel(
StoreChannel channel,
ScopedBuffer scopedBuffer,
int segmentBlockSize,
int initialChecksum,
long currentIndex,
DatabaseTracer databaseTracer,
LogRotation logRotation)
throws IOException {
this.channel = requireNonNull(channel);
this.scopedBuffer = requireNonNull(scopedBuffer);
this.previousChecksum = initialChecksum;
requirePowerOfTwo(segmentBlockSize);
this.segmentBlockSize = segmentBlockSize;
this.logRotation = requireNonNull(logRotation);
this.databaseTracer = requireNonNull(databaseTracer);
this.buffer = scopedBuffer.getBuffer();
this.checksumView = buffer.duplicate().order(buffer.order());
this.currentIndex = currentIndex;
requireMultipleOf("Buffer", buffer.capacity(), "segment block size", segmentBlockSize);
initialPositions(channel.position());
}
public int currentChecksum() {
return previousChecksum;
}
public void endCurrentEntry() throws IOException {
checkState(currentPayloadLength() > 0, "Closing empty envelope is not allowed.");
completeEnvelope(true);
prepareNextEnvelope();
}
public void prepareNextEnvelope() throws IOException {
if ((buffer.position() + LogEnvelopeHeader.HEADER_SIZE) >= nextSegmentOffset) {
padSegmentAndGoToNext();
}
beginNewEnvelope();
}
@Override
public void resetAppendedBytesCounter() {
appendedBytes = 0;
}
@Override
public long getAppendedBytes() {
return appendedBytes;
}
/**
* @param channel a newly allocated channel that should already contain a header. The channel must
* be position at the end of the header.
*/
@Override
public void setChannel(StoreChannel channel) throws IOException {
checkArgument(
channel != this.channel,
"Must NOT update the channel to the same instance otherwise we're overwriting data!");
this.channel = channel;
checkState(channel.position() == segmentBlockSize, "must be positioned on first segment");
initialPositions(segmentBlockSize);
}
/**
* This value is only valid when called after a call to {@link #endCurrentEntry()}
*
* @return the position in the channel.
* @throws IOException when unable to determine the position in the underlying log channel
*/
@Override
public long position() throws IOException {
checkState(
buffer.position() == currentEnvelopeStart + LogEnvelopeHeader.HEADER_SIZE,
"position() must be called right after endCurrentEntry()");
long bufferViewStart = channel.position() - lastWrittenPosition;
return bufferViewStart + currentEnvelopeStart;
}
@Override
public void beginChecksumForWriting() {
// nothing
}
@Override
public Flushable prepareForFlush() throws IOException {
checkChannelClosed(null);
if (lastWrittenPosition == currentEnvelopeStart) {
return channel; // Nothing to flush
}
int oldPosition = buffer.position();
// Since we write the header last, we can only flush until the start of the current envelope
buffer.position(lastWrittenPosition).limit(currentEnvelopeStart);
try {
channel.writeAll(buffer);
} catch (ClosedChannelException e) {
handleClosedChannelException(e);
}
buffer.clear().position(oldPosition);
lastWrittenPosition = currentEnvelopeStart;
if (channel.position() >= rotateAtSize) {
rotateLogFile();
buffer.position(currentEnvelopeStart);
// NOTE! 'channel' will be updated by 'setChannel'.
// 'setChannel' will also update buffer and positions.
} else if (currentEnvelopeStart >= buffer.capacity()) {
// Buffer is exhausted, reset and start over
buffer.clear();
lastWrittenPosition = 0;
currentEnvelopeStart = 0;
nextSegmentOffset = segmentBlockSize;
}
return channel;
}
@Override
public EnvelopeWriteChannel put(byte value) throws IOException {
nextSegmentOnOverflow(Byte.BYTES);
buffer.put(value);
return updateBytesWritten(Byte.BYTES);
}
@Override
public EnvelopeWriteChannel putShort(short value) throws IOException {
nextSegmentOnOverflow(Short.BYTES);
buffer.putShort(value);
return updateBytesWritten(Short.BYTES);
}
@Override
public EnvelopeWriteChannel putInt(int value) throws IOException {
nextSegmentOnOverflow(Integer.BYTES);
buffer.putInt(value);
return updateBytesWritten(Integer.BYTES);
}
@Override
public EnvelopeWriteChannel putLong(long value) throws IOException {
nextSegmentOnOverflow(Long.BYTES);
buffer.putLong(value);
return updateBytesWritten(Long.BYTES);
}
@Override
public EnvelopeWriteChannel putFloat(float value) throws IOException {
nextSegmentOnOverflow(Float.BYTES);
buffer.putFloat(value);
return updateBytesWritten(Float.BYTES);
}
@Override
public EnvelopeWriteChannel putDouble(double value) throws IOException {
nextSegmentOnOverflow(Double.BYTES);
buffer.putDouble(value);
return updateBytesWritten(Double.BYTES);
}
@Override
public EnvelopeWriteChannel put(byte[] value, int length) throws IOException {
return put(value, 0, length);
}
@Override
public EnvelopeWriteChannel put(byte[] src, int offset, int length) throws IOException {
int srcIndex = offset;
while (srcIndex < length) {
int remainingPayloadSpace = nextSegmentOffset - buffer.position();
int payloadChunk = min(length - srcIndex, remainingPayloadSpace);
buffer.put(src, srcIndex, payloadChunk);
srcIndex += payloadChunk;
if (srcIndex != length) {
// Still have data left to put
completeEnvelopeAndGoToNextSegment();
}
}
appendedBytes += length;
return this;
}
/**
* Writes all remaining data in the {@link ByteBuffer} to this channel.
* The channel will handle chunking the data into envelopes if needed.
*
* @param src buffer with data to write to this channel.
* @return this channel, for fluent usage.
* @throws IOException if I/O error occurs.
*/
@Override
public EnvelopeWriteChannel putAll(ByteBuffer src) throws IOException {
int length = src.remaining();
int srcIndex = src.position();
int srcLimit = src.limit();
while (srcIndex < srcLimit) {
int remainingPayloadSpace = nextSegmentOffset - buffer.position();
int payloadChunk = min(srcLimit - srcIndex, remainingPayloadSpace);
buffer.put(buffer.position(), src, srcIndex, payloadChunk);
buffer.position(buffer.position() + payloadChunk);
srcIndex += payloadChunk;
if (srcIndex != srcLimit) {
// Still have data left to put
completeEnvelopeAndGoToNextSegment();
}
}
appendedBytes += length;
return this;
}
/**
* Writes all remaining data in the {@link ByteBuffer} to this channel overriding the channel's envelope chunking.
* The data in the buffer must be already finished envelopes from another source, and it must be put on the same
* offset within a segment for the envelope boundaries to become correct.
* The method takes care of inserting a start offset envelope if necessary.
*
* @param src buffer with data to write to this channel.
* @param offset offset of the data on the origin. Will be used to figure out the offset to start on within the
* segment. -1 if the data should be put directly after previously written data.
* @return this channel, for fluent usage.
* @throws IOException if I/O error occurs.
*/
@Override
public PhysicalLogChannel directPutAll(ByteBuffer src, long offset) throws IOException {
// Some magic to get around the buffer already being positioned one header after any previous envelope.
if (offset != UNKNOWN_LOG_OFFSET) {
int offsetIntoSegment =
(int) (offset & (segmentBlockSize - 1)); // segmentBlockSize is guaranteed power of 2
// Should write a start offset envelope if there is not already data up to the offset
if (offsetIntoSegment != 0 && ((buffer.position() - HEADER_SIZE) % segmentBlockSize != offsetIntoSegment)) {
insertStartOffset(offsetIntoSegment);
}
buffer.position(nextSegmentOffset - segmentBlockSize + offsetIntoSegment);
} else {
// The position is always one header in, but direct writes should be directly after
// previous data if no offset specified.
buffer.position(buffer.position() - HEADER_SIZE);
}
int length = src.remaining();
int srcIndex = src.position();
int srcEnd = srcIndex + length;
while (srcIndex < srcEnd) {
int remainingPayloadSpace = nextSegmentOffset - buffer.position();
int payloadChunk = min(srcEnd - srcIndex, remainingPayloadSpace);
buffer.put(buffer.position(), src, srcIndex, payloadChunk);
buffer.position(buffer.position() + payloadChunk);
srcIndex += payloadChunk;
if (srcIndex != srcEnd) {
// Still have data left to put. Make sure we flush if buffer is full
padSegmentAndGoToNext();
}
}
appendedBytes += length;
// Update envelope start so that everything we have written will be flushed on next flush call
currentEnvelopeStart = buffer.position();
// TODO MERGELOG - preparing for a new envelope so getPosition will be happy in truncate in
// RemoteStore/TxLogCatchupSession.
// Some chance that it will push us to the next segment even though it shouldn't.
// Switch to something better here later
prepareNextEnvelope();
return this;
}
@Override
public EnvelopeWriteChannel putVersion(byte version) {
currentVersion = version;
return this;
}
@Override
public int putChecksum() throws IOException {
endCurrentEntry();
return previousChecksum;
}
@Override
public boolean isOpen() {
return !closed;
}
public void truncateToPosition(long position, int previousChecksum, long currentIndex) throws IOException {
requireNonNegative(position);
checkArgument(position <= channel.position(), "Can only truncate written data.");
checkArgument(position >= segmentBlockSize, "Truncating the first segment is not possible");
this.previousChecksum = previousChecksum;
channel.truncate(position);
rotateLogFile();
this.currentIndex = currentIndex;
}
@Override
public void close() throws IOException {
if (!closed) {
prepareForFlush().flush();
this.closed = true;
this.channel.close();
this.scopedBuffer.close();
}
}
/**
* @param initialPosition initial position where we should start appending.
*/
private void initialPositions(long initialPosition) {
int bufferWindowOffset = (int) (initialPosition % buffer.capacity());
currentEnvelopeStart = bufferWindowOffset;
lastWrittenPosition = bufferWindowOffset;
nextSegmentOffset = (bufferWindowOffset / segmentBlockSize + 1) * segmentBlockSize; // Round up to next
rotateAtSize = logRotation.rotationSize();
requireMultipleOf("Rotation size", rotateAtSize, "segment size", segmentBlockSize);
if (rotateAtSize == 0) {
rotateAtSize = Long.MAX_VALUE; // Rotation disabled
}
buffer.clear().position(bufferWindowOffset + LogEnvelopeHeader.HEADER_SIZE);
}
private void completeEnvelopeAndGoToNextSegment() throws IOException {
completeEnvelope(false);
padSegmentAndGoToNext();
beginNewEnvelope();
}
/**
* @param end if this is the last entry
*/
private void completeEnvelope(boolean end) {
EnvelopeType type = completedEnvelopeType(begin, end);
final int payLoadLength = currentPayloadLength();
if (payLoadLength == 0) {
checkState(
(nextSegmentOffset - currentEnvelopeStart) <= MAX_ZERO_PADDING_SIZE,
"Empty envelopes can only be discarded at the end of the segment.");
// Nothing to complete. This will be the case when we try to start a new entry at the end of the segment.
// Reset back position to last start and let the padding zero out the rest.
buffer.position(currentEnvelopeStart);
return;
}
writeHeader(type, payLoadLength);
begin = end;
if (end) {
currentIndex++;
}
}
private void writeHeader(EnvelopeType type, int payloadLength) {
final int payloadEndOffset = buffer.position();
// Fill in the header
final int checksumStartOffset = currentEnvelopeStart + Integer.BYTES;
buffer.position(checksumStartOffset);
assert currentVersion != -1;
buffer.put(type.typeValue)
.putInt(payloadLength)
// START_OFFSET envelopes do not have an index, as they are skipped automatically when reading
.putLong(type != EnvelopeType.START_OFFSET ? currentIndex : 0)
.put(currentVersion)
// START_OFFSET envelopes do not participate in the checksum chain
.putInt(type != EnvelopeType.START_OFFSET ? previousChecksum : 0);
// Calculate the checksum and insert
checksum.reset();
checksum.update(checksumView.clear().limit(payloadEndOffset).position(checksumStartOffset));
final int thisEnvelopeChecksum = (int) checksum.getValue();
buffer.putInt(currentEnvelopeStart, thisEnvelopeChecksum);
if (type != EnvelopeType.START_OFFSET) {
previousChecksum = thisEnvelopeChecksum;
}
// Now we're ready to position the buffer to start writing the next envelope.
buffer.position(payloadEndOffset);
currentEnvelopeStart = payloadEndOffset;
}
private static EnvelopeType completedEnvelopeType(boolean begin, boolean end) {
if (begin && end) {
return EnvelopeType.FULL;
} else if (begin) {
return EnvelopeType.BEGIN;
} else if (end) {
return EnvelopeType.END;
} else {
return EnvelopeType.MIDDLE;
}
}
private EnvelopeWriteChannel updateBytesWritten(int count) {
appendedBytes += count;
return this;
}
private void nextSegmentOnOverflow(int spaceInBytes) throws IOException {
if ((buffer.position() + spaceInBytes) > nextSegmentOffset) {
// Add data would overflow the segment, write out the current envelop and continue in next segment
completeEnvelopeAndGoToNextSegment();
}
}
private void beginNewEnvelope() {
currentEnvelopeStart = buffer.position();
buffer.position(currentEnvelopeStart + LogEnvelopeHeader.HEADER_SIZE);
}
private void padSegmentAndGoToNext() throws IOException {
int position = buffer.position();
if (position < nextSegmentOffset) {
buffer.put(PADDING_ZEROES, 0, nextSegmentOffset - position);
}
assert buffer.position() == nextSegmentOffset;
currentEnvelopeStart = nextSegmentOffset;
if (currentEnvelopeStart == buffer.capacity()
|| (channel.position() + currentEnvelopeStart - lastWrittenPosition) == rotateAtSize) {
prepareForFlush();
} else {
nextSegmentOffset += segmentBlockSize;
}
}
private void rotateLogFile() throws IOException {
try (var logAppendEvent = databaseTracer.logAppend()) {
logRotation.rotateLogFile(logAppendEvent);
// and notify the event tracer
logAppendEvent.setLogRotated(true);
}
}
private void handleClosedChannelException(ClosedChannelException e) throws ClosedChannelException {
// We don't want to check the closed flag every time we empty, instead we can avoid unnecessary the
// volatile read and catch ClosedChannelException where we see if the channel being closed was
// deliberate or not. If it was deliberately closed then throw IllegalStateException instead so
// that callers won't treat this as a kernel panic.
checkChannelClosed(e);
// OK, this channel was closed without us really knowing about it, throw exception as is.
throw e;
}
private void checkChannelClosed(ClosedChannelException e) throws IllegalStateException {
if (closed) {
throw new IllegalStateException("This log channel has been closed", e);
}
}
@Override
public int write(ByteBuffer src) throws IOException {
int remaining = src.remaining();
putAll(src);
return remaining;
}
private int currentPayloadLength() {
return buffer.position() - (currentEnvelopeStart + LogEnvelopeHeader.HEADER_SIZE);
}
/**
* Writes a START_OFFSET envelope to the current segment, which will shift all
* following envelopes by `size` bytes. This can be used to align the following
* envelopes if necessary while replicating envelopes from other machines.
*
* This method can only be called to insert an offset envelope at the beginning
* of a segment and cannot be called while writing of another envelope (so, make
* sure to close the current envelope with {@link #endCurrentEntry()}/{@link #putChecksum()}
* before calling this method.
*
* @param size must be at least the length of one envelope header (see {@link LogEnvelopeHeader#HEADER_SIZE})
* and must leave enough space for another envelope to be added after it in the current segment.
*/
public void insertStartOffset(int size) throws IOException {
checkArgument(size >= HEADER_SIZE, ERROR_MSG_TEMPLATE_OFFSET_SIZE_TOO_SMALL, HEADER_SIZE);
checkArgument(
size < segmentBlockSize - HEADER_SIZE, ERROR_MSG_TEMPLATE_OFFSET_SIZE_TOO_LARGE, segmentBlockSize);
checkState(
currentEnvelopeStart == 0 && channel.position() == segmentBlockSize,
ERROR_MSG_TEMPLATE_OFFSET_MUST_BE_FIRST_IN_THE_FIRST_SEGMENT);
checkState(
(currentEnvelopeStart + HEADER_SIZE) == buffer.position(),
ERROR_MSG_TEMPLATE_OFFSET_MUST_NOT_BE_INSIDE_ANOTHER_ENVELOPE);
final int payloadLength = size - HEADER_SIZE;
put(new byte[payloadLength], payloadLength);
writeHeader(EnvelopeType.START_OFFSET, payloadLength);
prepareNextEnvelope();
}
}