org.apache.kafka.clients.producer.internals.ProducerBatch Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jena-fmod-kafka Show documentation
Show all versions of jena-fmod-kafka Show documentation
Apache Jena Fuseki server Kafka connector
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.kafka.clients.producer.internals;
import java.util.OptionalInt;
import org.apache.kafka.clients.producer.Callback;
import org.apache.kafka.clients.producer.RecordMetadata;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.common.errors.RecordBatchTooLargeException;
import org.apache.kafka.common.header.Header;
import org.apache.kafka.common.record.AbstractRecords;
import org.apache.kafka.common.record.CompressionRatioEstimator;
import org.apache.kafka.common.record.CompressionType;
import org.apache.kafka.common.record.MemoryRecords;
import org.apache.kafka.common.record.MemoryRecordsBuilder;
import org.apache.kafka.common.record.MutableRecordBatch;
import org.apache.kafka.common.record.Record;
import org.apache.kafka.common.record.RecordBatch;
import org.apache.kafka.common.record.TimestampType;
import org.apache.kafka.common.requests.ProduceResponse;
import org.apache.kafka.common.utils.ProducerIdAndEpoch;
import org.apache.kafka.common.utils.Time;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.nio.ByteBuffer;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Deque;
import java.util.Iterator;
import java.util.List;
import java.util.Objects;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Function;
import static org.apache.kafka.common.record.RecordBatch.MAGIC_VALUE_V2;
import static org.apache.kafka.common.record.RecordBatch.NO_TIMESTAMP;
/**
* A batch of records that is or will be sent.
*
* This class is not thread safe and external synchronization must be used when modifying it
*/
public final class ProducerBatch {
private static final Logger log = LoggerFactory.getLogger(ProducerBatch.class);
private enum FinalState { ABORTED, FAILED, SUCCEEDED }
final long createdMs;
final TopicPartition topicPartition;
final ProduceRequestResult produceFuture;
private final List thunks = new ArrayList<>();
private final MemoryRecordsBuilder recordsBuilder;
private final AtomicInteger attempts = new AtomicInteger(0);
private final boolean isSplitBatch;
private final AtomicReference finalState = new AtomicReference<>(null);
int recordCount;
int maxRecordSize;
private long lastAttemptMs;
private long lastAppendTime;
private long drainedMs;
private boolean retry;
private boolean reopened;
// Tracks the current-leader's epoch to which this batch would be sent, in the current to produce the batch.
private OptionalInt currentLeaderEpoch;
// Tracks the attempt in which leader was changed to currentLeaderEpoch for the 1st time.
private int attemptsWhenLeaderLastChanged;
public ProducerBatch(TopicPartition tp, MemoryRecordsBuilder recordsBuilder, long createdMs) {
this(tp, recordsBuilder, createdMs, false);
}
public ProducerBatch(TopicPartition tp, MemoryRecordsBuilder recordsBuilder, long createdMs, boolean isSplitBatch) {
this.createdMs = createdMs;
this.lastAttemptMs = createdMs;
this.recordsBuilder = recordsBuilder;
this.topicPartition = tp;
this.lastAppendTime = createdMs;
this.produceFuture = new ProduceRequestResult(topicPartition);
this.retry = false;
this.isSplitBatch = isSplitBatch;
float compressionRatioEstimation = CompressionRatioEstimator.estimation(topicPartition.topic(),
recordsBuilder.compression().type());
this.currentLeaderEpoch = OptionalInt.empty();
this.attemptsWhenLeaderLastChanged = 0;
recordsBuilder.setEstimatedCompressionRatio(compressionRatioEstimation);
}
/**
* It will update the leader to which this batch will be produced for the ongoing attempt, if a newer leader is known.
* @param latestLeaderEpoch latest leader's epoch.
*/
void maybeUpdateLeaderEpoch(OptionalInt latestLeaderEpoch) {
if (latestLeaderEpoch.isPresent()
&& (!currentLeaderEpoch.isPresent() || currentLeaderEpoch.getAsInt() < latestLeaderEpoch.getAsInt())) {
log.trace("For {}, leader will be updated, currentLeaderEpoch: {}, attemptsWhenLeaderLastChanged:{}, latestLeaderEpoch: {}, current attempt: {}",
this, currentLeaderEpoch, attemptsWhenLeaderLastChanged, latestLeaderEpoch, attempts);
attemptsWhenLeaderLastChanged = attempts();
currentLeaderEpoch = latestLeaderEpoch;
} else {
log.trace("For {}, leader wasn't updated, currentLeaderEpoch: {}, attemptsWhenLeaderLastChanged:{}, latestLeaderEpoch: {}, current attempt: {}",
this, currentLeaderEpoch, attemptsWhenLeaderLastChanged, latestLeaderEpoch, attempts);
}
}
/**
* It will return true, for a when batch is being retried, it will be retried to a newer leader.
*/
boolean hasLeaderChangedForTheOngoingRetry() {
int attempts = attempts();
boolean isRetry = attempts >= 1;
if (!isRetry)
return false;
return attempts == attemptsWhenLeaderLastChanged;
}
/**
* Append the record to the current record set and return the relative offset within that record set
*
* @return The RecordSend corresponding to this record or null if there isn't sufficient room.
*/
public FutureRecordMetadata tryAppend(long timestamp, byte[] key, byte[] value, Header[] headers, Callback callback, long now) {
if (!recordsBuilder.hasRoomFor(timestamp, key, value, headers)) {
return null;
} else {
this.recordsBuilder.append(timestamp, key, value, headers);
this.maxRecordSize = Math.max(this.maxRecordSize, AbstractRecords.estimateSizeInBytesUpperBound(magic(),
recordsBuilder.compression().type(), key, value, headers));
this.lastAppendTime = now;
FutureRecordMetadata future = new FutureRecordMetadata(this.produceFuture, this.recordCount,
timestamp,
key == null ? -1 : key.length,
value == null ? -1 : value.length,
Time.SYSTEM);
// we have to keep every future returned to the users in case the batch needs to be
// split to several new batches and resent.
thunks.add(new Thunk(callback, future));
this.recordCount++;
return future;
}
}
/**
* This method is only used by {@link #split(int)} when splitting a large batch to smaller ones.
* @return true if the record has been successfully appended, false otherwise.
*/
private boolean tryAppendForSplit(long timestamp, ByteBuffer key, ByteBuffer value, Header[] headers, Thunk thunk) {
if (!recordsBuilder.hasRoomFor(timestamp, key, value, headers)) {
return false;
} else {
// No need to get the CRC.
this.recordsBuilder.append(timestamp, key, value, headers);
this.maxRecordSize = Math.max(this.maxRecordSize, AbstractRecords.estimateSizeInBytesUpperBound(magic(),
recordsBuilder.compression().type(), key, value, headers));
FutureRecordMetadata future = new FutureRecordMetadata(this.produceFuture, this.recordCount,
timestamp,
key == null ? -1 : key.remaining(),
value == null ? -1 : value.remaining(),
Time.SYSTEM);
// Chain the future to the original thunk.
thunk.future.chain(future);
this.thunks.add(thunk);
this.recordCount++;
return true;
}
}
/**
* Abort the batch and complete the future and callbacks.
*
* @param exception The exception to use to complete the future and awaiting callbacks.
*/
public void abort(RuntimeException exception) {
if (!finalState.compareAndSet(null, FinalState.ABORTED))
throw new IllegalStateException("Batch has already been completed in final state " + finalState.get());
log.trace("Aborting batch for partition {}", topicPartition, exception);
completeFutureAndFireCallbacks(ProduceResponse.INVALID_OFFSET, RecordBatch.NO_TIMESTAMP, index -> exception);
}
/**
* Check if the batch has been completed (either successfully or exceptionally).
* @return `true` if the batch has been completed, `false` otherwise.
*/
public boolean isDone() {
return finalState() != null;
}
/**
* Complete the batch successfully.
* @param baseOffset The base offset of the messages assigned by the server
* @param logAppendTime The log append time or -1 if CreateTime is being used
* @return true if the batch was completed as a result of this call, and false
* if it had been completed previously
*/
public boolean complete(long baseOffset, long logAppendTime) {
return done(baseOffset, logAppendTime, null, null);
}
/**
* Complete the batch exceptionally. The provided top-level exception will be used
* for each record future contained in the batch.
*
* @param topLevelException top-level partition error
* @param recordExceptions Record exception function mapping batchIndex to the respective record exception
* @return true if the batch was completed as a result of this call, and false
* if it had been completed previously
*/
public boolean completeExceptionally(
RuntimeException topLevelException,
Function recordExceptions
) {
Objects.requireNonNull(topLevelException);
Objects.requireNonNull(recordExceptions);
return done(ProduceResponse.INVALID_OFFSET, RecordBatch.NO_TIMESTAMP, topLevelException, recordExceptions);
}
/**
* Finalize the state of a batch. Final state, once set, is immutable. This function may be called
* once or twice on a batch. It may be called twice if
* 1. An inflight batch expires before a response from the broker is received. The batch's final
* state is set to FAILED. But it could succeed on the broker and second time around batch.done() may
* try to set SUCCEEDED final state.
* 2. If a transaction abortion happens or if the producer is closed forcefully, the final state is
* ABORTED but again it could succeed if broker responds with a success.
*
* Attempted transitions from [FAILED | ABORTED] --> SUCCEEDED are logged.
* Attempted transitions from one failure state to the same or a different failed state are ignored.
* Attempted transitions from SUCCEEDED to the same or a failed state throw an exception.
*
* @param baseOffset The base offset of the messages assigned by the server
* @param logAppendTime The log append time or -1 if CreateTime is being used
* @param topLevelException The exception that occurred (or null if the request was successful)
* @param recordExceptions Record exception function mapping batchIndex to the respective record exception
* @return true if the batch was completed successfully and false if the batch was previously aborted
*/
private boolean done(
long baseOffset,
long logAppendTime,
RuntimeException topLevelException,
Function recordExceptions
) {
final FinalState tryFinalState = (topLevelException == null) ? FinalState.SUCCEEDED : FinalState.FAILED;
if (tryFinalState == FinalState.SUCCEEDED) {
log.trace("Successfully produced messages to {} with base offset {}.", topicPartition, baseOffset);
} else {
log.trace("Failed to produce messages to {} with base offset {}.", topicPartition, baseOffset, topLevelException);
}
if (this.finalState.compareAndSet(null, tryFinalState)) {
completeFutureAndFireCallbacks(baseOffset, logAppendTime, recordExceptions);
return true;
}
if (this.finalState.get() != FinalState.SUCCEEDED) {
if (tryFinalState == FinalState.SUCCEEDED) {
// Log if a previously unsuccessful batch succeeded later on.
log.debug("ProduceResponse returned {} for {} after batch with base offset {} had already been {}.",
tryFinalState, topicPartition, baseOffset, this.finalState.get());
} else {
// FAILED --> FAILED and ABORTED --> FAILED transitions are ignored.
log.debug("Ignored state transition {} -> {} for {} batch with base offset {}",
this.finalState.get(), tryFinalState, topicPartition, baseOffset);
}
} else {
// A SUCCESSFUL batch must not attempt another state change.
throw new IllegalStateException("A " + this.finalState.get() + " batch must not attempt another state change to " + tryFinalState);
}
return false;
}
private void completeFutureAndFireCallbacks(
long baseOffset,
long logAppendTime,
Function recordExceptions
) {
// Set the future before invoking the callbacks as we rely on its state for the `onCompletion` call
produceFuture.set(baseOffset, logAppendTime, recordExceptions);
// execute callbacks
for (int i = 0; i < thunks.size(); i++) {
try {
Thunk thunk = thunks.get(i);
if (thunk.callback != null) {
if (recordExceptions == null) {
RecordMetadata metadata = thunk.future.value();
thunk.callback.onCompletion(metadata, null);
} else {
RuntimeException exception = recordExceptions.apply(i);
thunk.callback.onCompletion(null, exception);
}
}
} catch (Exception e) {
log.error("Error executing user-provided callback on message for topic-partition '{}'", topicPartition, e);
}
}
produceFuture.done();
}
public Deque split(int splitBatchSize) {
Deque batches = new ArrayDeque<>();
MemoryRecords memoryRecords = recordsBuilder.build();
Iterator recordBatchIter = memoryRecords.batches().iterator();
if (!recordBatchIter.hasNext())
throw new IllegalStateException("Cannot split an empty producer batch.");
RecordBatch recordBatch = recordBatchIter.next();
if (recordBatch.magic() < MAGIC_VALUE_V2 && !recordBatch.isCompressed())
throw new IllegalArgumentException("Batch splitting cannot be used with non-compressed messages " +
"with version v0 and v1");
if (recordBatchIter.hasNext())
throw new IllegalArgumentException("A producer batch should only have one record batch.");
Iterator thunkIter = thunks.iterator();
// We always allocate batch size because we are already splitting a big batch.
// And we also Retain the create time of the original batch.
ProducerBatch batch = null;
for (Record record : recordBatch) {
assert thunkIter.hasNext();
Thunk thunk = thunkIter.next();
if (batch == null)
batch = createBatchOffAccumulatorForRecord(record, splitBatchSize);
// A newly created batch can always host the first message.
if (!batch.tryAppendForSplit(record.timestamp(), record.key(), record.value(), record.headers(), thunk)) {
batches.add(batch);
batch.closeForRecordAppends();
batch = createBatchOffAccumulatorForRecord(record, splitBatchSize);
batch.tryAppendForSplit(record.timestamp(), record.key(), record.value(), record.headers(), thunk);
}
}
// Close the last batch and add it to the batch list after split.
if (batch != null) {
batches.add(batch);
batch.closeForRecordAppends();
}
produceFuture.set(ProduceResponse.INVALID_OFFSET, NO_TIMESTAMP, index -> new RecordBatchTooLargeException());
produceFuture.done();
if (hasSequence()) {
int sequence = baseSequence();
ProducerIdAndEpoch producerIdAndEpoch = new ProducerIdAndEpoch(producerId(), producerEpoch());
for (ProducerBatch newBatch : batches) {
newBatch.setProducerState(producerIdAndEpoch, sequence, isTransactional());
sequence += newBatch.recordCount;
}
}
return batches;
}
private ProducerBatch createBatchOffAccumulatorForRecord(Record record, int batchSize) {
int initialSize = Math.max(AbstractRecords.estimateSizeInBytesUpperBound(magic(),
recordsBuilder.compression().type(), record.key(), record.value(), record.headers()), batchSize);
ByteBuffer buffer = ByteBuffer.allocate(initialSize);
// Note that we intentionally do not set producer state (producerId, epoch, sequence, and isTransactional)
// for the newly created batch. This will be set when the batch is dequeued for sending (which is consistent
// with how normal batches are handled).
MemoryRecordsBuilder builder = MemoryRecords.builder(buffer, magic(), recordsBuilder.compression(),
TimestampType.CREATE_TIME, 0L);
return new ProducerBatch(topicPartition, builder, this.createdMs, true);
}
public boolean isCompressed() {
return recordsBuilder.compression().type() != CompressionType.NONE;
}
/**
* A callback and the associated FutureRecordMetadata argument to pass to it.
*/
final private static class Thunk {
final Callback callback;
final FutureRecordMetadata future;
Thunk(Callback callback, FutureRecordMetadata future) {
this.callback = callback;
this.future = future;
}
}
@Override
public String toString() {
return "ProducerBatch(topicPartition=" + topicPartition + ", recordCount=" + recordCount + ")";
}
boolean hasReachedDeliveryTimeout(long deliveryTimeoutMs, long now) {
return deliveryTimeoutMs <= now - this.createdMs;
}
public FinalState finalState() {
return this.finalState.get();
}
int attempts() {
return attempts.get();
}
void reenqueued(long now) {
attempts.getAndIncrement();
lastAttemptMs = Math.max(lastAppendTime, now);
lastAppendTime = Math.max(lastAppendTime, now);
retry = true;
}
long queueTimeMs() {
return drainedMs - createdMs;
}
long waitedTimeMs(long nowMs) {
return Math.max(0, nowMs - lastAttemptMs);
}
void drained(long nowMs) {
this.drainedMs = Math.max(drainedMs, nowMs);
}
boolean isSplitBatch() {
return isSplitBatch;
}
/**
* Returns if the batch is been retried for sending to kafka
*/
public boolean inRetry() {
return this.retry;
}
public MemoryRecords records() {
return recordsBuilder.build();
}
public int estimatedSizeInBytes() {
return recordsBuilder.estimatedSizeInBytes();
}
public double compressionRatio() {
return recordsBuilder.compressionRatio();
}
public boolean isFull() {
return recordsBuilder.isFull();
}
public void setProducerState(ProducerIdAndEpoch producerIdAndEpoch, int baseSequence, boolean isTransactional) {
recordsBuilder.setProducerState(producerIdAndEpoch.producerId, producerIdAndEpoch.epoch, baseSequence, isTransactional);
}
public void resetProducerState(ProducerIdAndEpoch producerIdAndEpoch, int baseSequence) {
log.info("Resetting sequence number of batch with current sequence {} for partition {} to {}",
this.baseSequence(), this.topicPartition, baseSequence);
reopened = true;
recordsBuilder.reopenAndRewriteProducerState(producerIdAndEpoch.producerId, producerIdAndEpoch.epoch, baseSequence, isTransactional());
}
/**
* Release resources required for record appends (e.g. compression buffers). Once this method is called, it's only
* possible to update the RecordBatch header.
*/
public void closeForRecordAppends() {
recordsBuilder.closeForRecordAppends();
}
public void close() {
recordsBuilder.close();
if (!recordsBuilder.isControlBatch()) {
CompressionRatioEstimator.updateEstimation(topicPartition.topic(),
recordsBuilder.compression().type(),
(float) recordsBuilder.compressionRatio());
}
reopened = false;
}
/**
* Abort the record builder and reset the state of the underlying buffer. This is used prior to aborting
* the batch with {@link #abort(RuntimeException)} and ensures that no record previously appended can be
* read. This is used in scenarios where we want to ensure a batch ultimately gets aborted, but in which
* it is not safe to invoke the completion callbacks (e.g. because we are holding a lock, such as
* when aborting batches in {@link RecordAccumulator}).
*/
public void abortRecordAppends() {
recordsBuilder.abort();
}
public boolean isClosed() {
return recordsBuilder.isClosed();
}
public ByteBuffer buffer() {
return recordsBuilder.buffer();
}
public int initialCapacity() {
return recordsBuilder.initialCapacity();
}
public boolean isWritable() {
return !recordsBuilder.isClosed();
}
public byte magic() {
return recordsBuilder.magic();
}
public long producerId() {
return recordsBuilder.producerId();
}
public short producerEpoch() {
return recordsBuilder.producerEpoch();
}
public int baseSequence() {
return recordsBuilder.baseSequence();
}
public int lastSequence() {
return recordsBuilder.baseSequence() + recordsBuilder.numRecords() - 1;
}
public boolean hasSequence() {
return baseSequence() != RecordBatch.NO_SEQUENCE;
}
public boolean isTransactional() {
return recordsBuilder.isTransactional();
}
public boolean sequenceHasBeenReset() {
return reopened;
}
// VisibleForTesting
OptionalInt currentLeaderEpoch() {
return currentLeaderEpoch;
}
// VisibleForTesting
int attemptsWhenLeaderLastChanged() {
return attemptsWhenLeaderLastChanged;
}
}