Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
io.streamnative.pulsar.handlers.kop.storage.PartitionLog Maven / Gradle / Ivy
Go to download
Kafka on Pulsar implemented using Pulsar Protocol Handler
/**
* Copyright (c) 2019 - 2024 StreamNative, Inc.. All Rights Reserved.
*/
/**
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.streamnative.pulsar.handlers.kop.storage;
import static com.google.common.base.Preconditions.checkArgument;
import static io.streamnative.pulsar.handlers.kop.AdminManager.KOP_KAFKA_PROPERTY_PREFIX;
import static io.streamnative.pulsar.handlers.kop.utils.MessageMetadataUtils.isBrokerIndexMetadataInterceptorConfigured;
import static org.apache.kafka.common.config.TopicConfig.CLEANUP_POLICY_COMPACT;
import static org.apache.kafka.common.config.TopicConfig.CLEANUP_POLICY_CONFIG;
import static org.apache.kafka.common.internals.Topic.GROUP_METADATA_TOPIC_NAME;
import com.google.common.annotations.VisibleForTesting;
import io.netty.buffer.ByteBuf;
import io.netty.util.Recycler;
import io.netty.util.concurrent.EventExecutor;
import io.streamnative.pulsar.handlers.kop.KafkaServiceConfiguration;
import io.streamnative.pulsar.handlers.kop.KafkaTopicConsumerManager;
import io.streamnative.pulsar.handlers.kop.KafkaTopicLookupService;
import io.streamnative.pulsar.handlers.kop.KafkaTopicManager;
import io.streamnative.pulsar.handlers.kop.MessageFetchContext;
import io.streamnative.pulsar.handlers.kop.MessagePublishContext;
import io.streamnative.pulsar.handlers.kop.PendingTopicFutures;
import io.streamnative.pulsar.handlers.kop.RequestStats;
import io.streamnative.pulsar.handlers.kop.exceptions.MetadataCorruptedException;
import io.streamnative.pulsar.handlers.kop.format.DecodeResult;
import io.streamnative.pulsar.handlers.kop.format.EncodeRequest;
import io.streamnative.pulsar.handlers.kop.format.EncodeResult;
import io.streamnative.pulsar.handlers.kop.format.EntryFormatter;
import io.streamnative.pulsar.handlers.kop.format.EntryFormatterFactory;
import io.streamnative.pulsar.handlers.kop.format.KafkaMixedEntryFormatter;
import io.streamnative.pulsar.handlers.kop.format.PulsarEntryFormatter;
import io.streamnative.pulsar.handlers.kop.topic.KopPersistentTopic;
import io.streamnative.pulsar.handlers.kop.utils.KopLogValidator;
import io.streamnative.pulsar.handlers.kop.utils.MessageMetadataUtils;
import io.streamnative.pulsar.handlers.kop.utils.MetadataUtils;
import io.streamnative.pulsar.handlers.kop.utils.TopicNameUtils;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.StringJoiner;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.Executor;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.LongAdder;
import java.util.function.Consumer;
import javax.annotation.Nullable;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.Getter;
import lombok.ToString;
import lombok.experimental.Accessors;
import lombok.extern.slf4j.Slf4j;
import org.apache.bookkeeper.common.util.MathUtils;
import org.apache.bookkeeper.common.util.OrderedExecutor;
import org.apache.bookkeeper.mledger.AsyncCallbacks;
import org.apache.bookkeeper.mledger.Entry;
import org.apache.bookkeeper.mledger.ManagedCursor;
import org.apache.bookkeeper.mledger.ManagedLedger;
import org.apache.bookkeeper.mledger.ManagedLedgerException;
import org.apache.bookkeeper.mledger.Position;
import org.apache.bookkeeper.mledger.impl.ManagedLedgerImpl;
import org.apache.bookkeeper.mledger.impl.NonDurableCursorImpl;
import org.apache.bookkeeper.mledger.impl.PositionImpl;
import org.apache.commons.lang3.mutable.MutableInt;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.kafka.common.InvalidRecordException;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.common.errors.CorruptRecordException;
import org.apache.kafka.common.errors.KafkaStorageException;
import org.apache.kafka.common.errors.NotLeaderOrFollowerException;
import org.apache.kafka.common.errors.RecordBatchTooLargeException;
import org.apache.kafka.common.errors.UnknownServerException;
import org.apache.kafka.common.message.DescribeProducersResponseData;
import org.apache.kafka.common.message.FetchRequestData;
import org.apache.kafka.common.message.FetchResponseData;
import org.apache.kafka.common.protocol.Errors;
import org.apache.kafka.common.record.CompressionType;
import org.apache.kafka.common.record.MemoryRecords;
import org.apache.kafka.common.record.Record;
import org.apache.kafka.common.record.RecordBatch;
import org.apache.kafka.common.requests.FetchResponse;
import org.apache.kafka.common.utils.Time;
import org.apache.pulsar.broker.service.persistent.PersistentTopic;
import org.apache.pulsar.broker.service.plugin.EntryFilter;
import org.apache.pulsar.common.naming.TopicName;
import org.apache.pulsar.common.util.FutureUtil;
import org.apache.pulsar.compaction.CompactedTopicUtils;
import org.apache.pulsar.compaction.Compactor;
import org.apache.pulsar.compaction.TopicCompactionService;
/**
* Analyze result.
*/
record AnalyzeResult(Map updatedProducers, List completedTxns) {
@Override
public boolean equals(Object obj) {
if (obj instanceof AnalyzeResult that) {
return Objects.equals(this.updatedProducers, that.updatedProducers)
&& Objects.equals(this.completedTxns, that.completedTxns);
}
return false;
}
}
/**
* An append-only log for storing messages. Mapping to Kafka Log.scala.
*/
@Slf4j
public class PartitionLog {
public static final String KAFKA_TOPIC_UUID_PROPERTY_NAME = "kafkaTopicUUID";
public static final String KAFKA_ENTRY_FORMATTER_PROPERTY_NAME = "kafkaEntryFormat";
private static final String PID_PREFIX = "KOP-PID-PREFIX";
private static final KopLogValidator.CompressionCodec DEFAULT_COMPRESSION =
new KopLogValidator.CompressionCodec(CompressionType.NONE.name, CompressionType.NONE.id);
private final KafkaServiceConfiguration kafkaConfig;
private final RequestStats requestStats;
private final Time time;
private final String fullPartitionName;
// We can use fullPartitionName as the key of StatsLogger in future to avoid using TopicPartition internally
private final TopicPartition statsTopicPartition;
@Getter
@VisibleForTesting
volatile ProducerStateManager producerStateManager;
private final List entryFilters;
private final KafkaTopicLookupService kafkaTopicLookupService;
private final ProducerStateManagerSnapshotBuffer producerStateManagerSnapshotBuffer;
private final ExecutorService recoveryExecutor;
@VisibleForTesting
volatile PersistentTopic persistentTopic;
private final CompletableFuture initFuture = new CompletableFuture<>();
private volatile Map topicProperties;
private final boolean isOffsetTopic;
private volatile EntryFormatter entryFormatter;
private volatile String kafkaTopicUUID;
private volatile boolean unloaded = false;
@Getter
private final LongAdder pendingPublishOps = new LongAdder();
public PartitionLog(KafkaServiceConfiguration kafkaConfig,
RequestStats requestStats,
Time time,
String fullPartitionName,
List entryFilters,
KafkaTopicLookupService kafkaTopicLookupService,
ProducerStateManagerSnapshotBuffer producerStateManagerSnapshotBuffer,
OrderedExecutor recoveryExecutor) {
this.kafkaConfig = kafkaConfig;
this.entryFilters = entryFilters;
this.requestStats = requestStats;
this.time = time;
this.fullPartitionName = fullPartitionName;
final var prefix = "persistent://" + kafkaConfig.getDefaultNamespacePrefix();
if (fullPartitionName.startsWith(prefix)) {
this.statsTopicPartition = TopicNameUtils.splitTopicPartition(fullPartitionName.substring(prefix.length()));
} else {
this.statsTopicPartition = TopicNameUtils.splitTopicPartition(fullPartitionName);
}
final int index = fullPartitionName.indexOf(TopicName.PARTITIONED_TOPIC_SUFFIX);
this.isOffsetTopic = (index >= 0 && fullPartitionName.substring(0, index).endsWith(
kafkaConfig.getKafkaMetadataNamespace() + "/" + GROUP_METADATA_TOPIC_NAME));
this.kafkaTopicLookupService = kafkaTopicLookupService;
this.producerStateManagerSnapshotBuffer = producerStateManagerSnapshotBuffer;
this.recoveryExecutor = recoveryExecutor.chooseThread(fullPartitionName);
}
public void initialise() {
loadTopicProperties().whenComplete((___, errorLoadTopic) -> {
if (errorLoadTopic != null) {
log.warn("Failed to load {}", fullPartitionName, errorLoadTopic);
initFuture.completeExceptionally(new NotLeaderOrFollowerException());
return;
}
CompletableFuture recoverFuture = CompletableFuture.completedFuture(null);
if (kafkaConfig.isKafkaTransactionCoordinatorEnabled()) {
recoverFuture = producerStateManager.recover(this, recoveryExecutor);
}
recoverFuture.thenRun(() -> {
if (persistentTopic instanceof KopPersistentTopic kopPersistentTopic) {
kopPersistentTopic.updateKafkaTopicUUID(kafkaTopicUUID != null ? kafkaTopicUUID : "");
}
}).thenRun(() -> initFuture.complete(this))
.exceptionally(error -> {
log.warn("Failed to recover {}", fullPartitionName, error);
initFuture.completeExceptionally(new NotLeaderOrFollowerException());
return null;
});
});
}
public CompletableFuture awaitInitialisation() {
return initFuture;
}
public boolean isInitialised() {
return initFuture.isDone() && !initFuture.isCompletedExceptionally();
}
public boolean isInitialisationFailed() {
return initFuture.isDone() && initFuture.isCompletedExceptionally();
}
private CompletableFuture loadTopicProperties() {
CompletableFuture> persistentTopicFuture =
kafkaTopicLookupService.getTopic(fullPartitionName, this);
return persistentTopicFuture
.thenCompose(this::fetchTopicProperties)
.thenAccept(properties -> {
this.topicProperties = properties;
log.info("Topic properties for {} are {}", fullPartitionName, properties);
if (isOffsetTopic) {
this.entryFormatter = PulsarEntryFormatter.offsetControlRecordFormatter();
} else {
this.entryFormatter = buildEntryFormatter(topicProperties);
}
this.kafkaTopicUUID = properties.getOrDefault(KAFKA_TOPIC_UUID_PROPERTY_NAME, "");
if (!MetadataUtils.isSystemTopic(persistentTopic, kafkaConfig)) {
checkArgument(this.kafkaTopicUUID != null);
}
// TODO: load the HW from the metadata store
this.producerStateManager =
new ProducerStateManager(
fullPartitionName,
kafkaTopicUUID,
producerStateManagerSnapshotBuffer,
kafkaConfig.getKafkaTxnProducerStateTopicSnapshotIntervalSeconds(),
kafkaConfig.getKafkaTxnPurgeAbortedTxnIntervalSeconds(),
kafkaConfig.getKafkaTxnMaxDifferentMessageToSnapshotThreshold(),
this);
});
}
private CompletableFuture> fetchTopicProperties(Optional persistentTopic) {
if (!persistentTopic.isPresent()) {
log.info("Topic {} not loaded here", fullPartitionName);
return FutureUtil.failedFuture(new NotLeaderOrFollowerException());
}
this.persistentTopic = persistentTopic.get();
TopicName logicalName = TopicName.get(persistentTopic.get().getName());
TopicName actualName;
if (logicalName.isPartitioned()) {
actualName = TopicName.getPartitionedTopicName(persistentTopic.get().getName());
} else {
actualName = logicalName;
}
return persistentTopic.get().getBrokerService()
.fetchPartitionedTopicMetadataAsync(actualName, true)
.thenApply(metadata -> {
if (metadata.partitions > 0) {
return metadata.properties;
} else {
return persistentTopic.get().getManagedLedger().getProperties();
}
})
.thenApply(map -> map != null ? map : Collections.emptyMap());
}
private EntryFormatter buildEntryFormatter(Map topicProperties) {
final String entryFormat;
if (topicProperties != null) {
entryFormat = topicProperties
.getOrDefault(KAFKA_ENTRY_FORMATTER_PROPERTY_NAME, kafkaConfig.getEntryFormat());
} else {
entryFormat = kafkaConfig.getEntryFormat();
}
if (log.isDebugEnabled()) {
log.debug("entryFormat for {} is {} (topicProperties {})", fullPartitionName,
entryFormat, topicProperties);
}
return EntryFormatterFactory.create(kafkaConfig, entryFilters, entryFormat);
}
@Data
@Accessors(fluent = true)
@AllArgsConstructor
public static class LogAppendInfo {
private Optional firstOffset;
private Optional producerId;
private String producerName;
private short producerEpoch;
private int numMessages;
private int shallowCount;
private boolean isTransaction;
private boolean isControlBatch;
private int validBytes;
private int firstSequence;
private int lastSequence;
private KopLogValidator.CompressionCodec sourceCodec;
private KopLogValidator.CompressionCodec targetCodec;
private boolean compactedTopic;
}
@Data
@ToString
@Accessors(fluent = true)
@AllArgsConstructor
public static class ReadRecordsResult {
private static final Recycler RECYCLER = new Recycler() {
protected ReadRecordsResult newObject(Handle handle) {
return new ReadRecordsResult(handle);
}
};
private final Recycler.Handle recyclerHandle;
private DecodeResult decodeResult;
private List abortedTransactions;
private long highWatermark;
private long lastStableOffset;
private Position lastPosition;
private Errors errors;
private PartitionLog partitionLog;
private ReadRecordsResult(Recycler.Handle recyclerHandle) {
this.recyclerHandle = recyclerHandle;
}
public Errors errors() {
return errors == null ? Errors.NONE : errors;
}
public static ReadRecordsResult get(DecodeResult decodeResult,
List abortedTransactions,
long highWatermark,
long lastStableOffset,
Position lastPosition,
PartitionLog partitionLog) {
return ReadRecordsResult.get(
decodeResult,
abortedTransactions,
highWatermark,
lastStableOffset,
lastPosition,
null,
partitionLog);
}
public static ReadRecordsResult get(DecodeResult decodeResult,
List abortedTransactions,
long highWatermark,
long lastStableOffset,
Position lastPosition,
Errors errors,
PartitionLog partitionLog) {
ReadRecordsResult readRecordsResult = RECYCLER.get();
readRecordsResult.decodeResult = decodeResult;
readRecordsResult.abortedTransactions = abortedTransactions;
readRecordsResult.highWatermark = highWatermark;
readRecordsResult.lastStableOffset = lastStableOffset;
readRecordsResult.lastPosition = lastPosition;
readRecordsResult.errors = errors;
readRecordsResult.partitionLog = partitionLog;
return readRecordsResult;
}
public static ReadRecordsResult empty(long highWatermark,
long lastStableOffset,
Position lastPosition,
PartitionLog partitionLog) {
return ReadRecordsResult.get(
DecodeResult.get(MemoryRecords.EMPTY),
Collections.emptyList(),
highWatermark,
lastStableOffset,
lastPosition,
partitionLog);
}
public static ReadRecordsResult error(Errors errors, PartitionLog partitionLog) {
return ReadRecordsResult.error(PositionImpl.EARLIEST, errors, partitionLog);
}
public static ReadRecordsResult error(Position position, Errors errors, PartitionLog partitionLog) {
return ReadRecordsResult.get(null,
null,
-1,
-1,
position,
errors,
partitionLog);
}
public FetchResponseData.PartitionData toPartitionData() {
// There are three cases:
//
// 1. errors == null :
// The decode result count > 0
// 2. errors == ERROR.NONE :
// Get the empty result.
// 3. errors == Others error :
// Get errors.
if (errors != null) {
return new FetchResponseData.PartitionData()
.setErrorCode(errors.code())
.setHighWatermark(FetchResponse.INVALID_HIGH_WATERMARK)
.setLastStableOffset(FetchResponse.INVALID_LAST_STABLE_OFFSET)
.setLogStartOffset(FetchResponse.INVALID_LOG_START_OFFSET)
.setRecords(MemoryRecords.EMPTY);
}
return new FetchResponseData.PartitionData()
.setErrorCode(Errors.NONE.code())
.setHighWatermark(highWatermark)
.setLastStableOffset(lastStableOffset)
.setHighWatermark(highWatermark) // TODO: should it be changed to the logStartOffset?
.setAbortedTransactions(abortedTransactions)
.setRecords(decodeResult.getRecords());
}
public void recycle() {
this.errors = null;
this.lastPosition = null;
this.lastStableOffset = -1;
this.highWatermark = -1;
this.abortedTransactions = null;
this.partitionLog = null;
if (this.decodeResult != null) {
this.decodeResult.recycle();
this.decodeResult = null;
}
}
}
/**
* AppendOrigin is used mark the data origin.
*/
public enum AppendOrigin {
Coordinator,
Client,
Log
}
// KSN does not modify the records from the client, so the base offset and last offset fields are both not set.
// Hence, here we pass the first offset and last offset explicitly.
private AnalyzeResult analyzeAndValidateProducerState(long firstOffset, long lastOffset, MemoryRecords records,
AppendOrigin origin) {
final var updatedProducers = new HashMap();
final var completedTxns = new ArrayList();
records.batches().forEach(batch -> {
if (batch.hasProducerId()) {
updateProducers(batch, updatedProducers, firstOffset, lastOffset, origin)
.ifPresent(completedTxns::add);
}
});
return new AnalyzeResult(updatedProducers, completedTxns);
}
private Optional updateProducers(RecordBatch batch, Map producers,
long firstOffset, long lastOffset,
AppendOrigin origin) {
Long producerId = batch.producerId();
ProducerAppendInfo appendInfo =
producers.computeIfAbsent(producerId, pid -> producerStateManager.prepareUpdate(pid, origin));
return appendInfo.append(batch, firstOffset, lastOffset, null);
}
// Note: this method must be called after initFuture completes successfully
public Optional firstUndecidedOffset() {
return producerStateManager.firstUndecidedOffset();
}
private List getAbortedIndexList(long fetchOffset) {
return producerStateManager.getAbortedIndexList(fetchOffset);
}
/**
* Append this message to pulsar.
*
* @param records The log records to append
* @param origin Declares the origin of to append which affects required validations
* @param appendRecordsContext See {@link AppendRecordsContext}
*/
public CompletableFuture appendRecords(final MemoryRecords records,
final AppendOrigin origin,
final AppendRecordsContext appendRecordsContext) {
CompletableFuture appendFuture = new CompletableFuture<>();
KafkaTopicManager topicManager = appendRecordsContext.getTopicManager();
if (topicManager == null) {
log.error("topicManager is null for {}", fullPartitionName,
new Exception("topicManager is null for " + fullPartitionName).fillInStackTrace());
return CompletableFuture
.failedFuture(new KafkaStorageException("topicManager is null for " + fullPartitionName));
}
final long beforeRecordsProcess = time.nanoseconds();
try {
final LogAppendInfo appendInfo = analyzeAndValidateRecords(records);
// return if we have no valid messages or if this is a duplicate of the last appended entry
if (appendInfo.shallowCount() == 0) {
appendFuture.complete(appendInfo.firstOffset().orElse(-1L));
return appendFuture;
}
MemoryRecords validRecords = trimInvalidBytes(records, appendInfo);
// Append Message into pulsar
final long startEnqueueNanos = MathUtils.nowInNano();
final Consumer sequentialExecutor = __ -> {
long messageQueuedLatencyNanos = MathUtils.elapsedNanos(startEnqueueNanos);
appendRecordsContext.getEventExecutor().execute(() -> {
requestStats.getMessageQueuedLatencyStats().registerSuccessfulEvent(
messageQueuedLatencyNanos, TimeUnit.NANOSECONDS);
});
if (persistentTopic.isSystemTopic()) {
log.error("Not support producing message to system topic: {}", persistentTopic);
appendFuture.completeExceptionally(Errors.INVALID_TOPIC_EXCEPTION.exception());
return;
}
final ManagedLedger managedLedger = persistentTopic.getManagedLedger();
if (entryFormatter instanceof KafkaMixedEntryFormatter) {
final long logEndOffset = MessageMetadataUtils.getLogEndOffset(managedLedger);
appendInfo.firstOffset(Optional.of(logEndOffset));
}
String cleanupPolicy = PartitionLog.this.topicProperties
.get(KOP_KAFKA_PROPERTY_PREFIX + CLEANUP_POLICY_CONFIG);
if (cleanupPolicy != null && cleanupPolicy.contains(CLEANUP_POLICY_COMPACT)) {
appendInfo.compactedTopic = true;
}
final EncodeRequest encodeRequest = EncodeRequest.get(validRecords, appendInfo);
long pendingTopicLatencyNanos = time.nanoseconds() - beforeRecordsProcess;
appendRecordsContext.getEventExecutor().execute(() -> {
requestStats.getPendingTopicLatencyStats().registerSuccessfulEvent(
pendingTopicLatencyNanos, TimeUnit.NANOSECONDS);
});
long beforeEncodingStarts = time.nanoseconds();
final EncodeResult encodeResult = entryFormatter.encode(encodeRequest);
encodeRequest.recycle();
// Check the converted buffer again
if (encodeResult.getEncodedByteBuf().readableBytes() > kafkaConfig.getMaxMessageSize()) {
appendFuture.completeExceptionally(new RecordBatchTooLargeException(String.format("Converted buffer"
+ " size is %d in append to partition %s which exceeds the maximum configured size of %d.",
encodeResult.getEncodedByteBuf().readableBytes(), fullPartitionName,
kafkaConfig.getMaxMessageSize())));
encodeResult.recycle();
return;
}
long encodeLatencyNanos = time.nanoseconds() - beforeEncodingStarts;
appendRecordsContext.getEventExecutor().execute(() -> {
requestStats.getProduceEncodeStats().registerSuccessfulEvent(
encodeLatencyNanos, TimeUnit.NANOSECONDS);
});
appendRecordsContext.getStartSendOperationForThrottling()
.accept(encodeResult.getEncodedByteBuf().readableBytes());
appendRecordsContext
.getTopicManager()
.registerProducerInPersistentTopic(fullPartitionName, persistentTopic)
.ifPresent((producer) -> {
// collect metrics
encodeResult.updateProducerStats(statsTopicPartition, requestStats, producer,
appendRecordsContext.getEventExecutor());
});
publishMessages(appendFuture, appendInfo, encodeResult, appendRecordsContext);
};
appendRecordsContext.getPendingTopicFuturesMap()
.computeIfAbsent(fullPartitionName, ignored -> new PendingTopicFutures())
.addListener(initFuture, sequentialExecutor, throwable -> {
long messageQueuedLatencyNanos = MathUtils.elapsedNanos(startEnqueueNanos);
appendRecordsContext.getCtx().executor().execute(() -> {
requestStats.getMessageQueuedLatencyStats().registerFailedEvent(
messageQueuedLatencyNanos, TimeUnit.NANOSECONDS);
});
appendFuture.completeExceptionally(throwable);
});
} catch (Exception exception) {
log.error("Failed to handle produce request for {}", fullPartitionName, exception);
appendFuture.completeExceptionally(exception);
}
return appendFuture;
}
// Note: this method must be called after initFuture completes successfully
public Position getLastPosition() {
return persistentTopic.getLastPosition();
}
// Note: this method must be called after initFuture completes successfully
public CompletableFuture readRecords(final FetchRequestData.FetchPartition partitionData,
final boolean readCommitted,
final AtomicLong limitBytes,
final int maxReadEntriesNum,
final MessageFetchContext context) {
final long startPrepareMetadataNanos = MathUtils.nowInNano();
final CompletableFuture future = new CompletableFuture<>();
final long offset = partitionData.fetchOffset();
KafkaTopicManager topicManager = context.getTopicManager();
// The future that is returned by getTopicConsumerManager is always completed normally
topicManager.getTopicConsumerManager(fullPartitionName).thenAccept(tcm -> {
if (tcm == null) {
registerPrepareMetadataFailedEvent(startPrepareMetadataNanos);
// remove null future cache
context.getSharedState().getKafkaTopicConsumerManagerCache().removeAndCloseByTopic(fullPartitionName);
if (log.isDebugEnabled()) {
log.debug("Fetch for {}: no tcm return NOT_LEADER_FOR_PARTITION.", fullPartitionName);
}
future.complete(ReadRecordsResult.error(Errors.NOT_LEADER_OR_FOLLOWER, this));
return;
}
if (checkOffsetOutOfRange(tcm, offset, startPrepareMetadataNanos, context.getEventExecutor())) {
future.complete(ReadRecordsResult.error(Errors.OFFSET_OUT_OF_RANGE, this));
return;
}
if (log.isDebugEnabled()) {
log.debug("Fetch for {}: remove tcm to get cursor for fetch offset: {} .", fullPartitionName, offset);
}
final CompletableFuture> cursorFuture = tcm.removeCursorFuture(offset);
if (cursorFuture == null) {
// tcm is closed, just return a NONE error because the channel may be still active
log.warn("KafkaTopicConsumerManager is closed, remove TCM of {}", fullPartitionName);
registerPrepareMetadataFailedEvent(startPrepareMetadataNanos);
context.getSharedState().getKafkaTopicConsumerManagerCache().removeAndCloseByTopic(fullPartitionName);
future.complete(ReadRecordsResult.error(Errors.NONE, this));
return;
}
cursorFuture.thenAccept((cursorLongPair) -> {
if (cursorLongPair == null) {
log.warn("KafkaTopicConsumerManager.remove({}) return null for topic {}. "
+ "Fetch for topic return error.", offset, fullPartitionName);
registerPrepareMetadataFailedEvent(startPrepareMetadataNanos);
future.complete(ReadRecordsResult.error(Errors.NOT_LEADER_OR_FOLLOWER, this));
return;
}
final ManagedCursor cursor = cursorLongPair.getLeft();
final AtomicLong cursorOffset = new AtomicLong(cursorLongPair.getRight());
final var startReadEntriesNanos = MathUtils.nowInNano();
requestStats.getPrepareMetadataStats().registerSuccessfulEvent(
startReadEntriesNanos - startPrepareMetadataNanos, TimeUnit.NANOSECONDS);
long adjustedMaxBytes = Math.min(partitionData.partitionMaxBytes(), limitBytes.get());
if (readCommitted) {
long firstUndecidedOffset = producerStateManager.firstUndecidedOffset().orElse(-1L);
if (firstUndecidedOffset >= 0 && firstUndecidedOffset <= offset) {
long highWaterMark = MessageMetadataUtils.getHighWatermark(cursor.getManagedLedger());
future.complete(
ReadRecordsResult.empty(
highWaterMark,
firstUndecidedOffset,
tcm.getManagedLedger().getLastConfirmedEntry(), this
)
);
return;
}
}
readEntries(tcm, cursor, cursorOffset, maxReadEntriesNum,
adjustedMaxBytes,
fullPartitionName -> {
topicManager.invalidateCacheForFencedManagerLedgerOnTopic(fullPartitionName);
})
.whenComplete((entries, throwable) -> {
final var messageReadStats = requestStats.getMessageReadStats();
final var startHandleEntriesNanos = MathUtils.nowInNano();
if (throwable != null) {
messageReadStats.registerFailedEvent(startHandleEntriesNanos - startReadEntriesNanos,
TimeUnit.NANOSECONDS);
tcm.deleteOneCursorAsync(cursorLongPair.getLeft(),
"cursor.readEntry fail. deleteCursor");
if (throwable instanceof ManagedLedgerException.CursorAlreadyClosedException
|| throwable instanceof ManagedLedgerException.ManagedLedgerFencedException) {
future.complete(ReadRecordsResult.error(Errors.NOT_LEADER_OR_FOLLOWER, this));
return;
}
log.error("Read entry error on {}", partitionData, throwable);
future.complete(ReadRecordsResult.error(Errors.UNKNOWN_SERVER_ERROR, this));
return;
}
messageReadStats.registerSuccessfulEvent(startHandleEntriesNanos - startReadEntriesNanos,
TimeUnit.NANOSECONDS);
long readSize = entries.stream().mapToLong(Entry::getLength).sum();
limitBytes.addAndGet(-1 * readSize);
// Add new offset back to TCM after entries are read successfully
tcm.add(cursorOffset.get(), Pair.of(cursor, cursorOffset.get()));
handleEntries(future, entries, partitionData, tcm, cursor, readCommitted, context,
startHandleEntriesNanos);
});
}).exceptionally(ex -> {
registerPrepareMetadataFailedEvent(startPrepareMetadataNanos);
context.getSharedState()
.getKafkaTopicConsumerManagerCache().removeAndCloseByTopic(fullPartitionName);
future.complete(ReadRecordsResult.error(Errors.NOT_LEADER_OR_FOLLOWER, this));
return null;
});
});
return future;
}
private boolean checkOffsetOutOfRange(KafkaTopicConsumerManager tcm,
long offset,
long startPrepareMetadataNanos,
EventExecutor eventExecutor) {
// handle offset out-of-range exception
ManagedLedgerImpl managedLedger = (ManagedLedgerImpl) tcm.getManagedLedger();
long logEndOffset = MessageMetadataUtils.getLogEndOffset(managedLedger);
// TODO: Offset out-of-range checks are still incomplete
// We only check the case of `offset > logEndOffset` and `offset < LogStartOffset`
// is currently not handled.
// Because we found that the operation of obtaining `LogStartOffset`
// requires reading from disk,
// and such a time-consuming operation is likely to harm the performance of FETCH request.
// More discussions please refer to https://github.com/streamnative/kop/pull/531
if (offset > logEndOffset) {
log.error("Received request for offset {} for partition {}, "
+ "but we only have entries less than {}.",
offset, fullPartitionName, logEndOffset);
if (startPrepareMetadataNanos > 0) {
registerPrepareMetadataFailedEvent(startPrepareMetadataNanos);
}
return true;
}
return false;
}
private void registerPrepareMetadataFailedEvent(long startPrepareMetadataNanos) {
long prepareMetadataNanos = MathUtils.elapsedNanos(startPrepareMetadataNanos);
this.requestStats.getPrepareMetadataStats().registerFailedEvent(prepareMetadataNanos, TimeUnit.NANOSECONDS);
}
private void handleEntries(final CompletableFuture future,
final List entries,
final FetchRequestData.FetchPartition partitionData,
final KafkaTopicConsumerManager tcm,
final ManagedCursor cursor,
final boolean readCommitted,
final MessageFetchContext context,
final long startHandleEntriesNanos) {
final long highWatermark = MessageMetadataUtils.getHighWatermark(cursor.getManagedLedger());
final long lso = (readCommitted
? this.firstUndecidedOffset().orElse(highWatermark) : highWatermark);
final List committedEntries = readCommitted ? getCommittedEntries(entries, lso) : entries;
if (log.isDebugEnabled()) {
log.debug("Read {} entries but only {} entries are committed, lso {}, highWatermark {}",
entries.size(), committedEntries.size(), lso, highWatermark);
}
if (committedEntries.isEmpty()) {
future.complete(ReadRecordsResult.error(tcm.getManagedLedger().getLastConfirmedEntry(), Errors.NONE,
this));
return;
}
// use compatible magic value by apiVersion
final byte magic = getCompatibleMagic(context.getHeader().apiVersion());
// this part is heavyweight, and we should not execute in the ManagedLedger Ordered executor thread
final CompletableFuture groupNameFuture = kafkaConfig.isKopEnableGroupLevelConsumerMetrics()
? context.getCurrentConnectedGroupNameAsync() : CompletableFuture.completedFuture(null);
groupNameFuture.whenCompleteAsync((groupName, ex) -> {
if (ex != null) {
log.error("Get groupId failed.", ex);
groupName = "";
}
// Get the last entry position for delayed fetch.
Position lastPosition = this.getLastPositionFromEntries(committedEntries);
final long startDecodingEntriesNanos = MathUtils.nowInNano();
final DecodeResult decodeResult = entryFormatter.decode(committedEntries, magic);
long fetchDecodeLatencyNanos = MathUtils.elapsedNanos(startDecodingEntriesNanos);
requestStats.getFetchDecodeStats().registerSuccessfulEvent(fetchDecodeLatencyNanos, TimeUnit.NANOSECONDS);
// collect consumer metrics
decodeResult.updateConsumerStats(statsTopicPartition, committedEntries.size(),
groupName, requestStats);
List abortedTransactions = null;
if (readCommitted) {
abortedTransactions = this.getAbortedIndexList(partitionData.fetchOffset());
}
if (log.isDebugEnabled()) {
log.debug("Partition {} read entry completed in {} ns",
fullPartitionName, MathUtils.nowInNano() - startDecodingEntriesNanos);
}
requestStats.getHandleEntriesStats().registerSuccessfulEvent(
MathUtils.elapsedNanos(startHandleEntriesNanos), TimeUnit.NANOSECONDS);
future.complete(ReadRecordsResult
.get(decodeResult, abortedTransactions, highWatermark, lso, lastPosition, this));
}, context.getDecodeExecutor()).exceptionally(ex -> {
log.error("Partition {} read entry exceptionally. ", fullPartitionName, ex);
requestStats.getHandleEntriesStats().registerFailedEvent(
MathUtils.elapsedNanos(startHandleEntriesNanos), TimeUnit.NANOSECONDS);
future.complete(ReadRecordsResult.error(Errors.KAFKA_STORAGE_ERROR, this));
return null;
});
}
private static byte getCompatibleMagic(short apiVersion) {
final byte magic;
if (apiVersion <= 1) {
magic = RecordBatch.MAGIC_VALUE_V0;
} else if (apiVersion <= 3) {
magic = RecordBatch.MAGIC_VALUE_V1;
} else {
magic = RecordBatch.CURRENT_MAGIC_VALUE;
}
return magic;
}
private Position getLastPositionFromEntries(List entries) {
if (entries == null || entries.isEmpty()) {
return PositionImpl.EARLIEST;
}
return entries.get(entries.size() - 1).getPosition();
}
private List getCommittedEntries(List entries, long lso) {
List committedEntries;
committedEntries = new ArrayList<>();
for (Entry entry : entries) {
try {
if (lso > MessageMetadataUtils.peekBaseOffsetFromEntry(entry)) {
committedEntries.add(entry);
} else {
break;
}
} catch (MetadataCorruptedException e) {
log.error("[{}:{}] Failed to peek base offset from entry.",
entry.getLedgerId(), entry.getEntryId());
}
}
// Release all the entries that are not in the result
for (int i = committedEntries.size(); i < entries.size(); i++) {
entries.get(i).release();
}
return committedEntries;
}
/**
* Read Entries by cursor.
*
* @return CompletableFuture>
* When the comparable future complete normally, the list of entry's will never be null.
*/
private CompletableFuture> readEntries(final KafkaTopicConsumerManager tcm,
final ManagedCursor cursor,
final AtomicLong cursorOffset,
final int maxReadEntriesNum,
final long adjustedMaxBytes,
final Consumer invalidateCacheOnTopic) {
final CompletableFuture> readFuture = new CompletableFuture<>();
if (adjustedMaxBytes <= 0) {
readFuture.complete(Collections.emptyList());
return readFuture;
}
final long originalOffset = cursorOffset.get();
AsyncCallbacks.ReadEntriesCallback readEntriesCallback = new AsyncCallbacks.ReadEntriesCallback() {
@Override
public void readEntriesComplete(List entries, Object ctx) {
if (!entries.isEmpty()) {
final Entry lastEntry = entries.get(entries.size() - 1);
final PositionImpl currentPosition = PositionImpl.get(
lastEntry.getLedgerId(), lastEntry.getEntryId());
try {
final long lastOffset = MessageMetadataUtils.peekOffsetFromEntry(lastEntry);
// commit the offset, so backlog not affect by this cursor.
commitOffset((NonDurableCursorImpl) cursor, currentPosition);
// and add back to TCM when all read complete.
cursorOffset.set(lastOffset + 1);
if (log.isDebugEnabled()) {
log.debug("Topic {} success read entry: ledgerId: {}, entryId: {}, size: {},"
+ " ConsumerManager original offset: {}, lastEntryPosition: {}, "
+ "nextOffset: {}",
fullPartitionName, lastEntry.getLedgerId(), lastEntry.getEntryId(),
lastEntry.getLength(), originalOffset, currentPosition,
cursorOffset.get());
}
} catch (MetadataCorruptedException e) {
log.error("[{}] Failed to peekOffsetFromEntry from position {}: {}",
fullPartitionName, currentPosition, e.getMessage());
readFuture.completeExceptionally(e);
return;
}
}
readFuture.complete(entries);
}
@Override
public void readEntriesFailed(ManagedLedgerException exception, Object ctx) {
log.error("Error read entry for topic: {}", fullPartitionName, exception);
if (exception instanceof ManagedLedgerException.ManagedLedgerFencedException) {
invalidateCacheOnTopic.accept(fullPartitionName);
}
readFuture.completeExceptionally(exception);
}
};
if (((NonDurableCursorImpl) cursor).isReadCompacted()) {
CompactedTopicUtils.asyncReadCompactedEntries(tcm.getTopic().getTopicCompactionService(), cursor,
maxReadEntriesNum, adjustedMaxBytes, PositionImpl.LATEST, false, readEntriesCallback,
false, null);
} else {
cursor.asyncReadEntries(maxReadEntriesNum, adjustedMaxBytes, readEntriesCallback, null,
PositionImpl.LATEST);
}
return readFuture;
}
// commit the offset, so backlog not affect by this cursor.
private static void commitOffset(NonDurableCursorImpl cursor, PositionImpl currentPosition) {
cursor.asyncMarkDelete(currentPosition, new AsyncCallbacks.MarkDeleteCallback() {
@Override
public void markDeleteComplete(Object ctx) {
if (log.isDebugEnabled()) {
log.debug("Mark delete success for position: {}", currentPosition);
}
}
// this is OK, since this is kind of cumulative ack, following commit will come.
@Override
public void markDeleteFailed(ManagedLedgerException e, Object ctx) {
log.warn("Mark delete failed for position: {} with error:",
currentPosition, e);
}
}, null);
}
@VisibleForTesting
void publishMessages(final CompletableFuture appendFuture, final LogAppendInfo appendInfo,
final EncodeResult encodeResult, final AppendRecordsContext appendRecordsContext) {
final ByteBuf byteBuf = encodeResult.getEncodedByteBuf();
final int byteBufSize = byteBuf.readableBytes();
final long beforePublish = time.nanoseconds();
final AnalyzeResult analyzeResult;
final var firstOffset = MessageMetadataUtils.getLogEndOffset(persistentTopic.getManagedLedger());
final var lastOffset = firstOffset + appendInfo.numMessages - 1;
try {
analyzeResult = analyzeAndValidateProducerState(firstOffset, lastOffset, encodeResult.getRecords(),
AppendOrigin.Client);
} catch (Throwable throwable) {
requestStats.getMessagePublishStats().registerFailedEvent(
MathUtils.elapsedNanos(beforePublish), TimeUnit.NANOSECONDS);
appendFuture.completeExceptionally(throwable);
return;
}
pendingPublishOps.increment();
publishMessage(persistentTopic, byteBuf, appendInfo)
.whenComplete((result, e) -> {
appendRecordsContext.getCompleteSendOperationForThrottling().accept(byteBufSize);
try {
if (e == null) {
analyzeResult.updatedProducers().values().stream().map(ProducerAppendInfo::startedTransactions)
.flatMap(Collection::stream).forEach(txn -> txn.firstPosition = result.position);
analyzeResult.updatedProducers().values().forEach(producerStateManager::update);
analyzeResult.completedTxns().forEach(completedTxn -> {
final var lastStableOffset = producerStateManager.lastStableOffset(completedTxn);
producerStateManager.updateTxnIndex(completedTxn, lastStableOffset);
producerStateManager.completeTxn(completedTxn);
});
producerStateManager.updateMapEndOffset(result.offset);
producerStateManager.updateMaxReadPosition(result.position);
// Only trigger the snapshot persistence when the records are persisted
producerStateManager.maybeTakeSnapshot(recoveryExecutor);
producerStateManager.maybePurgeAbortedTx();
requestStats.getMessagePublishStats().registerSuccessfulEvent(
MathUtils.elapsedNanos(beforePublish), TimeUnit.NANOSECONDS);
appendFuture.complete(result.offset);
} else {
// `producerStateManager.producers` might have already been updated. It's okay because even if
// the record failed to be persisted, it's still an exceptional case to see a new record whose
// producer epoch is smaller.
log.error("publishMessages for topic partition: {} failed when write.", fullPartitionName, e);
requestStats.getMessagePublishStats().registerFailedEvent(
MathUtils.elapsedNanos(beforePublish), TimeUnit.NANOSECONDS);
appendFuture.completeExceptionally(e);
}
} catch (Throwable throwable) {
log.error("[{}] Failed to handle the publish for offset {}", fullPartitionName,
appendInfo.firstOffset.orElse(-1L), throwable);
} finally {
pendingPublishOps.decrement();
encodeResult.recycle();
}
});
if (appendFuture.isCompletedExceptionally()) {
// The topic is fenced due to ownership transfer or message duplication happens
return;
}
// Update the `producers` field for producer epoch validation in next `analyzeAndValidateProducerState` call
analyzeResult.updatedProducers().values().forEach(producerStateManager::updateProducers);
}
public record PublishResult(long offset, PositionImpl position) {
@Override
public boolean equals(Object obj) {
if (!(obj instanceof PublishResult other)) {
return false;
}
return offset == other.offset && position.equals(other.position);
}
}
/**
* Publish message to bookkeeper.
* When the message is control message, then it will not do the message deduplication.
*
* @param persistentTopic The persistentTopic, use to publish message and check message deduplication.
* @param byteBuf Message byteBuf
* @param appendInfo Pre-analyzed recode info, we can get sequence, message num ...
* @return offset
*/
private CompletableFuture publishMessage(final PersistentTopic persistentTopic,
final ByteBuf byteBuf,
final LogAppendInfo appendInfo) {
final CompletableFuture publishFuture = new CompletableFuture<>();
persistentTopic.publishMessage(byteBuf,
MessagePublishContext.get(
publishFuture,
persistentTopic,
appendInfo.producerName(),
appendInfo.producerId().isPresent() && !appendInfo.isControlBatch(),
appendInfo.firstSequence(),
appendInfo.lastSequence(),
appendInfo.numMessages(),
time.nanoseconds()));
return publishFuture;
}
@VisibleForTesting
public LogAppendInfo analyzeAndValidateRecords(MemoryRecords records) {
int numMessages = 0;
int shallowMessageCount = 0;
Optional firstOffset = Optional.empty();
boolean readFirstMessage = false;
boolean isTransaction = false;
boolean isControlBatch = false;
int validBytesCount = 0;
int firstSequence = Integer.MAX_VALUE;
int lastSequence = -1;
Optional producerId = Optional.empty();
short producerEpoch = -1;
KopLogValidator.CompressionCodec sourceCodec = DEFAULT_COMPRESSION;
for (RecordBatch batch : records.batches()) {
if (batch.magic() >= RecordBatch.MAGIC_VALUE_V2 && batch.baseOffset() != 0) {
throw new InvalidRecordException("The baseOffset of the record batch in the append to "
+ fullPartitionName + " should be 0, but it is " + batch.baseOffset());
}
if (!readFirstMessage) {
if (batch.magic() >= RecordBatch.MAGIC_VALUE_V2) {
firstOffset = Optional.of(batch.baseOffset());
}
readFirstMessage = true;
}
int batchSize = batch.sizeInBytes();
if (batchSize > kafkaConfig.getMaxMessageSize()) {
// Kafka throws RecordTooLargeException here and Kafka clients will try to split the batch and
// send again until it succeeds. However, there is no way to let Kafka clients know the max message size
// so the client might never split the large batch.
// To avoid Kafka clients resending the same large batch infinitely, here we return
// RecordBatchTooLargeException so that the Kafka clients will fail immediately.
throw new RecordBatchTooLargeException(String.format("Message batch size is %s "
+ "in append to partition %s which exceeds the maximum configured size of %s .",
batchSize, fullPartitionName, kafkaConfig.getMaxMessageSize()));
}
batch.ensureValid();
shallowMessageCount += 1;
validBytesCount += batchSize;
int numMessagesInBatch = (int) (batch.lastOffset() - batch.baseOffset() + 1);
if (numMessagesInBatch <= 1) {
// The lastOffset field might be set. We need to iterate the records.
for (Record record : batch) {
numMessages++;
}
} else {
numMessages += numMessagesInBatch;
}
isTransaction = batch.isTransactional();
isControlBatch = batch.isControlBatch();
// We assume batches producerId are same.
if (batch.hasProducerId()) {
producerId = Optional.of(batch.producerId());
producerEpoch = batch.producerEpoch();
}
if (batch.compressionType().id != CompressionType.NONE.id) {
CompressionType compressionType = CompressionType.forId(batch.compressionType().id);
sourceCodec = new KopLogValidator.CompressionCodec(
compressionType.name, compressionType.id);
}
if (firstSequence > batch.baseSequence()) {
firstSequence = batch.baseSequence();
}
if (lastSequence < batch.lastSequence()) {
lastSequence = batch.lastSequence();
}
}
if (validBytesCount < 0) {
throw new CorruptRecordException("Cannot append record batch with illegal length "
+ validBytesCount + " to log for " + fullPartitionName
+ ". A possible cause is corrupted produce request.");
}
KopLogValidator.CompressionCodec targetCodec =
KopLogValidator.getTargetCodec(sourceCodec, kafkaConfig.getKafkaCompressionType());
// This producerName is only used to check the message deduplication.
// Kafka will reuse pid when transactionId is the same but will increase the producerEpoch.
// So we need to ensure the producerName is not the same.
String producerName = new StringJoiner("-")
.add(PID_PREFIX)
.add(String.valueOf(producerId.orElse(-1L)))
.add(String.valueOf(producerEpoch)).toString();
return new LogAppendInfo(firstOffset, producerId, producerName, producerEpoch, numMessages, shallowMessageCount,
isTransaction, isControlBatch, validBytesCount, firstSequence, lastSequence, sourceCodec, targetCodec,
false);
}
private MemoryRecords trimInvalidBytes(MemoryRecords records, LogAppendInfo info) {
int validBytes = info.validBytes();
if (validBytes < 0){
throw new CorruptRecordException(String.format("Cannot append record batch with illegal length %s to "
+ "log for %s. A possible cause is a corrupted produce request.", validBytes, fullPartitionName));
} else if (validBytes == records.sizeInBytes()) {
return records;
} else {
ByteBuffer validByteBuffer = records.buffer().duplicate();
validByteBuffer.limit(validBytes);
return MemoryRecords.readableRecords(validByteBuffer);
}
}
/**
* Remove all the AbortedTxn that are no more referred by existing data on the topic.
* @return
*/
public CompletableFuture> updatePurgeAbortedTxnsOffset() {
if (!kafkaConfig.isKafkaTransactionCoordinatorEnabled()) {
// no need to scan the topic, because transactions are disabled
return CompletableFuture.completedFuture(null);
}
if (!producerStateManager.hasSomeAbortedTransactions()) {
// nothing to do
return CompletableFuture.completedFuture(null);
}
if (unloaded) {
// nothing to do
return CompletableFuture.completedFuture(null);
}
return fetchOldestAvailableIndexFromTopic()
.thenAccept(offset ->
producerStateManager.updateAbortedTxnsPurgeOffset(offset));
}
@VisibleForTesting
public CompletableFuture fetchOldestAvailableIndexFromTopic() {
if (unloaded) {
return FutureUtil.failedFuture(new NotLeaderOrFollowerException());
}
final CompletableFuture future = new CompletableFuture<>();
// The future that is returned by getTopicConsumerManager is always completed normally
KafkaTopicConsumerManager tcm = new KafkaTopicConsumerManager("purge-aborted-tx",
true, persistentTopic, true);
future.whenComplete((___, error) -> {
// release resources in any case
try {
tcm.close();
} catch (Exception err) {
log.error("Cannot safely close the temporary KafkaTopicConsumerManager for {}",
fullPartitionName, err);
}
});
ManagedLedgerImpl managedLedger = (ManagedLedgerImpl) persistentTopic.getManagedLedger();
long numberOfEntries = managedLedger.getNumberOfEntries();
if (numberOfEntries == 0) {
long currentOffset = MessageMetadataUtils.getCurrentOffset(managedLedger);
log.info("First offset for topic {} is {} as the topic is empty (numberOfEntries=0)",
fullPartitionName, currentOffset);
future.complete(currentOffset);
return future;
}
// this is a DUMMY entry with -1
PositionImpl firstPosition = managedLedger.getFirstPosition();
// look for the first entry with data
PositionImpl nextValidPosition = managedLedger.getNextValidPosition(firstPosition);
fetchOldestAvailableIndexFromTopicReadNext(future, managedLedger, nextValidPosition);
return future.thenCompose(offset -> {
if (persistentTopic.getSubscriptions().containsKey(Compactor.COMPACTION_SUBSCRIPTION)) {
return getLastCompactedOffset(persistentTopic.getTopicCompactionService()).thenApply(
lastCompactedOffset -> {
return Math.max(lastCompactedOffset + 1, offset);
});
} else {
return CompletableFuture.completedFuture(offset);
}
});
}
private CompletableFuture getLastCompactedOffset(TopicCompactionService topicCompactionService) {
return topicCompactionService.readLastCompactedEntry().thenApply(entry -> {
if (entry == null) {
return -1L;
}
try {
return MessageMetadataUtils.peekOffsetFromEntry(entry);
} catch (MetadataCorruptedException e) {
if (e instanceof MetadataCorruptedException.NoBrokerEntryMetadata) {
return -1L;
} else {
throw new RuntimeException(e);
}
} finally {
entry.release();
}
});
}
private void fetchOldestAvailableIndexFromTopicReadNext(CompletableFuture future,
ManagedLedgerImpl managedLedger, PositionImpl position) {
managedLedger.asyncReadEntry(position, new AsyncCallbacks.ReadEntryCallback() {
@Override
public void readEntryComplete(Entry entry, Object ctx) {
try {
long startOffset = MessageMetadataUtils.peekBaseOffsetFromEntry(entry);
log.info("First offset for topic {} is {} - position {}", fullPartitionName,
startOffset, entry.getPosition());
future.complete(startOffset);
} catch (MetadataCorruptedException.NoBrokerEntryMetadata noBrokerEntryMetadata) {
long currentOffset = MessageMetadataUtils.getCurrentOffset(managedLedger);
log.info("Legacy entry for topic {} - position {} - returning current offset {}",
fullPartitionName,
entry.getPosition(),
currentOffset);
future.complete(currentOffset);
} catch (Exception err) {
future.completeExceptionally(err);
} finally {
entry.release();
}
}
@Override
public void readEntryFailed(ManagedLedgerException exception, Object ctx) {
future.completeExceptionally(exception);
}
}, null);
}
@VisibleForTesting
public CompletableFuture> takeProducerSnapshot() {
return initFuture.thenCompose((___) -> {
// snapshot can be taken only on the same thread that is used for writes
ManagedLedgerImpl ml = (ManagedLedgerImpl) getPersistentTopic().getManagedLedger();
Executor executorService = ml.getExecutor();
return this
.getProducerStateManager()
.takeSnapshot(executorService);
});
}
@VisibleForTesting
public CompletableFuture forcePurgeAbortTx() {
return initFuture.thenCompose((___) -> {
// purge can be taken only on the same thread that is used for writes
ManagedLedgerImpl ml = (ManagedLedgerImpl) getPersistentTopic().getManagedLedger();
ExecutorService executorService = ml.getScheduledExecutor().chooseThread(ml.getName());
return updatePurgeAbortedTxnsOffset()
.thenApplyAsync((____) -> {
return getProducerStateManager().executePurgeAbortedTx();
}, executorService);
});
}
CompletableFuture recoverTxEntries(long offset, Executor executor) {
if (!kafkaConfig.isKafkaTransactionCoordinatorEnabled()) {
// no need to scan the topic, because transactions are disabled
return CompletableFuture.completedFuture(0L);
}
if (!isBrokerIndexMetadataInterceptorConfigured(persistentTopic.getBrokerService())) {
// The `UpgradeTest` will set the interceptor to null,
// this will cause NPE problem while `fetchOldestAvailableIndexFromTopic`,
// but we can't disable kafka transaction,
// currently transaction coordinator must set to true (Newly Kafka client requirement).
// TODO Actually, if the AppendIndexMetadataInterceptor is not set, the kafka transaction can't work,
// we need to throw an exception, maybe we need add a new configuration for ProducerId.
log.error("The broker index metadata interceptor is not configured for topic {}, skip recover txn entries.",
fullPartitionName);
return CompletableFuture.completedFuture(0L);
}
final EventExecutor statsExecutor = getPersistentTopic().getBrokerService().executor().next();
return fetchOldestAvailableIndexFromTopic().thenCompose((minOffset -> {
log.info("start recoverTxEntries for {} at offset {} minOffset {}",
fullPartitionName, offset, minOffset);
final CompletableFuture future = new CompletableFuture<>();
// The future that is returned by getTopicConsumerManager is always completed normally
KafkaTopicConsumerManager tcm = new KafkaTopicConsumerManager("recover-tx",
true, persistentTopic, true);
future.whenComplete((___, error) -> {
// release resources in any case
try {
tcm.close();
} catch (Exception err) {
log.error("Cannot safely close the temporary KafkaTopicConsumerManager for {}",
fullPartitionName, err);
}
});
final long offsetToStart;
if (checkOffsetOutOfRange(tcm, offset, -1, statsExecutor)) {
offsetToStart = 0;
log.info("recoverTxEntries for {}: offset {} is out-of-range, "
+ "maybe the topic has been deleted/recreated, "
+ "starting recovery from {}",
fullPartitionName, offset, offsetToStart);
} else {
offsetToStart = Math.max(offset, minOffset);
}
producerStateManager.handleMissingDataBeforeRecovery(minOffset, offset);
if (log.isDebugEnabled()) {
log.debug("recoverTxEntries for {}: remove tcm to get cursor for fetch offset: {} .",
fullPartitionName, offsetToStart);
}
final CompletableFuture> cursorFuture = tcm.removeCursorFuture(offsetToStart);
if (cursorFuture == null) {
// tcm is closed, just return a NONE error because the channel may be still active
log.warn("KafkaTopicConsumerManager is closed, remove TCM of {}", fullPartitionName);
future.completeExceptionally(new NotLeaderOrFollowerException());
return future;
}
cursorFuture.thenAccept((cursorLongPair) -> {
if (cursorLongPair == null) {
log.warn("KafkaTopicConsumerManager.remove({}) return null for topic {}. "
+ "Fetch for topic return error.", offsetToStart, fullPartitionName);
future.completeExceptionally(new NotLeaderOrFollowerException());
return;
}
final ManagedCursor cursor = cursorLongPair.getLeft();
final AtomicLong cursorOffset = new AtomicLong(cursorLongPair.getRight());
AtomicLong entryCounter = new AtomicLong();
readNextEntriesForRecovery(cursor, cursorOffset, tcm, entryCounter,
future, executor, statsExecutor);
}).exceptionally(ex -> {
future.completeExceptionally(new NotLeaderOrFollowerException());
return null;
});
return future;
}));
}
private void readNextEntriesForRecovery(ManagedCursor cursor, AtomicLong cursorOffset,
KafkaTopicConsumerManager tcm,
AtomicLong entryCounter,
CompletableFuture future, Executor executor,
EventExecutor statsExecutor) {
if (log.isDebugEnabled()) {
log.debug("readNextEntriesForRecovery {} cursorOffset {}", fullPartitionName, cursorOffset);
}
int maxReadEntriesNum = 200;
long adjustedMaxBytes = Long.MAX_VALUE;
readEntries(tcm, cursor, cursorOffset, maxReadEntriesNum, adjustedMaxBytes, (partitionName) -> {})
.whenCompleteAsync((entries, throwable) -> {
if (throwable != null) {
log.error("Read entry error on {}", fullPartitionName, throwable);
tcm.deleteOneCursorAsync(cursor,
"cursor.readEntry fail. deleteCursor");
if (throwable instanceof ManagedLedgerException.CursorAlreadyClosedException
|| throwable instanceof ManagedLedgerException.ManagedLedgerFencedException) {
future.completeExceptionally(new NotLeaderOrFollowerException());
return;
}
future.completeExceptionally(new UnknownServerException(throwable));
return;
}
// Add new offset back to TCM after entries are read successfully
tcm.add(cursorOffset.get(), Pair.of(cursor, cursorOffset.get()));
if (entries.isEmpty()) {
if (log.isDebugEnabled()) {
log.debug("No more entries to recover for {}", fullPartitionName);
}
future.completeAsync(() -> entryCounter.get(), executor);
return;
}
CompletableFuture decodedEntries = new CompletableFuture<>();
decodeEntriesForRecovery(decodedEntries, entries, statsExecutor);
decodedEntries.thenAccept((decodeResult) -> {
try {
final var positions = decodeResult.getPositions();
final var index = new MutableInt(0);
decodeResult.getRecords().batches().forEach(batch -> {
final var i = index.getAndIncrement();
if (i < positions.size()) {
entryCounter.addAndGet(batch.lastOffset() - batch.baseOffset() + 1);
updateProducerState(batch, (PositionImpl) positions.get(i));
} else {
// It should never happen
log.error("[{}] Position index {} out of range for batch {}", fullPartitionName,
i, batch.baseOffset());
}
});
producerStateManager.updateMaxReadPosition((PositionImpl)
positions.get(positions.size() - 1));
} catch (Throwable e) {
log.error("Failed to handle the decode result {}", decodeResult.getPositions(), e);
} finally {
decodeResult.recycle();
}
readNextEntriesForRecovery(cursor, cursorOffset, tcm, entryCounter, future, executor,
statsExecutor);
}).exceptionally(error -> {
log.error("Bad error while recovering {}", fullPartitionName, error);
future.completeExceptionally(error);
return null;
});
}, executor);
}
private void updateProducerState(RecordBatch batch, PositionImpl position) {
if (batch.hasProducerId()) {
final var producerId = batch.producerId();
// TODO: the origin should be REPLICATION but it's never used in this project
ProducerAppendInfo appendInfo = producerStateManager.prepareUpdate(producerId, AppendOrigin.Client);
// The base offset and last offset fields are set when taking the snapshot
final var maybeCompletedTxn = appendInfo.append(batch, batch.baseOffset(), batch.lastOffset(), position);
producerStateManager.update(appendInfo);
if (maybeCompletedTxn.isPresent()) {
CompletedTxn completedTxn = maybeCompletedTxn.get();
long lastStableOffset = producerStateManager.lastStableOffset(completedTxn);
producerStateManager.updateTxnIndex(completedTxn, lastStableOffset);
producerStateManager.completeTxn(completedTxn);
}
}
producerStateManager.updateMapEndOffset(batch.lastOffset() + 1);
}
private void decodeEntriesForRecovery(final CompletableFuture future,
final List entries,
final EventExecutor statsExecutor) {
if (log.isDebugEnabled()) {
log.debug("Read {} entries", entries.size());
}
final byte magic = RecordBatch.CURRENT_MAGIC_VALUE;
final long startDecodingEntriesNanos = MathUtils.nowInNano();
try {
DecodeResult decodeResult = entryFormatter.decode(entries, magic);
long fetchDecodeLatencyNanos = MathUtils.elapsedNanos(startDecodingEntriesNanos);
statsExecutor.execute(() -> {
requestStats.getFetchDecodeStats().registerSuccessfulEvent(
fetchDecodeLatencyNanos, TimeUnit.NANOSECONDS);
});
future.complete(decodeResult);
} catch (Exception error) {
future.completeExceptionally(error);
}
}
@VisibleForTesting
public boolean isUnloaded() {
return unloaded;
}
public @Nullable PersistentTopic getPersistentTopic() {
return persistentTopic;
}
@Override
public String toString() {
return "PartitionLog(" + fullPartitionName + ")";
}
public DescribeProducersResponseData.PartitionResponse activeProducerState() {
var producerState = new DescribeProducersResponseData.PartitionResponse()
.setPartitionIndex(statsTopicPartition.partition())
.setErrorCode(Errors.NONE.code())
.setActiveProducers(new ArrayList<>());
// this utility is only for monitoring, it is fine to access this structure directly from any thread
var producers = producerStateManager.getProducers();
producers.values().forEach(producerStateEntry -> {
producerState.activeProducers().add(new DescribeProducersResponseData.ProducerState()
.setProducerId(producerStateEntry.producerId)
.setLastSequence(-1) // NOT HANDLED YET
.setProducerEpoch(producerStateEntry.producerEpoch != null
? producerStateEntry.producerEpoch.intValue() : -1)
.setLastTimestamp(producerStateEntry.lastTimestamp != null
? producerStateEntry.lastTimestamp : -1)
.setCoordinatorEpoch(producerStateEntry.coordinatorEpoch)
.setCurrentTxnStartOffset(producerStateEntry.currentTxnFirstOffset.orElse(-1L)));
});
return producerState;
}
public CompletableFuture close() {
if (this.unloaded) {
return CompletableFuture.completedFuture(null);
}
this.unloaded = true;
// Take snapshot before close the topic
if (this.initFuture.isDone() && !this.initFuture.isCompletedExceptionally()) {
if (recoveryExecutor.isShutdown()) {
log.warn("[{}] Failed to take snapshot after shutdown", fullPartitionName);
return CompletableFuture.completedFuture(null);
}
return this.producerStateManager.takeSnapshot(recoveryExecutor).thenApply(__ -> (Void) null)
.exceptionally(e -> {
if (e != null) {
log.warn("[{}] Failed to take snapshot: {}", fullPartitionName, e.getMessage());
}
return null;
});
} else {
return CompletableFuture.completedFuture(null);
}
}
}