All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.streamnative.pulsar.handlers.kop.storage.PartitionLog Maven / Gradle / Ivy

There is a newer version: 4.0.0.4
Show newest version
/**
 * Copyright (c) 2019 - 2024 StreamNative, Inc.. All Rights Reserved.
 */
/**
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.streamnative.pulsar.handlers.kop.storage;

import static com.google.common.base.Preconditions.checkArgument;
import static io.streamnative.pulsar.handlers.kop.AdminManager.KOP_KAFKA_PROPERTY_PREFIX;
import static io.streamnative.pulsar.handlers.kop.utils.MessageMetadataUtils.isBrokerIndexMetadataInterceptorConfigured;
import static org.apache.kafka.common.config.TopicConfig.CLEANUP_POLICY_COMPACT;
import static org.apache.kafka.common.config.TopicConfig.CLEANUP_POLICY_CONFIG;
import static org.apache.kafka.common.internals.Topic.GROUP_METADATA_TOPIC_NAME;

import com.google.common.annotations.VisibleForTesting;
import io.netty.buffer.ByteBuf;
import io.netty.util.Recycler;
import io.netty.util.concurrent.EventExecutor;
import io.streamnative.pulsar.handlers.kop.KafkaServiceConfiguration;
import io.streamnative.pulsar.handlers.kop.KafkaTopicConsumerManager;
import io.streamnative.pulsar.handlers.kop.KafkaTopicLookupService;
import io.streamnative.pulsar.handlers.kop.KafkaTopicManager;
import io.streamnative.pulsar.handlers.kop.MessageFetchContext;
import io.streamnative.pulsar.handlers.kop.MessagePublishContext;
import io.streamnative.pulsar.handlers.kop.PendingTopicFutures;
import io.streamnative.pulsar.handlers.kop.RequestStats;
import io.streamnative.pulsar.handlers.kop.exceptions.MetadataCorruptedException;
import io.streamnative.pulsar.handlers.kop.format.DecodeResult;
import io.streamnative.pulsar.handlers.kop.format.EncodeRequest;
import io.streamnative.pulsar.handlers.kop.format.EncodeResult;
import io.streamnative.pulsar.handlers.kop.format.EntryFormatter;
import io.streamnative.pulsar.handlers.kop.format.EntryFormatterFactory;
import io.streamnative.pulsar.handlers.kop.format.KafkaMixedEntryFormatter;
import io.streamnative.pulsar.handlers.kop.format.PulsarEntryFormatter;
import io.streamnative.pulsar.handlers.kop.topic.KopPersistentTopic;
import io.streamnative.pulsar.handlers.kop.utils.KopLogValidator;
import io.streamnative.pulsar.handlers.kop.utils.MessageMetadataUtils;
import io.streamnative.pulsar.handlers.kop.utils.MetadataUtils;
import io.streamnative.pulsar.handlers.kop.utils.TopicNameUtils;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.StringJoiner;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.Executor;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.LongAdder;
import java.util.function.Consumer;
import javax.annotation.Nullable;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.Getter;
import lombok.ToString;
import lombok.experimental.Accessors;
import lombok.extern.slf4j.Slf4j;
import org.apache.bookkeeper.common.util.MathUtils;
import org.apache.bookkeeper.common.util.OrderedExecutor;
import org.apache.bookkeeper.mledger.AsyncCallbacks;
import org.apache.bookkeeper.mledger.Entry;
import org.apache.bookkeeper.mledger.ManagedCursor;
import org.apache.bookkeeper.mledger.ManagedLedger;
import org.apache.bookkeeper.mledger.ManagedLedgerException;
import org.apache.bookkeeper.mledger.Position;
import org.apache.bookkeeper.mledger.impl.ManagedLedgerImpl;
import org.apache.bookkeeper.mledger.impl.NonDurableCursorImpl;
import org.apache.bookkeeper.mledger.impl.PositionImpl;
import org.apache.commons.lang3.mutable.MutableInt;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.kafka.common.InvalidRecordException;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.common.errors.CorruptRecordException;
import org.apache.kafka.common.errors.KafkaStorageException;
import org.apache.kafka.common.errors.NotLeaderOrFollowerException;
import org.apache.kafka.common.errors.RecordBatchTooLargeException;
import org.apache.kafka.common.errors.UnknownServerException;
import org.apache.kafka.common.message.DescribeProducersResponseData;
import org.apache.kafka.common.message.FetchRequestData;
import org.apache.kafka.common.message.FetchResponseData;
import org.apache.kafka.common.protocol.Errors;
import org.apache.kafka.common.record.CompressionType;
import org.apache.kafka.common.record.MemoryRecords;
import org.apache.kafka.common.record.Record;
import org.apache.kafka.common.record.RecordBatch;
import org.apache.kafka.common.requests.FetchResponse;
import org.apache.kafka.common.utils.Time;
import org.apache.pulsar.broker.service.persistent.PersistentTopic;
import org.apache.pulsar.broker.service.plugin.EntryFilter;
import org.apache.pulsar.common.naming.TopicName;
import org.apache.pulsar.common.util.FutureUtil;
import org.apache.pulsar.compaction.CompactedTopicUtils;
import org.apache.pulsar.compaction.Compactor;
import org.apache.pulsar.compaction.TopicCompactionService;

/**
 * Analyze result.
 */
record AnalyzeResult(Map updatedProducers, List completedTxns) {

    @Override
    public boolean equals(Object obj) {
        if (obj instanceof AnalyzeResult that) {
            return Objects.equals(this.updatedProducers, that.updatedProducers)
                && Objects.equals(this.completedTxns, that.completedTxns);
        }
        return false;
    }
}

/**
 * An append-only log for storing messages. Mapping to Kafka Log.scala.
 */
@Slf4j
public class PartitionLog {

    public static final String KAFKA_TOPIC_UUID_PROPERTY_NAME = "kafkaTopicUUID";
    public static final String KAFKA_ENTRY_FORMATTER_PROPERTY_NAME = "kafkaEntryFormat";
    private static final String PID_PREFIX = "KOP-PID-PREFIX";

    private static final KopLogValidator.CompressionCodec DEFAULT_COMPRESSION =
            new KopLogValidator.CompressionCodec(CompressionType.NONE.name, CompressionType.NONE.id);

    private final KafkaServiceConfiguration kafkaConfig;
    private final RequestStats requestStats;
    private final Time time;
    private final String fullPartitionName;
    // We can use fullPartitionName as the key of StatsLogger in future to avoid using TopicPartition internally
    private final TopicPartition statsTopicPartition;
    @Getter
    @VisibleForTesting
    volatile ProducerStateManager producerStateManager;

    private final List entryFilters;
    private final KafkaTopicLookupService kafkaTopicLookupService;

    private final ProducerStateManagerSnapshotBuffer producerStateManagerSnapshotBuffer;

    private final ExecutorService recoveryExecutor;

    @VisibleForTesting
    volatile PersistentTopic persistentTopic;

    private final CompletableFuture initFuture = new CompletableFuture<>();

    private volatile Map topicProperties;

    private final boolean isOffsetTopic;
    private volatile EntryFormatter entryFormatter;

    private volatile String kafkaTopicUUID;

    private volatile boolean unloaded = false;

    @Getter
    private final LongAdder pendingPublishOps = new LongAdder();

    public PartitionLog(KafkaServiceConfiguration kafkaConfig,
                        RequestStats requestStats,
                        Time time,
                        String fullPartitionName,
                        List entryFilters,
                        KafkaTopicLookupService kafkaTopicLookupService,
                        ProducerStateManagerSnapshotBuffer producerStateManagerSnapshotBuffer,
                        OrderedExecutor recoveryExecutor) {
        this.kafkaConfig = kafkaConfig;
        this.entryFilters = entryFilters;
        this.requestStats = requestStats;
        this.time = time;
        this.fullPartitionName = fullPartitionName;
        final var prefix = "persistent://" + kafkaConfig.getDefaultNamespacePrefix();
        if (fullPartitionName.startsWith(prefix)) {
            this.statsTopicPartition = TopicNameUtils.splitTopicPartition(fullPartitionName.substring(prefix.length()));
        } else {
            this.statsTopicPartition = TopicNameUtils.splitTopicPartition(fullPartitionName);
        }
        final int index = fullPartitionName.indexOf(TopicName.PARTITIONED_TOPIC_SUFFIX);
        this.isOffsetTopic = (index >= 0 && fullPartitionName.substring(0, index).endsWith(
                kafkaConfig.getKafkaMetadataNamespace() + "/" + GROUP_METADATA_TOPIC_NAME));
        this.kafkaTopicLookupService = kafkaTopicLookupService;
        this.producerStateManagerSnapshotBuffer = producerStateManagerSnapshotBuffer;
        this.recoveryExecutor = recoveryExecutor.chooseThread(fullPartitionName);
    }

    public void initialise() {
        loadTopicProperties().whenComplete((___, errorLoadTopic) -> {
            if (errorLoadTopic != null) {
                log.warn("Failed to load {}", fullPartitionName, errorLoadTopic);
                initFuture.completeExceptionally(new NotLeaderOrFollowerException());
                return;
            }
            CompletableFuture recoverFuture = CompletableFuture.completedFuture(null);
            if (kafkaConfig.isKafkaTransactionCoordinatorEnabled()) {
                recoverFuture = producerStateManager.recover(this, recoveryExecutor);
            }
            recoverFuture.thenRun(() -> {
                if (persistentTopic instanceof KopPersistentTopic kopPersistentTopic) {
                    kopPersistentTopic.updateKafkaTopicUUID(kafkaTopicUUID != null ? kafkaTopicUUID : "");
                }
            }).thenRun(() -> initFuture.complete(this))
                .exceptionally(error -> {
                    log.warn("Failed to recover {}", fullPartitionName, error);
                    initFuture.completeExceptionally(new NotLeaderOrFollowerException());
                    return null;
                });
        });
    }

    public CompletableFuture awaitInitialisation() {
        return initFuture;
    }

    public boolean isInitialised() {
        return initFuture.isDone() && !initFuture.isCompletedExceptionally();
    }

    public boolean isInitialisationFailed() {
        return initFuture.isDone() && initFuture.isCompletedExceptionally();
    }

    private CompletableFuture loadTopicProperties() {
        CompletableFuture> persistentTopicFuture =
                kafkaTopicLookupService.getTopic(fullPartitionName, this);
        return persistentTopicFuture
                .thenCompose(this::fetchTopicProperties)
                .thenAccept(properties -> {
                    this.topicProperties = properties;
                    log.info("Topic properties for {} are {}", fullPartitionName, properties);
                    if (isOffsetTopic) {
                        this.entryFormatter = PulsarEntryFormatter.offsetControlRecordFormatter();
                    } else {
                        this.entryFormatter = buildEntryFormatter(topicProperties);
                    }
                    this.kafkaTopicUUID = properties.getOrDefault(KAFKA_TOPIC_UUID_PROPERTY_NAME, "");
                    if (!MetadataUtils.isSystemTopic(persistentTopic, kafkaConfig)) {
                        checkArgument(this.kafkaTopicUUID != null);
                    }
                    // TODO: load the HW from the metadata store
                    this.producerStateManager =
                        new ProducerStateManager(
                            fullPartitionName,
                            kafkaTopicUUID,
                            producerStateManagerSnapshotBuffer,
                            kafkaConfig.getKafkaTxnProducerStateTopicSnapshotIntervalSeconds(),
                            kafkaConfig.getKafkaTxnPurgeAbortedTxnIntervalSeconds(),
                            kafkaConfig.getKafkaTxnMaxDifferentMessageToSnapshotThreshold(),
                            this);
                });
    }

    private CompletableFuture> fetchTopicProperties(Optional persistentTopic) {
        if (!persistentTopic.isPresent()) {
            log.info("Topic {} not loaded here", fullPartitionName);
            return FutureUtil.failedFuture(new NotLeaderOrFollowerException());
        }
        this.persistentTopic = persistentTopic.get();
        TopicName logicalName = TopicName.get(persistentTopic.get().getName());
        TopicName actualName;
        if (logicalName.isPartitioned()) {
            actualName = TopicName.getPartitionedTopicName(persistentTopic.get().getName());
        } else {
            actualName = logicalName;
        }
        return persistentTopic.get().getBrokerService()
                .fetchPartitionedTopicMetadataAsync(actualName, true)
                .thenApply(metadata -> {
                    if (metadata.partitions > 0) {
                        return metadata.properties;
                    } else {
                        return persistentTopic.get().getManagedLedger().getProperties();
                    }
                })
                .thenApply(map -> map != null ? map : Collections.emptyMap());
    }

    private EntryFormatter buildEntryFormatter(Map topicProperties) {
        final String entryFormat;
        if (topicProperties != null) {
            entryFormat = topicProperties
                    .getOrDefault(KAFKA_ENTRY_FORMATTER_PROPERTY_NAME, kafkaConfig.getEntryFormat());
        } else {
            entryFormat = kafkaConfig.getEntryFormat();
        }
        if (log.isDebugEnabled()) {
            log.debug("entryFormat for {} is {} (topicProperties {})", fullPartitionName,
                    entryFormat, topicProperties);
        }
        return EntryFormatterFactory.create(kafkaConfig, entryFilters, entryFormat);
    }

    @Data
    @Accessors(fluent = true)
    @AllArgsConstructor
    public static class LogAppendInfo {
        private Optional firstOffset;
        private Optional producerId;
        private String producerName;
        private short producerEpoch;
        private int numMessages;
        private int shallowCount;
        private boolean isTransaction;
        private boolean isControlBatch;
        private int validBytes;
        private int firstSequence;
        private int lastSequence;
        private KopLogValidator.CompressionCodec sourceCodec;
        private KopLogValidator.CompressionCodec targetCodec;
        private boolean compactedTopic;
    }

    @Data
    @ToString
    @Accessors(fluent = true)
    @AllArgsConstructor
    public static class ReadRecordsResult {

        private static final Recycler RECYCLER = new Recycler() {
            protected ReadRecordsResult newObject(Handle handle) {
                return new ReadRecordsResult(handle);
            }
        };

        private final Recycler.Handle recyclerHandle;

        private DecodeResult decodeResult;
        private List abortedTransactions;
        private long highWatermark;
        private long lastStableOffset;
        private Position lastPosition;
        private Errors errors;

        private PartitionLog partitionLog;

        private ReadRecordsResult(Recycler.Handle recyclerHandle) {
            this.recyclerHandle = recyclerHandle;
        }

        public Errors errors() {
            return errors == null ? Errors.NONE : errors;
        }

        public static ReadRecordsResult get(DecodeResult decodeResult,
                                            List abortedTransactions,
                                            long highWatermark,
                                            long lastStableOffset,
                                            Position lastPosition,
                                            PartitionLog partitionLog) {
            return ReadRecordsResult.get(
                    decodeResult,
                    abortedTransactions,
                    highWatermark,
                    lastStableOffset,
                    lastPosition,
                    null,
                    partitionLog);
        }

        public static ReadRecordsResult get(DecodeResult decodeResult,
                                            List abortedTransactions,
                                            long highWatermark,
                                            long lastStableOffset,
                                            Position lastPosition,
                                            Errors errors,
                                            PartitionLog partitionLog) {
            ReadRecordsResult readRecordsResult = RECYCLER.get();
            readRecordsResult.decodeResult = decodeResult;
            readRecordsResult.abortedTransactions = abortedTransactions;
            readRecordsResult.highWatermark = highWatermark;
            readRecordsResult.lastStableOffset = lastStableOffset;
            readRecordsResult.lastPosition = lastPosition;
            readRecordsResult.errors = errors;
            readRecordsResult.partitionLog = partitionLog;
            return readRecordsResult;
        }

        public static ReadRecordsResult empty(long highWatermark,
                                              long lastStableOffset,
                                              Position lastPosition,
                                              PartitionLog partitionLog) {
            return ReadRecordsResult.get(
                    DecodeResult.get(MemoryRecords.EMPTY),
                    Collections.emptyList(),
                    highWatermark,
                    lastStableOffset,
                    lastPosition,
                    partitionLog);
        }

        public static ReadRecordsResult error(Errors errors, PartitionLog partitionLog) {
            return ReadRecordsResult.error(PositionImpl.EARLIEST, errors, partitionLog);
        }

        public static ReadRecordsResult error(Position position, Errors errors, PartitionLog partitionLog) {
            return ReadRecordsResult.get(null,
                    null,
                    -1,
                    -1,
                    position,
                    errors,
                    partitionLog);
        }

        public FetchResponseData.PartitionData toPartitionData() {

            // There are three cases:
            //
            // 1. errors == null :
            //        The decode result count > 0
            // 2. errors == ERROR.NONE :
            //        Get the empty result.
            // 3. errors == Others error :
            //        Get errors.
            if (errors != null) {
                return new FetchResponseData.PartitionData()
                        .setErrorCode(errors.code())
                        .setHighWatermark(FetchResponse.INVALID_HIGH_WATERMARK)
                        .setLastStableOffset(FetchResponse.INVALID_LAST_STABLE_OFFSET)
                        .setLogStartOffset(FetchResponse.INVALID_LOG_START_OFFSET)
                        .setRecords(MemoryRecords.EMPTY);
            }
            return new FetchResponseData.PartitionData()
                    .setErrorCode(Errors.NONE.code())
                    .setHighWatermark(highWatermark)
                    .setLastStableOffset(lastStableOffset)
                    .setHighWatermark(highWatermark) // TODO: should it be changed to the logStartOffset?
                    .setAbortedTransactions(abortedTransactions)
                    .setRecords(decodeResult.getRecords());
        }

        public void recycle() {
            this.errors = null;
            this.lastPosition = null;
            this.lastStableOffset = -1;
            this.highWatermark = -1;
            this.abortedTransactions = null;
            this.partitionLog = null;
            if (this.decodeResult != null) {
                this.decodeResult.recycle();
                this.decodeResult = null;
            }
        }
    }
    /**
     * AppendOrigin is used mark the data origin.
     */
    public enum AppendOrigin {
        Coordinator,
        Client,
        Log
    }

    // KSN does not modify the records from the client, so the base offset and last offset fields are both not set.
    // Hence, here we pass the first offset and last offset explicitly.
    private AnalyzeResult analyzeAndValidateProducerState(long firstOffset, long lastOffset, MemoryRecords records,
                                                          AppendOrigin origin) {
        final var updatedProducers = new HashMap();
        final var completedTxns = new ArrayList();
        records.batches().forEach(batch -> {
            if (batch.hasProducerId()) {
                updateProducers(batch, updatedProducers, firstOffset, lastOffset, origin)
                    .ifPresent(completedTxns::add);
            }
        });
        return new AnalyzeResult(updatedProducers, completedTxns);
    }

    private Optional updateProducers(RecordBatch batch, Map producers,
                                                   long firstOffset, long lastOffset,
                                                   AppendOrigin origin) {
        Long producerId = batch.producerId();
        ProducerAppendInfo appendInfo =
            producers.computeIfAbsent(producerId, pid -> producerStateManager.prepareUpdate(pid, origin));
        return appendInfo.append(batch, firstOffset, lastOffset, null);
    }

    // Note: this method must be called after initFuture completes successfully
    public Optional firstUndecidedOffset() {
        return producerStateManager.firstUndecidedOffset();
    }

    private List getAbortedIndexList(long fetchOffset) {
        return producerStateManager.getAbortedIndexList(fetchOffset);
    }

    /**
     * Append this message to pulsar.
     *
     * @param records The log records to append
     * @param origin  Declares the origin of to append which affects required validations
     * @param appendRecordsContext See {@link AppendRecordsContext}
     */
    public CompletableFuture appendRecords(final MemoryRecords records,
                                                 final AppendOrigin origin,
                                                 final AppendRecordsContext appendRecordsContext) {
        CompletableFuture appendFuture = new CompletableFuture<>();
        KafkaTopicManager topicManager = appendRecordsContext.getTopicManager();
        if (topicManager == null) {
            log.error("topicManager is null for {}", fullPartitionName,
                    new Exception("topicManager is null for " + fullPartitionName).fillInStackTrace());
            return CompletableFuture
                    .failedFuture(new KafkaStorageException("topicManager is null for " + fullPartitionName));
        }
        final long beforeRecordsProcess = time.nanoseconds();
        try {
            final LogAppendInfo appendInfo = analyzeAndValidateRecords(records);

            // return if we have no valid messages or if this is a duplicate of the last appended entry
            if (appendInfo.shallowCount() == 0) {
                appendFuture.complete(appendInfo.firstOffset().orElse(-1L));
                return appendFuture;
            }
            MemoryRecords validRecords = trimInvalidBytes(records, appendInfo);

            // Append Message into pulsar
            final long startEnqueueNanos = MathUtils.nowInNano();
            final Consumer sequentialExecutor = __ -> {
                long messageQueuedLatencyNanos = MathUtils.elapsedNanos(startEnqueueNanos);
                appendRecordsContext.getEventExecutor().execute(() -> {
                    requestStats.getMessageQueuedLatencyStats().registerSuccessfulEvent(
                            messageQueuedLatencyNanos, TimeUnit.NANOSECONDS);
                });

                if (persistentTopic.isSystemTopic()) {
                    log.error("Not support producing message to system topic: {}", persistentTopic);
                    appendFuture.completeExceptionally(Errors.INVALID_TOPIC_EXCEPTION.exception());
                    return;
                }

                final ManagedLedger managedLedger = persistentTopic.getManagedLedger();
                if (entryFormatter instanceof KafkaMixedEntryFormatter) {
                    final long logEndOffset = MessageMetadataUtils.getLogEndOffset(managedLedger);
                    appendInfo.firstOffset(Optional.of(logEndOffset));
                }

                String cleanupPolicy = PartitionLog.this.topicProperties
                        .get(KOP_KAFKA_PROPERTY_PREFIX + CLEANUP_POLICY_CONFIG);
                if (cleanupPolicy != null && cleanupPolicy.contains(CLEANUP_POLICY_COMPACT)) {
                    appendInfo.compactedTopic = true;
                }

                final EncodeRequest encodeRequest = EncodeRequest.get(validRecords, appendInfo);

                long pendingTopicLatencyNanos = time.nanoseconds() - beforeRecordsProcess;
                appendRecordsContext.getEventExecutor().execute(() -> {
                    requestStats.getPendingTopicLatencyStats().registerSuccessfulEvent(
                            pendingTopicLatencyNanos, TimeUnit.NANOSECONDS);
                });

                long beforeEncodingStarts = time.nanoseconds();
                final EncodeResult encodeResult = entryFormatter.encode(encodeRequest);
                encodeRequest.recycle();

                // Check the converted buffer again
                if (encodeResult.getEncodedByteBuf().readableBytes() > kafkaConfig.getMaxMessageSize()) {
                    appendFuture.completeExceptionally(new RecordBatchTooLargeException(String.format("Converted buffer"
                            + " size is %d in append to partition %s which exceeds the maximum configured size of %d.",
                        encodeResult.getEncodedByteBuf().readableBytes(), fullPartitionName,
                        kafkaConfig.getMaxMessageSize())));
                    encodeResult.recycle();
                    return;
                }

                long encodeLatencyNanos = time.nanoseconds() - beforeEncodingStarts;
                appendRecordsContext.getEventExecutor().execute(() -> {
                    requestStats.getProduceEncodeStats().registerSuccessfulEvent(
                            encodeLatencyNanos, TimeUnit.NANOSECONDS);
                });

                appendRecordsContext.getStartSendOperationForThrottling()
                        .accept(encodeResult.getEncodedByteBuf().readableBytes());
                appendRecordsContext
                    .getTopicManager()
                    .registerProducerInPersistentTopic(fullPartitionName, persistentTopic)
                    .ifPresent((producer) -> {
                        // collect metrics
                        encodeResult.updateProducerStats(statsTopicPartition, requestStats, producer,
                            appendRecordsContext.getEventExecutor());
                    });
                publishMessages(appendFuture, appendInfo, encodeResult, appendRecordsContext);
            };

            appendRecordsContext.getPendingTopicFuturesMap()
                    .computeIfAbsent(fullPartitionName, ignored -> new PendingTopicFutures())
                    .addListener(initFuture, sequentialExecutor, throwable -> {
                        long messageQueuedLatencyNanos = MathUtils.elapsedNanos(startEnqueueNanos);
                        appendRecordsContext.getCtx().executor().execute(() -> {
                            requestStats.getMessageQueuedLatencyStats().registerFailedEvent(
                                    messageQueuedLatencyNanos, TimeUnit.NANOSECONDS);
                        });
                        appendFuture.completeExceptionally(throwable);
                    });
        } catch (Exception exception) {
            log.error("Failed to handle produce request for {}", fullPartitionName, exception);
            appendFuture.completeExceptionally(exception);
        }

        return appendFuture;
    }

    // Note: this method must be called after initFuture completes successfully
    public Position getLastPosition() {
        return persistentTopic.getLastPosition();
    }

    // Note: this method must be called after initFuture completes successfully
    public CompletableFuture readRecords(final FetchRequestData.FetchPartition partitionData,
                                                            final boolean readCommitted,
                                                            final AtomicLong limitBytes,
                                                            final int maxReadEntriesNum,
                                                            final MessageFetchContext context) {
        final long startPrepareMetadataNanos = MathUtils.nowInNano();
        final CompletableFuture future = new CompletableFuture<>();
        final long offset = partitionData.fetchOffset();
        KafkaTopicManager topicManager = context.getTopicManager();
        // The future that is returned by getTopicConsumerManager is always completed normally
        topicManager.getTopicConsumerManager(fullPartitionName).thenAccept(tcm -> {
            if (tcm == null) {
                registerPrepareMetadataFailedEvent(startPrepareMetadataNanos);
                // remove null future cache
                context.getSharedState().getKafkaTopicConsumerManagerCache().removeAndCloseByTopic(fullPartitionName);
                if (log.isDebugEnabled()) {
                    log.debug("Fetch for {}: no tcm return NOT_LEADER_FOR_PARTITION.", fullPartitionName);
                }
                future.complete(ReadRecordsResult.error(Errors.NOT_LEADER_OR_FOLLOWER, this));
                return;
            }
            if (checkOffsetOutOfRange(tcm, offset, startPrepareMetadataNanos, context.getEventExecutor())) {
                future.complete(ReadRecordsResult.error(Errors.OFFSET_OUT_OF_RANGE, this));
                return;
            }

            if (log.isDebugEnabled()) {
                log.debug("Fetch for {}: remove tcm to get cursor for fetch offset: {} .", fullPartitionName, offset);
            }

            final CompletableFuture> cursorFuture = tcm.removeCursorFuture(offset);

            if (cursorFuture == null) {
                // tcm is closed, just return a NONE error because the channel may be still active
                log.warn("KafkaTopicConsumerManager is closed, remove TCM of {}", fullPartitionName);
                registerPrepareMetadataFailedEvent(startPrepareMetadataNanos);
                context.getSharedState().getKafkaTopicConsumerManagerCache().removeAndCloseByTopic(fullPartitionName);
                future.complete(ReadRecordsResult.error(Errors.NONE, this));
                return;
            }
            cursorFuture.thenAccept((cursorLongPair) -> {

                if (cursorLongPair == null) {
                    log.warn("KafkaTopicConsumerManager.remove({}) return null for topic {}. "
                            + "Fetch for topic return error.", offset, fullPartitionName);
                    registerPrepareMetadataFailedEvent(startPrepareMetadataNanos);
                    future.complete(ReadRecordsResult.error(Errors.NOT_LEADER_OR_FOLLOWER, this));
                    return;
                }
                final ManagedCursor cursor = cursorLongPair.getLeft();
                final AtomicLong cursorOffset = new AtomicLong(cursorLongPair.getRight());

                final var startReadEntriesNanos = MathUtils.nowInNano();
                requestStats.getPrepareMetadataStats().registerSuccessfulEvent(
                    startReadEntriesNanos - startPrepareMetadataNanos, TimeUnit.NANOSECONDS);
                long adjustedMaxBytes = Math.min(partitionData.partitionMaxBytes(), limitBytes.get());
                if (readCommitted) {
                    long firstUndecidedOffset = producerStateManager.firstUndecidedOffset().orElse(-1L);
                    if (firstUndecidedOffset >= 0 && firstUndecidedOffset <= offset) {
                        long highWaterMark = MessageMetadataUtils.getHighWatermark(cursor.getManagedLedger());
                        future.complete(
                                ReadRecordsResult.empty(
                                        highWaterMark,
                                        firstUndecidedOffset,
                                        tcm.getManagedLedger().getLastConfirmedEntry(), this
                                )
                        );
                        return;
                    }
                }
                readEntries(tcm, cursor, cursorOffset, maxReadEntriesNum,
                        adjustedMaxBytes,
                        fullPartitionName -> {
                            topicManager.invalidateCacheForFencedManagerLedgerOnTopic(fullPartitionName);
                        })
                    .whenComplete((entries, throwable) -> {
                        final var messageReadStats = requestStats.getMessageReadStats();
                        final var startHandleEntriesNanos = MathUtils.nowInNano();
                        if (throwable != null) {
                            messageReadStats.registerFailedEvent(startHandleEntriesNanos - startReadEntriesNanos,
                                TimeUnit.NANOSECONDS);
                            tcm.deleteOneCursorAsync(cursorLongPair.getLeft(),
                                "cursor.readEntry fail. deleteCursor");
                            if (throwable instanceof ManagedLedgerException.CursorAlreadyClosedException
                                || throwable instanceof ManagedLedgerException.ManagedLedgerFencedException) {
                                future.complete(ReadRecordsResult.error(Errors.NOT_LEADER_OR_FOLLOWER, this));
                                return;
                            }
                            log.error("Read entry error on {}", partitionData, throwable);
                            future.complete(ReadRecordsResult.error(Errors.UNKNOWN_SERVER_ERROR, this));
                            return;
                        }
                        messageReadStats.registerSuccessfulEvent(startHandleEntriesNanos - startReadEntriesNanos,
                            TimeUnit.NANOSECONDS);
                        long readSize = entries.stream().mapToLong(Entry::getLength).sum();
                        limitBytes.addAndGet(-1 * readSize);
                        // Add new offset back to TCM after entries are read successfully
                        tcm.add(cursorOffset.get(), Pair.of(cursor, cursorOffset.get()));
                        handleEntries(future, entries, partitionData, tcm, cursor, readCommitted, context,
                            startHandleEntriesNanos);
                    });
            }).exceptionally(ex -> {
                registerPrepareMetadataFailedEvent(startPrepareMetadataNanos);
                context.getSharedState()
                        .getKafkaTopicConsumerManagerCache().removeAndCloseByTopic(fullPartitionName);
                future.complete(ReadRecordsResult.error(Errors.NOT_LEADER_OR_FOLLOWER, this));
                return null;
            });
        });

        return future;
    }

    private boolean checkOffsetOutOfRange(KafkaTopicConsumerManager tcm,
                                          long offset,
                                          long startPrepareMetadataNanos,
                                          EventExecutor eventExecutor) {
        // handle offset out-of-range exception
        ManagedLedgerImpl managedLedger = (ManagedLedgerImpl) tcm.getManagedLedger();
        long logEndOffset = MessageMetadataUtils.getLogEndOffset(managedLedger);
        // TODO: Offset out-of-range checks are still incomplete
        // We only check the case of `offset > logEndOffset` and `offset < LogStartOffset`
        // is currently not handled.
        // Because we found that the operation of obtaining `LogStartOffset`
        // requires reading from disk,
        // and such a time-consuming operation is likely to harm the performance of FETCH request.
        // More discussions please refer to https://github.com/streamnative/kop/pull/531
        if (offset > logEndOffset) {
            log.error("Received request for offset {} for partition {}, "
                            + "but we only have entries less than {}.",
                    offset, fullPartitionName, logEndOffset);
            if (startPrepareMetadataNanos > 0) {
                registerPrepareMetadataFailedEvent(startPrepareMetadataNanos);
            }
            return true;
        }
        return false;
    }

    private void registerPrepareMetadataFailedEvent(long startPrepareMetadataNanos) {
        long prepareMetadataNanos = MathUtils.elapsedNanos(startPrepareMetadataNanos);
        this.requestStats.getPrepareMetadataStats().registerFailedEvent(prepareMetadataNanos, TimeUnit.NANOSECONDS);
    }

    private void handleEntries(final CompletableFuture future,
                               final List entries,
                               final FetchRequestData.FetchPartition partitionData,
                               final KafkaTopicConsumerManager tcm,
                               final ManagedCursor cursor,
                               final boolean readCommitted,
                               final MessageFetchContext context,
                               final long startHandleEntriesNanos) {
        final long highWatermark = MessageMetadataUtils.getHighWatermark(cursor.getManagedLedger());
        final long lso = (readCommitted
                ? this.firstUndecidedOffset().orElse(highWatermark) : highWatermark);
        final List committedEntries = readCommitted ? getCommittedEntries(entries, lso) : entries;

        if (log.isDebugEnabled()) {
            log.debug("Read {} entries but only {} entries are committed, lso {}, highWatermark {}",
                    entries.size(), committedEntries.size(), lso, highWatermark);
        }
        if (committedEntries.isEmpty()) {
            future.complete(ReadRecordsResult.error(tcm.getManagedLedger().getLastConfirmedEntry(), Errors.NONE,
                    this));
            return;
        }

        // use compatible magic value by apiVersion
        final byte magic = getCompatibleMagic(context.getHeader().apiVersion());

        // this part is heavyweight, and we should not execute in the ManagedLedger Ordered executor thread
        final CompletableFuture groupNameFuture = kafkaConfig.isKopEnableGroupLevelConsumerMetrics()
                ? context.getCurrentConnectedGroupNameAsync() : CompletableFuture.completedFuture(null);

        groupNameFuture.whenCompleteAsync((groupName, ex) -> {
            if (ex != null) {
                log.error("Get groupId failed.", ex);
                groupName = "";
            }

            // Get the last entry position for delayed fetch.
            Position lastPosition = this.getLastPositionFromEntries(committedEntries);
            final long startDecodingEntriesNanos = MathUtils.nowInNano();
            final DecodeResult decodeResult = entryFormatter.decode(committedEntries, magic);
            long fetchDecodeLatencyNanos = MathUtils.elapsedNanos(startDecodingEntriesNanos);
            requestStats.getFetchDecodeStats().registerSuccessfulEvent(fetchDecodeLatencyNanos, TimeUnit.NANOSECONDS);
            // collect consumer metrics
            decodeResult.updateConsumerStats(statsTopicPartition, committedEntries.size(),
                    groupName, requestStats);
            List abortedTransactions = null;
            if (readCommitted) {
                abortedTransactions = this.getAbortedIndexList(partitionData.fetchOffset());
            }
            if (log.isDebugEnabled()) {
                log.debug("Partition {} read entry completed in {} ns",
                        fullPartitionName, MathUtils.nowInNano() - startDecodingEntriesNanos);
            }
            requestStats.getHandleEntriesStats().registerSuccessfulEvent(
                MathUtils.elapsedNanos(startHandleEntriesNanos), TimeUnit.NANOSECONDS);
            future.complete(ReadRecordsResult
                    .get(decodeResult, abortedTransactions, highWatermark, lso, lastPosition, this));
        }, context.getDecodeExecutor()).exceptionally(ex -> {
            log.error("Partition {} read entry exceptionally. ", fullPartitionName, ex);
            requestStats.getHandleEntriesStats().registerFailedEvent(
                MathUtils.elapsedNanos(startHandleEntriesNanos), TimeUnit.NANOSECONDS);
            future.complete(ReadRecordsResult.error(Errors.KAFKA_STORAGE_ERROR, this));
            return null;
        });
    }

    private static byte getCompatibleMagic(short apiVersion) {
        final byte magic;
        if (apiVersion <= 1) {
            magic = RecordBatch.MAGIC_VALUE_V0;
        } else if (apiVersion <= 3) {
            magic = RecordBatch.MAGIC_VALUE_V1;
        } else {
            magic = RecordBatch.CURRENT_MAGIC_VALUE;
        }
        return magic;
    }

    private Position getLastPositionFromEntries(List entries) {
        if (entries == null || entries.isEmpty()) {
            return PositionImpl.EARLIEST;
        }
        return entries.get(entries.size() - 1).getPosition();
    }

    private List getCommittedEntries(List entries, long lso) {
        List committedEntries;
        committedEntries = new ArrayList<>();
        for (Entry entry : entries) {
            try {
                if (lso > MessageMetadataUtils.peekBaseOffsetFromEntry(entry)) {
                    committedEntries.add(entry);
                } else {
                    break;
                }
            } catch (MetadataCorruptedException e) {
                log.error("[{}:{}] Failed to peek base offset from entry.",
                        entry.getLedgerId(), entry.getEntryId());
            }
        }
        // Release all the entries that are not in the result
        for (int i = committedEntries.size(); i < entries.size(); i++) {
            entries.get(i).release();
        }
        return committedEntries;
    }

    /**
     * Read Entries by cursor.
     *
     * @return CompletableFuture>
     *     When the comparable future complete normally, the list of entry's will never be null.
     */
    private CompletableFuture> readEntries(final KafkaTopicConsumerManager tcm,
                                                       final ManagedCursor cursor,
                                                       final AtomicLong cursorOffset,
                                                       final int maxReadEntriesNum,
                                                       final long adjustedMaxBytes,
                                                       final Consumer invalidateCacheOnTopic) {
        final CompletableFuture> readFuture = new CompletableFuture<>();
        if (adjustedMaxBytes <= 0) {
            readFuture.complete(Collections.emptyList());
            return readFuture;
        }

        final long originalOffset = cursorOffset.get();

        AsyncCallbacks.ReadEntriesCallback readEntriesCallback = new AsyncCallbacks.ReadEntriesCallback() {


            @Override
            public void readEntriesComplete(List entries, Object ctx) {
                if (!entries.isEmpty()) {
                    final Entry lastEntry = entries.get(entries.size() - 1);
                    final PositionImpl currentPosition = PositionImpl.get(
                            lastEntry.getLedgerId(), lastEntry.getEntryId());

                    try {
                        final long lastOffset = MessageMetadataUtils.peekOffsetFromEntry(lastEntry);

                        // commit the offset, so backlog not affect by this cursor.
                        commitOffset((NonDurableCursorImpl) cursor, currentPosition);

                        // and add back to TCM when all read complete.
                        cursorOffset.set(lastOffset + 1);

                        if (log.isDebugEnabled()) {
                            log.debug("Topic {} success read entry: ledgerId: {}, entryId: {}, size: {},"
                                            + " ConsumerManager original offset: {}, lastEntryPosition: {}, "
                                            + "nextOffset: {}",
                                    fullPartitionName, lastEntry.getLedgerId(), lastEntry.getEntryId(),
                                    lastEntry.getLength(), originalOffset, currentPosition,
                                    cursorOffset.get());
                        }
                    } catch (MetadataCorruptedException e) {
                        log.error("[{}] Failed to peekOffsetFromEntry from position {}: {}",
                            fullPartitionName, currentPosition, e.getMessage());
                        readFuture.completeExceptionally(e);
                        return;
                    }
                }
                readFuture.complete(entries);
            }

            @Override
            public void readEntriesFailed(ManagedLedgerException exception, Object ctx) {
                log.error("Error read entry for topic: {}", fullPartitionName, exception);
                if (exception instanceof ManagedLedgerException.ManagedLedgerFencedException) {
                    invalidateCacheOnTopic.accept(fullPartitionName);
                }
                readFuture.completeExceptionally(exception);
            }
        };

        if (((NonDurableCursorImpl) cursor).isReadCompacted()) {
            CompactedTopicUtils.asyncReadCompactedEntries(tcm.getTopic().getTopicCompactionService(), cursor,
                    maxReadEntriesNum, adjustedMaxBytes, PositionImpl.LATEST, false, readEntriesCallback,
                    false, null);
        } else {
            cursor.asyncReadEntries(maxReadEntriesNum, adjustedMaxBytes, readEntriesCallback, null,
                    PositionImpl.LATEST);
        }

        return readFuture;
    }

    // commit the offset, so backlog not affect by this cursor.
    private static void commitOffset(NonDurableCursorImpl cursor, PositionImpl currentPosition) {
        cursor.asyncMarkDelete(currentPosition, new AsyncCallbacks.MarkDeleteCallback() {
            @Override
            public void markDeleteComplete(Object ctx) {
                if (log.isDebugEnabled()) {
                    log.debug("Mark delete success for position: {}", currentPosition);
                }
            }

            // this is OK, since this is kind of cumulative ack, following commit will come.
            @Override
            public void markDeleteFailed(ManagedLedgerException e, Object ctx) {
                log.warn("Mark delete failed for position: {} with error:",
                        currentPosition, e);
            }
        }, null);
    }

    @VisibleForTesting
    void publishMessages(final CompletableFuture appendFuture, final LogAppendInfo appendInfo,
                         final EncodeResult encodeResult, final AppendRecordsContext appendRecordsContext) {
        final ByteBuf byteBuf = encodeResult.getEncodedByteBuf();
        final int byteBufSize = byteBuf.readableBytes();
        final long beforePublish = time.nanoseconds();

        final AnalyzeResult analyzeResult;
        final var firstOffset = MessageMetadataUtils.getLogEndOffset(persistentTopic.getManagedLedger());
        final var lastOffset = firstOffset + appendInfo.numMessages - 1;
        try {
            analyzeResult = analyzeAndValidateProducerState(firstOffset, lastOffset, encodeResult.getRecords(),
                AppendOrigin.Client);
        } catch (Throwable throwable) {
            requestStats.getMessagePublishStats().registerFailedEvent(
                MathUtils.elapsedNanos(beforePublish), TimeUnit.NANOSECONDS);
            appendFuture.completeExceptionally(throwable);
            return;
        }

        pendingPublishOps.increment();
        publishMessage(persistentTopic, byteBuf, appendInfo)
            .whenComplete((result, e) -> {
                appendRecordsContext.getCompleteSendOperationForThrottling().accept(byteBufSize);

                try {
                    if (e == null) {
                        analyzeResult.updatedProducers().values().stream().map(ProducerAppendInfo::startedTransactions)
                            .flatMap(Collection::stream).forEach(txn -> txn.firstPosition = result.position);
                        analyzeResult.updatedProducers().values().forEach(producerStateManager::update);

                        analyzeResult.completedTxns().forEach(completedTxn -> {
                            final var lastStableOffset = producerStateManager.lastStableOffset(completedTxn);
                            producerStateManager.updateTxnIndex(completedTxn, lastStableOffset);
                            producerStateManager.completeTxn(completedTxn);
                        });

                        producerStateManager.updateMapEndOffset(result.offset);
                        producerStateManager.updateMaxReadPosition(result.position);

                        // Only trigger the snapshot persistence when the records are persisted
                        producerStateManager.maybeTakeSnapshot(recoveryExecutor);
                        producerStateManager.maybePurgeAbortedTx();

                        requestStats.getMessagePublishStats().registerSuccessfulEvent(
                            MathUtils.elapsedNanos(beforePublish), TimeUnit.NANOSECONDS);
                        appendFuture.complete(result.offset);
                    } else {
                        // `producerStateManager.producers` might have already been updated. It's okay because even if
                        // the record failed to be persisted, it's still an exceptional case to see a new record whose
                        // producer epoch is smaller.
                        log.error("publishMessages for topic partition: {} failed when write.", fullPartitionName, e);
                        requestStats.getMessagePublishStats().registerFailedEvent(
                            MathUtils.elapsedNanos(beforePublish), TimeUnit.NANOSECONDS);
                        appendFuture.completeExceptionally(e);
                    }
                } catch (Throwable throwable) {
                    log.error("[{}] Failed to handle the publish for offset {}", fullPartitionName,
                        appendInfo.firstOffset.orElse(-1L), throwable);
                } finally {
                    pendingPublishOps.decrement();
                    encodeResult.recycle();
                }
            });
        if (appendFuture.isCompletedExceptionally()) {
            // The topic is fenced due to ownership transfer or message duplication happens
            return;
        }
        // Update the `producers` field for producer epoch validation in next `analyzeAndValidateProducerState` call
        analyzeResult.updatedProducers().values().forEach(producerStateManager::updateProducers);
    }

    public record PublishResult(long offset, PositionImpl position) {
        @Override
        public boolean equals(Object obj) {
            if (!(obj instanceof PublishResult other)) {
                return false;
            }
            return offset == other.offset && position.equals(other.position);
        }
    }

    /**
     * Publish message to bookkeeper.
     * When the message is control message, then it will not do the message deduplication.
     *
     * @param persistentTopic The persistentTopic, use to publish message and check message deduplication.
     * @param byteBuf Message byteBuf
     * @param appendInfo Pre-analyzed recode info, we can get sequence, message num ...
     * @return offset
     */
    private CompletableFuture publishMessage(final PersistentTopic persistentTopic,
                                                   final ByteBuf byteBuf,
                                                   final LogAppendInfo appendInfo) {
        final CompletableFuture publishFuture = new CompletableFuture<>();

        persistentTopic.publishMessage(byteBuf,
                MessagePublishContext.get(
                        publishFuture,
                        persistentTopic,
                        appendInfo.producerName(),
                        appendInfo.producerId().isPresent() && !appendInfo.isControlBatch(),
                        appendInfo.firstSequence(),
                        appendInfo.lastSequence(),
                        appendInfo.numMessages(),
                        time.nanoseconds()));
        return publishFuture;
    }

    @VisibleForTesting
    public LogAppendInfo analyzeAndValidateRecords(MemoryRecords records) {
        int numMessages = 0;
        int shallowMessageCount = 0;
        Optional firstOffset = Optional.empty();
        boolean readFirstMessage = false;
        boolean isTransaction = false;
        boolean isControlBatch = false;
        int validBytesCount = 0;
        int firstSequence = Integer.MAX_VALUE;
        int lastSequence = -1;
        Optional producerId = Optional.empty();
        short producerEpoch = -1;
        KopLogValidator.CompressionCodec sourceCodec = DEFAULT_COMPRESSION;

        for (RecordBatch batch : records.batches()) {
            if (batch.magic() >= RecordBatch.MAGIC_VALUE_V2 && batch.baseOffset() != 0) {
                throw new InvalidRecordException("The baseOffset of the record batch in the append to "
                        + fullPartitionName + " should be 0, but it is " + batch.baseOffset());
            }
            if (!readFirstMessage) {
                if (batch.magic() >= RecordBatch.MAGIC_VALUE_V2) {
                    firstOffset = Optional.of(batch.baseOffset());
                }
                readFirstMessage = true;
            }

            int batchSize = batch.sizeInBytes();
            if (batchSize > kafkaConfig.getMaxMessageSize()) {
                // Kafka throws RecordTooLargeException here and Kafka clients will try to split the batch and
                // send again until it succeeds. However, there is no way to let Kafka clients know the max message size
                // so the client might never split the large batch.
                // To avoid Kafka clients resending the same large batch infinitely, here we return
                // RecordBatchTooLargeException so that the Kafka clients will fail immediately.
                throw new RecordBatchTooLargeException(String.format("Message batch size is %s "
                                + "in append to partition %s which exceeds the maximum configured size of %s .",
                        batchSize, fullPartitionName, kafkaConfig.getMaxMessageSize()));
            }
            batch.ensureValid();
            shallowMessageCount += 1;
            validBytesCount += batchSize;

            int numMessagesInBatch = (int) (batch.lastOffset() - batch.baseOffset() + 1);
            if (numMessagesInBatch <= 1) {
                // The lastOffset field might be set. We need to iterate the records.
                for (Record record : batch) {
                    numMessages++;
                }
            } else {
                numMessages += numMessagesInBatch;
            }

            isTransaction = batch.isTransactional();
            isControlBatch = batch.isControlBatch();

            // We assume batches producerId are same.
            if (batch.hasProducerId()) {
                producerId = Optional.of(batch.producerId());
                producerEpoch = batch.producerEpoch();
            }

            if (batch.compressionType().id != CompressionType.NONE.id) {
                CompressionType compressionType = CompressionType.forId(batch.compressionType().id);
                sourceCodec = new KopLogValidator.CompressionCodec(
                        compressionType.name, compressionType.id);
            }
            if (firstSequence > batch.baseSequence()) {
                firstSequence = batch.baseSequence();
            }
            if (lastSequence < batch.lastSequence()) {
                lastSequence = batch.lastSequence();
            }
        }

        if (validBytesCount < 0) {
            throw new CorruptRecordException("Cannot append record batch with illegal length "
                    + validBytesCount + " to log for " + fullPartitionName
                    + ". A possible cause is corrupted produce request.");
        }

        KopLogValidator.CompressionCodec targetCodec =
                KopLogValidator.getTargetCodec(sourceCodec, kafkaConfig.getKafkaCompressionType());

        // This producerName is only used to check the message deduplication.
        // Kafka will reuse pid when transactionId is the same but will increase the producerEpoch.
        // So we need to ensure the producerName is not the same.
        String producerName = new StringJoiner("-")
            .add(PID_PREFIX)
            .add(String.valueOf(producerId.orElse(-1L)))
            .add(String.valueOf(producerEpoch)).toString();
        return new LogAppendInfo(firstOffset, producerId, producerName, producerEpoch, numMessages, shallowMessageCount,
                isTransaction, isControlBatch, validBytesCount, firstSequence, lastSequence, sourceCodec, targetCodec,
                false);
    }

    private MemoryRecords trimInvalidBytes(MemoryRecords records, LogAppendInfo info) {
        int validBytes = info.validBytes();
        if (validBytes < 0){
            throw new CorruptRecordException(String.format("Cannot append record batch with illegal length %s to "
                    + "log for %s. A possible cause is a corrupted produce request.", validBytes, fullPartitionName));
        } else if (validBytes == records.sizeInBytes()) {
            return records;
        } else {
            ByteBuffer validByteBuffer = records.buffer().duplicate();
            validByteBuffer.limit(validBytes);
            return MemoryRecords.readableRecords(validByteBuffer);
        }
    }

    /**
     * Remove all the AbortedTxn that are no more referred by existing data on the topic.
     * @return
     */
    public CompletableFuture updatePurgeAbortedTxnsOffset() {
        if (!kafkaConfig.isKafkaTransactionCoordinatorEnabled()) {
            // no need to scan the topic, because transactions are disabled
            return CompletableFuture.completedFuture(null);
        }
        if (!producerStateManager.hasSomeAbortedTransactions()) {
            // nothing to do
            return CompletableFuture.completedFuture(null);
        }
        if (unloaded) {
            // nothing to do
            return CompletableFuture.completedFuture(null);
        }
        return fetchOldestAvailableIndexFromTopic()
                .thenAccept(offset ->
                    producerStateManager.updateAbortedTxnsPurgeOffset(offset));

    }

    @VisibleForTesting
    public CompletableFuture fetchOldestAvailableIndexFromTopic() {
        if (unloaded) {
            return FutureUtil.failedFuture(new NotLeaderOrFollowerException());
        }

        final CompletableFuture future = new CompletableFuture<>();

        // The future that is returned by getTopicConsumerManager is always completed normally
        KafkaTopicConsumerManager tcm = new KafkaTopicConsumerManager("purge-aborted-tx",
                true, persistentTopic, true);
        future.whenComplete((___, error) -> {
            // release resources in any case
            try {
                tcm.close();
            } catch (Exception err) {
                log.error("Cannot safely close the temporary KafkaTopicConsumerManager for {}",
                        fullPartitionName, err);
            }
        });

        ManagedLedgerImpl managedLedger = (ManagedLedgerImpl) persistentTopic.getManagedLedger();
        long numberOfEntries = managedLedger.getNumberOfEntries();
        if (numberOfEntries == 0) {
            long currentOffset = MessageMetadataUtils.getCurrentOffset(managedLedger);
            log.info("First offset for topic {} is {} as the topic is empty (numberOfEntries=0)",
                    fullPartitionName, currentOffset);
            future.complete(currentOffset);

            return future;
        }

        // this is a DUMMY entry with -1
        PositionImpl firstPosition = managedLedger.getFirstPosition();
        // look for the first entry with data
        PositionImpl nextValidPosition = managedLedger.getNextValidPosition(firstPosition);

        fetchOldestAvailableIndexFromTopicReadNext(future, managedLedger, nextValidPosition);

        return future.thenCompose(offset -> {
            if (persistentTopic.getSubscriptions().containsKey(Compactor.COMPACTION_SUBSCRIPTION)) {
                return getLastCompactedOffset(persistentTopic.getTopicCompactionService()).thenApply(
                        lastCompactedOffset -> {
                            return Math.max(lastCompactedOffset + 1, offset);
                        });
            } else {
                return CompletableFuture.completedFuture(offset);
            }
        });

    }

    private CompletableFuture getLastCompactedOffset(TopicCompactionService topicCompactionService) {
        return topicCompactionService.readLastCompactedEntry().thenApply(entry -> {
            if (entry == null) {
                return -1L;
            }
            try {
                return MessageMetadataUtils.peekOffsetFromEntry(entry);
            } catch (MetadataCorruptedException e) {
                if (e instanceof MetadataCorruptedException.NoBrokerEntryMetadata) {
                    return -1L;
                } else {
                    throw new RuntimeException(e);
                }
            } finally {
                entry.release();
            }
        });
    }

    private void fetchOldestAvailableIndexFromTopicReadNext(CompletableFuture future,
                                                            ManagedLedgerImpl managedLedger, PositionImpl position) {
        managedLedger.asyncReadEntry(position, new AsyncCallbacks.ReadEntryCallback() {
            @Override
            public void readEntryComplete(Entry entry, Object ctx) {
                try {
                    long startOffset = MessageMetadataUtils.peekBaseOffsetFromEntry(entry);
                    log.info("First offset for topic {} is {} - position {}", fullPartitionName,
                            startOffset, entry.getPosition());
                    future.complete(startOffset);
                } catch (MetadataCorruptedException.NoBrokerEntryMetadata noBrokerEntryMetadata) {
                    long currentOffset = MessageMetadataUtils.getCurrentOffset(managedLedger);
                    log.info("Legacy entry for topic {} - position {} - returning current offset {}",
                            fullPartitionName,
                            entry.getPosition(),
                            currentOffset);
                    future.complete(currentOffset);
                } catch (Exception err) {
                    future.completeExceptionally(err);
                } finally {
                    entry.release();
                }
            }

            @Override
            public void readEntryFailed(ManagedLedgerException exception, Object ctx) {
                future.completeExceptionally(exception);
            }
        }, null);
    }

    @VisibleForTesting
    public CompletableFuture takeProducerSnapshot() {
        return initFuture.thenCompose((___)  -> {
            // snapshot can be taken only on the same thread that is used for writes
            ManagedLedgerImpl ml = (ManagedLedgerImpl) getPersistentTopic().getManagedLedger();
            Executor executorService = ml.getExecutor();
            return this
                    .getProducerStateManager()
                    .takeSnapshot(executorService);
        });
    }

    @VisibleForTesting
    public CompletableFuture forcePurgeAbortTx() {
        return initFuture.thenCompose((___)  -> {
            // purge can be taken only on the same thread that is used for writes
            ManagedLedgerImpl ml = (ManagedLedgerImpl) getPersistentTopic().getManagedLedger();
            ExecutorService executorService = ml.getScheduledExecutor().chooseThread(ml.getName());

            return updatePurgeAbortedTxnsOffset()
                    .thenApplyAsync((____) -> {
                        return getProducerStateManager().executePurgeAbortedTx();
                    }, executorService);
        });
    }

    CompletableFuture recoverTxEntries(long offset, Executor executor) {
        if (!kafkaConfig.isKafkaTransactionCoordinatorEnabled()) {
            // no need to scan the topic, because transactions are disabled
            return CompletableFuture.completedFuture(0L);
        }
        if (!isBrokerIndexMetadataInterceptorConfigured(persistentTopic.getBrokerService())) {
            // The `UpgradeTest` will set the interceptor to null,
            // this will cause NPE problem while `fetchOldestAvailableIndexFromTopic`,
            // but we can't disable kafka transaction,
            // currently transaction coordinator must set to true (Newly Kafka client requirement).
            // TODO Actually, if the AppendIndexMetadataInterceptor is not set, the kafka transaction can't work,
            //  we need to throw an exception, maybe we need add a new configuration for ProducerId.
            log.error("The broker index metadata interceptor is not configured for topic {}, skip recover txn entries.",
                    fullPartitionName);
            return CompletableFuture.completedFuture(0L);
        }
        final EventExecutor statsExecutor = getPersistentTopic().getBrokerService().executor().next();
        return fetchOldestAvailableIndexFromTopic().thenCompose((minOffset -> {
            log.info("start recoverTxEntries for {} at offset {} minOffset {}",
                    fullPartitionName, offset, minOffset);
            final CompletableFuture future = new CompletableFuture<>();

            // The future that is returned by getTopicConsumerManager is always completed normally
            KafkaTopicConsumerManager tcm = new KafkaTopicConsumerManager("recover-tx",
                    true, persistentTopic, true);
            future.whenComplete((___, error) -> {
                // release resources in any case
                try {
                    tcm.close();
                } catch (Exception err) {
                    log.error("Cannot safely close the temporary KafkaTopicConsumerManager for {}",
                            fullPartitionName, err);
                }
            });

            final long offsetToStart;
            if (checkOffsetOutOfRange(tcm, offset, -1, statsExecutor)) {
                offsetToStart = 0;
                log.info("recoverTxEntries for {}: offset {} is out-of-range, "
                                + "maybe the topic has been deleted/recreated, "
                                + "starting recovery from {}",
                        fullPartitionName, offset, offsetToStart);
            } else {
                offsetToStart = Math.max(offset, minOffset);
            }

            producerStateManager.handleMissingDataBeforeRecovery(minOffset, offset);

            if (log.isDebugEnabled()) {
                log.debug("recoverTxEntries for {}: remove tcm to get cursor for fetch offset: {} .",
                        fullPartitionName, offsetToStart);
            }


            final CompletableFuture> cursorFuture = tcm.removeCursorFuture(offsetToStart);

            if (cursorFuture == null) {
                // tcm is closed, just return a NONE error because the channel may be still active
                log.warn("KafkaTopicConsumerManager is closed, remove TCM of {}", fullPartitionName);
                future.completeExceptionally(new NotLeaderOrFollowerException());
                return future;
            }

            cursorFuture.thenAccept((cursorLongPair) -> {

                if (cursorLongPair == null) {
                    log.warn("KafkaTopicConsumerManager.remove({}) return null for topic {}. "
                            + "Fetch for topic return error.", offsetToStart, fullPartitionName);
                    future.completeExceptionally(new NotLeaderOrFollowerException());
                    return;
                }
                final ManagedCursor cursor = cursorLongPair.getLeft();
                final AtomicLong cursorOffset = new AtomicLong(cursorLongPair.getRight());

                AtomicLong entryCounter = new AtomicLong();
                readNextEntriesForRecovery(cursor, cursorOffset, tcm, entryCounter,
                        future, executor, statsExecutor);

            }).exceptionally(ex -> {
                future.completeExceptionally(new NotLeaderOrFollowerException());
                return null;
            });
            return future;
        }));
    }

    private void readNextEntriesForRecovery(ManagedCursor cursor, AtomicLong cursorOffset,
                                            KafkaTopicConsumerManager tcm,
                                            AtomicLong entryCounter,
                                            CompletableFuture future, Executor executor,
                                            EventExecutor statsExecutor) {
        if (log.isDebugEnabled()) {
            log.debug("readNextEntriesForRecovery {} cursorOffset {}", fullPartitionName, cursorOffset);
        }
        int maxReadEntriesNum = 200;
        long adjustedMaxBytes = Long.MAX_VALUE;
        readEntries(tcm, cursor, cursorOffset, maxReadEntriesNum, adjustedMaxBytes, (partitionName) -> {})
                .whenCompleteAsync((entries, throwable) -> {
                    if (throwable != null) {
                        log.error("Read entry error on {}", fullPartitionName, throwable);
                        tcm.deleteOneCursorAsync(cursor,
                                "cursor.readEntry fail. deleteCursor");
                        if (throwable instanceof ManagedLedgerException.CursorAlreadyClosedException
                                || throwable instanceof ManagedLedgerException.ManagedLedgerFencedException) {
                            future.completeExceptionally(new NotLeaderOrFollowerException());
                            return;
                        }
                        future.completeExceptionally(new UnknownServerException(throwable));
                        return;
                    }

                    // Add new offset back to TCM after entries are read successfully
                    tcm.add(cursorOffset.get(), Pair.of(cursor, cursorOffset.get()));

                    if (entries.isEmpty()) {
                        if (log.isDebugEnabled()) {
                            log.debug("No more entries to recover for {}", fullPartitionName);
                        }
                        future.completeAsync(() -> entryCounter.get(), executor);
                        return;
                    }

                    CompletableFuture decodedEntries = new CompletableFuture<>();
                    decodeEntriesForRecovery(decodedEntries, entries, statsExecutor);

                    decodedEntries.thenAccept((decodeResult) -> {
                        try {
                            final var positions = decodeResult.getPositions();
                            final var index = new MutableInt(0);
                            decodeResult.getRecords().batches().forEach(batch -> {
                                final var i = index.getAndIncrement();
                                if (i < positions.size()) {
                                    entryCounter.addAndGet(batch.lastOffset() - batch.baseOffset() + 1);
                                    updateProducerState(batch, (PositionImpl) positions.get(i));
                                } else {
                                    // It should never happen
                                    log.error("[{}] Position index {} out of range for batch {}", fullPartitionName,
                                        i, batch.baseOffset());
                                }
                            });
                            producerStateManager.updateMaxReadPosition((PositionImpl)
                                positions.get(positions.size() - 1));
                        } catch (Throwable e) {
                            log.error("Failed to handle the decode result {}", decodeResult.getPositions(), e);
                        } finally {
                            decodeResult.recycle();
                        }

                        readNextEntriesForRecovery(cursor, cursorOffset, tcm, entryCounter, future, executor,
                                statsExecutor);

                    }).exceptionally(error -> {
                        log.error("Bad error while recovering {}", fullPartitionName, error);
                        future.completeExceptionally(error);
                        return null;
                    });
                }, executor);
    }

    private void updateProducerState(RecordBatch batch, PositionImpl position) {
        if (batch.hasProducerId()) {
            final var producerId = batch.producerId();
            // TODO: the origin should be REPLICATION but it's never used in this project
            ProducerAppendInfo appendInfo = producerStateManager.prepareUpdate(producerId, AppendOrigin.Client);
            // The base offset and last offset fields are set when taking the snapshot
            final var maybeCompletedTxn = appendInfo.append(batch, batch.baseOffset(), batch.lastOffset(), position);
            producerStateManager.update(appendInfo);
            if (maybeCompletedTxn.isPresent()) {
                CompletedTxn completedTxn = maybeCompletedTxn.get();
                long lastStableOffset = producerStateManager.lastStableOffset(completedTxn);
                producerStateManager.updateTxnIndex(completedTxn, lastStableOffset);
                producerStateManager.completeTxn(completedTxn);
            }
        }
        producerStateManager.updateMapEndOffset(batch.lastOffset() + 1);
    }

    private void decodeEntriesForRecovery(final CompletableFuture future,
                                          final List entries,
                                          final EventExecutor statsExecutor) {

        if (log.isDebugEnabled()) {
            log.debug("Read {} entries", entries.size());
        }
        final byte magic = RecordBatch.CURRENT_MAGIC_VALUE;
        final long startDecodingEntriesNanos = MathUtils.nowInNano();
        try {
            DecodeResult decodeResult = entryFormatter.decode(entries, magic);
            long fetchDecodeLatencyNanos = MathUtils.elapsedNanos(startDecodingEntriesNanos);
            statsExecutor.execute(() -> {
                requestStats.getFetchDecodeStats().registerSuccessfulEvent(
                        fetchDecodeLatencyNanos, TimeUnit.NANOSECONDS);
            });
            future.complete(decodeResult);
        } catch (Exception error) {
            future.completeExceptionally(error);
        }
    }

    @VisibleForTesting
    public boolean isUnloaded() {
        return unloaded;
    }

    public @Nullable PersistentTopic getPersistentTopic() {
        return persistentTopic;
    }

    @Override
    public String toString() {
        return "PartitionLog(" + fullPartitionName + ")";
    }

    public DescribeProducersResponseData.PartitionResponse activeProducerState() {
        var producerState = new DescribeProducersResponseData.PartitionResponse()
                .setPartitionIndex(statsTopicPartition.partition())
                .setErrorCode(Errors.NONE.code())
                .setActiveProducers(new ArrayList<>());

        // this utility is only for monitoring, it is fine to access this structure directly from any thread
        var producers = producerStateManager.getProducers();
        producers.values().forEach(producerStateEntry -> {
            producerState.activeProducers().add(new DescribeProducersResponseData.ProducerState()
                    .setProducerId(producerStateEntry.producerId)
                    .setLastSequence(-1) // NOT HANDLED YET
                    .setProducerEpoch(producerStateEntry.producerEpoch != null
                            ? producerStateEntry.producerEpoch.intValue() : -1)
                    .setLastTimestamp(producerStateEntry.lastTimestamp != null
                            ? producerStateEntry.lastTimestamp : -1)
                    .setCoordinatorEpoch(producerStateEntry.coordinatorEpoch)
                    .setCurrentTxnStartOffset(producerStateEntry.currentTxnFirstOffset.orElse(-1L)));
        });
        return producerState;
    }

    public CompletableFuture close() {
        if (this.unloaded) {
            return CompletableFuture.completedFuture(null);
        }
        this.unloaded = true;
        // Take snapshot before close the topic
        if (this.initFuture.isDone() && !this.initFuture.isCompletedExceptionally()) {
            if (recoveryExecutor.isShutdown()) {
                log.warn("[{}] Failed to take snapshot after shutdown", fullPartitionName);
                return CompletableFuture.completedFuture(null);
            }
            return this.producerStateManager.takeSnapshot(recoveryExecutor).thenApply(__ -> (Void) null)
                .exceptionally(e -> {
                    if (e != null) {
                        log.warn("[{}] Failed to take snapshot: {}", fullPartitionName, e.getMessage());
                    }
                    return null;
                });
        } else {
            return CompletableFuture.completedFuture(null);
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy