All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.streamnative.pulsar.handlers.kop.storage.ReplicaManager Maven / Gradle / Ivy

There is a newer version: 4.0.0.4
Show newest version
/**
 * Copyright (c) 2019 - 2024 StreamNative, Inc.. All Rights Reserved.
 */
/**
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.streamnative.pulsar.handlers.kop.storage;

import com.google.common.annotations.VisibleForTesting;
import io.streamnative.pulsar.handlers.kop.DelayedFetch;
import io.streamnative.pulsar.handlers.kop.KafkaServiceConfiguration;
import io.streamnative.pulsar.handlers.kop.KafkaTopicLookupService;
import io.streamnative.pulsar.handlers.kop.KsnTopicPartition;
import io.streamnative.pulsar.handlers.kop.MessageFetchContext;
import io.streamnative.pulsar.handlers.kop.RequestStats;
import io.streamnative.pulsar.handlers.kop.utils.KopTopic;
import io.streamnative.pulsar.handlers.kop.utils.delayed.DelayedOperation;
import io.streamnative.pulsar.handlers.kop.utils.delayed.DelayedOperationKey;
import io.streamnative.pulsar.handlers.kop.utils.delayed.DelayedOperationPurgatory;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.stream.Collectors;
import lombok.Getter;
import lombok.extern.slf4j.Slf4j;
import org.apache.bookkeeper.common.util.MathUtils;
import org.apache.bookkeeper.common.util.OrderedExecutor;
import org.apache.commons.lang3.mutable.MutableBoolean;
import org.apache.commons.lang3.mutable.MutableLong;
import org.apache.kafka.common.IsolationLevel;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.common.errors.InvalidTopicException;
import org.apache.kafka.common.message.DescribeProducersResponseData;
import org.apache.kafka.common.message.FetchRequestData;
import org.apache.kafka.common.protocol.Errors;
import org.apache.kafka.common.record.MemoryRecords;
import org.apache.kafka.common.requests.ProduceResponse;
import org.apache.kafka.common.utils.SystemTime;
import org.apache.kafka.common.utils.Time;
import org.apache.pulsar.broker.service.BrokerServiceException;
import org.apache.pulsar.broker.service.plugin.EntryFilter;
import org.apache.pulsar.client.api.PulsarClientException;
import org.apache.pulsar.common.util.FutureUtil;

/**
 * Used to append records. Mapping to Kafka ReplicaManager.scala.
 */
@Slf4j
public class ReplicaManager {
    @Getter
    private final PartitionLogManager logManager;
    private final DelayedOperationPurgatory fetchPurgatory;

    private final String metadataNamespace;

    public ReplicaManager(KafkaServiceConfiguration kafkaConfig,
                          RequestStats requestStats,
                          Time time,
                          List entryFilters,
                          DelayedOperationPurgatory fetchPurgatory,
                          KafkaTopicLookupService kafkaTopicLookupService,
                          ProducerStateManagerSnapshotBufferFactory producerStateManagerSnapshotBufferFactory,
                          OrderedExecutor recoveryExecutor) {
        this.logManager = new PartitionLogManager(kafkaConfig, requestStats, entryFilters,
                time, kafkaTopicLookupService, producerStateManagerSnapshotBufferFactory, recoveryExecutor);
        this.fetchPurgatory = fetchPurgatory;
        this.metadataNamespace = kafkaConfig.getKafkaMetadataNamespace();
    }

    public PartitionLog getPartitionLog(String fullPartitionName) {
        // TODO: Make `getPartitionLog` async to avoid blocking the current thread
        return logManager.getLog(fullPartitionName);
    }

    public void removePartitionLog(String topicName) {
        PartitionLog partitionLog = logManager.removeAndCloseLog(topicName);
        if (log.isDebugEnabled() && partitionLog != null) {
            log.debug("PartitionLog: {} has bean removed.", topicName);
        }
    }

    @VisibleForTesting
    public int size() {
        return logManager.size();
    }

    public CompletableFuture> appendRecords(
            final short requiredAcks,
            final boolean internalTopicsAllowed,
            final Map entriesPerPartition,
            final PartitionLog.AppendOrigin origin,
            final AppendRecordsContext appendRecordsContext) {
        CompletableFuture> completableFuture =
                new CompletableFuture<>();
        try {
            final Map> responseMap =
                    new HashMap<>();

            for (Map.Entry entry : entriesPerPartition.entrySet()) {
                KsnTopicPartition ksnTopicPartition = entry.getKey();
                MemoryRecords memoryRecords = entry.getValue();
                final var topicPartition = ksnTopicPartition.topicPartition();
                final var fullPartitionName = ksnTopicPartition.fullPartitionName();
                // reject appending to internal topics if it is not allowed
                if (!internalTopicsAllowed && KopTopic.isInternalTopic(fullPartitionName, metadataNamespace)) {
                    responseMap.put(topicPartition,
                            CompletableFuture.completedFuture(new ProduceResponse.PartitionResponse(
                                    Errors.forException(new InvalidTopicException(
                                            String.format("Cannot append to internal topic %s",
                                                    topicPartition.topic()))))));
                } else {
                    PartitionLog partitionLog = getPartitionLog(fullPartitionName);
                    CompletableFuture appendRecordsFuture =
                            partitionLog.appendRecords(memoryRecords, origin, appendRecordsContext)
                                    .thenApply(offset -> new ProduceResponse.PartitionResponse(
                                            Errors.NONE, offset, -1L, -1L))
                                    .exceptionally(ex -> {
                                        Throwable cause = FutureUtil.unwrapCompletionException(ex);
                                        if (cause instanceof BrokerServiceException.PersistenceException
                                            || cause instanceof BrokerServiceException.ServiceUnitNotReadyException) {
                                            log.error(
                                                    "Encounter NotLeaderOrFollower error while handling append "
                                                            + "for {}",
                                                    fullPartitionName, ex);
                                            // BrokerServiceException$PersistenceException:
                                            // org.apache.bookkeeper.mledger.ManagedLedgerException:
                                            // org.apache.bookkeeper.mledger
                                            // .ManagedLedgerException$BadVersionException:
                                            // org.apache.pulsar.metadata.api.MetadataStoreExcept
                                            return new ProduceResponse.PartitionResponse(
                                                    Errors.NOT_LEADER_OR_FOLLOWER);
                                        } else if (cause instanceof PulsarClientException) {
                                            log.error("Error on Pulsar Client while handling append for {}",
                                                    fullPartitionName, ex);
                                            return new ProduceResponse.PartitionResponse(
                                                    Errors.BROKER_NOT_AVAILABLE);
                                        } else {
                                            final var errors = Errors.forException(cause);
                                            if (errors == Errors.UNKNOWN_SERVER_ERROR) {
                                                log.error("System error while handling append for {}",
                                                    fullPartitionName, cause);
                                            }
                                            return new ProduceResponse.PartitionResponse(errors);
                                        }
                                    });
                    // if requiredAcks==0, the response of appendRecords will be ignored (logging only)
                    if (requiredAcks != 0) {
                        responseMap.put(topicPartition, appendRecordsFuture);
                    }
                }
            }

            // wait for all futures to complete
            FutureUtil.waitForAll(responseMap.values()).whenComplete((__, ___) -> {
                Map result =
                        responseMap.entrySet().stream().collect(Collectors.toMap(Map.Entry::getKey, entry ->
                                entry.getValue().handle((partitionResponse, t) -> {
                                    if (t != null) {
                                        partitionResponse = new ProduceResponse.PartitionResponse(
                                                Errors.forException(t.getCause()));
                                    }
                                    return partitionResponse;
                                }).join()));
                completableFuture.complete(result);
            });
        } catch (Throwable error) {
            log.error("Internal error", error);
            completableFuture.completeExceptionally(error);
        }
        return completableFuture;
    }

    public CompletableFuture> fetchMessage(
            final long timeout,
            final int fetchMinBytes,
            final int fetchMaxBytes,
            final ConcurrentHashMap fetchInfos,
            final IsolationLevel isolationLevel,
            final MessageFetchContext context) {
        CompletableFuture> future =
                new CompletableFuture<>();
        final boolean readCommitted =
                (context.getTc() != null && isolationLevel.equals(IsolationLevel.READ_COMMITTED));
        final long startTime = SystemTime.SYSTEM.hiResClockMs();

        readFromLocalLog(readCommitted, fetchMaxBytes, context.getMaxReadEntriesNum(), fetchInfos, context)
                .thenAccept(readResults -> {
                    final MutableLong bytesReadable = new MutableLong(0);
                    final MutableBoolean errorReadingData = new MutableBoolean(false);
                    readResults.forEach((topicPartition, readRecordsResult) -> {
                        if (readRecordsResult.errors() != Errors.NONE) {
                            errorReadingData.setTrue();
                        }
                        if (readRecordsResult.decodeResult() != null) {
                            bytesReadable.addAndGet(readRecordsResult.decodeResult().getRecords().sizeInBytes());
                        }
                    });

                    long now = SystemTime.SYSTEM.hiResClockMs();
                    long currentWait = now - startTime;
                    long remainingMaxWait = timeout - currentWait;
                    long maxWait = Math.min(remainingMaxWait, timeout);
                    if (maxWait <= 0 || fetchInfos.isEmpty()
                            || bytesReadable.longValue() >= fetchMinBytes || errorReadingData.booleanValue()) {
                        future.complete(readResults);
                        return;
                    }
                    List delayedFetchKeys = fetchInfos.keySet().stream()
                            .map(DelayedOperationKey.TopicPartitionOperationKey::new).collect(Collectors.toList());
                    DelayedFetch delayedFetch = new DelayedFetch(
                            maxWait,
                            fetchMaxBytes,
                            bytesReadable.getValue(),
                            readCommitted,
                            context,
                            this,
                            fetchInfos,
                            readResults,
                            future
                    );
                    fetchPurgatory.tryCompleteElseWatch(delayedFetch, delayedFetchKeys);
                });

        return future;
    }

    public CompletableFuture> readFromLocalLog(
            final boolean readCommitted,
            final int fetchMaxBytes,
            final int maxReadEntriesNum,
            final Map readPartitionInfo,
            final MessageFetchContext context) {
        AtomicLong limitBytes = new AtomicLong(fetchMaxBytes);
        CompletableFuture> resultFuture = new CompletableFuture<>();
        ConcurrentHashMap result = new ConcurrentHashMap<>();
        AtomicInteger restTopicPartitionNeedRead = new AtomicInteger(readPartitionInfo.size());

        final var startFetchNanos = MathUtils.nowInNano();
        Runnable complete = () -> {
            if (restTopicPartitionNeedRead.decrementAndGet() == 0) {
                context.getStatsLogger().getTotalHandleFetchStats().registerSuccessfulEvent(
                    MathUtils.elapsedNanos(startFetchNanos), TimeUnit.NANOSECONDS);
                resultFuture.complete(result);
            }
        };
        readPartitionInfo.forEach((ksnTopicPartition, fetchInfo) -> {
            getPartitionLog(ksnTopicPartition.fullPartitionName())
                    .awaitInitialisation()
                    .whenComplete((partitionLog, failed) -> {
                        if (failed != null) {
                            var error = Errors.forException(failed);
                            if (error == Errors.UNKNOWN_SERVER_ERROR) {
                                // It should never reach here
                                log.error("Unexpected exception when getPartitionLog {}", resultFuture, failed);
                                error = Errors.NOT_LEADER_OR_FOLLOWER;
                            }
                            result.put(ksnTopicPartition.topicPartition(), PartitionLog.ReadRecordsResult.error(
                                error, null));
                            complete.run();
                            return;
                        }
                        partitionLog
                                .readRecords(fetchInfo, readCommitted,
                                        limitBytes, maxReadEntriesNum, context
                                )
                                .thenAccept(readResult -> {
                                    result.put(ksnTopicPartition.topicPartition(), readResult);
                                    complete.run();
                                });
                    });

        });
        return resultFuture;
    }

    public void tryCompleteDelayedFetch(DelayedOperationKey key) {
        int completed = fetchPurgatory.checkAndComplete(key);
        if (log.isDebugEnabled()) {
            log.debug("Request key {} unblocked {} fetch requests.", key.keyLabel(), completed);
        }
    }

    public CompletableFuture updatePurgeAbortedTxnsOffsets() {
        return logManager.updatePurgeAbortedTxnsOffsets();
    }

    public CompletableFuture activeProducerState(
            KsnTopicPartition ksnTopicPartition) {
        PartitionLog partitionLog = getPartitionLog(ksnTopicPartition.fullPartitionName());
        return partitionLog.awaitInitialisation().thenApply(PartitionLog::activeProducerState);
    }

    public CompletableFuture closeAsync() {
        return logManager.closeAsync();
    }
}