All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.streamnative.pulsar.handlers.kop.storage.ProducerStateManager Maven / Gradle / Ivy

There is a newer version: 4.0.0.4
Show newest version
/**
 * Copyright (c) 2019 - 2024 StreamNative, Inc.. All Rights Reserved.
 */
/**
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.streamnative.pulsar.handlers.kop.storage;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Maps;
import io.streamnative.pulsar.handlers.kop.utils.MessageMetadataUtils;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.TreeMap;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.Executor;
import java.util.concurrent.atomic.AtomicLong;
import javax.annotation.Nullable;
import lombok.Getter;
import lombok.extern.slf4j.Slf4j;
import org.apache.bookkeeper.mledger.Position;
import org.apache.bookkeeper.mledger.impl.ManagedLedgerImpl;
import org.apache.bookkeeper.mledger.impl.PositionImpl;
import org.apache.bookkeeper.util.SafeRunnable;
import org.apache.kafka.common.message.FetchResponseData;
import org.apache.kafka.common.record.RecordBatch;
import org.apache.pulsar.broker.service.persistent.PersistentTopic;
import org.apache.pulsar.common.util.FutureUtil;

/**
 * Producer state manager.
 */
@Slf4j
public class ProducerStateManager {

    @Getter
    private final String topicPartition;
    private final String kafkaTopicUUID;

    @Getter
    private final Map producers = Maps.newConcurrentMap();

    // ongoing transactions sorted by the first offset of the transaction
    private final TreeMap ongoingTxns = Maps.newTreeMap();
    private final List abortedIndexList = new ArrayList<>();

    private final ProducerStateManagerSnapshotBuffer producerStateManagerSnapshotBuffer;

    private final int kafkaTxnProducerStateTopicSnapshotIntervalSeconds;
    private final int kafkaTxnPurgeAbortedTxnIntervalSeconds;
    private final int kafkaTxnMaxDifferentMessageToSnapshotThreshold;

    private volatile long lastMapOffset = -1L;
    private volatile long lastSnapOffset = -1L;

    private long lastSnapshotTime;
    private long lastPurgeAbortedTxnTime;

    private volatile long abortedTxnsPurgeOffset = -1;

    private PositionImpl maxReadPosition;

    @Getter
    private final PartitionLog partitionLog;

    public ProducerStateManager(String topicPartition,
                                String kafkaTopicUUID,
                                ProducerStateManagerSnapshotBuffer producerStateManagerSnapshotBuffer,
                                int kafkaTxnProducerStateTopicSnapshotIntervalSeconds,
                                int kafkaTxnPurgeAbortedTxnIntervalSeconds,
                                int kafkaTxnMaxDifferentMessageToSnapshotThreshold,
                                PartitionLog partitionLog) {
        this.topicPartition = topicPartition;
        this.kafkaTopicUUID = kafkaTopicUUID;
        this.producerStateManagerSnapshotBuffer = producerStateManagerSnapshotBuffer;
        this.kafkaTxnProducerStateTopicSnapshotIntervalSeconds = kafkaTxnProducerStateTopicSnapshotIntervalSeconds;
        this.kafkaTxnPurgeAbortedTxnIntervalSeconds = kafkaTxnPurgeAbortedTxnIntervalSeconds;
        this.kafkaTxnMaxDifferentMessageToSnapshotThreshold = kafkaTxnMaxDifferentMessageToSnapshotThreshold;
        this.lastSnapshotTime = System.currentTimeMillis();
        this.lastPurgeAbortedTxnTime = System.currentTimeMillis();
        this.maxReadPosition = PositionImpl.EARLIEST;
        this.partitionLog = partitionLog;
    }

    public CompletableFuture recover(PartitionLog partitionLog, Executor executor) {
        return producerStateManagerSnapshotBuffer
            .readLatestSnapshot(topicPartition)
            .exceptionally(e -> {
                log.error("Failed to read snapshot for {} from storage, trying to replay the log to recover.",
                    topicPartition, e);
                return null;
            })
            .thenCompose(snapshot -> applySnapshotAndRecover(snapshot, partitionLog, executor));
    }

    private CompletableFuture applySnapshotAndRecover(ProducerStateManagerSnapshot snapshot,
                                                            PartitionLog partitionLog,
                                                            Executor executor) {
        if (snapshot != null && kafkaTopicUUID != null
                && !kafkaTopicUUID.equals(snapshot.topicUUID())) {
            log.info("The latest snapshot for topic {} was for UUID {} that is different from {}. "
                    + "Ignoring it (topic has been re-created)", topicPartition, snapshot.topicUUID(),
                    kafkaTopicUUID);
            snapshot = null;
        }
        long offSetPosition = 0;
        synchronized (abortedIndexList) {
            this.abortedIndexList.clear();
            this.producers.clear();
            this.ongoingTxns.clear();
            if (snapshot != null) {
                this.abortedIndexList.addAll(snapshot.abortedIndexList());
                this.producers.putAll(snapshot.producers());
                this.ongoingTxns.putAll(snapshot.ongoingTxns());
                this.lastSnapOffset = snapshot.offset();
                this.lastMapOffset = lastSnapOffset;
                offSetPosition = snapshot.offset();
                log.info("Recover topic {} from offset {}", topicPartition, offSetPosition);
                log.info("ongoingTxns transactions after recovery {}", snapshot.ongoingTxns());
                log.info("Aborted transactions after recovery {}", snapshot.abortedIndexList());
            } else {
                log.info("No snapshot found for topic {}, recovering from the beginning", topicPartition);
            }
        }
        long startRecovery = System.currentTimeMillis();
        // recover from log
        long finalOffSetPosition = offSetPosition;

        return recoverFirstPositionInOngoingTxns(partitionLog.getPersistentTopic()).thenCompose(__ -> {
            return partitionLog.recoverTxEntries(finalOffSetPosition, executor)
                .thenCompose(numEntries -> {
                    updateMaxReadPosition((PositionImpl) partitionLog.getLastPosition());
                    log.info("Recovery of {} finished. Scanned {} entries, time {} ms, new lastMapOffset {}",
                        topicPartition,
                        numEntries,
                        System.currentTimeMillis() - startRecovery,
                        lastMapOffset);
                    // Take snapshot after recovery the producer state from the log
                    if (numEntries == 0) {
                        return CompletableFuture.completedFuture(null);
                    }
                    return takeSnapshot(executor).exceptionally(e -> {
                        log.error("Failed to take snapshot after recovery for {} from log", topicPartition, e);
                        return null;
                    }).thenApply(___ -> null);
                });
        });
    }

    private CompletableFuture recoverFirstPositionInOngoingTxns(@Nullable PersistentTopic persistentTopic) {
        if (persistentTopic == null) {
            return CompletableFuture.failedFuture(
                new IllegalStateException("PersistentTopic is null in recoverFirstPositionInOngoingTxns"));
        }
        List> futures = new ArrayList<>();
        this.ongoingTxns.forEach((__, txnMetadata) -> {
            if (txnMetadata.firstPosition != null) {
                return;
            }
            CompletableFuture future = MessageMetadataUtils.asyncFindPosition(persistentTopic.getManagedLedger(),
                            txnMetadata.firstOffset, true)
                    .thenAccept(position -> txnMetadata.firstPosition = position);
            futures.add(future);
        });

        return FutureUtil.waitForAll(futures);
    }

    @VisibleForTesting
    public CompletableFuture takeSnapshot(Executor executor) {
        CompletableFuture result = new CompletableFuture<>();
        executor.execute(new SafeRunnable() {
            @Override
            public void safeRun() {
                if (lastMapOffset == -1) {
                    result.complete(null);
                    return;
                }
                // If not a new offset, then it is not worth taking another snapshot
                if (lastMapOffset <= lastSnapOffset) {
                    result.complete(null);
                    return;
                }
                ProducerStateManagerSnapshot snapshot = getProducerStateManagerSnapshot();
                log.info("Taking snapshot for {} at {}", topicPartition, snapshot);
                producerStateManagerSnapshotBuffer
                        .write(snapshot)
                        .whenComplete((res, error) -> {
                            if (error != null) {
                                result.completeExceptionally(error);
                            } else {
                                if (log.isDebugEnabled()) {
                                    log.debug("Snapshot for {} ({}) taken at offset {}",
                                            topicPartition, kafkaTopicUUID, snapshot.offset());
                                }
                                lastSnapOffset = snapshot.offset();
                                result.complete(snapshot);
                            }
                        });
            }
        });
        return result;
    }

    void maybeTakeSnapshot(Executor executor) {
        if (lastMapOffset == -1 || kafkaTxnProducerStateTopicSnapshotIntervalSeconds <= 0) {
            return;
        }
        if (lastMapOffset - lastSnapOffset >= kafkaTxnMaxDifferentMessageToSnapshotThreshold) {
            takeSnapshot(executor);
            return;
        }
        long now = System.currentTimeMillis();
        long deltaFromLast = (now - lastSnapshotTime) / 1000;
        if (log.isDebugEnabled()) {
            log.debug("maybeTakeSnapshot deltaFromLast {} vs kafkaTxnProducerStateTopicSnapshotIntervalSeconds {} ",
                    deltaFromLast, kafkaTxnProducerStateTopicSnapshotIntervalSeconds);
        }
        if (deltaFromLast < kafkaTxnProducerStateTopicSnapshotIntervalSeconds) {
            return;
        }
        lastSnapshotTime = now;

        takeSnapshot(executor);
    }

    void updateAbortedTxnsPurgeOffset(long abortedTxnsPurgeOffset) {
        if (log.isDebugEnabled()) {
            log.debug("{} updateAbortedTxnsPurgeOffset offset={}", topicPartition, abortedTxnsPurgeOffset);
        }
        if (abortedTxnsPurgeOffset < 0) {
            return;
        }
        this.abortedTxnsPurgeOffset = abortedTxnsPurgeOffset;
    }

    long maybePurgeAbortedTx() {
        if (lastMapOffset == -1 || kafkaTxnPurgeAbortedTxnIntervalSeconds <= 0) {
            return 0;
        }
        long now = System.currentTimeMillis();
        long deltaFromLast = (now - lastPurgeAbortedTxnTime) / 1000;
        if (log.isDebugEnabled()) {
            log.debug("maybePurgeAbortedTx deltaFromLast {} vs kafkaTxnPurgeAbortedTxnIntervalSeconds {} ",
                    deltaFromLast, kafkaTxnPurgeAbortedTxnIntervalSeconds);
        }
        if (deltaFromLast < kafkaTxnPurgeAbortedTxnIntervalSeconds) {
            return 0;
        }
        lastPurgeAbortedTxnTime = now;
        return executePurgeAbortedTx();
    }

    @VisibleForTesting
    long executePurgeAbortedTx() {
        return purgeAbortedTxns(abortedTxnsPurgeOffset);
    }

    private ProducerStateManagerSnapshot getProducerStateManagerSnapshot() {
        ProducerStateManagerSnapshot snapshot;
        synchronized (abortedIndexList) {
            snapshot = new ProducerStateManagerSnapshot(
                    topicPartition,
                    kafkaTopicUUID,
                    lastMapOffset,
                    new HashMap<>(producers),
                    new TreeMap<>(ongoingTxns),
                    new ArrayList<>(abortedIndexList));
        }
        if (log.isDebugEnabled()) {
            log.debug("Snapshot for {}: {}", topicPartition, snapshot);
        }
        return snapshot;
    }

    public ProducerAppendInfo prepareUpdate(Long producerId, PartitionLog.AppendOrigin origin) {
        ProducerStateEntry currentEntry = lastEntry(producerId).orElse(ProducerStateEntry.empty(producerId));
        return new ProducerAppendInfo(topicPartition, producerId, currentEntry, origin);
    }

    /**
     * Compute the last stable offset of a completed transaction, but do not yet mark the transaction complete.
     * That will be done in `completeTxn` below. This is used to compute the LSO that will be appended to the
     * transaction index, but the completion must be done only after successfully appending to the index.
     */
    public long lastStableOffset(CompletedTxn completedTxn) {
        for (TxnMetadata txnMetadata : ongoingTxns.values()) {
            if (completedTxn.producerId() != txnMetadata.producerId) {
                return txnMetadata.firstOffset;
            }
        }
        return completedTxn.lastOffset() + 1;
    }

    public Optional firstUndecidedOffset() {
        Map.Entry entry = ongoingTxns.firstEntry();
        if (log.isDebugEnabled()) {
            log.debug("firstUndecidedOffset {} (ongoingTxns {})", entry, ongoingTxns);
        }
        if (entry == null) {
            return Optional.empty();
        }
        return Optional.of(entry.getValue().firstOffset);
    }

    /**
     * Get the last written entry for the given producer id.
     */
    public Optional lastEntry(Long producerId) {
        if (!producers.containsKey(producerId)) {
            return Optional.empty();
        }
        return Optional.of(producers.get(producerId));
    }

    void updateProducers(ProducerAppendInfo appendInfo) {
        if (log.isDebugEnabled()) {
            log.debug("Updated producer {} state to {}", appendInfo.producerId, appendInfo);
        }
        if (appendInfo.producerId == RecordBatch.NO_PRODUCER_ID) {
            throw new IllegalArgumentException(String.format("Invalid producer id %s passed to update for %s",
                appendInfo.producerId, topicPartition));
        }

        ProducerStateEntry updatedEntry = appendInfo.toEntry();

        producers.compute(appendInfo.producerId, (pid, stateEntry) -> {
            if (stateEntry == null) {
                stateEntry = updatedEntry;
            } else {
                stateEntry.update(updatedEntry);
            }
            return stateEntry;
        });
    }

    public void update(ProducerAppendInfo appendInfo) {
        updateProducers(appendInfo);
        synchronized (ongoingTxns) {
            for (TxnMetadata txn : appendInfo.startedTransactions()) {
                ongoingTxns.put(txn.firstOffset, txn);
            }
        }
    }

    public void updateMapEndOffset(long lastOffset) {
        this.lastMapOffset = lastOffset;
    }

    public void updateTxnIndex(CompletedTxn completedTxn, long lastStableOffset) {
        if (completedTxn.isAborted()) {
            AbortedTxn abortedTxn = new AbortedTxn(completedTxn.producerId(), completedTxn.firstOffset(),
                    completedTxn.lastOffset(), lastStableOffset);
            synchronized (abortedIndexList) {
                abortedIndexList.add(abortedTxn);
            }
        }
    }

    public void completeTxn(CompletedTxn completedTxn) {
        TxnMetadata txnMetadata;
        synchronized (ongoingTxns) {
            txnMetadata = ongoingTxns.remove(completedTxn.firstOffset());
        }
        if (txnMetadata == null) {
            String msg = String.format("Attempted to complete transaction %s on partition "
                    + "%s which was not started.", completedTxn, topicPartition);
            throw new IllegalArgumentException(msg);
        }
    }

    public boolean hasSomeAbortedTransactions() {
        return !abortedIndexList.isEmpty();
    }

    public long purgeAbortedTxns(long offset) {
        AtomicLong count = new AtomicLong();
        synchronized (abortedIndexList) {
            abortedIndexList.removeIf(tx -> {
                boolean toRemove = tx.lastOffset() < offset;
                if (toRemove) {
                    log.info("Transaction {} can be removed (lastOffset {} < {})", tx, tx.lastOffset(), offset);
                    count.incrementAndGet();
                } else {
                    if (log.isDebugEnabled()) {
                        log.info("Transaction {} cannot be removed ({} >= {})", tx, tx.lastOffset(), offset);
                    }
                }
                return toRemove;
            });
        }
        return count.get();
    }

    public List getAbortedIndexList(long fetchOffset) {
        synchronized (abortedIndexList) {
            List abortedTransactions = new ArrayList<>();
            for (AbortedTxn abortedTxn : abortedIndexList) {
                if (abortedTxn.lastOffset() >= fetchOffset) {
                    abortedTransactions.add(
                            new FetchResponseData.AbortedTransaction()
                                    .setProducerId(abortedTxn.producerId())
                                    .setFirstOffset(abortedTxn.firstOffset()));
                }
            }
            return abortedTransactions;
        }
    }

    public boolean isTxnAborted(long producerId, long offset) {
        for (AbortedTxn abortedTxn : abortedIndexList) {
            if (producerId == abortedTxn.producerId() && abortedTxn.lastOffset() >= offset
                    && abortedTxn.firstOffset() <= offset) {
                return true;
            }
        }
        return false;
    }
    public PositionImpl getMaxReadPosition() {
        final PersistentTopic persistentTopic = partitionLog.getPersistentTopic();
        if (persistentTopic == null) {
            log.warn("[{}] getMaxReadPosition when PersistentTopic is null", partitionLog);
            return PositionImpl.EARLIEST;
        }
        synchronized (ongoingTxns) {
            // Avoid the Pulsar's Producer publish new messages, but the maxReadPosition has never been updated
            final PositionImpl lac = (PositionImpl) persistentTopic.getLastPosition();
            // If partitionLog has pendingPublishOps, we will skip update maxReadPosition,
            // since we are not sure if the last message is a transaction message.
            if (ongoingTxns.isEmpty() && partitionLog.getPendingPublishOps().intValue() == 0) {
                maxReadPosition = lac;
            }
            return maxReadPosition;
        }
    }

    public void updateMaxReadPosition(PositionImpl lastPosition) {
        synchronized (ongoingTxns) {
            if (!ongoingTxns.isEmpty()) {
                Position firstPosition = ongoingTxns.firstEntry().getValue().firstPosition;
                ManagedLedgerImpl managedLedger =
                        (ManagedLedgerImpl) partitionLog.getPersistentTopic().getManagedLedger();
                maxReadPosition = managedLedger.getPreviousPosition((PositionImpl) firstPosition);
            } else {
                maxReadPosition = lastPosition;
            }
        }
    }

    public void handleMissingDataBeforeRecovery(long minOffset, long snapshotOffset) {
        if (lastMapOffset == -1) {
            // empty topic
            return;
        }
        // topic has been trimmed
        if (snapshotOffset < minOffset) {
            log.info("{} handleMissingDataBeforeRecovery lastMapOffset {} snapshotOffset "
                            + "{} minOffset {} RESETTING STATE",
                    topicPartition, lastMapOffset, snapshotOffset, minOffset);
            // topic was not empty (mapEndOffset has some value)
            // but there is no more data on the topic (trimmed?)
            ongoingTxns.clear();
            abortedIndexList.clear();
            producers.clear();
            lastMapOffset = -1;
        }
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy