io.streamnative.pulsar.handlers.kop.storage.ProducerStateManager Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of pulsar-protocol-handler-kafka Show documentation
Show all versions of pulsar-protocol-handler-kafka Show documentation
Kafka on Pulsar implemented using Pulsar Protocol Handler
/**
* Copyright (c) 2019 - 2024 StreamNative, Inc.. All Rights Reserved.
*/
/**
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.streamnative.pulsar.handlers.kop.storage;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Maps;
import io.streamnative.pulsar.handlers.kop.utils.MessageMetadataUtils;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.TreeMap;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.Executor;
import java.util.concurrent.atomic.AtomicLong;
import javax.annotation.Nullable;
import lombok.Getter;
import lombok.extern.slf4j.Slf4j;
import org.apache.bookkeeper.mledger.Position;
import org.apache.bookkeeper.mledger.impl.ManagedLedgerImpl;
import org.apache.bookkeeper.mledger.impl.PositionImpl;
import org.apache.bookkeeper.util.SafeRunnable;
import org.apache.kafka.common.message.FetchResponseData;
import org.apache.kafka.common.record.RecordBatch;
import org.apache.pulsar.broker.service.persistent.PersistentTopic;
import org.apache.pulsar.common.util.FutureUtil;
/**
* Producer state manager.
*/
@Slf4j
public class ProducerStateManager {
@Getter
private final String topicPartition;
private final String kafkaTopicUUID;
@Getter
private final Map producers = Maps.newConcurrentMap();
// ongoing transactions sorted by the first offset of the transaction
private final TreeMap ongoingTxns = Maps.newTreeMap();
private final List abortedIndexList = new ArrayList<>();
private final ProducerStateManagerSnapshotBuffer producerStateManagerSnapshotBuffer;
private final int kafkaTxnProducerStateTopicSnapshotIntervalSeconds;
private final int kafkaTxnPurgeAbortedTxnIntervalSeconds;
private final int kafkaTxnMaxDifferentMessageToSnapshotThreshold;
private volatile long lastMapOffset = -1L;
private volatile long lastSnapOffset = -1L;
private long lastSnapshotTime;
private long lastPurgeAbortedTxnTime;
private volatile long abortedTxnsPurgeOffset = -1;
private PositionImpl maxReadPosition;
@Getter
private final PartitionLog partitionLog;
public ProducerStateManager(String topicPartition,
String kafkaTopicUUID,
ProducerStateManagerSnapshotBuffer producerStateManagerSnapshotBuffer,
int kafkaTxnProducerStateTopicSnapshotIntervalSeconds,
int kafkaTxnPurgeAbortedTxnIntervalSeconds,
int kafkaTxnMaxDifferentMessageToSnapshotThreshold,
PartitionLog partitionLog) {
this.topicPartition = topicPartition;
this.kafkaTopicUUID = kafkaTopicUUID;
this.producerStateManagerSnapshotBuffer = producerStateManagerSnapshotBuffer;
this.kafkaTxnProducerStateTopicSnapshotIntervalSeconds = kafkaTxnProducerStateTopicSnapshotIntervalSeconds;
this.kafkaTxnPurgeAbortedTxnIntervalSeconds = kafkaTxnPurgeAbortedTxnIntervalSeconds;
this.kafkaTxnMaxDifferentMessageToSnapshotThreshold = kafkaTxnMaxDifferentMessageToSnapshotThreshold;
this.lastSnapshotTime = System.currentTimeMillis();
this.lastPurgeAbortedTxnTime = System.currentTimeMillis();
this.maxReadPosition = PositionImpl.EARLIEST;
this.partitionLog = partitionLog;
}
public CompletableFuture recover(PartitionLog partitionLog, Executor executor) {
return producerStateManagerSnapshotBuffer
.readLatestSnapshot(topicPartition)
.exceptionally(e -> {
log.error("Failed to read snapshot for {} from storage, trying to replay the log to recover.",
topicPartition, e);
return null;
})
.thenCompose(snapshot -> applySnapshotAndRecover(snapshot, partitionLog, executor));
}
private CompletableFuture applySnapshotAndRecover(ProducerStateManagerSnapshot snapshot,
PartitionLog partitionLog,
Executor executor) {
if (snapshot != null && kafkaTopicUUID != null
&& !kafkaTopicUUID.equals(snapshot.topicUUID())) {
log.info("The latest snapshot for topic {} was for UUID {} that is different from {}. "
+ "Ignoring it (topic has been re-created)", topicPartition, snapshot.topicUUID(),
kafkaTopicUUID);
snapshot = null;
}
long offSetPosition = 0;
synchronized (abortedIndexList) {
this.abortedIndexList.clear();
this.producers.clear();
this.ongoingTxns.clear();
if (snapshot != null) {
this.abortedIndexList.addAll(snapshot.abortedIndexList());
this.producers.putAll(snapshot.producers());
this.ongoingTxns.putAll(snapshot.ongoingTxns());
this.lastSnapOffset = snapshot.offset();
this.lastMapOffset = lastSnapOffset;
offSetPosition = snapshot.offset();
log.info("Recover topic {} from offset {}", topicPartition, offSetPosition);
log.info("ongoingTxns transactions after recovery {}", snapshot.ongoingTxns());
log.info("Aborted transactions after recovery {}", snapshot.abortedIndexList());
} else {
log.info("No snapshot found for topic {}, recovering from the beginning", topicPartition);
}
}
long startRecovery = System.currentTimeMillis();
// recover from log
long finalOffSetPosition = offSetPosition;
return recoverFirstPositionInOngoingTxns(partitionLog.getPersistentTopic()).thenCompose(__ -> {
return partitionLog.recoverTxEntries(finalOffSetPosition, executor)
.thenCompose(numEntries -> {
updateMaxReadPosition((PositionImpl) partitionLog.getLastPosition());
log.info("Recovery of {} finished. Scanned {} entries, time {} ms, new lastMapOffset {}",
topicPartition,
numEntries,
System.currentTimeMillis() - startRecovery,
lastMapOffset);
// Take snapshot after recovery the producer state from the log
if (numEntries == 0) {
return CompletableFuture.completedFuture(null);
}
return takeSnapshot(executor).exceptionally(e -> {
log.error("Failed to take snapshot after recovery for {} from log", topicPartition, e);
return null;
}).thenApply(___ -> null);
});
});
}
private CompletableFuture recoverFirstPositionInOngoingTxns(@Nullable PersistentTopic persistentTopic) {
if (persistentTopic == null) {
return CompletableFuture.failedFuture(
new IllegalStateException("PersistentTopic is null in recoverFirstPositionInOngoingTxns"));
}
List> futures = new ArrayList<>();
this.ongoingTxns.forEach((__, txnMetadata) -> {
if (txnMetadata.firstPosition != null) {
return;
}
CompletableFuture future = MessageMetadataUtils.asyncFindPosition(persistentTopic.getManagedLedger(),
txnMetadata.firstOffset, true)
.thenAccept(position -> txnMetadata.firstPosition = position);
futures.add(future);
});
return FutureUtil.waitForAll(futures);
}
@VisibleForTesting
public CompletableFuture takeSnapshot(Executor executor) {
CompletableFuture result = new CompletableFuture<>();
executor.execute(new SafeRunnable() {
@Override
public void safeRun() {
if (lastMapOffset == -1) {
result.complete(null);
return;
}
// If not a new offset, then it is not worth taking another snapshot
if (lastMapOffset <= lastSnapOffset) {
result.complete(null);
return;
}
ProducerStateManagerSnapshot snapshot = getProducerStateManagerSnapshot();
log.info("Taking snapshot for {} at {}", topicPartition, snapshot);
producerStateManagerSnapshotBuffer
.write(snapshot)
.whenComplete((res, error) -> {
if (error != null) {
result.completeExceptionally(error);
} else {
if (log.isDebugEnabled()) {
log.debug("Snapshot for {} ({}) taken at offset {}",
topicPartition, kafkaTopicUUID, snapshot.offset());
}
lastSnapOffset = snapshot.offset();
result.complete(snapshot);
}
});
}
});
return result;
}
void maybeTakeSnapshot(Executor executor) {
if (lastMapOffset == -1 || kafkaTxnProducerStateTopicSnapshotIntervalSeconds <= 0) {
return;
}
if (lastMapOffset - lastSnapOffset >= kafkaTxnMaxDifferentMessageToSnapshotThreshold) {
takeSnapshot(executor);
return;
}
long now = System.currentTimeMillis();
long deltaFromLast = (now - lastSnapshotTime) / 1000;
if (log.isDebugEnabled()) {
log.debug("maybeTakeSnapshot deltaFromLast {} vs kafkaTxnProducerStateTopicSnapshotIntervalSeconds {} ",
deltaFromLast, kafkaTxnProducerStateTopicSnapshotIntervalSeconds);
}
if (deltaFromLast < kafkaTxnProducerStateTopicSnapshotIntervalSeconds) {
return;
}
lastSnapshotTime = now;
takeSnapshot(executor);
}
void updateAbortedTxnsPurgeOffset(long abortedTxnsPurgeOffset) {
if (log.isDebugEnabled()) {
log.debug("{} updateAbortedTxnsPurgeOffset offset={}", topicPartition, abortedTxnsPurgeOffset);
}
if (abortedTxnsPurgeOffset < 0) {
return;
}
this.abortedTxnsPurgeOffset = abortedTxnsPurgeOffset;
}
long maybePurgeAbortedTx() {
if (lastMapOffset == -1 || kafkaTxnPurgeAbortedTxnIntervalSeconds <= 0) {
return 0;
}
long now = System.currentTimeMillis();
long deltaFromLast = (now - lastPurgeAbortedTxnTime) / 1000;
if (log.isDebugEnabled()) {
log.debug("maybePurgeAbortedTx deltaFromLast {} vs kafkaTxnPurgeAbortedTxnIntervalSeconds {} ",
deltaFromLast, kafkaTxnPurgeAbortedTxnIntervalSeconds);
}
if (deltaFromLast < kafkaTxnPurgeAbortedTxnIntervalSeconds) {
return 0;
}
lastPurgeAbortedTxnTime = now;
return executePurgeAbortedTx();
}
@VisibleForTesting
long executePurgeAbortedTx() {
return purgeAbortedTxns(abortedTxnsPurgeOffset);
}
private ProducerStateManagerSnapshot getProducerStateManagerSnapshot() {
ProducerStateManagerSnapshot snapshot;
synchronized (abortedIndexList) {
snapshot = new ProducerStateManagerSnapshot(
topicPartition,
kafkaTopicUUID,
lastMapOffset,
new HashMap<>(producers),
new TreeMap<>(ongoingTxns),
new ArrayList<>(abortedIndexList));
}
if (log.isDebugEnabled()) {
log.debug("Snapshot for {}: {}", topicPartition, snapshot);
}
return snapshot;
}
public ProducerAppendInfo prepareUpdate(Long producerId, PartitionLog.AppendOrigin origin) {
ProducerStateEntry currentEntry = lastEntry(producerId).orElse(ProducerStateEntry.empty(producerId));
return new ProducerAppendInfo(topicPartition, producerId, currentEntry, origin);
}
/**
* Compute the last stable offset of a completed transaction, but do not yet mark the transaction complete.
* That will be done in `completeTxn` below. This is used to compute the LSO that will be appended to the
* transaction index, but the completion must be done only after successfully appending to the index.
*/
public long lastStableOffset(CompletedTxn completedTxn) {
for (TxnMetadata txnMetadata : ongoingTxns.values()) {
if (completedTxn.producerId() != txnMetadata.producerId) {
return txnMetadata.firstOffset;
}
}
return completedTxn.lastOffset() + 1;
}
public Optional firstUndecidedOffset() {
Map.Entry entry = ongoingTxns.firstEntry();
if (log.isDebugEnabled()) {
log.debug("firstUndecidedOffset {} (ongoingTxns {})", entry, ongoingTxns);
}
if (entry == null) {
return Optional.empty();
}
return Optional.of(entry.getValue().firstOffset);
}
/**
* Get the last written entry for the given producer id.
*/
public Optional lastEntry(Long producerId) {
if (!producers.containsKey(producerId)) {
return Optional.empty();
}
return Optional.of(producers.get(producerId));
}
void updateProducers(ProducerAppendInfo appendInfo) {
if (log.isDebugEnabled()) {
log.debug("Updated producer {} state to {}", appendInfo.producerId, appendInfo);
}
if (appendInfo.producerId == RecordBatch.NO_PRODUCER_ID) {
throw new IllegalArgumentException(String.format("Invalid producer id %s passed to update for %s",
appendInfo.producerId, topicPartition));
}
ProducerStateEntry updatedEntry = appendInfo.toEntry();
producers.compute(appendInfo.producerId, (pid, stateEntry) -> {
if (stateEntry == null) {
stateEntry = updatedEntry;
} else {
stateEntry.update(updatedEntry);
}
return stateEntry;
});
}
public void update(ProducerAppendInfo appendInfo) {
updateProducers(appendInfo);
synchronized (ongoingTxns) {
for (TxnMetadata txn : appendInfo.startedTransactions()) {
ongoingTxns.put(txn.firstOffset, txn);
}
}
}
public void updateMapEndOffset(long lastOffset) {
this.lastMapOffset = lastOffset;
}
public void updateTxnIndex(CompletedTxn completedTxn, long lastStableOffset) {
if (completedTxn.isAborted()) {
AbortedTxn abortedTxn = new AbortedTxn(completedTxn.producerId(), completedTxn.firstOffset(),
completedTxn.lastOffset(), lastStableOffset);
synchronized (abortedIndexList) {
abortedIndexList.add(abortedTxn);
}
}
}
public void completeTxn(CompletedTxn completedTxn) {
TxnMetadata txnMetadata;
synchronized (ongoingTxns) {
txnMetadata = ongoingTxns.remove(completedTxn.firstOffset());
}
if (txnMetadata == null) {
String msg = String.format("Attempted to complete transaction %s on partition "
+ "%s which was not started.", completedTxn, topicPartition);
throw new IllegalArgumentException(msg);
}
}
public boolean hasSomeAbortedTransactions() {
return !abortedIndexList.isEmpty();
}
public long purgeAbortedTxns(long offset) {
AtomicLong count = new AtomicLong();
synchronized (abortedIndexList) {
abortedIndexList.removeIf(tx -> {
boolean toRemove = tx.lastOffset() < offset;
if (toRemove) {
log.info("Transaction {} can be removed (lastOffset {} < {})", tx, tx.lastOffset(), offset);
count.incrementAndGet();
} else {
if (log.isDebugEnabled()) {
log.info("Transaction {} cannot be removed ({} >= {})", tx, tx.lastOffset(), offset);
}
}
return toRemove;
});
}
return count.get();
}
public List getAbortedIndexList(long fetchOffset) {
synchronized (abortedIndexList) {
List abortedTransactions = new ArrayList<>();
for (AbortedTxn abortedTxn : abortedIndexList) {
if (abortedTxn.lastOffset() >= fetchOffset) {
abortedTransactions.add(
new FetchResponseData.AbortedTransaction()
.setProducerId(abortedTxn.producerId())
.setFirstOffset(abortedTxn.firstOffset()));
}
}
return abortedTransactions;
}
}
public boolean isTxnAborted(long producerId, long offset) {
for (AbortedTxn abortedTxn : abortedIndexList) {
if (producerId == abortedTxn.producerId() && abortedTxn.lastOffset() >= offset
&& abortedTxn.firstOffset() <= offset) {
return true;
}
}
return false;
}
public PositionImpl getMaxReadPosition() {
final PersistentTopic persistentTopic = partitionLog.getPersistentTopic();
if (persistentTopic == null) {
log.warn("[{}] getMaxReadPosition when PersistentTopic is null", partitionLog);
return PositionImpl.EARLIEST;
}
synchronized (ongoingTxns) {
// Avoid the Pulsar's Producer publish new messages, but the maxReadPosition has never been updated
final PositionImpl lac = (PositionImpl) persistentTopic.getLastPosition();
// If partitionLog has pendingPublishOps, we will skip update maxReadPosition,
// since we are not sure if the last message is a transaction message.
if (ongoingTxns.isEmpty() && partitionLog.getPendingPublishOps().intValue() == 0) {
maxReadPosition = lac;
}
return maxReadPosition;
}
}
public void updateMaxReadPosition(PositionImpl lastPosition) {
synchronized (ongoingTxns) {
if (!ongoingTxns.isEmpty()) {
Position firstPosition = ongoingTxns.firstEntry().getValue().firstPosition;
ManagedLedgerImpl managedLedger =
(ManagedLedgerImpl) partitionLog.getPersistentTopic().getManagedLedger();
maxReadPosition = managedLedger.getPreviousPosition((PositionImpl) firstPosition);
} else {
maxReadPosition = lastPosition;
}
}
}
public void handleMissingDataBeforeRecovery(long minOffset, long snapshotOffset) {
if (lastMapOffset == -1) {
// empty topic
return;
}
// topic has been trimmed
if (snapshotOffset < minOffset) {
log.info("{} handleMissingDataBeforeRecovery lastMapOffset {} snapshotOffset "
+ "{} minOffset {} RESETTING STATE",
topicPartition, lastMapOffset, snapshotOffset, minOffset);
// topic was not empty (mapEndOffset has some value)
// but there is no more data on the topic (trimmed?)
ongoingTxns.clear();
abortedIndexList.clear();
producers.clear();
lastMapOffset = -1;
}
}
}