Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
io.streamnative.pulsar.handlers.kop.storage.PartitionLog Maven / Gradle / Ivy
Go to download
Kafka on Pulsar implemented using Pulsar Protocol Handler
/**
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.streamnative.pulsar.handlers.kop.storage;
import static com.google.common.base.Preconditions.checkArgument;
import static io.streamnative.pulsar.handlers.kop.utils.MessageMetadataUtils.isBrokerIndexMetadataInterceptorConfigured;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Maps;
import io.netty.buffer.ByteBuf;
import io.netty.channel.ChannelHandlerContext;
import io.netty.util.Recycler;
import io.netty.util.concurrent.EventExecutor;
import io.streamnative.pulsar.handlers.kop.KafkaServiceConfiguration;
import io.streamnative.pulsar.handlers.kop.KafkaTopicConsumerManager;
import io.streamnative.pulsar.handlers.kop.KafkaTopicLookupService;
import io.streamnative.pulsar.handlers.kop.KafkaTopicManager;
import io.streamnative.pulsar.handlers.kop.MessageFetchContext;
import io.streamnative.pulsar.handlers.kop.MessagePublishContext;
import io.streamnative.pulsar.handlers.kop.PendingTopicFutures;
import io.streamnative.pulsar.handlers.kop.RequestStats;
import io.streamnative.pulsar.handlers.kop.exceptions.KoPTopicInitializeException;
import io.streamnative.pulsar.handlers.kop.exceptions.MetadataCorruptedException;
import io.streamnative.pulsar.handlers.kop.format.DecodeResult;
import io.streamnative.pulsar.handlers.kop.format.EncodeRequest;
import io.streamnative.pulsar.handlers.kop.format.EncodeResult;
import io.streamnative.pulsar.handlers.kop.format.EntryFormatter;
import io.streamnative.pulsar.handlers.kop.format.EntryFormatterFactory;
import io.streamnative.pulsar.handlers.kop.format.KafkaMixedEntryFormatter;
import io.streamnative.pulsar.handlers.kop.format.PulsarEntryFormatter;
import io.streamnative.pulsar.handlers.kop.utils.KopLogValidator;
import io.streamnative.pulsar.handlers.kop.utils.MessageMetadataUtils;
import io.streamnative.pulsar.handlers.kop.utils.MetadataUtils;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.StringJoiner;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.Executor;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Consumer;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.Getter;
import lombok.ToString;
import lombok.experimental.Accessors;
import lombok.extern.slf4j.Slf4j;
import org.apache.bookkeeper.common.util.MathUtils;
import org.apache.bookkeeper.common.util.OrderedExecutor;
import org.apache.bookkeeper.mledger.AsyncCallbacks;
import org.apache.bookkeeper.mledger.Entry;
import org.apache.bookkeeper.mledger.ManagedCursor;
import org.apache.bookkeeper.mledger.ManagedLedger;
import org.apache.bookkeeper.mledger.ManagedLedgerException;
import org.apache.bookkeeper.mledger.Position;
import org.apache.bookkeeper.mledger.impl.ManagedLedgerImpl;
import org.apache.bookkeeper.mledger.impl.NonDurableCursorImpl;
import org.apache.bookkeeper.mledger.impl.PositionImpl;
import org.apache.bookkeeper.stats.OpStatsLogger;
import org.apache.commons.compress.utils.Lists;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.kafka.common.InvalidRecordException;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.common.errors.CorruptRecordException;
import org.apache.kafka.common.errors.InvalidProducerEpochException;
import org.apache.kafka.common.errors.KafkaStorageException;
import org.apache.kafka.common.errors.NotLeaderOrFollowerException;
import org.apache.kafka.common.errors.RecordTooLargeException;
import org.apache.kafka.common.errors.UnknownServerException;
import org.apache.kafka.common.message.FetchRequestData;
import org.apache.kafka.common.message.FetchResponseData;
import org.apache.kafka.common.protocol.Errors;
import org.apache.kafka.common.record.CompressionType;
import org.apache.kafka.common.record.MemoryRecords;
import org.apache.kafka.common.record.Record;
import org.apache.kafka.common.record.RecordBatch;
import org.apache.kafka.common.requests.FetchResponse;
import org.apache.kafka.common.utils.Time;
import org.apache.pulsar.broker.service.Topic;
import org.apache.pulsar.broker.service.persistent.PersistentTopic;
import org.apache.pulsar.broker.service.plugin.EntryFilter;
import org.apache.pulsar.common.naming.TopicName;
import org.apache.pulsar.common.util.FutureUtil;
/**
* Analyze result.
*/
@Data
@Accessors(fluent = true)
@AllArgsConstructor
class AnalyzeResult {
private Map updatedProducers;
private List completedTxns;
}
/**
* An append-only log for storing messages. Mapping to Kafka Log.scala.
*/
@Slf4j
public class PartitionLog {
public static final String KAFKA_TOPIC_UUID_PROPERTY_NAME = "kafkaTopicUUID";
public static final String KAFKA_ENTRY_FORMATTER_PROPERTY_NAME = "kafkaEntryFormat";
private static final String PID_PREFIX = "KOP-PID-PREFIX";
private static final KopLogValidator.CompressionCodec DEFAULT_COMPRESSION =
new KopLogValidator.CompressionCodec(CompressionType.NONE.name, CompressionType.NONE.id);
private final KafkaServiceConfiguration kafkaConfig;
private final RequestStats requestStats;
private final Time time;
private final TopicPartition topicPartition;
private final String fullPartitionName;
@Getter
private volatile ProducerStateManager producerStateManager;
private final List entryFilters;
private final boolean preciseTopicPublishRateLimitingEnable;
private final KafkaTopicLookupService kafkaTopicLookupService;
private final ProducerStateManagerSnapshotBuffer producerStateManagerSnapshotBuffer;
private final ExecutorService recoveryExecutor;
@Getter
private volatile PersistentTopic persistentTopic;
private final CompletableFuture initFuture = new CompletableFuture<>();
private volatile Map topicProperties;
private volatile EntryFormatter entryFormatter;
private volatile String kafkaTopicUUID;
private volatile AtomicBoolean unloaded = new AtomicBoolean();
public PartitionLog(KafkaServiceConfiguration kafkaConfig,
RequestStats requestStats,
Time time,
TopicPartition topicPartition,
String fullPartitionName,
List entryFilters,
KafkaTopicLookupService kafkaTopicLookupService,
ProducerStateManagerSnapshotBuffer producerStateManagerSnapshotBuffer,
OrderedExecutor recoveryExecutor) {
this.kafkaConfig = kafkaConfig;
this.entryFilters = entryFilters;
this.requestStats = requestStats;
this.time = time;
this.topicPartition = topicPartition;
this.fullPartitionName = fullPartitionName;
this.preciseTopicPublishRateLimitingEnable = kafkaConfig.isPreciseTopicPublishRateLimiterEnable();
this.kafkaTopicLookupService = kafkaTopicLookupService;
this.producerStateManagerSnapshotBuffer = producerStateManagerSnapshotBuffer;
this.recoveryExecutor = recoveryExecutor.chooseThread(fullPartitionName);
}
public CompletableFuture initialise() {
loadTopicProperties().whenComplete((___, errorLoadTopic) -> {
if (errorLoadTopic != null) {
initFuture.completeExceptionally(new KoPTopicInitializeException(errorLoadTopic));
return;
}
if (kafkaConfig.isKafkaTransactionCoordinatorEnabled()) {
producerStateManager
.recover(this, recoveryExecutor)
.thenRun(() -> initFuture.complete(this))
.exceptionally(error -> {
initFuture.completeExceptionally(new KoPTopicInitializeException(error));
return null;
});
} else {
initFuture.complete(this);
}
});
return initFuture;
}
public CompletableFuture awaitInitialisation() {
return initFuture;
}
public boolean isInitialised() {
return initFuture.isDone() && !initFuture.isCompletedExceptionally();
}
public boolean isInitialisationFailed() {
return initFuture.isDone() && initFuture.isCompletedExceptionally();
}
public void markAsUnloaded() {
unloaded.set(true);
}
private CompletableFuture loadTopicProperties() {
CompletableFuture> persistentTopicFuture =
kafkaTopicLookupService.getTopic(fullPartitionName, this);
return persistentTopicFuture
.thenCompose(this::fetchTopicProperties)
.thenAccept(properties -> {
this.topicProperties = properties;
log.info("Topic properties for {} are {}", fullPartitionName, properties);
if (topicPartition.topic().endsWith(kafkaConfig.getKafkaMetadataNamespace()
+ "/" + org.apache.kafka.common.internals.Topic.GROUP_METADATA_TOPIC_NAME)) {
this.entryFormatter = PulsarEntryFormatter.offsetControlRecordFormatter();
} else {
this.entryFormatter = buildEntryFormatter(topicProperties);
}
this.kafkaTopicUUID = properties.get(KAFKA_TOPIC_UUID_PROPERTY_NAME);
if (!MetadataUtils.isSystemTopic(persistentTopic, kafkaConfig)) {
checkArgument(this.kafkaTopicUUID != null);
}
this.producerStateManager =
new ProducerStateManager(
fullPartitionName,
kafkaTopicUUID,
producerStateManagerSnapshotBuffer,
kafkaConfig.getKafkaTxnProducerStateTopicSnapshotIntervalSeconds(),
kafkaConfig.getKafkaTxnPurgeAbortedTxnIntervalSeconds());
});
}
private CompletableFuture> fetchTopicProperties(Optional persistentTopic) {
if (!persistentTopic.isPresent()) {
log.info("Topic {} not loaded here", fullPartitionName);
return FutureUtil.failedFuture(new NotLeaderOrFollowerException());
}
this.persistentTopic = persistentTopic.get();
TopicName logicalName = TopicName.get(persistentTopic.get().getName());
TopicName actualName;
if (logicalName.isPartitioned()) {
actualName = TopicName.getPartitionedTopicName(persistentTopic.get().getName());
} else {
actualName = logicalName;
}
return persistentTopic.get().getBrokerService()
.fetchPartitionedTopicMetadataAsync(actualName)
.thenApply(metadata -> {
if (metadata.partitions > 0) {
return metadata.properties;
} else {
return persistentTopic.get().getManagedLedger().getProperties();
}
})
.thenApply(map -> map != null ? map : Collections.emptyMap());
}
private EntryFormatter buildEntryFormatter(Map topicProperties) {
final String entryFormat;
if (topicProperties != null) {
entryFormat = topicProperties
.getOrDefault(KAFKA_ENTRY_FORMATTER_PROPERTY_NAME, kafkaConfig.getEntryFormat());
} else {
entryFormat = kafkaConfig.getEntryFormat();
}
if (log.isDebugEnabled()) {
log.debug("entryFormat for {} is {} (topicProperties {})", fullPartitionName,
entryFormat, topicProperties);
}
return EntryFormatterFactory.create(kafkaConfig, entryFilters, entryFormat);
}
@Data
@Accessors(fluent = true)
@AllArgsConstructor
public static class LogAppendInfo {
private Optional firstOffset;
private Optional producerId;
private short producerEpoch;
private int numMessages;
private int shallowCount;
private boolean isTransaction;
private boolean isControlBatch;
private int validBytes;
private int firstSequence;
private int lastSequence;
private KopLogValidator.CompressionCodec sourceCodec;
private KopLogValidator.CompressionCodec targetCodec;
}
@Data
@ToString
@Accessors(fluent = true)
@AllArgsConstructor
public static class ReadRecordsResult {
private static final Recycler RECYCLER = new Recycler() {
protected ReadRecordsResult newObject(Handle handle) {
return new ReadRecordsResult(handle);
}
};
private final Recycler.Handle recyclerHandle;
private DecodeResult decodeResult;
private List abortedTransactions;
private long highWatermark;
private long lastStableOffset;
private Position lastPosition;
private Errors errors;
private PartitionLog partitionLog;
private ReadRecordsResult(Recycler.Handle recyclerHandle) {
this.recyclerHandle = recyclerHandle;
}
public Errors errors() {
return errors == null ? Errors.NONE : errors;
}
public static ReadRecordsResult get(DecodeResult decodeResult,
List abortedTransactions,
long highWatermark,
long lastStableOffset,
Position lastPosition,
PartitionLog partitionLog) {
return ReadRecordsResult.get(
decodeResult,
abortedTransactions,
highWatermark,
lastStableOffset,
lastPosition,
null,
partitionLog);
}
public static ReadRecordsResult get(DecodeResult decodeResult,
List abortedTransactions,
long highWatermark,
long lastStableOffset,
Position lastPosition,
Errors errors,
PartitionLog partitionLog) {
ReadRecordsResult readRecordsResult = RECYCLER.get();
readRecordsResult.decodeResult = decodeResult;
readRecordsResult.abortedTransactions = abortedTransactions;
readRecordsResult.highWatermark = highWatermark;
readRecordsResult.lastStableOffset = lastStableOffset;
readRecordsResult.lastPosition = lastPosition;
readRecordsResult.errors = errors;
readRecordsResult.partitionLog = partitionLog;
return readRecordsResult;
}
public static ReadRecordsResult empty(long highWatermark,
long lastStableOffset,
Position lastPosition,
PartitionLog partitionLog) {
return ReadRecordsResult.get(
DecodeResult.get(MemoryRecords.EMPTY),
Collections.emptyList(),
highWatermark,
lastStableOffset,
lastPosition,
partitionLog);
}
public static ReadRecordsResult error(Errors errors, PartitionLog partitionLog) {
return ReadRecordsResult.error(PositionImpl.EARLIEST, errors, partitionLog);
}
public static ReadRecordsResult error(Position position, Errors errors, PartitionLog partitionLog) {
return ReadRecordsResult.get(null,
null,
-1,
-1,
position,
errors,
partitionLog);
}
public FetchResponseData.PartitionData toPartitionData() {
// There are three cases:
//
// 1. errors == null :
// The decode result count > 0
// 2. errors == ERROR.NONE :
// Get the empty result.
// 3. errors == Others error :
// Get errors.
if (errors != null) {
return new FetchResponseData.PartitionData()
.setErrorCode(errors.code())
.setHighWatermark(FetchResponse.INVALID_HIGH_WATERMARK)
.setLastStableOffset(FetchResponse.INVALID_LAST_STABLE_OFFSET)
.setLogStartOffset(FetchResponse.INVALID_LOG_START_OFFSET)
.setRecords(MemoryRecords.EMPTY);
}
return new FetchResponseData.PartitionData()
.setErrorCode(Errors.NONE.code())
.setHighWatermark(highWatermark)
.setLastStableOffset(lastStableOffset)
.setHighWatermark(highWatermark) // TODO: should it be changed to the logStartOffset?
.setAbortedTransactions(abortedTransactions)
.setRecords(decodeResult.getRecords());
}
public void recycle() {
this.errors = null;
this.lastPosition = null;
this.lastStableOffset = -1;
this.highWatermark = -1;
this.abortedTransactions = null;
this.partitionLog = null;
if (this.decodeResult != null) {
this.decodeResult.recycle();
this.decodeResult = null;
}
}
}
/**
* AppendOrigin is used mark the data origin.
*/
public enum AppendOrigin {
Coordinator,
Client,
Log
}
// TODO: the first and last offset only make sense here if there is only a sinlge completed txn.
// It might make sense to refactor this method.
public AnalyzeResult analyzeAndValidateProducerState(MemoryRecords records,
Optional firstOffset,
Long lastOffset,
AppendOrigin origin) {
Map updatedProducers = Maps.newHashMap();
return analyzeAndValidateProducerState(records, updatedProducers, firstOffset, lastOffset, origin);
}
public AnalyzeResult analyzeAndValidateProducerState(MemoryRecords records,
Map updatedProducers,
Optional firstOffset,
Long lastOffset,
AppendOrigin origin) {
List completedTxns = Lists.newArrayList();
for (RecordBatch batch : records.batches()) {
if (batch.hasProducerId()) {
// We cache offset metadata for the start of each transaction. This allows us to
// compute the last stable offset without relying on additional index lookups.
Optional maybeCompletedTxn =
updateProducers(batch, updatedProducers, firstOffset, origin);
maybeCompletedTxn.ifPresent(txn -> {
if (lastOffset != null) {
txn.lastOffset(lastOffset);
}
completedTxns.add(txn);
});
}
}
return new AnalyzeResult(updatedProducers, completedTxns);
}
public Map analyzeAndCheckProducerEpochBeforeUpdate(
MemoryRecords records, AppendOrigin origin) throws InvalidProducerEpochException {
Map updatedProducers = Maps.newHashMap();
for (RecordBatch batch : records.batches()) {
if (batch.hasProducerId()) {
checkProducerEpochBeforeUpdate(batch, updatedProducers, origin);
}
}
return updatedProducers;
}
private Optional updateProducers(
RecordBatch batch,
Map producers,
Optional firstOffset,
AppendOrigin origin) {
Long producerId = batch.producerId();
ProducerAppendInfo appendInfo =
producers.computeIfAbsent(producerId, pid -> producerStateManager.prepareUpdate(pid, origin));
return appendInfo.append(batch, firstOffset);
}
private void checkProducerEpochBeforeUpdate(RecordBatch batch,
Map producers,
AppendOrigin origin) throws InvalidProducerEpochException {
Long producerId = batch.producerId();
ProducerAppendInfo appendInfo =
producers.computeIfAbsent(producerId, pid -> producerStateManager.prepareUpdate(pid, origin));
if (appendInfo != null) {
appendInfo.checkProducerEpoch(batch.producerEpoch());
}
}
public Optional firstUndecidedOffset() {
return producerStateManager.firstUndecidedOffset();
}
public List getAbortedIndexList(long fetchOffset) {
return producerStateManager.getAbortedIndexList(fetchOffset);
}
/**
* Append this message to pulsar.
*
* @param records The log records to append
* @param origin Declares the origin of to append which affects required validations
* @param appendRecordsContext See {@link AppendRecordsContext}
*/
public CompletableFuture appendRecords(final MemoryRecords records,
final AppendOrigin origin,
final AppendRecordsContext appendRecordsContext) {
CompletableFuture appendFuture = new CompletableFuture<>();
KafkaTopicManager topicManager = appendRecordsContext.getTopicManager();
if (topicManager == null) {
log.error("topicManager is null for {}", fullPartitionName,
new Exception("topicManager is null for " + fullPartitionName).fillInStackTrace());
return CompletableFuture
.failedFuture(new KafkaStorageException("topicManager is null for " + fullPartitionName));
}
final long beforeRecordsProcess = time.nanoseconds();
try {
final LogAppendInfo appendInfo = analyzeAndValidateRecords(records);
// return if we have no valid messages or if this is a duplicate of the last appended entry
if (appendInfo.shallowCount() == 0) {
appendFuture.complete(appendInfo.firstOffset().orElse(-1L));
return appendFuture;
}
MemoryRecords validRecords = trimInvalidBytes(records, appendInfo);
// Append Message into pulsar
final long startEnqueueNanos = MathUtils.nowInNano();
final Consumer sequentialExecutor = __ -> {
long messageQueuedLatencyNanos = MathUtils.elapsedNanos(startEnqueueNanos);
appendRecordsContext.getEventExecutor().execute(() -> {
requestStats.getMessageQueuedLatencyStats().registerSuccessfulEvent(
messageQueuedLatencyNanos, TimeUnit.NANOSECONDS);
});
Map updatedProducers;
try {
updatedProducers = analyzeAndCheckProducerEpochBeforeUpdate(validRecords, origin);
} catch (InvalidProducerEpochException t) {
appendFuture.completeExceptionally(t);
return;
} catch (Throwable t) {
log.error("Failed to analyzeAndCheckProducerEpochBeforeUpdate for {}", topicPartition, t);
appendFuture.completeExceptionally(t);
return;
}
final ManagedLedger managedLedger = persistentTopic.getManagedLedger();
if (entryFormatter instanceof KafkaMixedEntryFormatter) {
final long logEndOffset = MessageMetadataUtils.getLogEndOffset(managedLedger);
appendInfo.firstOffset(Optional.of(logEndOffset));
}
final EncodeRequest encodeRequest = EncodeRequest.get(validRecords, appendInfo);
long pendingTopicLatencyNanos = time.nanoseconds() - beforeRecordsProcess;
appendRecordsContext.getEventExecutor().execute(() -> {
requestStats.getPendingTopicLatencyStats().registerSuccessfulEvent(
pendingTopicLatencyNanos, TimeUnit.NANOSECONDS);
});
long beforeEncodingStarts = time.nanoseconds();
final EncodeResult encodeResult = entryFormatter.encode(encodeRequest);
encodeRequest.recycle();
long encodeLatencyNanos = time.nanoseconds() - beforeEncodingStarts;
appendRecordsContext.getEventExecutor().execute(() -> {
requestStats.getProduceEncodeStats().registerSuccessfulEvent(
encodeLatencyNanos, TimeUnit.NANOSECONDS);
});
appendRecordsContext.getStartSendOperationForThrottling()
.accept(encodeResult.getEncodedByteBuf().readableBytes());
publishMessages(appendFuture, appendInfo, encodeResult, updatedProducers, appendRecordsContext);
};
appendRecordsContext.getPendingTopicFuturesMap()
.computeIfAbsent(topicPartition, ignored -> new PendingTopicFutures())
.addListener(initFuture, sequentialExecutor, throwable -> {
long messageQueuedLatencyNanos = MathUtils.elapsedNanos(startEnqueueNanos);
appendRecordsContext.getCtx().executor().execute(() -> {
requestStats.getMessageQueuedLatencyStats().registerFailedEvent(
messageQueuedLatencyNanos, TimeUnit.NANOSECONDS);
});
appendFuture.completeExceptionally(throwable);
});
} catch (Exception exception) {
log.error("Failed to handle produce request for {}", topicPartition, exception);
appendFuture.completeExceptionally(exception);
}
return appendFuture;
}
public Position getLastPosition() {
return persistentTopic.getLastPosition();
}
public CompletableFuture readRecords(final FetchRequestData.FetchPartition partitionData,
final boolean readCommitted,
final AtomicLong limitBytes,
final int maxReadEntriesNum,
final MessageFetchContext context) {
final long startPrepareMetadataNanos = MathUtils.nowInNano();
final CompletableFuture future = new CompletableFuture<>();
final long offset = partitionData.fetchOffset();
KafkaTopicManager topicManager = context.getTopicManager();
// The future that is returned by getTopicConsumerManager is always completed normally
topicManager.getTopicConsumerManager(fullPartitionName).thenAccept(tcm -> {
if (tcm == null) {
registerPrepareMetadataFailedEvent(context.getEventExecutor(), startPrepareMetadataNanos);
// remove null future cache
context.getSharedState().getKafkaTopicConsumerManagerCache().removeAndCloseByTopic(fullPartitionName);
if (log.isDebugEnabled()) {
log.debug("Fetch for {}: no tcm for topic {} return NOT_LEADER_FOR_PARTITION.",
topicPartition, fullPartitionName);
}
future.complete(ReadRecordsResult.error(Errors.NOT_LEADER_OR_FOLLOWER, this));
return;
}
if (checkOffsetOutOfRange(tcm, offset, topicPartition, startPrepareMetadataNanos,
context.getEventExecutor())) {
future.complete(ReadRecordsResult.error(Errors.OFFSET_OUT_OF_RANGE, this));
return;
}
if (log.isDebugEnabled()) {
log.debug("Fetch for {}: remove tcm to get cursor for fetch offset: {} .", topicPartition, offset);
}
final CompletableFuture> cursorFuture = tcm.removeCursorFuture(offset);
if (cursorFuture == null) {
// tcm is closed, just return a NONE error because the channel may be still active
log.warn("KafkaTopicConsumerManager is closed, remove TCM of {}", fullPartitionName);
registerPrepareMetadataFailedEvent(context.getEventExecutor(), startPrepareMetadataNanos);
context.getSharedState().getKafkaTopicConsumerManagerCache().removeAndCloseByTopic(fullPartitionName);
future.complete(ReadRecordsResult.error(Errors.NONE, this));
return;
}
cursorFuture.thenAccept((cursorLongPair) -> {
if (cursorLongPair == null) {
log.warn("KafkaTopicConsumerManager.remove({}) return null for topic {}. "
+ "Fetch for topic return error.", offset, topicPartition);
registerPrepareMetadataFailedEvent(context.getEventExecutor(), startPrepareMetadataNanos);
future.complete(ReadRecordsResult.error(Errors.NOT_LEADER_OR_FOLLOWER, this));
return;
}
final ManagedCursor cursor = cursorLongPair.getLeft();
final AtomicLong cursorOffset = new AtomicLong(cursorLongPair.getRight());
registerPrepareMetadataFailedEvent(context.getEventExecutor(), startPrepareMetadataNanos);
long adjustedMaxBytes = Math.min(partitionData.partitionMaxBytes(), limitBytes.get());
if (readCommitted) {
long firstUndecidedOffset = producerStateManager.firstUndecidedOffset().orElse(-1L);
if (firstUndecidedOffset >= 0 && firstUndecidedOffset <= offset) {
long highWaterMark = MessageMetadataUtils.getHighWatermark(cursor.getManagedLedger());
future.complete(
ReadRecordsResult.empty(
highWaterMark,
firstUndecidedOffset,
tcm.getManagedLedger().getLastConfirmedEntry(), this
)
);
return;
}
}
readEntries(cursor, topicPartition, cursorOffset, maxReadEntriesNum, adjustedMaxBytes,
fullPartitionName -> {
topicManager.invalidateCacheForFencedManagerLedgerOnTopic(fullPartitionName);
}, context.getEventExecutor())
.whenComplete((entries, throwable) -> {
if (throwable != null) {
tcm.deleteOneCursorAsync(cursorLongPair.getLeft(),
"cursor.readEntry fail. deleteCursor");
if (throwable instanceof ManagedLedgerException.CursorAlreadyClosedException
|| throwable instanceof ManagedLedgerException.ManagedLedgerFencedException) {
future.complete(ReadRecordsResult.error(Errors.NOT_LEADER_OR_FOLLOWER, this));
return;
}
log.error("Read entry error on {}", partitionData, throwable);
future.complete(ReadRecordsResult.error(Errors.UNKNOWN_SERVER_ERROR, this));
return;
}
long readSize = entries.stream().mapToLong(Entry::getLength).sum();
limitBytes.addAndGet(-1 * readSize);
// Add new offset back to TCM after entries are read successfully
tcm.add(cursorOffset.get(), Pair.of(cursor, cursorOffset.get()));
handleEntries(future, entries, partitionData, tcm, cursor, readCommitted, context);
});
}).exceptionally(ex -> {
registerPrepareMetadataFailedEvent(context.getEventExecutor(), startPrepareMetadataNanos);
context.getSharedState()
.getKafkaTopicConsumerManagerCache().removeAndCloseByTopic(fullPartitionName);
future.complete(ReadRecordsResult.error(Errors.NOT_LEADER_OR_FOLLOWER, this));
return null;
});
});
return future;
}
private boolean checkOffsetOutOfRange(KafkaTopicConsumerManager tcm,
long offset,
TopicPartition topicPartition,
long startPrepareMetadataNanos,
EventExecutor eventExecutor) {
// handle offset out-of-range exception
ManagedLedgerImpl managedLedger = (ManagedLedgerImpl) tcm.getManagedLedger();
long logEndOffset = MessageMetadataUtils.getLogEndOffset(managedLedger);
// TODO: Offset out-of-range checks are still incomplete
// We only check the case of `offset > logEndOffset` and `offset < LogStartOffset`
// is currently not handled.
// Because we found that the operation of obtaining `LogStartOffset`
// requires reading from disk,
// and such a time-consuming operation is likely to harm the performance of FETCH request.
// More discussions please refer to https://github.com/streamnative/kop/pull/531
if (offset > logEndOffset) {
log.error("Received request for offset {} for partition {}, "
+ "but we only have entries less than {}.",
offset, topicPartition, logEndOffset);
if (startPrepareMetadataNanos > 0) {
registerPrepareMetadataFailedEvent(eventExecutor, startPrepareMetadataNanos);
}
return true;
}
return false;
}
private void registerPrepareMetadataFailedEvent(EventExecutor eventExecutor, long startPrepareMetadataNanos) {
long prepareMetadataNanos = MathUtils.elapsedNanos(startPrepareMetadataNanos);
eventExecutor.execute(() -> {
this.requestStats.getPrepareMetadataStats().registerFailedEvent(
prepareMetadataNanos, TimeUnit.NANOSECONDS);
});
}
private void handleEntries(final CompletableFuture future,
final List entries,
final FetchRequestData.FetchPartition partitionData,
final KafkaTopicConsumerManager tcm,
final ManagedCursor cursor,
final boolean readCommitted,
final MessageFetchContext context) {
final long highWatermark = MessageMetadataUtils.getHighWatermark(cursor.getManagedLedger());
final long lso = (readCommitted
? this.firstUndecidedOffset().orElse(highWatermark) : highWatermark);
final List committedEntries = readCommitted ? getCommittedEntries(entries, lso) : entries;
if (log.isDebugEnabled()) {
log.debug("Read {} entries but only {} entries are committed, lso {}, highWatermark {}",
entries.size(), committedEntries.size(), lso, highWatermark);
}
if (committedEntries.isEmpty()) {
future.complete(ReadRecordsResult.error(tcm.getManagedLedger().getLastConfirmedEntry(), Errors.NONE,
this));
return;
}
// use compatible magic value by apiVersion
final byte magic = getCompatibleMagic(context.getHeader().apiVersion());
// this part is heavyweight, and we should not execute in the ManagedLedger Ordered executor thread
final CompletableFuture groupNameFuture = kafkaConfig.isKopEnableGroupLevelConsumerMetrics()
? context.getCurrentConnectedGroupNameAsync() : CompletableFuture.completedFuture(null);
groupNameFuture.whenCompleteAsync((groupName, ex) -> {
if (ex != null) {
log.error("Get groupId failed.", ex);
groupName = "";
}
final long startDecodingEntriesNanos = MathUtils.nowInNano();
// Get the last entry position for delayed fetch.
Position lastPosition = this.getLastPositionFromEntries(committedEntries);
final DecodeResult decodeResult = entryFormatter.decode(committedEntries, magic);
long fetchDecodeLatencyNanos = MathUtils.elapsedNanos(startDecodingEntriesNanos);
context.getEventExecutor().execute(() -> {
requestStats.getFetchDecodeStats().registerSuccessfulEvent(
fetchDecodeLatencyNanos, TimeUnit.NANOSECONDS);
});
// collect consumer metrics
decodeResult.updateConsumerStats(topicPartition, committedEntries.size(),
groupName, requestStats, context.getEventExecutor());
List abortedTransactions = null;
if (readCommitted) {
abortedTransactions = this.getAbortedIndexList(partitionData.fetchOffset());
}
if (log.isDebugEnabled()) {
log.debug("Partition {} read entry completed in {} ns",
topicPartition, MathUtils.nowInNano() - startDecodingEntriesNanos);
}
future.complete(ReadRecordsResult
.get(decodeResult, abortedTransactions, highWatermark, lso, lastPosition, this));
}, context.getDecodeExecutor()).exceptionally(ex -> {
log.error("Partition {} read entry exceptionally. ", topicPartition, ex);
future.complete(ReadRecordsResult.error(Errors.KAFKA_STORAGE_ERROR, this));
return null;
});
}
private static byte getCompatibleMagic(short apiVersion) {
final byte magic;
if (apiVersion <= 1) {
magic = RecordBatch.MAGIC_VALUE_V0;
} else if (apiVersion <= 3) {
magic = RecordBatch.MAGIC_VALUE_V1;
} else {
magic = RecordBatch.CURRENT_MAGIC_VALUE;
}
return magic;
}
private Position getLastPositionFromEntries(List entries) {
if (entries == null || entries.isEmpty()) {
return PositionImpl.EARLIEST;
}
return entries.get(entries.size() - 1).getPosition();
}
private List getCommittedEntries(List entries, long lso) {
List committedEntries;
committedEntries = new ArrayList<>();
for (Entry entry : entries) {
try {
if (lso > MessageMetadataUtils.peekBaseOffsetFromEntry(entry)) {
committedEntries.add(entry);
} else {
break;
}
} catch (MetadataCorruptedException e) {
log.error("[{}:{}] Failed to peek base offset from entry.",
entry.getLedgerId(), entry.getEntryId());
}
}
// Release all the entries that are not in the result
for (int i = committedEntries.size(); i < entries.size(); i++) {
entries.get(i).release();
}
return committedEntries;
}
/**
* Read Entries by cursor.
*
* @return CompletableFuture>
* When the comparable future complete normally, the list of entry's will never be null.
*/
private CompletableFuture> readEntries(final ManagedCursor cursor,
final TopicPartition topicPartition,
final AtomicLong cursorOffset,
final int maxReadEntriesNum,
final long adjustedMaxBytes,
final Consumer invalidateCacheOnTopic,
final EventExecutor eventExecutor) {
final OpStatsLogger messageReadStats = requestStats.getMessageReadStats();
// read readeEntryNum size entry.
final long startReadingMessagesNanos = MathUtils.nowInNano();
final CompletableFuture> readFuture = new CompletableFuture<>();
if (adjustedMaxBytes <= 0) {
readFuture.complete(Collections.emptyList());
return readFuture;
}
final long originalOffset = cursorOffset.get();
cursor.asyncReadEntries(maxReadEntriesNum, adjustedMaxBytes, new AsyncCallbacks.ReadEntriesCallback() {
@Override
public void readEntriesComplete(List entries, Object ctx) {
if (!entries.isEmpty()) {
final Entry lastEntry = entries.get(entries.size() - 1);
final PositionImpl currentPosition = PositionImpl.get(
lastEntry.getLedgerId(), lastEntry.getEntryId());
try {
final long lastOffset = MessageMetadataUtils.peekOffsetFromEntry(lastEntry);
// commit the offset, so backlog not affect by this cursor.
commitOffset((NonDurableCursorImpl) cursor, currentPosition);
// and add back to TCM when all read complete.
cursorOffset.set(lastOffset + 1);
if (log.isDebugEnabled()) {
log.debug("Topic {} success read entry: ledgerId: {}, entryId: {}, size: {},"
+ " ConsumerManager original offset: {}, lastEntryPosition: {}, "
+ "nextOffset: {}",
topicPartition, lastEntry.getLedgerId(), lastEntry.getEntryId(),
lastEntry.getLength(), originalOffset, currentPosition,
cursorOffset.get());
}
} catch (MetadataCorruptedException e) {
log.error("[{}] Failed to peekOffsetFromEntry from position {}: {}",
topicPartition, currentPosition, e.getMessage());
long failedLatencyNanos = MathUtils.elapsedNanos(startReadingMessagesNanos);
eventExecutor.execute(() -> {
messageReadStats.registerFailedEvent(failedLatencyNanos, TimeUnit.NANOSECONDS);
});
readFuture.completeExceptionally(e);
return;
}
}
long successLatencyNanos = MathUtils.elapsedNanos(startReadingMessagesNanos);
eventExecutor.execute(() -> {
messageReadStats.registerSuccessfulEvent(
successLatencyNanos, TimeUnit.NANOSECONDS);
});
readFuture.complete(entries);
}
@Override
public void readEntriesFailed(ManagedLedgerException exception, Object ctx) {
log.error("Error read entry for topic: {}", fullPartitionName, exception);
if (exception instanceof ManagedLedgerException.ManagedLedgerFencedException) {
invalidateCacheOnTopic.accept(fullPartitionName);
}
long failedLatencyNanos = MathUtils.elapsedNanos(startReadingMessagesNanos);
eventExecutor.execute(() -> {
messageReadStats.registerFailedEvent(failedLatencyNanos, TimeUnit.NANOSECONDS);
});
readFuture.completeExceptionally(exception);
}
}, null, PositionImpl.LATEST);
return readFuture;
}
// commit the offset, so backlog not affect by this cursor.
private static void commitOffset(NonDurableCursorImpl cursor, PositionImpl currentPosition) {
cursor.asyncMarkDelete(currentPosition, new AsyncCallbacks.MarkDeleteCallback() {
@Override
public void markDeleteComplete(Object ctx) {
if (log.isDebugEnabled()) {
log.debug("Mark delete success for position: {}", currentPosition);
}
}
// this is OK, since this is kind of cumulative ack, following commit will come.
@Override
public void markDeleteFailed(ManagedLedgerException e, Object ctx) {
log.warn("Mark delete failed for position: {} with error:",
currentPosition, e);
}
}, null);
}
private void publishMessages(final CompletableFuture appendFuture,
final LogAppendInfo appendInfo,
final EncodeResult encodeResult,
final Map updatedProducers,
final AppendRecordsContext appendRecordsContext) {
checkAndRecordPublishQuota(persistentTopic, appendInfo.validBytes(),
appendInfo.numMessages(), appendRecordsContext);
if (persistentTopic.isSystemTopic()) {
encodeResult.recycle();
log.error("Not support producing message to system topic: {}", persistentTopic);
appendFuture.completeExceptionally(Errors.INVALID_TOPIC_EXCEPTION.exception());
return;
}
appendRecordsContext
.getTopicManager()
.registerProducerInPersistentTopic(fullPartitionName, persistentTopic)
.ifPresent((producer) -> {
// collect metrics
encodeResult.updateProducerStats(topicPartition, requestStats, producer,
appendRecordsContext.getEventExecutor());
});
final int numMessages = encodeResult.getNumMessages();
final ByteBuf byteBuf = encodeResult.getEncodedByteBuf();
final int byteBufSize = byteBuf.readableBytes();
final long beforePublish = time.nanoseconds();
publishMessage(persistentTopic, byteBuf, appendInfo)
.whenComplete((offset, e) -> {
appendRecordsContext.getCompleteSendOperationForThrottling().accept(byteBufSize);
if (e == null) {
long publishLatencyNanos = time.nanoseconds() - beforePublish;
appendRecordsContext.getEventExecutor().execute(() -> {
requestStats.getMessagePublishStats().registerSuccessfulEvent(
publishLatencyNanos, TimeUnit.NANOSECONDS);
});
final long lastOffset = offset + numMessages - 1;
try {
AnalyzeResult analyzeResult = analyzeAndValidateProducerState(
encodeResult.getRecords(),
updatedProducers,
Optional.of(offset),
lastOffset,
AppendOrigin.Client);
updateProducerStateManager(lastOffset, analyzeResult);
} catch (Throwable t) {
appendFuture.completeExceptionally(t);
return;
}
appendFuture.complete(offset);
} else {
log.error("publishMessages for topic partition: {} failed when write.", fullPartitionName, e);
long publishLatencyNanos = time.nanoseconds() - beforePublish;
appendRecordsContext.getEventExecutor().execute(() -> {
requestStats.getMessagePublishStats().registerFailedEvent(
publishLatencyNanos, TimeUnit.NANOSECONDS);
});
appendFuture.completeExceptionally(e);
}
encodeResult.recycle();
});
}
private void checkAndRecordPublishQuota(Topic topic, int msgSize, int numMessages,
AppendRecordsContext appendRecordsContext) {
final boolean isPublishRateExceeded;
if (preciseTopicPublishRateLimitingEnable) {
boolean isPreciseTopicPublishRateExceeded =
topic.isTopicPublishRateExceeded(numMessages, msgSize);
if (isPreciseTopicPublishRateExceeded) {
topic.disableCnxAutoRead();
return;
}
isPublishRateExceeded = topic.isBrokerPublishRateExceeded();
} else {
if (topic.isResourceGroupRateLimitingEnabled()) {
final boolean resourceGroupPublishRateExceeded =
topic.isResourceGroupPublishRateExceeded(numMessages, msgSize);
if (resourceGroupPublishRateExceeded) {
topic.disableCnxAutoRead();
return;
}
}
isPublishRateExceeded = topic.isPublishRateExceeded();
}
if (isPublishRateExceeded) {
ChannelHandlerContext ctx = appendRecordsContext.getCtx();
if (ctx != null && ctx.channel().config().isAutoRead()) {
ctx.channel().config().setAutoRead(false);
}
}
}
/**
* Publish message to bookkeeper.
* When the message is control message, then it will not do the message deduplication.
*
* @param persistentTopic The persistentTopic, use to publish message and check message deduplication.
* @param byteBuf Message byteBuf
* @param appendInfo Pre-analyzed recode info, we can get sequence, message num ...
* @return offset
*/
private CompletableFuture publishMessage(final PersistentTopic persistentTopic,
final ByteBuf byteBuf,
final LogAppendInfo appendInfo) {
final CompletableFuture offsetFuture = new CompletableFuture<>();
// This producerName is only used to check the message deduplication.
// Kafka will reuse pid when transactionId is the same but will increase the producerEpoch.
// So we need to ensure the producerName is not the same.
String producerName = new StringJoiner("-")
.add(PID_PREFIX)
.add(String.valueOf(appendInfo.producerId().orElse(-1L)))
.add(String.valueOf(appendInfo.producerEpoch())).toString();
persistentTopic.publishMessage(byteBuf,
MessagePublishContext.get(
offsetFuture,
persistentTopic,
producerName,
appendInfo.producerId().isPresent() && !appendInfo.isControlBatch(),
appendInfo.firstSequence(),
appendInfo.lastSequence(),
appendInfo.numMessages(),
time.nanoseconds()));
return offsetFuture;
}
@VisibleForTesting
public LogAppendInfo analyzeAndValidateRecords(MemoryRecords records) {
int numMessages = 0;
int shallowMessageCount = 0;
Optional firstOffset = Optional.empty();
boolean readFirstMessage = false;
boolean isTransaction = false;
boolean isControlBatch = false;
int validBytesCount = 0;
int firstSequence = Integer.MAX_VALUE;
int lastSequence = -1;
Optional producerId = Optional.empty();
short producerEpoch = -1;
KopLogValidator.CompressionCodec sourceCodec = DEFAULT_COMPRESSION;
for (RecordBatch batch : records.batches()) {
if (batch.magic() >= RecordBatch.MAGIC_VALUE_V2 && batch.baseOffset() != 0) {
throw new InvalidRecordException("The baseOffset of the record batch in the append to "
+ topicPartition + " should be 0, but it is " + batch.baseOffset());
}
if (!readFirstMessage) {
if (batch.magic() >= RecordBatch.MAGIC_VALUE_V2) {
firstOffset = Optional.of(batch.baseOffset());
}
readFirstMessage = true;
}
int batchSize = batch.sizeInBytes();
if (batchSize > kafkaConfig.getMaxMessageSize()) {
throw new RecordTooLargeException(String.format("Message batch size is %s "
+ "in append to partition %s which exceeds the maximum configured size of %s .",
batchSize, topicPartition, kafkaConfig.getMaxMessageSize()));
}
batch.ensureValid();
shallowMessageCount += 1;
validBytesCount += batchSize;
int numMessagesInBatch = (int) (batch.lastOffset() - batch.baseOffset() + 1);
if (numMessagesInBatch <= 1) {
// The lastOffset field might be set. We need to iterate the records.
for (Record record : batch) {
numMessages++;
}
} else {
numMessages += numMessagesInBatch;
}
isTransaction = batch.isTransactional();
isControlBatch = batch.isControlBatch();
// We assume batches producerId are same.
if (batch.hasProducerId()) {
producerId = Optional.of(batch.producerId());
producerEpoch = batch.producerEpoch();
}
if (batch.compressionType().id != CompressionType.NONE.id) {
CompressionType compressionType = CompressionType.forId(batch.compressionType().id);
sourceCodec = new KopLogValidator.CompressionCodec(
compressionType.name, compressionType.id);
}
if (firstSequence > batch.baseSequence()) {
firstSequence = batch.baseSequence();
}
if (lastSequence < batch.lastSequence()) {
lastSequence = batch.lastSequence();
}
}
if (validBytesCount < 0) {
throw new CorruptRecordException("Cannot append record batch with illegal length "
+ validBytesCount + " to log for " + topicPartition
+ ". A possible cause is corrupted produce request.");
}
KopLogValidator.CompressionCodec targetCodec =
KopLogValidator.getTargetCodec(sourceCodec, kafkaConfig.getKafkaCompressionType());
return new LogAppendInfo(firstOffset, producerId, producerEpoch, numMessages, shallowMessageCount,
isTransaction, isControlBatch, validBytesCount, firstSequence, lastSequence, sourceCodec, targetCodec);
}
private MemoryRecords trimInvalidBytes(MemoryRecords records, LogAppendInfo info) {
int validBytes = info.validBytes();
if (validBytes < 0){
throw new CorruptRecordException(String.format("Cannot append record batch with illegal length %s to "
+ "log for %s. A possible cause is a corrupted produce request.", validBytes, topicPartition));
} else if (validBytes == records.sizeInBytes()) {
return records;
} else {
ByteBuffer validByteBuffer = records.buffer().duplicate();
validByteBuffer.limit(validBytes);
return MemoryRecords.readableRecords(validByteBuffer);
}
}
/**
* Remove all the AbortedTxn that are no more referred by existing data on the topic.
* @return
*/
public CompletableFuture> updatePurgeAbortedTxnsOffset() {
if (!kafkaConfig.isKafkaTransactionCoordinatorEnabled()) {
// no need to scan the topic, because transactions are disabled
return CompletableFuture.completedFuture(null);
}
if (!producerStateManager.hasSomeAbortedTransactions()) {
// nothing to do
return CompletableFuture.completedFuture(null);
}
if (unloaded.get()) {
// nothing to do
return CompletableFuture.completedFuture(null);
}
return fetchOldestAvailableIndexFromTopic()
.thenAccept(offset ->
producerStateManager.updateAbortedTxnsPurgeOffset(offset));
}
public CompletableFuture fetchOldestAvailableIndexFromTopic() {
if (unloaded.get()) {
return FutureUtil.failedFuture(new NotLeaderOrFollowerException());
}
final CompletableFuture future = new CompletableFuture<>();
// The future that is returned by getTopicConsumerManager is always completed normally
KafkaTopicConsumerManager tcm = new KafkaTopicConsumerManager("purge-aborted-tx",
true, persistentTopic);
future.whenComplete((___, error) -> {
// release resources in any case
try {
tcm.close();
} catch (Exception err) {
log.error("Cannot safely close the temporary KafkaTopicConsumerManager for {}",
fullPartitionName, err);
}
});
ManagedLedgerImpl managedLedger = (ManagedLedgerImpl) persistentTopic.getManagedLedger();
long numberOfEntries = managedLedger.getNumberOfEntries();
if (numberOfEntries == 0) {
long currentOffset = MessageMetadataUtils.getCurrentOffset(managedLedger);
log.info("First offset for topic {} is {} as the topic is empty (numberOfEntries=0)",
fullPartitionName, currentOffset);
future.complete(currentOffset);
return future;
}
// this is a DUMMY entry with -1
PositionImpl firstPosition = managedLedger.getFirstPosition();
// look for the first entry with data
PositionImpl nextValidPosition = managedLedger.getNextValidPosition(firstPosition);
fetchOldestAvailableIndexFromTopicReadNext(future, managedLedger, nextValidPosition);
return future;
}
private void fetchOldestAvailableIndexFromTopicReadNext(CompletableFuture future,
ManagedLedgerImpl managedLedger, PositionImpl position) {
managedLedger.asyncReadEntry(position, new AsyncCallbacks.ReadEntryCallback() {
@Override
public void readEntryComplete(Entry entry, Object ctx) {
try {
long startOffset = MessageMetadataUtils.peekBaseOffsetFromEntry(entry);
log.info("First offset for topic {} is {} - position {}", fullPartitionName,
startOffset, entry.getPosition());
future.complete(startOffset);
} catch (MetadataCorruptedException.NoBrokerEntryMetadata noBrokerEntryMetadata) {
long currentOffset = MessageMetadataUtils.getCurrentOffset(managedLedger);
log.info("Legacy entry for topic {} - position {} - returning current offset {}",
fullPartitionName,
entry.getPosition(),
currentOffset);
future.complete(currentOffset);
} catch (Exception err) {
future.completeExceptionally(err);
} finally {
entry.release();
}
}
@Override
public void readEntryFailed(ManagedLedgerException exception, Object ctx) {
future.completeExceptionally(exception);
}
}, null);
}
@VisibleForTesting
public CompletableFuture> takeProducerSnapshot() {
return initFuture.thenCompose((___) -> {
// snapshot can be taken only on the same thread that is used for writes
ManagedLedgerImpl ml = (ManagedLedgerImpl) getPersistentTopic().getManagedLedger();
Executor executorService = ml.getExecutor();
return this
.getProducerStateManager()
.takeSnapshot(executorService);
});
}
@VisibleForTesting
public CompletableFuture forcePurgeAbortTx() {
return initFuture.thenCompose((___) -> {
// purge can be taken only on the same thread that is used for writes
ManagedLedgerImpl ml = (ManagedLedgerImpl) getPersistentTopic().getManagedLedger();
ExecutorService executorService = ml.getScheduledExecutor().chooseThread(ml.getName());
return updatePurgeAbortedTxnsOffset()
.thenApplyAsync((____) -> {
return getProducerStateManager().executePurgeAbortedTx();
}, executorService);
});
}
public CompletableFuture recoverTxEntries(
long offset,
Executor executor) {
if (!kafkaConfig.isKafkaTransactionCoordinatorEnabled()) {
// no need to scan the topic, because transactions are disabled
return CompletableFuture.completedFuture(0L);
}
if (!isBrokerIndexMetadataInterceptorConfigured(persistentTopic.getBrokerService())) {
// The `UpgradeTest` will set the interceptor to null,
// this will cause NPE problem while `fetchOldestAvailableIndexFromTopic`,
// but we can't disable kafka transaction,
// currently transaction coordinator must set to true (Newly Kafka client requirement).
// TODO Actually, if the AppendIndexMetadataInterceptor is not set, the kafka transaction can't work,
// we need to throw an exception, maybe we need add a new configuration for ProducerId.
log.error("The broker index metadata interceptor is not configured for topic {}, skip recover txn entries.",
fullPartitionName);
return CompletableFuture.completedFuture(0L);
}
final EventExecutor statsExecutor = getPersistentTopic().getBrokerService().executor().next();
return fetchOldestAvailableIndexFromTopic().thenCompose((minOffset -> {
log.info("start recoverTxEntries for {} at offset {} minOffset {}",
fullPartitionName, offset, minOffset);
final CompletableFuture future = new CompletableFuture<>();
// The future that is returned by getTopicConsumerManager is always completed normally
KafkaTopicConsumerManager tcm = new KafkaTopicConsumerManager("recover-tx",
true, persistentTopic);
future.whenComplete((___, error) -> {
// release resources in any case
try {
tcm.close();
} catch (Exception err) {
log.error("Cannot safely close the temporary KafkaTopicConsumerManager for {}",
fullPartitionName, err);
}
});
final long offsetToStart;
if (checkOffsetOutOfRange(tcm, offset, topicPartition, -1, statsExecutor)) {
offsetToStart = 0;
log.info("recoverTxEntries for {}: offset {} is out-of-range, "
+ "maybe the topic has been deleted/recreated, "
+ "starting recovery from {}",
topicPartition, offset, offsetToStart);
} else {
offsetToStart = offset;
}
producerStateManager.handleMissingDataBeforeRecovery(minOffset, offset);
if (log.isDebugEnabled()) {
log.debug("recoverTxEntries for {}: remove tcm to get cursor for fetch offset: {} .",
topicPartition, offsetToStart);
}
final CompletableFuture> cursorFuture = tcm.removeCursorFuture(offsetToStart);
if (cursorFuture == null) {
// tcm is closed, just return a NONE error because the channel may be still active
log.warn("KafkaTopicConsumerManager is closed, remove TCM of {}", fullPartitionName);
future.completeExceptionally(new NotLeaderOrFollowerException());
return future;
}
cursorFuture.thenAccept((cursorLongPair) -> {
if (cursorLongPair == null) {
log.warn("KafkaTopicConsumerManager.remove({}) return null for topic {}. "
+ "Fetch for topic return error.", offsetToStart, topicPartition);
future.completeExceptionally(new NotLeaderOrFollowerException());
return;
}
final ManagedCursor cursor = cursorLongPair.getLeft();
final AtomicLong cursorOffset = new AtomicLong(cursorLongPair.getRight());
AtomicLong entryCounter = new AtomicLong();
readNextEntriesForRecovery(cursor, cursorOffset, tcm, entryCounter,
future, executor, statsExecutor);
}).exceptionally(ex -> {
future.completeExceptionally(new NotLeaderOrFollowerException());
return null;
});
return future;
}));
}
private void readNextEntriesForRecovery(ManagedCursor cursor, AtomicLong cursorOffset,
KafkaTopicConsumerManager tcm,
AtomicLong entryCounter,
CompletableFuture future, Executor executor,
EventExecutor statsExecutor) {
if (log.isDebugEnabled()) {
log.debug("readNextEntriesForRecovery {} cursorOffset {}", fullPartitionName, cursorOffset);
}
int maxReadEntriesNum = 200;
long adjustedMaxBytes = Long.MAX_VALUE;
readEntries(cursor, topicPartition, cursorOffset, maxReadEntriesNum, adjustedMaxBytes,
(partitionName) -> {}, statsExecutor)
.whenCompleteAsync((entries, throwable) -> {
if (throwable != null) {
log.error("Read entry error on {}", fullPartitionName, throwable);
tcm.deleteOneCursorAsync(cursor,
"cursor.readEntry fail. deleteCursor");
if (throwable instanceof ManagedLedgerException.CursorAlreadyClosedException
|| throwable instanceof ManagedLedgerException.ManagedLedgerFencedException) {
future.completeExceptionally(new NotLeaderOrFollowerException());
return;
}
future.completeExceptionally(new UnknownServerException(throwable));
return;
}
// Add new offset back to TCM after entries are read successfully
tcm.add(cursorOffset.get(), Pair.of(cursor, cursorOffset.get()));
if (entries.isEmpty()) {
if (log.isDebugEnabled()) {
log.debug("No more entries to recover for {}", fullPartitionName);
}
future.completeAsync(() -> entryCounter.get(), executor);
return;
}
CompletableFuture decodedEntries = new CompletableFuture<>();
decodeEntriesForRecovery(decodedEntries, entries, statsExecutor);
decodedEntries.thenAccept((decodeResult) -> {
try {
MemoryRecords records = decodeResult.getRecords();
// When we retrieve many entries, this firstOffset's baseOffset is not necessarily
// the base offset for all records.
Optional firstOffset = Optional
.ofNullable(records.firstBatch())
.map(batch -> batch.baseOffset());
long[] lastOffSetHolder = {-1L};
records.batches().forEach(batch -> {
batch.forEach(record -> {
if (lastOffSetHolder[0] < record.offset()) {
lastOffSetHolder[0] = record.offset();
}
entryCounter.incrementAndGet();
});
});
long lastOffset = lastOffSetHolder[0];
if (log.isDebugEnabled()) {
log.debug("Read some entries while recovering {} firstOffSet {} lastOffset {}",
fullPartitionName,
firstOffset.orElse(null), lastOffset);
}
// Get the relevant offsets from each record
AnalyzeResult analyzeResult = analyzeAndValidateProducerState(records,
Optional.empty(), null, AppendOrigin.Log);
updateProducerStateManager(lastOffset, analyzeResult);
if (log.isDebugEnabled()) {
log.debug("Completed recovery of batch {} {}", analyzeResult, fullPartitionName);
}
} finally {
decodeResult.recycle();
}
readNextEntriesForRecovery(cursor, cursorOffset, tcm, entryCounter, future, executor,
statsExecutor);
}).exceptionally(error -> {
log.error("Bad error while recovering {}", fullPartitionName, error);
future.completeExceptionally(error);
return null;
});
}, executor);
}
private void updateProducerStateManager(long lastOffset, AnalyzeResult analyzeResult) {
analyzeResult.updatedProducers().forEach((pid, producerAppendInfo) -> {
if (log.isDebugEnabled()) {
log.debug("Append pid: [{}], appendInfo: [{}], lastOffset: [{}]",
pid, producerAppendInfo, lastOffset);
}
producerStateManager.update(producerAppendInfo);
});
analyzeResult.completedTxns().forEach(completedTxn -> {
long lastStableOffset = producerStateManager.lastStableOffset(completedTxn);
producerStateManager.updateTxnIndex(completedTxn, lastStableOffset);
producerStateManager.completeTxn(completedTxn);
});
producerStateManager.updateMapEndOffset(lastOffset);
// do system clean up stuff in this thread
producerStateManager.maybeTakeSnapshot(recoveryExecutor);
producerStateManager.maybePurgeAbortedTx();
}
private void decodeEntriesForRecovery(final CompletableFuture future,
final List entries,
final EventExecutor statsExecutor) {
if (log.isDebugEnabled()) {
log.debug("Read {} entries", entries.size());
}
final byte magic = RecordBatch.CURRENT_MAGIC_VALUE;
final long startDecodingEntriesNanos = MathUtils.nowInNano();
try {
DecodeResult decodeResult = entryFormatter.decode(entries, magic);
long fetchDecodeLatencyNanos = MathUtils.elapsedNanos(startDecodingEntriesNanos);
statsExecutor.execute(() -> {
requestStats.getFetchDecodeStats().registerSuccessfulEvent(
fetchDecodeLatencyNanos, TimeUnit.NANOSECONDS);
});
future.complete(decodeResult);
} catch (Exception error) {
future.completeExceptionally(error);
}
}
public boolean isUnloaded() {
return unloaded.get();
}
}