
kafka.log.remote.RemoteLogManager Maven / Gradle / Ivy
Show all versions of org.apache.servicemix.bundles.kafka_2.12
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package kafka.log.remote;
import kafka.cluster.EndPoint;
import kafka.cluster.Partition;
import kafka.log.UnifiedLog;
import kafka.log.remote.quota.RLMQuotaManager;
import kafka.log.remote.quota.RLMQuotaManagerConfig;
import kafka.log.remote.quota.RLMQuotaMetrics;
import kafka.server.BrokerTopicStats;
import kafka.server.QuotaType;
import kafka.server.StopPartition;
import org.apache.kafka.common.KafkaException;
import org.apache.kafka.common.TopicIdPartition;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.common.Uuid;
import org.apache.kafka.common.errors.OffsetOutOfRangeException;
import org.apache.kafka.common.errors.RetriableException;
import org.apache.kafka.common.internals.SecurityManagerCompatibility;
import org.apache.kafka.common.message.FetchResponseData;
import org.apache.kafka.common.metrics.Metrics;
import org.apache.kafka.common.metrics.Quota;
import org.apache.kafka.common.metrics.Sensor;
import org.apache.kafka.common.record.FileRecords;
import org.apache.kafka.common.record.MemoryRecords;
import org.apache.kafka.common.record.Record;
import org.apache.kafka.common.record.RecordBatch;
import org.apache.kafka.common.record.RemoteLogInputStream;
import org.apache.kafka.common.requests.FetchRequest;
import org.apache.kafka.common.utils.BufferSupplier;
import org.apache.kafka.common.utils.ChildFirstClassLoader;
import org.apache.kafka.common.utils.CloseableIterator;
import org.apache.kafka.common.utils.KafkaThread;
import org.apache.kafka.common.utils.LogContext;
import org.apache.kafka.common.utils.Time;
import org.apache.kafka.common.utils.Utils;
import org.apache.kafka.server.common.CheckpointFile;
import org.apache.kafka.server.common.OffsetAndEpoch;
import org.apache.kafka.server.config.ServerConfigs;
import org.apache.kafka.server.log.remote.metadata.storage.ClassLoaderAwareRemoteLogMetadataManager;
import org.apache.kafka.server.log.remote.storage.ClassLoaderAwareRemoteStorageManager;
import org.apache.kafka.server.log.remote.storage.LogSegmentData;
import org.apache.kafka.server.log.remote.storage.RemoteLogManagerConfig;
import org.apache.kafka.server.log.remote.storage.RemoteLogMetadataManager;
import org.apache.kafka.server.log.remote.storage.RemoteLogSegmentId;
import org.apache.kafka.server.log.remote.storage.RemoteLogSegmentMetadata;
import org.apache.kafka.server.log.remote.storage.RemoteLogSegmentMetadata.CustomMetadata;
import org.apache.kafka.server.log.remote.storage.RemoteLogSegmentMetadataUpdate;
import org.apache.kafka.server.log.remote.storage.RemoteLogSegmentState;
import org.apache.kafka.server.log.remote.storage.RemoteStorageException;
import org.apache.kafka.server.log.remote.storage.RemoteStorageManager;
import org.apache.kafka.server.metrics.KafkaMetricsGroup;
import org.apache.kafka.storage.internals.checkpoint.LeaderEpochCheckpointFile;
import org.apache.kafka.storage.internals.epoch.LeaderEpochFileCache;
import org.apache.kafka.storage.internals.log.AbortedTxn;
import org.apache.kafka.storage.internals.log.EpochEntry;
import org.apache.kafka.storage.internals.log.FetchDataInfo;
import org.apache.kafka.storage.internals.log.FetchIsolation;
import org.apache.kafka.storage.internals.log.LogOffsetMetadata;
import org.apache.kafka.storage.internals.log.LogSegment;
import org.apache.kafka.storage.internals.log.OffsetIndex;
import org.apache.kafka.storage.internals.log.OffsetPosition;
import org.apache.kafka.storage.internals.log.RemoteIndexCache;
import org.apache.kafka.storage.internals.log.RemoteLogReadResult;
import org.apache.kafka.storage.internals.log.RemoteStorageFetchInfo;
import org.apache.kafka.storage.internals.log.RemoteStorageThreadPool;
import org.apache.kafka.storage.internals.log.TransactionIndex;
import org.apache.kafka.storage.internals.log.TxnIndexSearchResult;
import com.yammer.metrics.core.Timer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedWriter;
import java.io.ByteArrayOutputStream;
import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.lang.reflect.InvocationTargetException;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NavigableMap;
import java.util.Objects;
import java.util.Optional;
import java.util.OptionalInt;
import java.util.OptionalLong;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.ScheduledThreadPoolExecutor;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
import java.util.function.BiConsumer;
import java.util.function.Consumer;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import scala.Option;
import scala.collection.JavaConverters;
import static kafka.log.remote.quota.RLMQuotaManagerConfig.INACTIVE_SENSOR_EXPIRATION_TIME_SECONDS;
import static org.apache.kafka.server.config.ServerLogConfigs.LOG_DIR_CONFIG;
import static org.apache.kafka.server.log.remote.metadata.storage.TopicBasedRemoteLogMetadataManagerConfig.REMOTE_LOG_METADATA_COMMON_CLIENT_PREFIX;
import static org.apache.kafka.server.log.remote.storage.RemoteStorageMetrics.REMOTE_LOG_MANAGER_TASKS_AVG_IDLE_PERCENT_METRIC;
import static org.apache.kafka.server.log.remote.storage.RemoteStorageMetrics.REMOTE_LOG_READER_FETCH_RATE_AND_TIME_METRIC;
/**
* This class is responsible for
* - initializing `RemoteStorageManager` and `RemoteLogMetadataManager` instances
* - receives any leader and follower replica events and partition stop events and act on them
* - also provides APIs to fetch indexes, metadata about remote log segments
* - copying log segments to the remote storage
* - cleaning up segments that are expired based on retention size or retention time
*/
public class RemoteLogManager implements Closeable {
private static final Logger LOGGER = LoggerFactory.getLogger(RemoteLogManager.class);
private static final String REMOTE_LOG_READER_THREAD_NAME_PREFIX = "remote-log-reader";
private final RemoteLogManagerConfig rlmConfig;
private final int brokerId;
private final String logDir;
private final Time time;
private final Function> fetchLog;
private final BiConsumer updateRemoteLogStartOffset;
private final BrokerTopicStats brokerTopicStats;
private final Metrics metrics;
private final RemoteStorageManager remoteLogStorageManager;
private final RemoteLogMetadataManager remoteLogMetadataManager;
private final ReentrantLock copyQuotaManagerLock = new ReentrantLock(true);
private final Condition copyQuotaManagerLockCondition = copyQuotaManagerLock.newCondition();
private final RLMQuotaManager rlmCopyQuotaManager;
private final RLMQuotaManager rlmFetchQuotaManager;
private final Sensor fetchThrottleTimeSensor;
private final Sensor copyThrottleTimeSensor;
private final RemoteIndexCache indexCache;
private final RemoteStorageThreadPool remoteStorageReaderThreadPool;
private final RLMScheduledThreadPool rlmCopyThreadPool;
private final RLMScheduledThreadPool rlmExpirationThreadPool;
private final RLMScheduledThreadPool followerThreadPool;
private final long delayInMs;
private final ConcurrentHashMap leaderCopyRLMTasks = new ConcurrentHashMap<>();
private final ConcurrentHashMap leaderExpirationRLMTasks = new ConcurrentHashMap<>();
private final ConcurrentHashMap followerRLMTasks = new ConcurrentHashMap<>();
private final Set segmentIdsBeingCopied = ConcurrentHashMap.newKeySet();
// topic ids that are received on leadership changes, this map is cleared on stop partitions
private final ConcurrentMap topicIdByPartitionMap = new ConcurrentHashMap<>();
private final String clusterId;
private final KafkaMetricsGroup metricsGroup = new KafkaMetricsGroup(this.getClass());
// The endpoint for remote log metadata manager to connect to
private Optional endpoint = Optional.empty();
private boolean closed = false;
private volatile boolean remoteLogManagerConfigured = false;
private final Timer remoteReadTimer;
/**
* Creates RemoteLogManager instance with the given arguments.
*
* @param rlmConfig Configuration required for remote logging subsystem(tiered storage) at the broker level.
* @param brokerId id of the current broker.
* @param logDir directory of Kafka log segments.
* @param time Time instance.
* @param clusterId The cluster id.
* @param fetchLog function to get UnifiedLog instance for a given topic.
* @param updateRemoteLogStartOffset function to update the log-start-offset for a given topic partition.
* @param brokerTopicStats BrokerTopicStats instance to update the respective metrics.
* @param metrics Metrics instance
*/
public RemoteLogManager(RemoteLogManagerConfig rlmConfig,
int brokerId,
String logDir,
String clusterId,
Time time,
Function> fetchLog,
BiConsumer updateRemoteLogStartOffset,
BrokerTopicStats brokerTopicStats,
Metrics metrics) throws IOException {
this.rlmConfig = rlmConfig;
this.brokerId = brokerId;
this.logDir = logDir;
this.clusterId = clusterId;
this.time = time;
this.fetchLog = fetchLog;
this.updateRemoteLogStartOffset = updateRemoteLogStartOffset;
this.brokerTopicStats = brokerTopicStats;
this.metrics = metrics;
remoteLogStorageManager = createRemoteStorageManager();
remoteLogMetadataManager = createRemoteLogMetadataManager();
rlmCopyQuotaManager = createRLMCopyQuotaManager();
rlmFetchQuotaManager = createRLMFetchQuotaManager();
fetchThrottleTimeSensor = new RLMQuotaMetrics(metrics, "remote-fetch-throttle-time", RemoteLogManager.class.getSimpleName(),
"The %s time in millis remote fetches was throttled by a broker", INACTIVE_SENSOR_EXPIRATION_TIME_SECONDS).sensor();
copyThrottleTimeSensor = new RLMQuotaMetrics(metrics, "remote-copy-throttle-time", RemoteLogManager.class.getSimpleName(),
"The %s time in millis remote copies was throttled by a broker", INACTIVE_SENSOR_EXPIRATION_TIME_SECONDS).sensor();
indexCache = new RemoteIndexCache(rlmConfig.remoteLogIndexFileCacheTotalSizeBytes(), remoteLogStorageManager, logDir);
delayInMs = rlmConfig.remoteLogManagerTaskIntervalMs();
rlmCopyThreadPool = new RLMScheduledThreadPool(rlmConfig.remoteLogManagerCopierThreadPoolSize(),
"RLMCopyThreadPool", "kafka-rlm-copy-thread-pool-");
rlmExpirationThreadPool = new RLMScheduledThreadPool(rlmConfig.remoteLogManagerExpirationThreadPoolSize(),
"RLMExpirationThreadPool", "kafka-rlm-expiration-thread-pool-");
followerThreadPool = new RLMScheduledThreadPool(rlmConfig.remoteLogManagerThreadPoolSize(),
"RLMFollowerScheduledThreadPool", "kafka-rlm-follower-thread-pool-");
metricsGroup.newGauge(REMOTE_LOG_MANAGER_TASKS_AVG_IDLE_PERCENT_METRIC, rlmCopyThreadPool::getIdlePercent);
remoteReadTimer = metricsGroup.newTimer(REMOTE_LOG_READER_FETCH_RATE_AND_TIME_METRIC,
TimeUnit.MILLISECONDS, TimeUnit.SECONDS);
remoteStorageReaderThreadPool = new RemoteStorageThreadPool(
REMOTE_LOG_READER_THREAD_NAME_PREFIX,
rlmConfig.remoteLogReaderThreads(),
rlmConfig.remoteLogReaderMaxPendingTasks()
);
}
public void resizeCacheSize(long remoteLogIndexFileCacheSize) {
indexCache.resizeCacheSize(remoteLogIndexFileCacheSize);
}
public void updateCopyQuota(long quota) {
LOGGER.info("Updating remote copy quota to {} bytes per second", quota);
rlmCopyQuotaManager.updateQuota(new Quota(quota, true));
}
public void updateFetchQuota(long quota) {
LOGGER.info("Updating remote fetch quota to {} bytes per second", quota);
rlmFetchQuotaManager.updateQuota(new Quota(quota, true));
}
private void removeMetrics() {
metricsGroup.removeMetric(REMOTE_LOG_MANAGER_TASKS_AVG_IDLE_PERCENT_METRIC);
metricsGroup.removeMetric(REMOTE_LOG_READER_FETCH_RATE_AND_TIME_METRIC);
remoteStorageReaderThreadPool.removeMetrics();
}
/**
* Returns the timeout for the RLM Tasks to wait for the quota to be available
*/
Duration quotaTimeout() {
return Duration.ofSeconds(1);
}
RLMQuotaManager createRLMCopyQuotaManager() {
return new RLMQuotaManager(copyQuotaManagerConfig(rlmConfig), metrics, QuotaType.RLMCopy$.MODULE$,
"Tracking copy byte-rate for Remote Log Manager", time);
}
RLMQuotaManager createRLMFetchQuotaManager() {
return new RLMQuotaManager(fetchQuotaManagerConfig(rlmConfig), metrics, QuotaType.RLMFetch$.MODULE$,
"Tracking fetch byte-rate for Remote Log Manager", time);
}
public long getFetchThrottleTimeMs() {
return rlmFetchQuotaManager.getThrottleTimeMs();
}
public Sensor fetchThrottleTimeSensor() {
return fetchThrottleTimeSensor;
}
static RLMQuotaManagerConfig copyQuotaManagerConfig(RemoteLogManagerConfig rlmConfig) {
return new RLMQuotaManagerConfig(rlmConfig.remoteLogManagerCopyMaxBytesPerSecond(),
rlmConfig.remoteLogManagerCopyNumQuotaSamples(),
rlmConfig.remoteLogManagerCopyQuotaWindowSizeSeconds());
}
static RLMQuotaManagerConfig fetchQuotaManagerConfig(RemoteLogManagerConfig rlmConfig) {
return new RLMQuotaManagerConfig(rlmConfig.remoteLogManagerFetchMaxBytesPerSecond(),
rlmConfig.remoteLogManagerFetchNumQuotaSamples(),
rlmConfig.remoteLogManagerFetchQuotaWindowSizeSeconds());
}
private T createDelegate(ClassLoader classLoader, String className) {
try {
return (T) classLoader.loadClass(className)
.getDeclaredConstructor().newInstance();
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException |
ClassNotFoundException e) {
throw new KafkaException(e);
}
}
RemoteStorageManager createRemoteStorageManager() {
return SecurityManagerCompatibility.get().doPrivileged(() -> {
final String classPath = rlmConfig.remoteStorageManagerClassPath();
if (classPath != null && !classPath.trim().isEmpty()) {
ChildFirstClassLoader classLoader = new ChildFirstClassLoader(classPath, this.getClass().getClassLoader());
RemoteStorageManager delegate = createDelegate(classLoader, rlmConfig.remoteStorageManagerClassName());
return (RemoteStorageManager) new ClassLoaderAwareRemoteStorageManager(delegate, classLoader);
} else {
return createDelegate(this.getClass().getClassLoader(), rlmConfig.remoteStorageManagerClassName());
}
});
}
private void configureRSM() {
final Map rsmProps = new HashMap<>(rlmConfig.remoteStorageManagerProps());
rsmProps.put(ServerConfigs.BROKER_ID_CONFIG, brokerId);
remoteLogStorageManager.configure(rsmProps);
}
RemoteLogMetadataManager createRemoteLogMetadataManager() {
return SecurityManagerCompatibility.get().doPrivileged(() -> {
final String classPath = rlmConfig.remoteLogMetadataManagerClassPath();
if (classPath != null && !classPath.trim().isEmpty()) {
ClassLoader classLoader = new ChildFirstClassLoader(classPath, this.getClass().getClassLoader());
RemoteLogMetadataManager delegate = createDelegate(classLoader, rlmConfig.remoteLogMetadataManagerClassName());
return (RemoteLogMetadataManager) new ClassLoaderAwareRemoteLogMetadataManager(delegate, classLoader);
} else {
return createDelegate(this.getClass().getClassLoader(), rlmConfig.remoteLogMetadataManagerClassName());
}
});
}
public void onEndPointCreated(EndPoint endpoint) {
this.endpoint = Optional.of(endpoint);
}
private void configureRLMM() {
final Map rlmmProps = new HashMap<>();
endpoint.ifPresent(e -> {
rlmmProps.put(REMOTE_LOG_METADATA_COMMON_CLIENT_PREFIX + "bootstrap.servers", e.host() + ":" + e.port());
rlmmProps.put(REMOTE_LOG_METADATA_COMMON_CLIENT_PREFIX + "security.protocol", e.securityProtocol().name);
});
// update the remoteLogMetadataProps here to override endpoint config if any
rlmmProps.putAll(rlmConfig.remoteLogMetadataManagerProps());
rlmmProps.put(ServerConfigs.BROKER_ID_CONFIG, brokerId);
rlmmProps.put(LOG_DIR_CONFIG, logDir);
rlmmProps.put("cluster.id", clusterId);
remoteLogMetadataManager.configure(rlmmProps);
}
public void startup() {
// Initialize and configure RSM and RLMM. This will start RSM, RLMM resources which may need to start resources
// in connecting to the brokers or remote storages.
configureRSM();
configureRLMM();
remoteLogManagerConfigured = true;
}
private boolean isRemoteLogManagerConfigured() {
return this.remoteLogManagerConfigured;
}
public RemoteStorageManager storageManager() {
return remoteLogStorageManager;
}
private Stream filterPartitions(Set partitions) {
// We are not specifically checking for internal topics etc here as `log.remoteLogEnabled()` already handles that.
return partitions.stream().filter(partition -> partition.log().exists(UnifiedLog::remoteLogEnabled));
}
private void cacheTopicPartitionIds(TopicIdPartition topicIdPartition) {
Uuid previousTopicId = topicIdByPartitionMap.put(topicIdPartition.topicPartition(), topicIdPartition.topicId());
if (previousTopicId != null && !previousTopicId.equals(topicIdPartition.topicId())) {
LOGGER.info("Previous cached topic id {} for {} does not match updated topic id {}",
previousTopicId, topicIdPartition.topicPartition(), topicIdPartition.topicId());
}
}
/**
* Callback to receive any leadership changes for the topic partitions assigned to this broker. If there are no
* existing tasks for a given topic partition then it will assign new leader or follower task else it will convert the
* task to respective target state(leader or follower).
*
* @param partitionsBecomeLeader partitions that have become leaders on this broker.
* @param partitionsBecomeFollower partitions that have become followers on this broker.
* @param topicIds topic name to topic id mappings.
*/
public void onLeadershipChange(Set partitionsBecomeLeader,
Set partitionsBecomeFollower,
Map topicIds) {
LOGGER.debug("Received leadership changes for leaders: {} and followers: {}", partitionsBecomeLeader, partitionsBecomeFollower);
if (rlmConfig.isRemoteStorageSystemEnabled() && !isRemoteLogManagerConfigured()) {
throw new KafkaException("RemoteLogManager is not configured when remote storage system is enabled");
}
Map leaderPartitions = filterPartitions(partitionsBecomeLeader)
.collect(Collectors.toMap(p -> new TopicIdPartition(topicIds.get(p.topic()), p.topicPartition()),
p -> p.log().exists(log -> log.config().remoteLogCopyDisable())));
Map followerPartitions = filterPartitions(partitionsBecomeFollower)
.collect(Collectors.toMap(p -> new TopicIdPartition(topicIds.get(p.topic()), p.topicPartition()),
p -> p.log().exists(log -> log.config().remoteLogCopyDisable())));
if (!leaderPartitions.isEmpty() || !followerPartitions.isEmpty()) {
LOGGER.debug("Effective topic partitions after filtering compact and internal topics, leaders: {} and followers: {}",
leaderPartitions, followerPartitions);
leaderPartitions.forEach((tp, __) -> cacheTopicPartitionIds(tp));
followerPartitions.forEach((tp, __) -> cacheTopicPartitionIds(tp));
remoteLogMetadataManager.onPartitionLeadershipChanges(leaderPartitions.keySet(), followerPartitions.keySet());
followerPartitions.forEach((tp, __) -> doHandleFollowerPartition(tp));
// If this node was the previous leader for the partition, then the RLMTask might be running in the
// background thread and might emit metrics. So, removing the metrics after marking this node as follower.
followerPartitions.forEach((tp, __) -> removeRemoteTopicPartitionMetrics(tp));
leaderPartitions.forEach(this::doHandleLeaderPartition);
}
}
public void stopLeaderCopyRLMTasks(Set partitions) {
for (Partition partition : partitions) {
TopicPartition tp = partition.topicPartition();
if (topicIdByPartitionMap.containsKey(tp)) {
TopicIdPartition tpId = new TopicIdPartition(topicIdByPartitionMap.get(tp), tp);
leaderCopyRLMTasks.computeIfPresent(tpId, (topicIdPartition, task) -> {
LOGGER.info("Cancelling the copy RLM task for tpId: {}", tpId);
task.cancel();
LOGGER.info("Resetting remote copy lag metrics for tpId: {}", tpId);
((RLMCopyTask) task.rlmTask).resetLagStats();
return null;
});
}
}
}
/**
* Stop the remote-log-manager task for the given partitions. And, calls the
* {@link RemoteLogMetadataManager#onStopPartitions(Set)} when {@link StopPartition#deleteLocalLog()} is true.
* Deletes the partitions from the remote storage when {@link StopPartition#deleteRemoteLog()} is true.
*
* @param stopPartitions topic partitions that needs to be stopped.
* @param errorHandler callback to handle any errors while stopping the partitions.
*/
public void stopPartitions(Set stopPartitions,
BiConsumer errorHandler) {
LOGGER.debug("Stop partitions: {}", stopPartitions);
for (StopPartition stopPartition: stopPartitions) {
TopicPartition tp = stopPartition.topicPartition();
try {
if (topicIdByPartitionMap.containsKey(tp)) {
TopicIdPartition tpId = new TopicIdPartition(topicIdByPartitionMap.get(tp), tp);
leaderCopyRLMTasks.computeIfPresent(tpId, (topicIdPartition, task) -> {
LOGGER.info("Cancelling the copy RLM task for tpId: {}", tpId);
task.cancel();
return null;
});
leaderExpirationRLMTasks.computeIfPresent(tpId, (topicIdPartition, task) -> {
LOGGER.info("Cancelling the expiration RLM task for tpId: {}", tpId);
task.cancel();
return null;
});
followerRLMTasks.computeIfPresent(tpId, (topicIdPartition, task) -> {
LOGGER.info("Cancelling the follower RLM task for tpId: {}", tpId);
task.cancel();
return null;
});
removeRemoteTopicPartitionMetrics(tpId);
if (stopPartition.deleteRemoteLog()) {
LOGGER.info("Deleting the remote log segments task for partition: {}", tpId);
deleteRemoteLogPartition(tpId);
}
} else {
LOGGER.warn("StopPartition call is not expected for partition: {}", tp);
}
} catch (Exception ex) {
errorHandler.accept(tp, ex);
LOGGER.error("Error while stopping the partition: {}", stopPartition, ex);
}
}
// We want to remote topicId map and stopPartition on RLMM for deleteLocalLog or stopRLMM partitions because
// in both case, they all mean the topic will not be held in this broker anymore.
// NOTE: In ZK mode, this#stopPartitions method is called when Replica state changes to Offline and ReplicaDeletionStarted
Set pendingActionsPartitions = stopPartitions.stream()
.filter(sp -> (sp.stopRemoteLogMetadataManager() || sp.deleteLocalLog()) && topicIdByPartitionMap.containsKey(sp.topicPartition()))
.map(sp -> new TopicIdPartition(topicIdByPartitionMap.get(sp.topicPartition()), sp.topicPartition()))
.collect(Collectors.toSet());
if (!pendingActionsPartitions.isEmpty()) {
pendingActionsPartitions.forEach(tpId -> topicIdByPartitionMap.remove(tpId.topicPartition()));
remoteLogMetadataManager.onStopPartitions(pendingActionsPartitions);
}
}
private void deleteRemoteLogPartition(TopicIdPartition partition) throws RemoteStorageException, ExecutionException, InterruptedException {
List metadataList = new ArrayList<>();
remoteLogMetadataManager.listRemoteLogSegments(partition).forEachRemaining(metadataList::add);
List deleteSegmentStartedEvents = metadataList.stream()
.map(metadata ->
new RemoteLogSegmentMetadataUpdate(metadata.remoteLogSegmentId(), time.milliseconds(),
metadata.customMetadata(), RemoteLogSegmentState.DELETE_SEGMENT_STARTED, brokerId))
.collect(Collectors.toList());
publishEvents(deleteSegmentStartedEvents).get();
// KAFKA-15313: Delete remote log segments partition asynchronously when a partition is deleted.
Collection deletedSegmentIds = new ArrayList<>();
for (RemoteLogSegmentMetadata metadata: metadataList) {
deletedSegmentIds.add(metadata.remoteLogSegmentId().id());
remoteLogStorageManager.deleteLogSegmentData(metadata);
}
indexCache.removeAll(deletedSegmentIds);
List deleteSegmentFinishedEvents = metadataList.stream()
.map(metadata ->
new RemoteLogSegmentMetadataUpdate(metadata.remoteLogSegmentId(), time.milliseconds(),
metadata.customMetadata(), RemoteLogSegmentState.DELETE_SEGMENT_FINISHED, brokerId))
.collect(Collectors.toList());
publishEvents(deleteSegmentFinishedEvents).get();
}
private CompletableFuture publishEvents(List events) throws RemoteStorageException {
List> result = new ArrayList<>();
for (RemoteLogSegmentMetadataUpdate event : events) {
result.add(remoteLogMetadataManager.updateRemoteLogSegmentMetadata(event));
}
return CompletableFuture.allOf(result.toArray(new CompletableFuture[0]));
}
public Optional fetchRemoteLogSegmentMetadata(TopicPartition topicPartition,
int epochForOffset,
long offset) throws RemoteStorageException {
Uuid topicId = topicIdByPartitionMap.get(topicPartition);
if (topicId == null) {
throw new KafkaException("No topic id registered for topic partition: " + topicPartition);
}
return remoteLogMetadataManager.remoteLogSegmentMetadata(new TopicIdPartition(topicId, topicPartition), epochForOffset, offset);
}
private Optional lookupTimestamp(RemoteLogSegmentMetadata rlsMetadata, long timestamp, long startingOffset)
throws RemoteStorageException, IOException {
int startPos = indexCache.lookupTimestamp(rlsMetadata, timestamp, startingOffset);
InputStream remoteSegInputStream = null;
try {
// Search forward for the position of the last offset that is greater than or equal to the startingOffset
remoteSegInputStream = remoteLogStorageManager.fetchLogSegment(rlsMetadata, startPos);
RemoteLogInputStream remoteLogInputStream = new RemoteLogInputStream(remoteSegInputStream);
while (true) {
RecordBatch batch = remoteLogInputStream.nextBatch();
if (batch == null) break;
if (batch.maxTimestamp() >= timestamp && batch.lastOffset() >= startingOffset) {
try (CloseableIterator recordStreamingIterator = batch.streamingIterator(BufferSupplier.NO_CACHING)) {
while (recordStreamingIterator.hasNext()) {
Record record = recordStreamingIterator.next();
if (record.timestamp() >= timestamp && record.offset() >= startingOffset)
return Optional.of(new FileRecords.TimestampAndOffset(record.timestamp(), record.offset(), maybeLeaderEpoch(batch.partitionLeaderEpoch())));
}
}
}
}
return Optional.empty();
} finally {
Utils.closeQuietly(remoteSegInputStream, "RemoteLogSegmentInputStream");
}
}
private Optional maybeLeaderEpoch(int leaderEpoch) {
return leaderEpoch == RecordBatch.NO_PARTITION_LEADER_EPOCH ? Optional.empty() : Optional.of(leaderEpoch);
}
/**
* Search the message offset in the remote storage based on timestamp and offset.
*
* This method returns an option of TimestampOffset. The returned value is determined using the following ordered list of rules:
*
* - If there are no messages in the remote storage, return Empty
* - If all the messages in the remote storage have smaller offsets, return Empty
* - If all the messages in the remote storage have smaller timestamps, return Empty
* - Otherwise, return an option of TimestampOffset. The offset is the offset of the first message whose timestamp
* is greater than or equals to the target timestamp and whose offset is greater than or equals to the startingOffset.
*
* @param tp topic partition in which the offset to be found.
* @param timestamp The timestamp to search for.
* @param startingOffset The starting offset to search.
* @param leaderEpochCache LeaderEpochFileCache of the topic partition.
* @return the timestamp and offset of the first message that meets the requirements. Empty will be returned if there
* is no such message.
*/
public Optional findOffsetByTimestamp(TopicPartition tp,
long timestamp,
long startingOffset,
LeaderEpochFileCache leaderEpochCache) throws RemoteStorageException, IOException {
Uuid topicId = topicIdByPartitionMap.get(tp);
if (topicId == null) {
throw new KafkaException("Topic id does not exist for topic partition: " + tp);
}
Optional unifiedLogOptional = fetchLog.apply(tp);
if (!unifiedLogOptional.isPresent()) {
throw new KafkaException("UnifiedLog does not exist for topic partition: " + tp);
}
UnifiedLog unifiedLog = unifiedLogOptional.get();
// Get the respective epoch in which the starting-offset exists.
OptionalInt maybeEpoch = leaderEpochCache.epochForOffset(startingOffset);
TopicIdPartition topicIdPartition = new TopicIdPartition(topicId, tp);
NavigableMap epochWithOffsets = buildFilteredLeaderEpochMap(leaderEpochCache.epochWithOffsets());
while (maybeEpoch.isPresent()) {
int epoch = maybeEpoch.getAsInt();
// KAFKA-15802: Add a new API for RLMM to choose how to implement the predicate.
// currently, all segments are returned and then iterated, and filtered
Iterator iterator = remoteLogMetadataManager.listRemoteLogSegments(topicIdPartition, epoch);
while (iterator.hasNext()) {
RemoteLogSegmentMetadata rlsMetadata = iterator.next();
if (rlsMetadata.maxTimestampMs() >= timestamp
&& rlsMetadata.endOffset() >= startingOffset
&& isRemoteSegmentWithinLeaderEpochs(rlsMetadata, unifiedLog.logEndOffset(), epochWithOffsets)
&& rlsMetadata.state().equals(RemoteLogSegmentState.COPY_SEGMENT_FINISHED)) {
return lookupTimestamp(rlsMetadata, timestamp, startingOffset);
}
}
// Move to the next epoch if not found with the current epoch.
maybeEpoch = leaderEpochCache.nextEpoch(epoch);
}
return Optional.empty();
}
private abstract static class CancellableRunnable implements Runnable {
private volatile boolean cancelled = false;
public void cancel() {
cancelled = true;
}
public boolean isCancelled() {
return cancelled;
}
}
/**
* Returns the leader epoch entries within the range of the given start[exclusive] and end[inclusive] offset.
*
* Visible for testing.
*
* @param log The actual log from where to take the leader-epoch checkpoint
* @param startOffset The start offset of the epoch entries (inclusive).
* If start offset is 6, then it will retain an entry at offset 6.
* @param endOffset The end offset of the epoch entries (exclusive)
* If end offset is 100, then it will remove the entries greater than or equal to 100.
* @return the leader epoch entries
*/
List getLeaderEpochEntries(UnifiedLog log, long startOffset, long endOffset) {
if (log.leaderEpochCache().isDefined()) {
return log.leaderEpochCache().get().epochEntriesInRange(startOffset, endOffset);
} else {
return Collections.emptyList();
}
}
// VisibleForTesting
RLMTask rlmCopyTask(TopicIdPartition topicIdPartition) {
RLMTaskWithFuture task = leaderCopyRLMTasks.get(topicIdPartition);
if (task != null) {
return task.rlmTask;
}
return null;
}
abstract class RLMTask extends CancellableRunnable {
protected final TopicIdPartition topicIdPartition;
private final Logger logger;
public RLMTask(TopicIdPartition topicIdPartition) {
this.topicIdPartition = topicIdPartition;
this.logger = getLogContext().logger(RLMTask.class);
}
protected LogContext getLogContext() {
return new LogContext("[RemoteLogManager=" + brokerId + " partition=" + topicIdPartition + "] ");
}
public void run() {
if (isCancelled())
return;
try {
Optional unifiedLogOptional = fetchLog.apply(topicIdPartition.topicPartition());
if (!unifiedLogOptional.isPresent()) {
return;
}
execute(unifiedLogOptional.get());
} catch (InterruptedException ex) {
if (!isCancelled()) {
logger.warn("Current thread for topic-partition-id {} is interrupted", topicIdPartition, ex);
}
} catch (RetriableException ex) {
logger.debug("Encountered a retryable error while executing current task for topic-partition {}", topicIdPartition, ex);
} catch (Exception ex) {
if (!isCancelled()) {
logger.warn("Current task for topic-partition {} received error but it will be scheduled", topicIdPartition, ex);
}
}
}
protected abstract void execute(UnifiedLog log) throws InterruptedException, RemoteStorageException, ExecutionException;
public String toString() {
return this.getClass() + "[" + topicIdPartition + "]";
}
}
class RLMCopyTask extends RLMTask {
private final int customMetadataSizeLimit;
private final Logger logger;
// The copied and log-start offset is empty initially for a new RLMCopyTask, and needs to be fetched inside
// the task's run() method.
private volatile Optional copiedOffsetOption = Optional.empty();
private volatile boolean isLogStartOffsetUpdated = false;
private volatile Optional logDirectory = Optional.empty();
public RLMCopyTask(TopicIdPartition topicIdPartition, int customMetadataSizeLimit) {
super(topicIdPartition);
this.customMetadataSizeLimit = customMetadataSizeLimit;
this.logger = getLogContext().logger(RLMCopyTask.class);
}
@Override
protected void execute(UnifiedLog log) throws InterruptedException {
// In the first run after completing altering logDir within broker, we should make sure the state is reset. (KAFKA-16711)
if (!log.parentDir().equals(logDirectory.orElse(null))) {
copiedOffsetOption = Optional.empty();
isLogStartOffsetUpdated = false;
logDirectory = Optional.of(log.parentDir());
}
copyLogSegmentsToRemote(log);
}
private void maybeUpdateLogStartOffsetOnBecomingLeader(UnifiedLog log) throws RemoteStorageException {
if (!isLogStartOffsetUpdated) {
long logStartOffset = findLogStartOffset(topicIdPartition, log);
updateRemoteLogStartOffset.accept(topicIdPartition.topicPartition(), logStartOffset);
isLogStartOffsetUpdated = true;
logger.info("Found the logStartOffset: {} for partition: {} after becoming leader",
logStartOffset, topicIdPartition);
}
}
private void maybeUpdateCopiedOffset(UnifiedLog log) throws RemoteStorageException {
if (!copiedOffsetOption.isPresent()) {
// This is found by traversing from the latest leader epoch from leader epoch history and find the highest offset
// of a segment with that epoch copied into remote storage. If it can not find an entry then it checks for the
// previous leader epoch till it finds an entry, If there are no entries till the earliest leader epoch in leader
// epoch cache then it starts copying the segments from the earliest epoch entry's offset.
copiedOffsetOption = Optional.of(findHighestRemoteOffset(topicIdPartition, log));
logger.info("Found the highest copiedRemoteOffset: {} for partition: {} after becoming leader", copiedOffsetOption, topicIdPartition);
copiedOffsetOption.ifPresent(offsetAndEpoch -> log.updateHighestOffsetInRemoteStorage(offsetAndEpoch.offset()));
}
}
/**
* Segments which match the following criteria are eligible for copying to remote storage:
* 1) Segment is not the active segment and
* 2) Segment end-offset is less than the last-stable-offset as remote storage should contain only
* committed/acked messages
* @param log The log from which the segments are to be copied
* @param fromOffset The offset from which the segments are to be copied
* @param lastStableOffset The last stable offset of the log
* @return candidate log segments to be copied to remote storage
*/
List candidateLogSegments(UnifiedLog log, Long fromOffset, Long lastStableOffset) {
List candidateLogSegments = new ArrayList<>();
List segments = JavaConverters.seqAsJavaList(log.logSegments(fromOffset, Long.MAX_VALUE).toSeq());
if (!segments.isEmpty()) {
for (int idx = 1; idx < segments.size(); idx++) {
LogSegment previousSeg = segments.get(idx - 1);
LogSegment currentSeg = segments.get(idx);
if (currentSeg.baseOffset() <= lastStableOffset) {
candidateLogSegments.add(new EnrichedLogSegment(previousSeg, currentSeg.baseOffset()));
}
}
// Discard the last active segment
}
return candidateLogSegments;
}
public void copyLogSegmentsToRemote(UnifiedLog log) throws InterruptedException {
if (isCancelled())
return;
try {
maybeUpdateLogStartOffsetOnBecomingLeader(log);
maybeUpdateCopiedOffset(log);
long copiedOffset = copiedOffsetOption.get().offset();
// LSO indicates the offset below are ready to be consumed (high-watermark or committed)
long lso = log.lastStableOffset();
if (lso < 0) {
logger.warn("lastStableOffset for partition {} is {}, which should not be negative.", topicIdPartition, lso);
} else if (lso > 0 && copiedOffset < lso) {
// log-start-offset can be ahead of the copied-offset, when:
// 1) log-start-offset gets incremented via delete-records API (or)
// 2) enabling the remote log for the first time
long fromOffset = Math.max(copiedOffset + 1, log.logStartOffset());
List candidateLogSegments = candidateLogSegments(log, fromOffset, lso);
logger.debug("Candidate log segments, logStartOffset: {}, copiedOffset: {}, fromOffset: {}, lso: {} " +
"and candidateLogSegments: {}", log.logStartOffset(), copiedOffset, fromOffset, lso, candidateLogSegments);
if (candidateLogSegments.isEmpty()) {
logger.debug("No segments found to be copied for partition {} with copiedOffset: {} and active segment's base-offset: {}",
topicIdPartition, copiedOffset, log.activeSegment().baseOffset());
} else {
for (EnrichedLogSegment candidateLogSegment : candidateLogSegments) {
if (isCancelled()) {
logger.info("Skipping copying log segments as the current task state is changed, cancelled: {}",
isCancelled());
return;
}
copyQuotaManagerLock.lock();
try {
long throttleTimeMs = rlmCopyQuotaManager.getThrottleTimeMs();
while (throttleTimeMs > 0) {
copyThrottleTimeSensor.record(throttleTimeMs, time.milliseconds());
logger.debug("Quota exceeded for copying log segments, waiting for the quota to be available.");
// If the thread gets interrupted while waiting, the InterruptedException is thrown
// back to the caller. It's important to note that the task being executed is already
// cancelled before the executing thread is interrupted. The caller is responsible
// for handling the exception gracefully by checking if the task is already cancelled.
boolean ignored = copyQuotaManagerLockCondition.await(quotaTimeout().toMillis(), TimeUnit.MILLISECONDS);
throttleTimeMs = rlmCopyQuotaManager.getThrottleTimeMs();
}
rlmCopyQuotaManager.record(candidateLogSegment.logSegment.log().sizeInBytes());
// Signal waiting threads to check the quota again
copyQuotaManagerLockCondition.signalAll();
} finally {
copyQuotaManagerLock.unlock();
}
RemoteLogSegmentId segmentId = RemoteLogSegmentId.generateNew(topicIdPartition);
segmentIdsBeingCopied.add(segmentId);
try {
copyLogSegment(log, candidateLogSegment.logSegment, segmentId, candidateLogSegment.nextSegmentOffset);
} finally {
segmentIdsBeingCopied.remove(segmentId);
}
}
}
} else {
logger.debug("Skipping copying segments, current read-offset:{}, and LSO:{}", copiedOffset, lso);
}
} catch (CustomMetadataSizeLimitExceededException e) {
// Only stop this task. Logging is done where the exception is thrown.
brokerTopicStats.topicStats(log.topicPartition().topic()).failedRemoteCopyRequestRate().mark();
brokerTopicStats.allTopicsStats().failedRemoteCopyRequestRate().mark();
this.cancel();
} catch (InterruptedException | RetriableException ex) {
throw ex;
} catch (Exception ex) {
if (!isCancelled()) {
brokerTopicStats.topicStats(log.topicPartition().topic()).failedRemoteCopyRequestRate().mark();
brokerTopicStats.allTopicsStats().failedRemoteCopyRequestRate().mark();
logger.error("Error occurred while copying log segments of partition: {}", topicIdPartition, ex);
}
}
}
private void copyLogSegment(UnifiedLog log, LogSegment segment, RemoteLogSegmentId segmentId, long nextSegmentBaseOffset)
throws InterruptedException, ExecutionException, RemoteStorageException, IOException,
CustomMetadataSizeLimitExceededException {
File logFile = segment.log().file();
String logFileName = logFile.getName();
logger.info("Copying {} to remote storage.", logFileName);
long endOffset = nextSegmentBaseOffset - 1;
File producerStateSnapshotFile = log.producerStateManager().fetchSnapshot(nextSegmentBaseOffset).orElse(null);
List epochEntries = getLeaderEpochEntries(log, segment.baseOffset(), nextSegmentBaseOffset);
Map segmentLeaderEpochs = new HashMap<>(epochEntries.size());
epochEntries.forEach(entry -> segmentLeaderEpochs.put(entry.epoch, entry.startOffset));
RemoteLogSegmentMetadata copySegmentStartedRlsm = new RemoteLogSegmentMetadata(segmentId, segment.baseOffset(), endOffset,
segment.largestTimestamp(), brokerId, time.milliseconds(), segment.log().sizeInBytes(),
segmentLeaderEpochs);
remoteLogMetadataManager.addRemoteLogSegmentMetadata(copySegmentStartedRlsm).get();
ByteBuffer leaderEpochsIndex = epochEntriesAsByteBuffer(getLeaderEpochEntries(log, -1, nextSegmentBaseOffset));
LogSegmentData segmentData = new LogSegmentData(logFile.toPath(), toPathIfExists(segment.offsetIndex().file()),
toPathIfExists(segment.timeIndex().file()), Optional.ofNullable(toPathIfExists(segment.txnIndex().file())),
producerStateSnapshotFile.toPath(), leaderEpochsIndex);
brokerTopicStats.topicStats(log.topicPartition().topic()).remoteCopyRequestRate().mark();
brokerTopicStats.allTopicsStats().remoteCopyRequestRate().mark();
Optional customMetadata = Optional.empty();
try {
customMetadata = remoteLogStorageManager.copyLogSegmentData(copySegmentStartedRlsm, segmentData);
} catch (RemoteStorageException e) {
try {
remoteLogStorageManager.deleteLogSegmentData(copySegmentStartedRlsm);
logger.info("Successfully cleaned segment {} after failing to copy segment", segmentId);
} catch (RemoteStorageException e1) {
logger.error("Error while cleaning segment {}, consider cleaning manually", segmentId, e1);
}
throw e;
}
RemoteLogSegmentMetadataUpdate copySegmentFinishedRlsm = new RemoteLogSegmentMetadataUpdate(segmentId, time.milliseconds(),
customMetadata, RemoteLogSegmentState.COPY_SEGMENT_FINISHED, brokerId);
if (customMetadata.isPresent()) {
long customMetadataSize = customMetadata.get().value().length;
if (customMetadataSize > this.customMetadataSizeLimit) {
CustomMetadataSizeLimitExceededException e = new CustomMetadataSizeLimitExceededException();
logger.error("Custom metadata size {} exceeds configured limit {}." +
" Copying will be stopped and copied segment will be attempted to clean." +
" Original metadata: {}",
customMetadataSize, this.customMetadataSizeLimit, copySegmentStartedRlsm, e);
try {
// For deletion, we provide back the custom metadata by creating a new metadata object from the update.
// However, the update itself will not be stored in this case.
remoteLogStorageManager.deleteLogSegmentData(copySegmentStartedRlsm.createWithUpdates(copySegmentFinishedRlsm));
logger.info("Successfully cleaned segment after custom metadata size exceeded");
} catch (RemoteStorageException e1) {
logger.error("Error while cleaning segment after custom metadata size exceeded, consider cleaning manually", e1);
}
throw e;
}
}
remoteLogMetadataManager.updateRemoteLogSegmentMetadata(copySegmentFinishedRlsm).get();
brokerTopicStats.topicStats(log.topicPartition().topic())
.remoteCopyBytesRate().mark(copySegmentStartedRlsm.segmentSizeInBytes());
brokerTopicStats.allTopicsStats().remoteCopyBytesRate().mark(copySegmentStartedRlsm.segmentSizeInBytes());
// `epochEntries` cannot be empty, there is a pre-condition validation in RemoteLogSegmentMetadata
// constructor
int lastEpochInSegment = epochEntries.get(epochEntries.size() - 1).epoch;
copiedOffsetOption = Optional.of(new OffsetAndEpoch(endOffset, lastEpochInSegment));
// Update the highest offset in remote storage for this partition's log so that the local log segments
// are not deleted before they are copied to remote storage.
log.updateHighestOffsetInRemoteStorage(endOffset);
logger.info("Copied {} to remote storage with segment-id: {}", logFileName, copySegmentFinishedRlsm.remoteLogSegmentId());
long bytesLag = log.onlyLocalLogSegmentsSize() - log.activeSegment().size();
long segmentsLag = log.onlyLocalLogSegmentsCount() - 1;
recordLagStats(bytesLag, segmentsLag);
}
// VisibleForTesting
void recordLagStats(long bytesLag, long segmentsLag) {
if (!isCancelled()) {
String topic = topicIdPartition.topic();
int partition = topicIdPartition.partition();
brokerTopicStats.recordRemoteCopyLagBytes(topic, partition, bytesLag);
brokerTopicStats.recordRemoteCopyLagSegments(topic, partition, segmentsLag);
}
}
void resetLagStats() {
String topic = topicIdPartition.topic();
int partition = topicIdPartition.partition();
brokerTopicStats.recordRemoteCopyLagBytes(topic, partition, 0);
brokerTopicStats.recordRemoteCopyLagSegments(topic, partition, 0);
}
private Path toPathIfExists(File file) {
return file.exists() ? file.toPath() : null;
}
}
class RLMExpirationTask extends RLMTask {
private final Logger logger;
public RLMExpirationTask(TopicIdPartition topicIdPartition) {
super(topicIdPartition);
this.logger = getLogContext().logger(RLMExpirationTask.class);
}
@Override
protected void execute(UnifiedLog log) throws InterruptedException, RemoteStorageException, ExecutionException {
// Cleanup/delete expired remote log segments
cleanupExpiredRemoteLogSegments();
}
public void handleLogStartOffsetUpdate(TopicPartition topicPartition, long remoteLogStartOffset) {
logger.debug("Updating {} with remoteLogStartOffset: {}", topicPartition, remoteLogStartOffset);
updateRemoteLogStartOffset.accept(topicPartition, remoteLogStartOffset);
}
class RemoteLogRetentionHandler {
private final Optional retentionSizeData;
private final Optional retentionTimeData;
private long remainingBreachedSize;
private OptionalLong logStartOffset = OptionalLong.empty();
public RemoteLogRetentionHandler(Optional retentionSizeData, Optional retentionTimeData) {
this.retentionSizeData = retentionSizeData;
this.retentionTimeData = retentionTimeData;
remainingBreachedSize = retentionSizeData.map(sizeData -> sizeData.remainingBreachedSize).orElse(0L);
}
private boolean isSegmentBreachedByRetentionSize(RemoteLogSegmentMetadata metadata) {
boolean shouldDeleteSegment = false;
if (!retentionSizeData.isPresent()) {
return shouldDeleteSegment;
}
// Assumption that segments contain size >= 0
if (remainingBreachedSize > 0) {
long remainingBytes = remainingBreachedSize - metadata.segmentSizeInBytes();
if (remainingBytes >= 0) {
remainingBreachedSize = remainingBytes;
shouldDeleteSegment = true;
}
}
if (shouldDeleteSegment) {
if (!logStartOffset.isPresent() || logStartOffset.getAsLong() < metadata.endOffset() + 1) {
logStartOffset = OptionalLong.of(metadata.endOffset() + 1);
}
logger.info("About to delete remote log segment {} due to retention size {} breach. Log size after deletion will be {}.",
metadata.remoteLogSegmentId(), retentionSizeData.get().retentionSize, remainingBreachedSize + retentionSizeData.get().retentionSize);
}
return shouldDeleteSegment;
}
public boolean isSegmentBreachedByRetentionTime(RemoteLogSegmentMetadata metadata) {
boolean shouldDeleteSegment = false;
if (!retentionTimeData.isPresent()) {
return shouldDeleteSegment;
}
shouldDeleteSegment = metadata.maxTimestampMs() <= retentionTimeData.get().cleanupUntilMs;
if (shouldDeleteSegment) {
remainingBreachedSize = Math.max(0, remainingBreachedSize - metadata.segmentSizeInBytes());
// It is fine to have logStartOffset as `metadata.endOffset() + 1` as the segment offset intervals
// are ascending with in an epoch.
if (!logStartOffset.isPresent() || logStartOffset.getAsLong() < metadata.endOffset() + 1) {
logStartOffset = OptionalLong.of(metadata.endOffset() + 1);
}
logger.info("About to delete remote log segment {} due to retention time {}ms breach based on the largest record timestamp in the segment",
metadata.remoteLogSegmentId(), retentionTimeData.get().retentionMs);
}
return shouldDeleteSegment;
}
private boolean isSegmentBreachByLogStartOffset(RemoteLogSegmentMetadata metadata,
long logStartOffset,
NavigableMap leaderEpochEntries) {
boolean shouldDeleteSegment = false;
if (!leaderEpochEntries.isEmpty()) {
// Note that `logStartOffset` and `leaderEpochEntries.firstEntry().getValue()` should be same
Integer firstEpoch = leaderEpochEntries.firstKey();
shouldDeleteSegment = metadata.segmentLeaderEpochs().keySet().stream().allMatch(epoch -> epoch <= firstEpoch)
&& metadata.endOffset() < logStartOffset;
}
if (shouldDeleteSegment) {
logger.info("About to delete remote log segment {} due to log-start-offset {} breach. " +
"Current earliest-epoch-entry: {}, segment-end-offset: {} and segment-epochs: {}",
metadata.remoteLogSegmentId(), logStartOffset, leaderEpochEntries.firstEntry(),
metadata.endOffset(), metadata.segmentLeaderEpochs());
}
return shouldDeleteSegment;
}
// It removes the segments beyond the current leader's earliest epoch. Those segments are considered as
// unreferenced because they are not part of the current leader epoch lineage.
private boolean deleteLogSegmentsDueToLeaderEpochCacheTruncation(EpochEntry earliestEpochEntry,
RemoteLogSegmentMetadata metadata)
throws RemoteStorageException, ExecutionException, InterruptedException {
boolean isSegmentDeleted = deleteRemoteLogSegment(metadata, ignored ->
metadata.segmentLeaderEpochs().keySet().stream().allMatch(epoch -> epoch < earliestEpochEntry.epoch));
if (isSegmentDeleted) {
logger.info("Deleted remote log segment {} due to leader-epoch-cache truncation. " +
"Current earliest-epoch-entry: {}, segment-end-offset: {} and segment-epochs: {}",
metadata.remoteLogSegmentId(), earliestEpochEntry, metadata.endOffset(), metadata.segmentLeaderEpochs().keySet());
}
// No need to update the log-start-offset as these epochs/offsets are earlier to that value.
return isSegmentDeleted;
}
private boolean deleteRemoteLogSegment(RemoteLogSegmentMetadata segmentMetadata, Predicate predicate)
throws RemoteStorageException, ExecutionException, InterruptedException {
if (predicate.test(segmentMetadata)) {
logger.debug("Deleting remote log segment {}", segmentMetadata.remoteLogSegmentId());
String topic = segmentMetadata.topicIdPartition().topic();
// Publish delete segment started event.
remoteLogMetadataManager.updateRemoteLogSegmentMetadata(
new RemoteLogSegmentMetadataUpdate(segmentMetadata.remoteLogSegmentId(), time.milliseconds(),
segmentMetadata.customMetadata(), RemoteLogSegmentState.DELETE_SEGMENT_STARTED, brokerId)).get();
brokerTopicStats.topicStats(topic).remoteDeleteRequestRate().mark();
brokerTopicStats.allTopicsStats().remoteDeleteRequestRate().mark();
// Delete the segment in remote storage.
try {
remoteLogStorageManager.deleteLogSegmentData(segmentMetadata);
} catch (RemoteStorageException e) {
brokerTopicStats.topicStats(topic).failedRemoteDeleteRequestRate().mark();
brokerTopicStats.allTopicsStats().failedRemoteDeleteRequestRate().mark();
throw e;
}
// Publish delete segment finished event.
remoteLogMetadataManager.updateRemoteLogSegmentMetadata(
new RemoteLogSegmentMetadataUpdate(segmentMetadata.remoteLogSegmentId(), time.milliseconds(),
segmentMetadata.customMetadata(), RemoteLogSegmentState.DELETE_SEGMENT_FINISHED, brokerId)).get();
logger.debug("Deleted remote log segment {}", segmentMetadata.remoteLogSegmentId());
return true;
}
return false;
}
}
private void updateMetadataCountAndLogSizeWith(int metadataCount, long remoteLogSizeBytes) {
int partition = topicIdPartition.partition();
String topic = topicIdPartition.topic();
brokerTopicStats.recordRemoteLogMetadataCount(topic, partition, metadataCount);
brokerTopicStats.recordRemoteLogSizeBytes(topic, partition, remoteLogSizeBytes);
}
private void updateRemoteDeleteLagWith(int segmentsLeftToDelete, long sizeOfDeletableSegmentsBytes) {
String topic = topicIdPartition.topic();
int partition = topicIdPartition.partition();
brokerTopicStats.recordRemoteDeleteLagSegments(topic, partition, segmentsLeftToDelete);
brokerTopicStats.recordRemoteDeleteLagBytes(topic, partition, sizeOfDeletableSegmentsBytes);
}
void cleanupExpiredRemoteLogSegments() throws RemoteStorageException, ExecutionException, InterruptedException {
if (isCancelled()) {
logger.info("Returning from remote log segments cleanup as the task state is changed");
return;
}
final Optional logOptional = fetchLog.apply(topicIdPartition.topicPartition());
if (!logOptional.isPresent()) {
logger.debug("No UnifiedLog instance available for partition: {}", topicIdPartition);
return;
}
final UnifiedLog log = logOptional.get();
final Option leaderEpochCacheOption = log.leaderEpochCache();
if (leaderEpochCacheOption.isEmpty()) {
logger.debug("No leader epoch cache available for partition: {}", topicIdPartition);
return;
}
// Cleanup remote log segments and update the log start offset if applicable.
final Iterator segmentMetadataIter = remoteLogMetadataManager.listRemoteLogSegments(topicIdPartition);
if (!segmentMetadataIter.hasNext()) {
updateMetadataCountAndLogSizeWith(0, 0);
logger.debug("No remote log segments available on remote storage for partition: {}", topicIdPartition);
return;
}
final Set epochsSet = new HashSet<>();
int metadataCount = 0;
long remoteLogSizeBytes = 0;
// Good to have an API from RLMM to get all the remote leader epochs of all the segments of a partition
// instead of going through all the segments and building it here.
while (segmentMetadataIter.hasNext()) {
RemoteLogSegmentMetadata segmentMetadata = segmentMetadataIter.next();
epochsSet.addAll(segmentMetadata.segmentLeaderEpochs().keySet());
metadataCount++;
remoteLogSizeBytes += segmentMetadata.segmentSizeInBytes();
}
updateMetadataCountAndLogSizeWith(metadataCount, remoteLogSizeBytes);
// All the leader epochs in sorted order that exists in remote storage
final List remoteLeaderEpochs = new ArrayList<>(epochsSet);
Collections.sort(remoteLeaderEpochs);
LeaderEpochFileCache leaderEpochCache = leaderEpochCacheOption.get();
// Build the leader epoch map by filtering the epochs that do not have any records.
NavigableMap epochWithOffsets = buildFilteredLeaderEpochMap(leaderEpochCache.epochWithOffsets());
long logStartOffset = log.logStartOffset();
long logEndOffset = log.logEndOffset();
Optional retentionSizeData = buildRetentionSizeData(log.config().retentionSize,
log.onlyLocalLogSegmentsSize(), logEndOffset, epochWithOffsets);
Optional retentionTimeData = buildRetentionTimeData(log.config().retentionMs);
RemoteLogRetentionHandler remoteLogRetentionHandler = new RemoteLogRetentionHandler(retentionSizeData, retentionTimeData);
Iterator epochIterator = epochWithOffsets.navigableKeySet().iterator();
boolean canProcess = true;
List segmentsToDelete = new ArrayList<>();
long sizeOfDeletableSegmentsBytes = 0L;
while (canProcess && epochIterator.hasNext()) {
Integer epoch = epochIterator.next();
Iterator segmentsIterator = remoteLogMetadataManager.listRemoteLogSegments(topicIdPartition, epoch);
while (canProcess && segmentsIterator.hasNext()) {
if (isCancelled()) {
logger.info("Returning from remote log segments cleanup for the remaining segments as the task state is changed.");
return;
}
RemoteLogSegmentMetadata metadata = segmentsIterator.next();
if (segmentIdsBeingCopied.contains(metadata.remoteLogSegmentId())) {
logger.debug("Copy for the segment {} is currently in process. Skipping cleanup for it and the remaining segments",
metadata.remoteLogSegmentId());
canProcess = false;
continue;
}
if (RemoteLogSegmentState.DELETE_SEGMENT_FINISHED.equals(metadata.state())) {
continue;
}
if (segmentsToDelete.contains(metadata)) {
continue;
}
// When the log-start-offset is moved by the user, the leader-epoch-checkpoint file gets truncated
// as per the log-start-offset. Until the rlm-cleaner-thread runs in the next iteration, those
// remote log segments won't be removed. The `isRemoteSegmentWithinLeaderEpoch` validates whether
// the epochs present in the segment lies in the checkpoint file. It will always return false
// since the checkpoint file was already truncated.
boolean shouldDeleteSegment = remoteLogRetentionHandler.isSegmentBreachByLogStartOffset(
metadata, logStartOffset, epochWithOffsets);
boolean isValidSegment = false;
if (!shouldDeleteSegment) {
// check whether the segment contains the required epoch range with in the current leader epoch lineage.
isValidSegment = isRemoteSegmentWithinLeaderEpochs(metadata, logEndOffset, epochWithOffsets);
if (isValidSegment) {
shouldDeleteSegment =
remoteLogRetentionHandler.isSegmentBreachedByRetentionTime(metadata) ||
remoteLogRetentionHandler.isSegmentBreachedByRetentionSize(metadata);
}
}
if (shouldDeleteSegment) {
segmentsToDelete.add(metadata);
sizeOfDeletableSegmentsBytes += metadata.segmentSizeInBytes();
}
canProcess = shouldDeleteSegment || !isValidSegment;
}
}
// Update log start offset with the computed value after retention cleanup is done
remoteLogRetentionHandler.logStartOffset.ifPresent(offset -> handleLogStartOffsetUpdate(topicIdPartition.topicPartition(), offset));
// At this point in time we have updated the log start offsets, but not initiated a deletion.
// Either a follower has picked up the changes to the log start offset, or they have not.
// If the follower HAS picked up the changes, and they become the leader this replica won't successfully complete
// the deletion.
// However, the new leader will correctly pick up all breaching segments as log start offset breaching ones
// and delete them accordingly.
// If the follower HAS NOT picked up the changes, and they become the leader then they will go through this process
// again and delete them with the original deletion reason i.e. size, time or log start offset breach.
int segmentsLeftToDelete = segmentsToDelete.size();
updateRemoteDeleteLagWith(segmentsLeftToDelete, sizeOfDeletableSegmentsBytes);
List undeletedSegments = new ArrayList<>();
for (RemoteLogSegmentMetadata segmentMetadata : segmentsToDelete) {
if (!remoteLogRetentionHandler.deleteRemoteLogSegment(segmentMetadata, x -> !isCancelled())) {
undeletedSegments.add(segmentMetadata.remoteLogSegmentId().toString());
} else {
sizeOfDeletableSegmentsBytes -= segmentMetadata.segmentSizeInBytes();
segmentsLeftToDelete--;
updateRemoteDeleteLagWith(segmentsLeftToDelete, sizeOfDeletableSegmentsBytes);
}
}
if (!undeletedSegments.isEmpty()) {
logger.info("The following remote segments could not be deleted: {}", String.join(",", undeletedSegments));
}
// Remove the remote log segments whose segment-leader-epochs are less than the earliest-epoch known
// to the leader. This will remove the unreferenced segments in the remote storage. This is needed for
// unclean leader election scenarios as the remote storage can have epochs earlier to the current leader's
// earliest leader epoch.
Optional earliestEpochEntryOptional = leaderEpochCache.earliestEntry();
if (earliestEpochEntryOptional.isPresent()) {
EpochEntry earliestEpochEntry = earliestEpochEntryOptional.get();
Iterator epochsToClean = remoteLeaderEpochs.stream()
.filter(remoteEpoch -> remoteEpoch < earliestEpochEntry.epoch)
.iterator();
List listOfSegmentsToBeCleaned = new ArrayList<>();
while (epochsToClean.hasNext()) {
int epoch = epochsToClean.next();
Iterator segmentsToBeCleaned = remoteLogMetadataManager.listRemoteLogSegments(topicIdPartition, epoch);
while (segmentsToBeCleaned.hasNext()) {
if (!isCancelled()) {
RemoteLogSegmentMetadata nextSegmentMetadata = segmentsToBeCleaned.next();
sizeOfDeletableSegmentsBytes += nextSegmentMetadata.segmentSizeInBytes();
listOfSegmentsToBeCleaned.add(nextSegmentMetadata);
}
}
}
segmentsLeftToDelete += listOfSegmentsToBeCleaned.size();
updateRemoteDeleteLagWith(segmentsLeftToDelete, sizeOfDeletableSegmentsBytes);
for (RemoteLogSegmentMetadata segmentMetadata : listOfSegmentsToBeCleaned) {
if (!isCancelled()) {
// No need to update the log-start-offset even though the segment is deleted as these epochs/offsets are earlier to that value.
if (remoteLogRetentionHandler.deleteLogSegmentsDueToLeaderEpochCacheTruncation(earliestEpochEntry, segmentMetadata)) {
sizeOfDeletableSegmentsBytes -= segmentMetadata.segmentSizeInBytes();
segmentsLeftToDelete--;
updateRemoteDeleteLagWith(segmentsLeftToDelete, sizeOfDeletableSegmentsBytes);
}
}
}
}
}
private Optional buildRetentionTimeData(long retentionMs) {
long cleanupUntilMs = time.milliseconds() - retentionMs;
return retentionMs > -1 && cleanupUntilMs >= 0
? Optional.of(new RetentionTimeData(retentionMs, cleanupUntilMs))
: Optional.empty();
}
private Optional buildRetentionSizeData(long retentionSize,
long onlyLocalLogSegmentsSize,
long logEndOffset,
NavigableMap epochEntries) throws RemoteStorageException {
if (retentionSize > -1) {
long startTimeMs = time.milliseconds();
long remoteLogSizeBytes = 0L;
Set visitedSegmentIds = new HashSet<>();
for (Integer epoch : epochEntries.navigableKeySet()) {
// remoteLogSize(topicIdPartition, epochEntry.epoch) may not be completely accurate as the remote
// log size may be computed for all the segments but not for segments with in the current
// partition's leader epoch lineage. Better to revisit this API.
// remoteLogSizeBytes += remoteLogMetadataManager.remoteLogSize(topicIdPartition, epochEntry.epoch);
Iterator segmentsIterator = remoteLogMetadataManager.listRemoteLogSegments(topicIdPartition, epoch);
while (segmentsIterator.hasNext()) {
RemoteLogSegmentMetadata segmentMetadata = segmentsIterator.next();
// Only count the size of "COPY_SEGMENT_FINISHED" and "DELETE_SEGMENT_STARTED" state segments
// because "COPY_SEGMENT_STARTED" means copy didn't complete, and "DELETE_SEGMENT_FINISHED" means delete did complete.
// Note: there might be some "COPY_SEGMENT_STARTED" segments not counted here.
// Either they are being copied and will be counted next time or they are dangling and will be cleaned elsewhere,
// either way, this won't cause more segment deletion.
if (segmentMetadata.state().equals(RemoteLogSegmentState.COPY_SEGMENT_FINISHED) ||
segmentMetadata.state().equals(RemoteLogSegmentState.DELETE_SEGMENT_STARTED)) {
RemoteLogSegmentId segmentId = segmentMetadata.remoteLogSegmentId();
if (!visitedSegmentIds.contains(segmentId) && isRemoteSegmentWithinLeaderEpochs(segmentMetadata, logEndOffset, epochEntries)) {
remoteLogSizeBytes += segmentMetadata.segmentSizeInBytes();
visitedSegmentIds.add(segmentId);
}
}
}
}
brokerTopicStats.recordRemoteLogSizeComputationTime(topicIdPartition.topic(), topicIdPartition.partition(), time.milliseconds() - startTimeMs);
// This is the total size of segments in local log that have their base-offset > local-log-start-offset
// and size of the segments in remote storage which have their end-offset < local-log-start-offset.
long totalSize = onlyLocalLogSegmentsSize + remoteLogSizeBytes;
if (totalSize > retentionSize) {
long remainingBreachedSize = totalSize - retentionSize;
RetentionSizeData retentionSizeData = new RetentionSizeData(retentionSize, remainingBreachedSize);
return Optional.of(retentionSizeData);
}
}
return Optional.empty();
}
}
class RLMFollowerTask extends RLMTask {
public RLMFollowerTask(TopicIdPartition topicIdPartition) {
super(topicIdPartition);
}
@Override
protected void execute(UnifiedLog log) throws InterruptedException, RemoteStorageException, ExecutionException {
OffsetAndEpoch offsetAndEpoch = findHighestRemoteOffset(topicIdPartition, log);
// Update the highest offset in remote storage for this partition's log so that the local log segments
// are not deleted before they are copied to remote storage.
log.updateHighestOffsetInRemoteStorage(offsetAndEpoch.offset());
}
}
/**
* Returns true if the remote segment's epoch/offsets are within the leader epoch lineage of the partition.
* The constraints here are as follows:
* - The segment's first epoch's offset should be more than or equal to the respective leader epoch's offset in the partition leader epoch lineage.
* - The segment's end offset should be less than or equal to the respective leader epoch's offset in the partition leader epoch lineage.
* - The segment's epoch lineage(epoch and offset) should be same as leader epoch lineage((epoch and offset)) except
* for the first and the last epochs in the segment.
*
* @param segmentMetadata The remote segment metadata to be validated.
* @param logEndOffset The log end offset of the partition.
* @param leaderEpochs The leader epoch lineage of the partition by filtering the epochs containing no data.
* @return true if the remote segment's epoch/offsets are within the leader epoch lineage of the partition.
*/
// Visible for testing
static boolean isRemoteSegmentWithinLeaderEpochs(RemoteLogSegmentMetadata segmentMetadata,
long logEndOffset,
NavigableMap leaderEpochs) {
long segmentEndOffset = segmentMetadata.endOffset();
// Filter epochs that does not have any messages/records associated with them.
NavigableMap segmentLeaderEpochs = buildFilteredLeaderEpochMap(segmentMetadata.segmentLeaderEpochs());
// Check for out of bound epochs between segment epochs and current leader epochs.
Integer segmentLastEpoch = segmentLeaderEpochs.lastKey();
if (segmentLastEpoch < leaderEpochs.firstKey() || segmentLastEpoch > leaderEpochs.lastKey()) {
LOGGER.debug("Segment {} is not within the partition leader epoch lineage. " +
"Remote segment epochs: {} and partition leader epochs: {}",
segmentMetadata.remoteLogSegmentId(), segmentLeaderEpochs, leaderEpochs);
return false;
}
// There can be overlapping remote log segments in the remote storage. (eg)
// leader-epoch-file-cache: {(5, 10), (7, 15), (9, 100)}
// segment1: offset-range = 5-50, Broker = 0, epochs = {(5, 10), (7, 15)}
// segment2: offset-range = 14-150, Broker = 1, epochs = {(5, 14), (7, 15), (9, 100)}, after leader-election.
// When the segment1 gets deleted, then the log-start-offset = 51 and leader-epoch-file-cache gets updated to: {(7, 51), (9, 100)}.
// While validating the segment2, we should ensure the overlapping remote log segments case.
Integer segmentFirstEpoch = segmentLeaderEpochs.ceilingKey(leaderEpochs.firstKey());
if (segmentFirstEpoch == null) {
LOGGER.debug("Segment {} is not within the partition leader epoch lineage. " +
"Remote segment epochs: {} and partition leader epochs: {}",
segmentMetadata.remoteLogSegmentId(), segmentLeaderEpochs, leaderEpochs);
return false;
}
for (Map.Entry entry : segmentLeaderEpochs.entrySet()) {
int epoch = entry.getKey();
long offset = entry.getValue();
if (epoch < segmentFirstEpoch) {
continue;
}
// If segment's epoch does not exist in the leader epoch lineage then it is not a valid segment.
if (!leaderEpochs.containsKey(epoch)) {
LOGGER.debug("Segment {} epoch {} is not within the leader epoch lineage. " +
"Remote segment epochs: {} and partition leader epochs: {}",
segmentMetadata.remoteLogSegmentId(), epoch, segmentLeaderEpochs, leaderEpochs);
return false;
}
// Two cases:
// case-1: When the segment-first-epoch equals to the first-epoch in the leader-epoch-lineage, then the
// offset value can lie anywhere between 0 to (next-epoch-start-offset - 1) is valid.
// case-2: When the segment-first-epoch is not equal to the first-epoch in the leader-epoch-lineage, then
// the offset value should be between (current-epoch-start-offset) to (next-epoch-start-offset - 1).
if (epoch == segmentFirstEpoch && leaderEpochs.lowerKey(epoch) != null && offset < leaderEpochs.get(epoch)) {
LOGGER.debug("Segment {} first-valid epoch {} offset is less than first leader epoch offset {}." +
"Remote segment epochs: {} and partition leader epochs: {}",
segmentMetadata.remoteLogSegmentId(), epoch, leaderEpochs.get(epoch),
segmentLeaderEpochs, leaderEpochs);
return false;
}
// Segment's end offset should be less than or equal to the respective leader epoch's offset.
if (epoch == segmentLastEpoch) {
Map.Entry nextEntry = leaderEpochs.higherEntry(epoch);
if (nextEntry != null && segmentEndOffset > nextEntry.getValue() - 1) {
LOGGER.debug("Segment {} end offset {} is more than leader epoch offset {}." +
"Remote segment epochs: {} and partition leader epochs: {}",
segmentMetadata.remoteLogSegmentId(), segmentEndOffset, nextEntry.getValue() - 1,
segmentLeaderEpochs, leaderEpochs);
return false;
}
}
// Next segment epoch entry and next leader epoch entry should be same to ensure that the segment's epoch
// is within the leader epoch lineage.
if (epoch != segmentLastEpoch && !leaderEpochs.higherEntry(epoch).equals(segmentLeaderEpochs.higherEntry(epoch))) {
LOGGER.debug("Segment {} epoch {} is not within the leader epoch lineage. " +
"Remote segment epochs: {} and partition leader epochs: {}",
segmentMetadata.remoteLogSegmentId(), epoch, segmentLeaderEpochs, leaderEpochs);
return false;
}
}
// segment end offset should be with in the log end offset.
if (segmentEndOffset >= logEndOffset) {
LOGGER.debug("Segment {} end offset {} is more than log end offset {}.",
segmentMetadata.remoteLogSegmentId(), segmentEndOffset, logEndOffset);
return false;
}
return true;
}
/**
* Returns a map containing the epoch vs start-offset for the given leader epoch map by filtering the epochs that
* does not contain any messages/records associated with them.
* For ex:
*
* {@code
*
* 0 - 0
* 1 - 10
* 2 - 20
* 3 - 30
* 4 - 40
* 5 - 60 // epoch 5 does not have records or messages associated with it
* 6 - 60
* 7 - 70
* }
*
* When the above leaderEpochMap is passed to this method, it returns the following map:
*
* {@code
*
* 0 - 0
* 1 - 10
* 2 - 20
* 3 - 30
* 4 - 40
* 6 - 60
* 7 - 70
* }
*
* @param leaderEpochs The leader epoch map to be refined.
*/
// Visible for testing
static NavigableMap buildFilteredLeaderEpochMap(NavigableMap leaderEpochs) {
List epochsWithNoMessages = new ArrayList<>();
Map.Entry previousEpochAndOffset = null;
for (Map.Entry currentEpochAndOffset : leaderEpochs.entrySet()) {
if (previousEpochAndOffset != null && previousEpochAndOffset.getValue().equals(currentEpochAndOffset.getValue())) {
epochsWithNoMessages.add(previousEpochAndOffset.getKey());
}
previousEpochAndOffset = currentEpochAndOffset;
}
if (epochsWithNoMessages.isEmpty()) {
return leaderEpochs;
}
TreeMap filteredLeaderEpochs = new TreeMap<>(leaderEpochs);
for (Integer epochWithNoMessage : epochsWithNoMessages) {
filteredLeaderEpochs.remove(epochWithNoMessage);
}
return filteredLeaderEpochs;
}
public FetchDataInfo read(RemoteStorageFetchInfo remoteStorageFetchInfo) throws RemoteStorageException, IOException {
int fetchMaxBytes = remoteStorageFetchInfo.fetchMaxBytes;
TopicPartition tp = remoteStorageFetchInfo.topicPartition;
FetchRequest.PartitionData fetchInfo = remoteStorageFetchInfo.fetchInfo;
boolean includeAbortedTxns = remoteStorageFetchInfo.fetchIsolation == FetchIsolation.TXN_COMMITTED;
long offset = fetchInfo.fetchOffset;
int maxBytes = Math.min(fetchMaxBytes, fetchInfo.maxBytes);
Optional logOptional = fetchLog.apply(tp);
OptionalInt epoch = OptionalInt.empty();
if (logOptional.isPresent()) {
Option leaderEpochCache = logOptional.get().leaderEpochCache();
if (leaderEpochCache != null && leaderEpochCache.isDefined()) {
epoch = leaderEpochCache.get().epochForOffset(offset);
}
}
Optional rlsMetadataOptional = epoch.isPresent()
? fetchRemoteLogSegmentMetadata(tp, epoch.getAsInt(), offset)
: Optional.empty();
if (!rlsMetadataOptional.isPresent()) {
String epochStr = (epoch.isPresent()) ? Integer.toString(epoch.getAsInt()) : "NOT AVAILABLE";
throw new OffsetOutOfRangeException("Received request for offset " + offset + " for leader epoch "
+ epochStr + " and partition " + tp + " which does not exist in remote tier.");
}
RemoteLogSegmentMetadata remoteLogSegmentMetadata = rlsMetadataOptional.get();
EnrichedRecordBatch enrichedRecordBatch = new EnrichedRecordBatch(null, 0);
InputStream remoteSegInputStream = null;
try {
int startPos = 0;
// Iteration over multiple RemoteSegmentMetadata is required in case of log compaction.
// It may be possible the offset is log compacted in the current RemoteLogSegmentMetadata
// And we need to iterate over the next segment metadata to fetch messages higher than the given offset.
while (enrichedRecordBatch.batch == null && rlsMetadataOptional.isPresent()) {
remoteLogSegmentMetadata = rlsMetadataOptional.get();
// Search forward for the position of the last offset that is greater than or equal to the target offset
startPos = lookupPositionForOffset(remoteLogSegmentMetadata, offset);
remoteSegInputStream = remoteLogStorageManager.fetchLogSegment(remoteLogSegmentMetadata, startPos);
RemoteLogInputStream remoteLogInputStream = getRemoteLogInputStream(remoteSegInputStream);
enrichedRecordBatch = findFirstBatch(remoteLogInputStream, offset);
if (enrichedRecordBatch.batch == null) {
Utils.closeQuietly(remoteSegInputStream, "RemoteLogSegmentInputStream");
rlsMetadataOptional = findNextSegmentMetadata(rlsMetadataOptional.get(), logOptional.get().leaderEpochCache());
}
}
RecordBatch firstBatch = enrichedRecordBatch.batch;
if (firstBatch == null)
return new FetchDataInfo(new LogOffsetMetadata(offset), MemoryRecords.EMPTY, false,
includeAbortedTxns ? Optional.of(Collections.emptyList()) : Optional.empty());
int firstBatchSize = firstBatch.sizeInBytes();
// An empty record is sent instead of an incomplete batch when
// - there is no minimum-one-message constraint and
// - the first batch size is more than maximum bytes that can be sent and
// - for FetchRequest version 3 or above.
if (!remoteStorageFetchInfo.minOneMessage &&
!remoteStorageFetchInfo.hardMaxBytesLimit &&
firstBatchSize > maxBytes) {
return new FetchDataInfo(new LogOffsetMetadata(offset), MemoryRecords.EMPTY);
}
int updatedFetchSize =
remoteStorageFetchInfo.minOneMessage && firstBatchSize > maxBytes ? firstBatchSize : maxBytes;
ByteBuffer buffer = ByteBuffer.allocate(updatedFetchSize);
int remainingBytes = updatedFetchSize;
firstBatch.writeTo(buffer);
remainingBytes -= firstBatchSize;
if (remainingBytes > 0) {
// read the input stream until min of (EOF stream or buffer's remaining capacity).
Utils.readFully(remoteSegInputStream, buffer);
}
buffer.flip();
startPos = startPos + enrichedRecordBatch.skippedBytes;
FetchDataInfo fetchDataInfo = new FetchDataInfo(
new LogOffsetMetadata(firstBatch.baseOffset(), remoteLogSegmentMetadata.startOffset(), startPos),
MemoryRecords.readableRecords(buffer));
if (includeAbortedTxns) {
fetchDataInfo = addAbortedTransactions(firstBatch.baseOffset(), remoteLogSegmentMetadata, fetchDataInfo, logOptional.get());
}
return fetchDataInfo;
} finally {
if (enrichedRecordBatch.batch != null) {
Utils.closeQuietly(remoteSegInputStream, "RemoteLogSegmentInputStream");
}
}
}
// for testing
RemoteLogInputStream getRemoteLogInputStream(InputStream in) {
return new RemoteLogInputStream(in);
}
// Visible for testing
int lookupPositionForOffset(RemoteLogSegmentMetadata remoteLogSegmentMetadata, long offset) {
return indexCache.lookupOffset(remoteLogSegmentMetadata, offset);
}
private FetchDataInfo addAbortedTransactions(long startOffset,
RemoteLogSegmentMetadata segmentMetadata,
FetchDataInfo fetchInfo,
UnifiedLog log) throws RemoteStorageException {
int fetchSize = fetchInfo.records.sizeInBytes();
OffsetPosition startOffsetPosition = new OffsetPosition(fetchInfo.fetchOffsetMetadata.messageOffset,
fetchInfo.fetchOffsetMetadata.relativePositionInSegment);
OffsetIndex offsetIndex = indexCache.getIndexEntry(segmentMetadata).offsetIndex();
long upperBoundOffset = offsetIndex.fetchUpperBoundOffset(startOffsetPosition, fetchSize)
.map(position -> position.offset).orElse(segmentMetadata.endOffset() + 1);
final Set abortedTransactions = new HashSet<>();
Consumer> accumulator =
abortedTxns -> abortedTransactions.addAll(abortedTxns.stream()
.map(AbortedTxn::asAbortedTransaction).collect(Collectors.toList()));
collectAbortedTransactions(startOffset, upperBoundOffset, segmentMetadata, accumulator, log);
return new FetchDataInfo(fetchInfo.fetchOffsetMetadata,
fetchInfo.records,
fetchInfo.firstEntryIncomplete,
Optional.of(abortedTransactions.isEmpty() ? Collections.emptyList() : new ArrayList<>(abortedTransactions)));
}
private void collectAbortedTransactions(long startOffset,
long upperBoundOffset,
RemoteLogSegmentMetadata segmentMetadata,
Consumer> accumulator,
UnifiedLog log) throws RemoteStorageException {
// Search in remote segments first.
Optional nextSegmentMetadataOpt = Optional.of(segmentMetadata);
while (nextSegmentMetadataOpt.isPresent()) {
Optional txnIndexOpt = nextSegmentMetadataOpt.map(metadata -> indexCache.getIndexEntry(metadata).txnIndex());
if (txnIndexOpt.isPresent()) {
TxnIndexSearchResult searchResult = txnIndexOpt.get().collectAbortedTxns(startOffset, upperBoundOffset);
accumulator.accept(searchResult.abortedTransactions);
if (searchResult.isComplete) {
// Return immediately when the search result is complete, it does not need to go through local log segments.
return;
}
}
nextSegmentMetadataOpt = findNextSegmentMetadata(nextSegmentMetadataOpt.get(), log.leaderEpochCache());
}
// Search in local segments
collectAbortedTransactionInLocalSegments(startOffset, upperBoundOffset, accumulator, log.logSegments().iterator());
}
private void collectAbortedTransactionInLocalSegments(long startOffset,
long upperBoundOffset,
Consumer> accumulator,
Iterator localLogSegments) {
while (localLogSegments.hasNext()) {
TransactionIndex txnIndex = localLogSegments.next().txnIndex();
if (txnIndex != null) {
TxnIndexSearchResult searchResult = txnIndex.collectAbortedTxns(startOffset, upperBoundOffset);
accumulator.accept(searchResult.abortedTransactions);
if (searchResult.isComplete) {
return;
}
}
}
}
// visible for testing.
Optional findNextSegmentMetadata(RemoteLogSegmentMetadata segmentMetadata,
Option leaderEpochFileCacheOption) throws RemoteStorageException {
if (leaderEpochFileCacheOption.isEmpty()) {
return Optional.empty();
}
long nextSegmentBaseOffset = segmentMetadata.endOffset() + 1;
OptionalInt epoch = leaderEpochFileCacheOption.get().epochForOffset(nextSegmentBaseOffset);
return epoch.isPresent()
? fetchRemoteLogSegmentMetadata(segmentMetadata.topicIdPartition().topicPartition(), epoch.getAsInt(), nextSegmentBaseOffset)
: Optional.empty();
}
// Visible for testing
EnrichedRecordBatch findFirstBatch(RemoteLogInputStream remoteLogInputStream, long offset) throws IOException {
int skippedBytes = 0;
RecordBatch nextBatch = null;
// Look for the batch which has the desired offset
// We will always have a batch in that segment as it is a non-compacted topic.
do {
if (nextBatch != null) {
skippedBytes += nextBatch.sizeInBytes();
}
nextBatch = remoteLogInputStream.nextBatch();
} while (nextBatch != null && nextBatch.lastOffset() < offset);
return new EnrichedRecordBatch(nextBatch, skippedBytes);
}
OffsetAndEpoch findHighestRemoteOffset(TopicIdPartition topicIdPartition, UnifiedLog log) throws RemoteStorageException {
OffsetAndEpoch offsetAndEpoch = null;
Option leaderEpochCacheOpt = log.leaderEpochCache();
if (leaderEpochCacheOpt.isDefined()) {
LeaderEpochFileCache cache = leaderEpochCacheOpt.get();
Optional maybeEpochEntry = cache.latestEntry();
while (offsetAndEpoch == null && maybeEpochEntry.isPresent()) {
int epoch = maybeEpochEntry.get().epoch;
Optional highestRemoteOffsetOpt =
remoteLogMetadataManager.highestOffsetForEpoch(topicIdPartition, epoch);
if (highestRemoteOffsetOpt.isPresent()) {
Map.Entry entry = cache.endOffsetFor(epoch, log.logEndOffset());
int requestedEpoch = entry.getKey();
long endOffset = entry.getValue();
long highestRemoteOffset = highestRemoteOffsetOpt.get();
if (endOffset <= highestRemoteOffset) {
LOGGER.info("The end-offset for epoch {}: ({}, {}) is less than or equal to the " +
"highest-remote-offset: {} for partition: {}", epoch, requestedEpoch, endOffset,
highestRemoteOffset, topicIdPartition);
offsetAndEpoch = new OffsetAndEpoch(endOffset - 1, requestedEpoch);
} else {
offsetAndEpoch = new OffsetAndEpoch(highestRemoteOffset, epoch);
}
}
maybeEpochEntry = cache.previousEntry(epoch);
}
}
if (offsetAndEpoch == null) {
offsetAndEpoch = new OffsetAndEpoch(-1L, RecordBatch.NO_PARTITION_LEADER_EPOCH);
}
return offsetAndEpoch;
}
long findLogStartOffset(TopicIdPartition topicIdPartition, UnifiedLog log) throws RemoteStorageException {
Optional logStartOffset = Optional.empty();
Option maybeLeaderEpochFileCache = log.leaderEpochCache();
if (maybeLeaderEpochFileCache.isDefined()) {
LeaderEpochFileCache cache = maybeLeaderEpochFileCache.get();
OptionalInt earliestEpochOpt = cache.earliestEntry()
.map(epochEntry -> OptionalInt.of(epochEntry.epoch))
.orElseGet(OptionalInt::empty);
while (!logStartOffset.isPresent() && earliestEpochOpt.isPresent()) {
Iterator iterator =
remoteLogMetadataManager.listRemoteLogSegments(topicIdPartition, earliestEpochOpt.getAsInt());
if (iterator.hasNext()) {
logStartOffset = Optional.of(iterator.next().startOffset());
}
earliestEpochOpt = cache.nextEpoch(earliestEpochOpt.getAsInt());
}
}
return logStartOffset.orElseGet(log::localLogStartOffset);
}
/**
* Submit a remote log read task.
* This method returns immediately. The read operation is executed in a thread pool.
* The callback will be called when the task is done.
*
* @throws java.util.concurrent.RejectedExecutionException if the task cannot be accepted for execution (task queue is full)
*/
public Future asyncRead(RemoteStorageFetchInfo fetchInfo, Consumer callback) {
return remoteStorageReaderThreadPool.submit(
new RemoteLogReader(fetchInfo, this, callback, brokerTopicStats, rlmFetchQuotaManager, remoteReadTimer));
}
void doHandleLeaderPartition(TopicIdPartition topicPartition, Boolean remoteLogCopyDisable) {
RLMTaskWithFuture followerRLMTaskWithFuture = followerRLMTasks.remove(topicPartition);
if (followerRLMTaskWithFuture != null) {
LOGGER.info("Cancelling the follower task: {}", followerRLMTaskWithFuture.rlmTask);
followerRLMTaskWithFuture.cancel();
}
// Only create copy task when remoteLogCopyDisable is disabled
if (!remoteLogCopyDisable) {
leaderCopyRLMTasks.computeIfAbsent(topicPartition, topicIdPartition -> {
RLMCopyTask task = new RLMCopyTask(topicIdPartition, this.rlmConfig.remoteLogMetadataCustomMetadataMaxBytes());
// set this upfront when it is getting initialized instead of doing it after scheduling.
LOGGER.info("Created a new copy task: {} and getting scheduled", task);
ScheduledFuture> future = rlmCopyThreadPool.scheduleWithFixedDelay(task, 0, delayInMs, TimeUnit.MILLISECONDS);
return new RLMTaskWithFuture(task, future);
});
}
leaderExpirationRLMTasks.computeIfAbsent(topicPartition, topicIdPartition -> {
RLMExpirationTask task = new RLMExpirationTask(topicIdPartition);
LOGGER.info("Created a new expiration task: {} and getting scheduled", task);
ScheduledFuture> future = rlmExpirationThreadPool.scheduleWithFixedDelay(task, 0, delayInMs, TimeUnit.MILLISECONDS);
return new RLMTaskWithFuture(task, future);
});
}
void doHandleFollowerPartition(TopicIdPartition topicPartition) {
RLMTaskWithFuture copyRLMTaskWithFuture = leaderCopyRLMTasks.remove(topicPartition);
if (copyRLMTaskWithFuture != null) {
LOGGER.info("Cancelling the copy task: {}", copyRLMTaskWithFuture.rlmTask);
copyRLMTaskWithFuture.cancel();
}
RLMTaskWithFuture expirationRLMTaskWithFuture = leaderExpirationRLMTasks.remove(topicPartition);
if (expirationRLMTaskWithFuture != null) {
LOGGER.info("Cancelling the expiration task: {}", expirationRLMTaskWithFuture.rlmTask);
expirationRLMTaskWithFuture.cancel();
}
followerRLMTasks.computeIfAbsent(topicPartition, topicIdPartition -> {
RLMFollowerTask task = new RLMFollowerTask(topicIdPartition);
LOGGER.info("Created a new follower task: {} and getting scheduled", task);
ScheduledFuture> future = followerThreadPool.scheduleWithFixedDelay(task, 0, delayInMs, TimeUnit.MILLISECONDS);
return new RLMTaskWithFuture(task, future);
});
}
static class RLMTaskWithFuture {
private final RLMTask rlmTask;
private final Future> future;
RLMTaskWithFuture(RLMTask rlmTask, Future> future) {
this.rlmTask = rlmTask;
this.future = future;
}
public void cancel() {
rlmTask.cancel();
try {
future.cancel(true);
} catch (Exception ex) {
LOGGER.error("Error occurred while canceling the task: {}", rlmTask, ex);
}
}
}
/**
* Closes and releases all the resources like RemoterStorageManager and RemoteLogMetadataManager.
*/
public void close() {
synchronized (this) {
if (!closed) {
leaderCopyRLMTasks.values().forEach(RLMTaskWithFuture::cancel);
leaderExpirationRLMTasks.values().forEach(RLMTaskWithFuture::cancel);
followerRLMTasks.values().forEach(RLMTaskWithFuture::cancel);
Utils.closeQuietly(remoteLogStorageManager, "RemoteLogStorageManager");
Utils.closeQuietly(remoteLogMetadataManager, "RemoteLogMetadataManager");
Utils.closeQuietly(indexCache, "RemoteIndexCache");
rlmCopyThreadPool.close();
rlmExpirationThreadPool.close();
followerThreadPool.close();
try {
shutdownAndAwaitTermination(remoteStorageReaderThreadPool, "RemoteStorageReaderThreadPool", 10, TimeUnit.SECONDS);
} finally {
removeMetrics();
}
leaderCopyRLMTasks.clear();
leaderExpirationRLMTasks.clear();
followerRLMTasks.clear();
closed = true;
}
}
}
private static void shutdownAndAwaitTermination(ExecutorService pool, String poolName, long timeout, TimeUnit timeUnit) {
// This pattern of shutting down thread pool is adopted from here: https://docs.oracle.com/en/java/javase/17/docs/api/java.base/java/util/concurrent/ExecutorService.html
LOGGER.info("Shutting down of thread pool {} is started", poolName);
pool.shutdown(); // Disable new tasks from being submitted
try {
// Wait a while for existing tasks to terminate
if (!pool.awaitTermination(timeout, timeUnit)) {
LOGGER.info("Shutting down of thread pool {} could not be completed. It will retry cancelling the tasks using shutdownNow.", poolName);
pool.shutdownNow(); // Cancel currently executing tasks
// Wait a while for tasks to respond to being cancelled
if (!pool.awaitTermination(timeout, timeUnit))
LOGGER.warn("Shutting down of thread pool {} could not be completed even after retrying cancellation of the tasks using shutdownNow.", poolName);
}
} catch (InterruptedException ex) {
// (Re-)Cancel if current thread also interrupted
LOGGER.warn("Encountered InterruptedException while shutting down thread pool {}. It will retry cancelling the tasks using shutdownNow.", poolName);
pool.shutdownNow();
// Preserve interrupt status
Thread.currentThread().interrupt();
}
LOGGER.info("Shutting down of thread pool {} is completed", poolName);
}
//Visible for testing
static ByteBuffer epochEntriesAsByteBuffer(List epochEntries) throws IOException {
ByteArrayOutputStream stream = new ByteArrayOutputStream();
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(stream, StandardCharsets.UTF_8))) {
CheckpointFile.CheckpointWriteBuffer writeBuffer =
new CheckpointFile.CheckpointWriteBuffer<>(writer, 0, LeaderEpochCheckpointFile.FORMATTER);
writeBuffer.write(epochEntries);
writer.flush();
}
return ByteBuffer.wrap(stream.toByteArray());
}
private void removeRemoteTopicPartitionMetrics(TopicIdPartition topicIdPartition) {
String topic = topicIdPartition.topic();
if (!brokerTopicStats.isTopicStatsExisted(topicIdPartition.topic())) {
// The topic metrics are already removed, removing this topic key from broker-level metrics
brokerTopicStats.removeBrokerLevelRemoteCopyLagBytes(topic);
brokerTopicStats.removeBrokerLevelRemoteCopyLagSegments(topic);
brokerTopicStats.removeBrokerLevelRemoteDeleteLagBytes(topic);
brokerTopicStats.removeBrokerLevelRemoteDeleteLagSegments(topic);
brokerTopicStats.removeBrokerLevelRemoteLogMetadataCount(topic);
brokerTopicStats.removeBrokerLevelRemoteLogSizeComputationTime(topic);
brokerTopicStats.removeBrokerLevelRemoteLogSizeBytes(topic);
} else {
int partition = topicIdPartition.partition();
// remove the partition metric values and update the broker-level metrics
brokerTopicStats.removeRemoteCopyLagBytes(topic, partition);
brokerTopicStats.removeRemoteCopyLagSegments(topic, partition);
brokerTopicStats.removeRemoteDeleteLagBytes(topic, partition);
brokerTopicStats.removeRemoteDeleteLagSegments(topic, partition);
brokerTopicStats.removeRemoteLogMetadataCount(topic, partition);
brokerTopicStats.removeRemoteLogSizeComputationTime(topic, partition);
brokerTopicStats.removeRemoteLogSizeBytes(topic, partition);
}
}
//Visible for testing
RLMTaskWithFuture leaderCopyTask(TopicIdPartition partition) {
return leaderCopyRLMTasks.get(partition);
}
RLMTaskWithFuture leaderExpirationTask(TopicIdPartition partition) {
return leaderExpirationRLMTasks.get(partition);
}
RLMTaskWithFuture followerTask(TopicIdPartition partition) {
return followerRLMTasks.get(partition);
}
static class RLMScheduledThreadPool {
private static final Logger LOGGER = LoggerFactory.getLogger(RLMScheduledThreadPool.class);
private final int poolSize;
private final String threadPoolName;
private final String threadNamePrefix;
private final ScheduledThreadPoolExecutor scheduledThreadPool;
public RLMScheduledThreadPool(int poolSize, String threadPoolName, String threadNamePrefix) {
this.poolSize = poolSize;
this.threadPoolName = threadPoolName;
this.threadNamePrefix = threadNamePrefix;
scheduledThreadPool = createPool();
}
private ScheduledThreadPoolExecutor createPool() {
ScheduledThreadPoolExecutor threadPool = new ScheduledThreadPoolExecutor(poolSize);
threadPool.setRemoveOnCancelPolicy(true);
threadPool.setExecuteExistingDelayedTasksAfterShutdownPolicy(false);
threadPool.setContinueExistingPeriodicTasksAfterShutdownPolicy(false);
threadPool.setThreadFactory(new ThreadFactory() {
private final AtomicInteger sequence = new AtomicInteger();
public Thread newThread(Runnable r) {
return KafkaThread.daemon(threadNamePrefix + sequence.incrementAndGet(), r);
}
});
return threadPool;
}
public Double getIdlePercent() {
return 1 - (double) scheduledThreadPool.getActiveCount() / (double) scheduledThreadPool.getCorePoolSize();
}
public ScheduledFuture> scheduleWithFixedDelay(Runnable runnable, long initialDelay, long delay, TimeUnit timeUnit) {
LOGGER.info("Scheduling runnable {} with initial delay: {}, fixed delay: {}", runnable, initialDelay, delay);
return scheduledThreadPool.scheduleWithFixedDelay(runnable, initialDelay, delay, timeUnit);
}
public void close() {
shutdownAndAwaitTermination(scheduledThreadPool, threadPoolName, 10, TimeUnit.SECONDS);
}
}
// Visible for testing
public static class RetentionSizeData {
private final long retentionSize;
private final long remainingBreachedSize;
public RetentionSizeData(long retentionSize, long remainingBreachedSize) {
if (retentionSize < 0)
throw new IllegalArgumentException("retentionSize should be non negative, but it is " + retentionSize);
if (remainingBreachedSize <= 0) {
throw new IllegalArgumentException("remainingBreachedSize should be more than zero, but it is " + remainingBreachedSize);
}
this.retentionSize = retentionSize;
this.remainingBreachedSize = remainingBreachedSize;
}
}
// Visible for testing
public static class RetentionTimeData {
private final long retentionMs;
private final long cleanupUntilMs;
public RetentionTimeData(long retentionMs, long cleanupUntilMs) {
if (retentionMs < 0)
throw new IllegalArgumentException("retentionMs should be non negative, but it is " + retentionMs);
if (cleanupUntilMs < 0)
throw new IllegalArgumentException("cleanupUntilMs should be non negative, but it is " + cleanupUntilMs);
this.retentionMs = retentionMs;
this.cleanupUntilMs = cleanupUntilMs;
}
}
// Visible for testing
static class EnrichedLogSegment {
private final LogSegment logSegment;
private final long nextSegmentOffset;
public EnrichedLogSegment(LogSegment logSegment,
long nextSegmentOffset) {
this.logSegment = logSegment;
this.nextSegmentOffset = nextSegmentOffset;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
EnrichedLogSegment that = (EnrichedLogSegment) o;
return nextSegmentOffset == that.nextSegmentOffset && Objects.equals(logSegment, that.logSegment);
}
@Override
public int hashCode() {
return Objects.hash(logSegment, nextSegmentOffset);
}
@Override
public String toString() {
return "EnrichedLogSegment{" +
"logSegment=" + logSegment +
", nextSegmentOffset=" + nextSegmentOffset +
'}';
}
}
static class EnrichedRecordBatch {
private final RecordBatch batch;
private final int skippedBytes;
public EnrichedRecordBatch(RecordBatch batch, int skippedBytes) {
this.batch = batch;
this.skippedBytes = skippedBytes;
}
}
}