org.apache.kafka.streams.processor.internals.StoreChangelogReader Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of kafka-streams Show documentation
There is a newer version: 3.7.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.kafka.streams.processor.internals;

import org.apache.kafka.clients.admin.Admin;
import org.apache.kafka.clients.admin.ListOffsetsOptions;
import org.apache.kafka.clients.admin.ListOffsetsResult;
import org.apache.kafka.clients.admin.OffsetSpec;
import org.apache.kafka.clients.consumer.Consumer;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.InvalidOffsetException;
import org.apache.kafka.common.IsolationLevel;
import org.apache.kafka.common.KafkaException;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.common.errors.TimeoutException;
import org.apache.kafka.common.utils.LogContext;
import org.apache.kafka.common.utils.Time;
import org.apache.kafka.streams.StreamsConfig;
import org.apache.kafka.streams.errors.StreamsException;
import org.apache.kafka.streams.errors.TaskCorruptedException;
import org.apache.kafka.streams.processor.StateRestoreListener;
import org.apache.kafka.streams.processor.TaskId;
import org.apache.kafka.streams.processor.internals.ProcessorStateManager.StateStoreMetadata;
import org.slf4j.Logger;

import java.time.Duration;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ExecutionException;
import java.util.function.Function;
import java.util.stream.Collectors;

import static org.apache.kafka.streams.processor.internals.ClientUtils.fetchCommittedOffsets;

/**
 * ChangelogReader is created and maintained by the stream thread and used for both updating standby tasks and
 * restoring active tasks. It manages the restore consumer, including its assigned partitions, when to pause / resume
 * these partitions, etc.
 * 
 * The reader also maintains the source of truth for restoration state: only active tasks restoring changelog could
 * be completed, while standby tasks updating changelog would always be in restoring state after being initialized.
 */
public class StoreChangelogReader implements ChangelogReader {
    private static final long RESTORE_LOG_INTERVAL_MS = 10_000L;
    private long lastRestoreLogTime = 0L;

    enum ChangelogState {
        // registered but need to be initialized (i.e. set its starting, end, limit offsets)
        REGISTERED("REGISTERED"),

        // initialized and restoring
        RESTORING("RESTORING", 0),

        // completed restoring (only for active restoring task, standby task should never be completed)
        COMPLETED("COMPLETED", 1);

        public final String name;
        private final List prevStates;

        ChangelogState(final String name, final Integer... prevStates) {
            this.name = name;
            this.prevStates = Arrays.asList(prevStates);
        }
    }

    // NOTE we assume that the changelog reader is used only for either
    //   1) restoring active task or
    //   2) updating standby task at a given time,
    // but never doing both
    enum ChangelogReaderState {
        ACTIVE_RESTORING("ACTIVE_RESTORING"),

        STANDBY_UPDATING("STANDBY_UPDATING");

        public final String name;

        ChangelogReaderState(final String name) {
            this.name = name;
        }
    }

    static class ChangelogMetadata {

        private final StateStoreMetadata storeMetadata;

        private final ProcessorStateManager stateManager;

        private ChangelogState changelogState;

        private long totalRestored;

        // the end offset beyond which records should not be applied (yet) to restore the states
        //
        // for both active restoring tasks and standby updating tasks, it is defined as:
        //    * log-end-offset if the changelog is not piggy-backed with source topic
        //    * min(log-end-offset, committed-offset) if the changelog is piggy-backed with source topic
        //
        // the log-end-offset only needs to be updated once and only need to be for active tasks since for standby
        // tasks it would never "complete" based on the end-offset;
        //
        // the committed-offset needs to be updated periodically for those standby tasks
        //
        // NOTE we do not book keep the current offset since we leverage state manager as its source of truth
        private Long restoreEndOffset;

        // buffer records polled by the restore consumer;
        private final List> bufferedRecords;

        // the limit index (exclusive) inside the buffered records beyond which should not be used to restore
        // either due to limit offset (standby) or committed end offset (active)
        private int bufferedLimitIndex;

        private ChangelogMetadata(final StateStoreMetadata storeMetadata, final ProcessorStateManager stateManager) {
            this.changelogState = ChangelogState.REGISTERED;
            this.storeMetadata = storeMetadata;
            this.stateManager = stateManager;
            this.restoreEndOffset = null;
            this.totalRestored = 0L;

            this.bufferedRecords = new ArrayList<>();
            this.bufferedLimitIndex = 0;
        }

        private void clear() {
            this.bufferedRecords.clear();
        }

        private void transitTo(final ChangelogState newState) {
            if (newState.prevStates.contains(changelogState.ordinal())) {
                changelogState = newState;
            } else {
                throw new IllegalStateException("Invalid transition from " + changelogState + " to " + newState);
            }
        }

        @Override
        public String toString() {
            final Long currentOffset = storeMetadata.offset();
            return changelogState + " " + stateManager.taskType() +
                " (currentOffset " + currentOffset + ", endOffset " + restoreEndOffset + ")";
        }

        // for testing only below
        ChangelogState state() {
            return changelogState;
        }

        long totalRestored() {
            return totalRestored;
        }

        Long endOffset() {
            return restoreEndOffset;
        }

        List> bufferedRecords() {
            return bufferedRecords;
        }

        int bufferedLimitIndex() {
            return bufferedLimitIndex;
        }
    }

    private final static long DEFAULT_OFFSET_UPDATE_MS = Duration.ofMinutes(5L).toMillis();

    private ChangelogReaderState state;

    private final Time time;
    private final Logger log;
    private final Duration pollTime;
    private final long updateOffsetIntervalMs;

    // 1) we keep adding partitions to restore consumer whenever new tasks are registered with the state manager;
    // 2) we do not unassign partitions when we switch between standbys and actives, we just pause / resume them;
    // 3) we only remove an assigned partition when the corresponding task is being removed from the thread.
    private final Consumer restoreConsumer;
    private final StateRestoreListener stateRestoreListener;

    // source of the truth of the current registered changelogs;
    // NOTE a changelog would only be removed when its corresponding task
    // is being removed from the thread; otherwise it would stay in this map even after completed
    private final Map changelogs;

    // the changelog reader only need the main consumer to get committed offsets for source changelog partitions
    // to update offset limit for standby tasks;
    private Consumer mainConsumer;

    // the changelog reader needs the admin client to list end offsets
    private final Admin adminClient;

    private long lastUpdateOffsetTime;

    void setMainConsumer(final Consumer consumer) {
        this.mainConsumer = consumer;
    }

    public StoreChangelogReader(final Time time,
                                final StreamsConfig config,
                                final LogContext logContext,
                                final Admin adminClient,
                                final Consumer restoreConsumer,
                                final StateRestoreListener stateRestoreListener) {
        this.time = time;
        this.log = logContext.logger(StoreChangelogReader.class);
        this.state = ChangelogReaderState.ACTIVE_RESTORING;
        this.adminClient = adminClient;
        this.restoreConsumer = restoreConsumer;
        this.stateRestoreListener = stateRestoreListener;

        this.pollTime = Duration.ofMillis(config.getLong(StreamsConfig.POLL_MS_CONFIG));
        this.updateOffsetIntervalMs = config.getLong(StreamsConfig.COMMIT_INTERVAL_MS_CONFIG) == Long.MAX_VALUE ?
            DEFAULT_OFFSET_UPDATE_MS : config.getLong(StreamsConfig.COMMIT_INTERVAL_MS_CONFIG);
        this.lastUpdateOffsetTime = 0L;

        this.changelogs = new HashMap<>();
    }

    private static String recordEndOffset(final Long endOffset) {
        return endOffset == null ? "UNKNOWN (since it is for standby task)" : endOffset.toString();
    }

    private boolean hasRestoredToEnd(final ChangelogMetadata metadata) {
        final Long endOffset = metadata.restoreEndOffset;
        if (endOffset == null) {
            // end offset is not initialized meaning that it is from a standby task,
            // this should never happen since we only call this function for active task in restoring phase
            throw new IllegalStateException("End offset for changelog " + metadata + " is unknown when deciding " +
                "if it has completed restoration, this should never happen.");
        } else if (endOffset == 0) {
            // this is a special case, meaning there's nothing to be restored since the changelog has no data
            // OR the changelog is a source topic and there's no committed offset
            return true;
        } else if (metadata.bufferedRecords.isEmpty()) {
            // NOTE there are several corner cases that we need to consider:
            //  1) the end / committed offset returned from the consumer is the last offset + 1
            //  2) there could be txn markers as the last record if EOS is enabled at the producer
            //
            // It is possible that: the last record's offset == last txn marker offset - 1 == end / committed offset - 2
            //
            // So we make the following decision:
            //  1) if all the buffered records have been applied, then we compare the end offset with the
            //     current consumer's position, which is the "next" record to fetch, bypassing the txn marker already
            //  2) if not all the buffered records have been applied, then it means we are restricted by the end offset,
            //     and the consumer's position is likely already ahead of that end offset. Then we just need to check
            //     the first record in the remaining buffer and see if that record is no smaller than the end offset.
            final TopicPartition partition = metadata.storeMetadata.changelogPartition();
            try {
                return restoreConsumer.position(partition) >= endOffset;
            } catch (final TimeoutException e) {
                // if we cannot get the position of the consumer within timeout, just return false
                return false;
            } catch (final KafkaException e) {
                // this also includes InvalidOffsetException, which should not happen under normal
                // execution, hence it is also okay to wrap it as fatal StreamsException
                throw new StreamsException("Restore consumer get unexpected error trying to get the position " +
                    " of " + partition, e);
            }
        } else {
            return metadata.bufferedRecords.get(0).offset() >= endOffset;
        }
    }

    // Once some new tasks are created, we transit to restore them and pause on the existing standby tasks. It is
    // possible that when newly created tasks are created the changelog reader are still restoring existing
    // active tasks, and hence this function is idempotent and can be called multiple times.
    //
    // NOTE: even if the newly created tasks do not need any restoring, we still first transit to this state and then
    // immediately transit back -- there's no overhead of transiting back and forth but simplifies the logic a lot.
    @Override
    public void enforceRestoreActive() {
        if (state != ChangelogReaderState.ACTIVE_RESTORING) {
            log.debug("Transiting to restore active tasks: {}", changelogs);
            lastRestoreLogTime = 0L;

            // pause all partitions that are for standby tasks from the restore consumer
            pauseChangelogsFromRestoreConsumer(standbyRestoringChangelogs());

            state = ChangelogReaderState.ACTIVE_RESTORING;
        }
    }

    // Only after we've completed restoring all active tasks we'll then move back to resume updating standby tasks.
    // This function is NOT idempotent: if it is already in updating standby tasks mode, we should not call it again.
    //
    // NOTE: we do not clear completed active restoring changelogs or remove partitions from restore consumer either
    // upon completing them but only pause the corresponding partitions; the changelog metadata / partitions would only
    // be cleared when the corresponding task is being removed from the thread. In other words, the restore consumer
    // should contain all changelogs that are RESTORING or COMPLETED
    @Override
    public void transitToUpdateStandby() {
        if (state != ChangelogReaderState.ACTIVE_RESTORING) {
            throw new IllegalStateException(
                "The changelog reader is not restoring active tasks (is " + state + ") while trying to " +
                    "transit to update standby tasks: " + changelogs
            );
        }

        log.debug("Transiting to update standby tasks: {}", changelogs);

        // resume all standby restoring changelogs from the restore consumer
        resumeChangelogsFromRestoreConsumer(standbyRestoringChangelogs());

        state = ChangelogReaderState.STANDBY_UPDATING;
    }

    /**
     * Since it is shared for multiple tasks and hence multiple state managers, the registration would take its
     * corresponding state manager as well for restoring.
     */
    @Override
    public void register(final TopicPartition partition, final ProcessorStateManager stateManager) {
        final StateStoreMetadata storeMetadata = stateManager.storeMetadata(partition);
        if (storeMetadata == null) {
            throw new IllegalStateException("Cannot find the corresponding state store metadata for changelog " +
                partition);
        }

        final ChangelogMetadata changelogMetadata = new ChangelogMetadata(storeMetadata, stateManager);

        // initializing limit offset to 0L for standby changelog to effectively disable any restoration until it is updated
        if (stateManager.taskType() == Task.TaskType.STANDBY && stateManager.changelogAsSource(partition)) {
            changelogMetadata.restoreEndOffset = 0L;
        }

        if (changelogs.putIfAbsent(partition, changelogMetadata) != null) {
            throw new IllegalStateException("There is already a changelog registered for " + partition +
                ", this should not happen: " + changelogs);
        }
    }

    private ChangelogMetadata restoringChangelogByPartition(final TopicPartition partition) {
        final ChangelogMetadata changelogMetadata = changelogs.get(partition);
        if (changelogMetadata == null) {
            throw new IllegalStateException("The corresponding changelog restorer for " + partition +
                " does not exist, this should not happen.");
        }
        if (changelogMetadata.changelogState != ChangelogState.RESTORING) {
            throw new IllegalStateException("The corresponding changelog restorer for " + partition +
                " has already transited to completed state, this should not happen.");
        }

        return changelogMetadata;
    }

    private Set registeredChangelogs() {
        return changelogs.values().stream()
            .filter(metadata -> metadata.changelogState == ChangelogState.REGISTERED)
            .collect(Collectors.toSet());
    }

    private Set restoringChangelogs() {
        return changelogs.values().stream()
            .filter(metadata -> metadata.changelogState == ChangelogState.RESTORING)
            .map(metadata -> metadata.storeMetadata.changelogPartition())
            .collect(Collectors.toSet());
    }

    private Set activeRestoringChangelogs() {
        return changelogs.values().stream()
            .filter(metadata -> metadata.changelogState == ChangelogState.RESTORING &&
                metadata.stateManager.taskType() == Task.TaskType.ACTIVE)
            .map(metadata -> metadata.storeMetadata.changelogPartition())
            .collect(Collectors.toSet());
    }

    private Set standbyRestoringChangelogs() {
        return changelogs.values().stream()
            .filter(metadata -> metadata.changelogState == ChangelogState.RESTORING &&
                metadata.stateManager.taskType() == Task.TaskType.STANDBY)
            .map(metadata -> metadata.storeMetadata.changelogPartition())
            .collect(Collectors.toSet());
    }

    private boolean allChangelogsCompleted() {
        return changelogs.values().stream()
            .allMatch(metadata -> metadata.changelogState == ChangelogState.COMPLETED);
    }

    @Override
    public Set completedChangelogs() {
        return changelogs.values().stream()
            .filter(metadata -> metadata.changelogState == ChangelogState.COMPLETED)
            .map(metadata -> metadata.storeMetadata.changelogPartition())
            .collect(Collectors.toSet());
    }

    // 1. if there are any registered changelogs that needs initialization, try to initialize them first;
    // 2. if all changelogs have finished, return early;
    // 3. if there are any restoring changelogs, try to read from the restore consumer and process them.
    public void restore() {
        initializeChangelogs(registeredChangelogs());

        if (!activeRestoringChangelogs().isEmpty() && state == ChangelogReaderState.STANDBY_UPDATING) {
            throw new IllegalStateException("Should not be in standby updating state if there are still un-completed active changelogs");
        }

        if (allChangelogsCompleted()) {
            log.debug("Finished restoring all changelogs {}", changelogs.keySet());
            return;
        }

        final Set restoringChangelogs = restoringChangelogs();
        if (!restoringChangelogs.isEmpty()) {
            final ConsumerRecords polledRecords;

            try {
                // for restoring active and updating standby we may prefer different poll time
                // in order to make sure we call the main consumer#poll in time.
                // TODO: once we move ChangelogReader to a separate thread this may no longer be a concern
                polledRecords = restoreConsumer.poll(state == ChangelogReaderState.STANDBY_UPDATING ? Duration.ZERO : pollTime);
            } catch (final InvalidOffsetException e) {
                log.warn("Encountered " + e.getClass().getName() +
                    " fetching records from restore consumer for partitions " + e.partitions() + ", it is likely that " +
                    "the consumer's position has fallen out of the topic partition offset range because the topic was " +
                    "truncated or compacted on the broker, marking the corresponding tasks as corrupted and re-initializing" +
                    " it later.", e);

                final Map> taskWithCorruptedChangelogs = new HashMap<>();
                for (final TopicPartition partition : e.partitions()) {
                    final TaskId taskId = changelogs.get(partition).stateManager.taskId();
                    taskWithCorruptedChangelogs.computeIfAbsent(taskId, k -> new HashSet<>()).add(partition);
                }
                throw new TaskCorruptedException(taskWithCorruptedChangelogs, e);
            } catch (final KafkaException e) {
                throw new StreamsException("Restore consumer get unexpected error polling records.", e);
            }

            for (final TopicPartition partition : polledRecords.partitions()) {
                bufferChangelogRecords(restoringChangelogByPartition(partition), polledRecords.records(partition));
            }

            for (final TopicPartition partition : restoringChangelogs) {
                // even if some partition do not have any accumulated data, we still trigger
                // restoring since some changelog may not need to restore any at all, and the
                // restore to end check needs to be executed still.
                // TODO: we always try to restore as a batch when some records are accumulated, which may result in
                //       small batches; this can be optimized in the future, e.g. wait longer for larger batches.
                restoreChangelog(changelogs.get(partition));
            }

            maybeUpdateLimitOffsetsForStandbyChangelogs();

            maybeLogRestorationProgress();
        }
    }

    private void maybeLogRestorationProgress() {
        if (state == ChangelogReaderState.ACTIVE_RESTORING) {
            if (time.milliseconds() - lastRestoreLogTime > RESTORE_LOG_INTERVAL_MS) {
                final Set topicPartitions = activeRestoringChangelogs();
                if (!topicPartitions.isEmpty()) {
                    final StringBuilder builder = new StringBuilder().append("Restoration in progress for ")
                                                                     .append(topicPartitions.size())
                                                                     .append(" partitions.");
                    for (final TopicPartition partition : topicPartitions) {
                        final ChangelogMetadata changelogMetadata = restoringChangelogByPartition(partition);
                        builder.append(" {")
                               .append(partition)
                               .append(": ")
                               .append("position=")
                               .append(getPositionString(partition, changelogMetadata))
                               .append(", end=")
                               .append(changelogMetadata.restoreEndOffset)
                               .append(", totalRestored=")
                               .append(changelogMetadata.totalRestored)
                               .append("}");
                    }
                    log.info(builder.toString());
                    lastRestoreLogTime = time.milliseconds();
                }
            }
        } else {
            lastRestoreLogTime = 0L;
        }
    }

    private static String getPositionString(final TopicPartition partition,
                                            final ChangelogMetadata changelogMetadata) {
        final ProcessorStateManager stateManager = changelogMetadata.stateManager;
        final Long offsets = stateManager.changelogOffsets().get(partition);
        return offsets == null ? "unknown" : String.valueOf(offsets);
    }

    private void maybeUpdateLimitOffsetsForStandbyChangelogs() {
        // we only consider updating the limit offset for standbys if we are not restoring active tasks
        if (state == ChangelogReaderState.STANDBY_UPDATING &&
            updateOffsetIntervalMs < time.milliseconds() - lastUpdateOffsetTime) {

            // when the interval has elapsed we should try to update the limit offset for standbys reading from
            // a source changelog with the new committed offset, unless there are no buffered records since 
            // we only need the limit when processing new records
            // for other changelog partitions we do not need to update limit offset at all since we never need to
            // check when it completes based on limit offset anyways: the end offset would keep increasing and the
            // standby never need to stop
            final Set changelogsWithLimitOffsets = changelogs.entrySet().stream()
                .filter(entry -> entry.getValue().stateManager.taskType() == Task.TaskType.STANDBY &&
                    entry.getValue().stateManager.changelogAsSource(entry.getKey()))
                .map(Map.Entry::getKey).collect(Collectors.toSet());

            for (final TopicPartition partition : changelogsWithLimitOffsets) {
                if (!changelogs.get(partition).bufferedRecords().isEmpty()) {
                    updateLimitOffsetsForStandbyChangelogs(committedOffsetForChangelogs(changelogsWithLimitOffsets));
                    break;
                }
            }
        }
    }

    private void bufferChangelogRecords(final ChangelogMetadata changelogMetadata, final List> records) {
        // update the buffered records and limit index with the fetched records
        for (final ConsumerRecord record : records) {
            // filter polled records for null-keys and also possibly update buffer limit index
            if (record.key() == null) {
                log.warn("Read changelog record with null key from changelog {} at offset {}, " +
                    "skipping it for restoration", changelogMetadata.storeMetadata.changelogPartition(), record.offset());
            } else {
                changelogMetadata.bufferedRecords.add(record);
                final long offset = record.offset();
                if (changelogMetadata.restoreEndOffset == null || offset < changelogMetadata.restoreEndOffset) {
                    changelogMetadata.bufferedLimitIndex = changelogMetadata.bufferedRecords.size();
                }
            }
        }
    }

    /**
     * restore a changelog with its buffered records if there's any; for active changelogs also check if
     * it has completed the restoration and can transit to COMPLETED state and trigger restore callbacks
     */
    private void restoreChangelog(final ChangelogMetadata changelogMetadata) {
        final ProcessorStateManager stateManager = changelogMetadata.stateManager;
        final StateStoreMetadata storeMetadata = changelogMetadata.storeMetadata;
        final TopicPartition partition = storeMetadata.changelogPartition();
        final String storeName = storeMetadata.store().name();
        final int numRecords = changelogMetadata.bufferedLimitIndex;

        if (numRecords != 0) {
            final List> records = changelogMetadata.bufferedRecords.subList(0, numRecords);
            stateManager.restore(storeMetadata, records);

            // NOTE here we use removeRange of ArrayList in order to achieve efficiency with range shifting,
            // otherwise one-at-a-time removal or addition would be very costly; if all records are restored
            // then we can further optimize to save the array-shift but just set array elements to null
            if (numRecords < changelogMetadata.bufferedRecords.size()) {
                records.clear();
            } else {
                changelogMetadata.bufferedRecords.clear();
            }

            final Long currentOffset = storeMetadata.offset();
            log.trace("Restored {} records from changelog {} to store {}, end offset is {}, current offset is {}",
                partition, storeName, numRecords, recordEndOffset(changelogMetadata.restoreEndOffset), currentOffset);

            changelogMetadata.bufferedLimitIndex = 0;
            changelogMetadata.totalRestored += numRecords;

            // do not trigger restore listener if we are processing standby tasks
            if (changelogMetadata.stateManager.taskType() == Task.TaskType.ACTIVE) {
                try {
                    stateRestoreListener.onBatchRestored(partition, storeName, currentOffset, numRecords);
                } catch (final Exception e) {
                    throw new StreamsException("State restore listener failed on batch restored", e);
                }
            }
        }

        // we should check even if there's nothing restored, but do not check completed if we are processing standby tasks
        if (changelogMetadata.stateManager.taskType() == Task.TaskType.ACTIVE && hasRestoredToEnd(changelogMetadata)) {
            log.info("Finished restoring changelog {} to store {} with a total number of {} records",
                partition, storeName, changelogMetadata.totalRestored);

            changelogMetadata.transitTo(ChangelogState.COMPLETED);
            pauseChangelogsFromRestoreConsumer(Collections.singleton(partition));

            try {
                stateRestoreListener.onRestoreEnd(partition, storeName, changelogMetadata.totalRestored);
            } catch (final Exception e) {
                throw new StreamsException("State restore listener failed on restore completed", e);
            }
        }
    }

    private Map committedOffsetForChangelogs(final Set partitions) {
        final Map committedOffsets;
        try {
            committedOffsets = fetchCommittedOffsets(partitions, mainConsumer);
        } catch (final TimeoutException e) {
            // if it timed out we just retry next time
            return Collections.emptyMap();
        }
        lastUpdateOffsetTime = time.milliseconds();
        return committedOffsets;
    }

    private Map endOffsetForChangelogs(final Set partitions) {
        if (partitions.isEmpty()) {
            return Collections.emptyMap();
        }

        try {
            final ListOffsetsResult result = adminClient.listOffsets(
                    partitions.stream().collect(Collectors.toMap(Function.identity(), tp -> OffsetSpec.latest())),
                    new ListOffsetsOptions(IsolationLevel.READ_UNCOMMITTED)
            );
            return result.all().get().entrySet().stream().collect(
                    Collectors.toMap(Map.Entry::getKey, entry -> entry.getValue().offset()));
        } catch (final TimeoutException | InterruptedException | ExecutionException e) {
            // if timeout exception gets thrown we just give up this time and retry in the next run loop
            log.debug("Could not fetch all end offsets for {}, will retry in the next run loop", partitions);
            return Collections.emptyMap();
        } catch (final KafkaException e) {
            throw new StreamsException(String.format("Failed to retrieve end offsets for %s", partitions), e);
        }
    }

    private void updateLimitOffsetsForStandbyChangelogs(final Map committedOffsets) {
        for (final ChangelogMetadata metadata : changelogs.values()) {
            final TopicPartition partition = metadata.storeMetadata.changelogPartition();
            if (metadata.stateManager.taskType() == Task.TaskType.STANDBY &&
                metadata.stateManager.changelogAsSource(partition) &&
                committedOffsets.containsKey(partition)) {

                final Long newLimit = committedOffsets.get(partition);
                final Long previousLimit = metadata.restoreEndOffset;

                if (previousLimit != null && previousLimit > newLimit) {
                    throw new IllegalStateException("Offset limit should monotonically increase, but was reduced for partition " +
                        partition + ". New limit: " + newLimit + ". Previous limit: " + previousLimit);
                }

                metadata.restoreEndOffset = newLimit;

                // update the limit index for buffered records
                while (metadata.bufferedLimitIndex < metadata.bufferedRecords.size() &&
                    metadata.bufferedRecords.get(metadata.bufferedLimitIndex).offset() < metadata.restoreEndOffset)
                    metadata.bufferedLimitIndex++;
            }
        }
    }

    private void initializeChangelogs(final Set newPartitionsToRestore) {
        if (newPartitionsToRestore.isEmpty()) {
            return;
        }

        // for active changelogs, we need to find their end offset before transit to restoring
        // if the changelog is on source topic, then its end offset should be the minimum of
        // its committed offset and its end offset; for standby tasks that use source topics
        // as changelogs, we want to initialize their limit offsets as committed offsets as well
        final Set newPartitionsToFindEndOffset = new HashSet<>();
        final Set newPartitionsToFindCommittedOffset = new HashSet<>();

        for (final ChangelogMetadata metadata : newPartitionsToRestore) {
            final TopicPartition partition = metadata.storeMetadata.changelogPartition();

            // TODO K9113: when TaskType.GLOBAL is added we need to modify this
            if (metadata.stateManager.taskType() == Task.TaskType.ACTIVE) {
                newPartitionsToFindEndOffset.add(partition);
            }

            if (metadata.stateManager.changelogAsSource(partition)) {
                newPartitionsToFindCommittedOffset.add(partition);
            }
        }

        // NOTE we assume that all requested partitions will be included in the returned map for both end/committed
        // offsets, i.e., it would not return partial result and would timeout if some of the results cannot be found
        final Map endOffsets = endOffsetForChangelogs(newPartitionsToFindEndOffset);
        final Map committedOffsets = committedOffsetForChangelogs(newPartitionsToFindCommittedOffset);

        for (final TopicPartition partition : newPartitionsToFindEndOffset) {
            final ChangelogMetadata changelogMetadata = changelogs.get(partition);
            final Long endOffset = endOffsets.get(partition);
            final Long committedOffset = newPartitionsToFindCommittedOffset.contains(partition) ?
                committedOffsets.get(partition) : Long.valueOf(Long.MAX_VALUE);

            if (endOffset != null && committedOffset != null) {
                if (changelogMetadata.restoreEndOffset != null) {
                    throw new IllegalStateException("End offset for " + partition +
                        " should only be initialized once. Existing value: " + changelogMetadata.restoreEndOffset +
                        ", new value: (" + endOffset + ", " + committedOffset + ")");
                }

                changelogMetadata.restoreEndOffset = Math.min(endOffset, committedOffset);

                log.debug("End offset for changelog {} initialized as {}.", partition, changelogMetadata.restoreEndOffset);
            } else {
                if (!newPartitionsToRestore.remove(changelogMetadata)) {
                    throw new IllegalStateException("New changelogs to restore " + newPartitionsToRestore +
                        " does not contain the one looking for end offset: " + partition + ", this should not happen.");
                }

                log.info("End offset for changelog {} cannot be found; will retry in the next time.", partition);
            }
        }

        // try initialize limit offsets for standby tasks for the first time
        if (!committedOffsets.isEmpty()) {
            updateLimitOffsetsForStandbyChangelogs(committedOffsets);
        }

        // add new partitions to the restore consumer and transit them to restoring state
        addChangelogsToRestoreConsumer(newPartitionsToRestore.stream().map(metadata -> metadata.storeMetadata.changelogPartition())
            .collect(Collectors.toSet()));

        newPartitionsToRestore.forEach(metadata -> metadata.transitTo(ChangelogState.RESTORING));

        // if it is in the active restoring mode, we immediately pause those standby changelogs
        // here we just blindly pause all (including the existing and newly added)
        if (state == ChangelogReaderState.ACTIVE_RESTORING) {
            pauseChangelogsFromRestoreConsumer(standbyRestoringChangelogs());
        }

        // prepare newly added partitions of the restore consumer by setting their starting position
        prepareChangelogs(newPartitionsToRestore);
    }

    private void addChangelogsToRestoreConsumer(final Set partitions) {
        final Set assignment = new HashSet<>(restoreConsumer.assignment());

        // the current assignment should not contain any of the new partitions
        if (assignment.removeAll(partitions)) {
            throw new IllegalStateException("The current assignment " + restoreConsumer.assignment() + " " +
                "already contains some of the new partitions " + partitions);
        }
        assignment.addAll(partitions);
        restoreConsumer.assign(assignment);

        log.debug("Added partitions {} to the restore consumer, current assignment is {}", partitions, assignment);
    }

    private void pauseChangelogsFromRestoreConsumer(final Collection partitions) {
        final Set assignment = new HashSet<>(restoreConsumer.assignment());

        // the current assignment should contain all the partitions to pause
        if (!assignment.containsAll(partitions)) {
            throw new IllegalStateException("The current assignment " + assignment + " " +
                "does not contain some of the partitions " + partitions + " for pausing.");
        }
        restoreConsumer.pause(partitions);

        log.debug("Paused partitions {} from the restore consumer", partitions);
    }

    private void removeChangelogsFromRestoreConsumer(final Collection partitions) {
        final Set assignment = new HashSet<>(restoreConsumer.assignment());

        // the current assignment should contain all the partitions to remove
        if (!assignment.containsAll(partitions)) {
            throw new IllegalStateException("The current assignment " + assignment + " " +
                "does not contain some of the partitions " + partitions + " for removing.");
        }
        assignment.removeAll(partitions);
        restoreConsumer.assign(assignment);
    }

    private void resumeChangelogsFromRestoreConsumer(final Collection partitions) {
        final Set assignment = new HashSet<>(restoreConsumer.assignment());

        // the current assignment should contain all the partitions to resume
        if (!assignment.containsAll(partitions)) {
            throw new IllegalStateException("The current assignment " + assignment + " " +
                "does not contain some of the partitions " + partitions + " for resuming.");
        }
        restoreConsumer.resume(partitions);

        log.debug("Resumed partitions {} from the restore consumer", partitions);
    }

    private void prepareChangelogs(final Set newPartitionsToRestore) {
        // separate those who do not have the current offset loaded from checkpoint
        final Set newPartitionsWithoutStartOffset = new HashSet<>();

        for (final ChangelogMetadata changelogMetadata : newPartitionsToRestore) {
            final StateStoreMetadata storeMetadata = changelogMetadata.storeMetadata;
            final TopicPartition partition = storeMetadata.changelogPartition();
            final Long currentOffset = storeMetadata.offset();
            final Long endOffset = changelogs.get(partition).restoreEndOffset;

            if (currentOffset != null) {
                // the current offset is the offset of the last record, so we should set the position
                // as that offset + 1 as the "next" record to fetch; seek is not a blocking call so
                // there's nothing to capture
                restoreConsumer.seek(partition, currentOffset + 1);

                log.debug("Start restoring changelog partition {} from current offset {} to end offset {}.",
                    partition, currentOffset, recordEndOffset(endOffset));
            } else {
                log.debug("Start restoring changelog partition {} from the beginning offset to end offset {} " +
                    "since we cannot find current offset.", partition, recordEndOffset(endOffset));

                newPartitionsWithoutStartOffset.add(partition);
            }
        }

        // optimization: batch all seek-to-beginning offsets in a single request
        //               seek is not a blocking call so there's nothing to capture
        if (!newPartitionsWithoutStartOffset.isEmpty()) {
            restoreConsumer.seekToBeginning(newPartitionsWithoutStartOffset);
        }

        // do not trigger restore listener if we are processing standby tasks
        for (final ChangelogMetadata changelogMetadata : newPartitionsToRestore) {
            if (changelogMetadata.stateManager.taskType() == Task.TaskType.ACTIVE) {
                final StateStoreMetadata storeMetadata = changelogMetadata.storeMetadata;
                final TopicPartition partition = storeMetadata.changelogPartition();
                final String storeName = storeMetadata.store().name();

                long startOffset = 0L;
                try {
                    startOffset = restoreConsumer.position(partition);
                } catch (final TimeoutException e) {
                    // if we cannot find the starting position at the beginning, just use the default 0L
                } catch (final KafkaException e) {
                    // this also includes InvalidOffsetException, which should not happen under normal
                    // execution, hence it is also okay to wrap it as fatal StreamsException
                    throw new StreamsException("Restore consumer get unexpected error trying to get the position " +
                        " of " + partition, e);
                }

                try {
                    stateRestoreListener.onRestoreStart(partition, storeName, startOffset, changelogMetadata.restoreEndOffset);
                } catch (final Exception e) {
                    throw new StreamsException("State restore listener failed on batch restored", e);
                }
            }
        }
    }

    @Override
    public void unregister(final Collection revokedChangelogs) {
        // Only changelogs that are initialized have been added to the restore consumer's assignment
        final List revokedInitializedChangelogs = new ArrayList<>();

        for (final TopicPartition partition : revokedChangelogs) {
            final ChangelogMetadata changelogMetadata = changelogs.remove(partition);
            if (changelogMetadata != null) {
                if (!changelogMetadata.state().equals(ChangelogState.REGISTERED)) {
                    revokedInitializedChangelogs.add(partition);
                }

                changelogMetadata.clear();
            } else {
                log.debug("Changelog partition {} could not be found," +
                    " it could be already cleaned up during the handling" +
                    " of task corruption and never restore again", partition);
            }
        }

        removeChangelogsFromRestoreConsumer(revokedInitializedChangelogs);
    }

    @Override
    public void clear() {
        for (final ChangelogMetadata changelogMetadata : changelogs.values()) {
            changelogMetadata.clear();
        }
        changelogs.clear();

        try {
            restoreConsumer.unsubscribe();
        } catch (final KafkaException e) {
            throw new StreamsException("Restore consumer get unexpected error unsubscribing", e);
        }
    }

    @Override
    public boolean isEmpty() {
        return changelogs.isEmpty();
    }

    @Override
    public String toString() {
        return "StoreChangelogReader: " + changelogs + "\n";
    }

    // for testing only
    ChangelogMetadata changelogMetadata(final TopicPartition partition) {
        return changelogs.get(partition);
    }

    ChangelogReaderState state() {
        return state;
    }
}