All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.kafka.streams.processor.internals.StreamsPartitionAssignor Maven / Gradle / Ivy

There is a newer version: 3.8.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.kafka.streams.processor.internals;

import java.util.Optional;
import org.apache.kafka.clients.admin.Admin;
import org.apache.kafka.clients.admin.ListOffsetsResult;
import org.apache.kafka.clients.admin.ListOffsetsResult.ListOffsetsResultInfo;
import org.apache.kafka.clients.consumer.Consumer;
import org.apache.kafka.clients.consumer.ConsumerGroupMetadata;
import org.apache.kafka.clients.consumer.ConsumerPartitionAssignor;
import org.apache.kafka.common.Cluster;
import org.apache.kafka.common.Configurable;
import org.apache.kafka.common.KafkaException;
import org.apache.kafka.common.Node;
import org.apache.kafka.common.PartitionInfo;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.common.errors.TimeoutException;
import org.apache.kafka.common.utils.LogContext;
import org.apache.kafka.common.utils.Time;
import org.apache.kafka.common.utils.Utils;
import org.apache.kafka.streams.errors.MissingSourceTopicException;
import org.apache.kafka.streams.errors.StreamsException;
import org.apache.kafka.streams.errors.TaskAssignmentException;
import org.apache.kafka.streams.processor.TaskId;
import org.apache.kafka.streams.processor.internals.InternalTopologyBuilder.TopicsInfo;
import org.apache.kafka.streams.processor.internals.TopologyMetadata.Subtopology;
import org.apache.kafka.streams.processor.internals.assignment.AssignmentInfo;
import org.apache.kafka.streams.processor.internals.assignment.AssignorConfiguration;
import org.apache.kafka.streams.processor.internals.assignment.AssignorConfiguration.AssignmentConfigs;
import org.apache.kafka.streams.processor.internals.assignment.AssignorConfiguration.AssignmentListener;
import org.apache.kafka.streams.processor.internals.assignment.AssignorError;
import org.apache.kafka.streams.processor.internals.assignment.ClientState;
import org.apache.kafka.streams.processor.internals.assignment.CopartitionedTopicsEnforcer;
import org.apache.kafka.streams.processor.internals.assignment.FallbackPriorTaskAssignor;
import org.apache.kafka.streams.processor.internals.assignment.RackAwareTaskAssignor;
import org.apache.kafka.streams.processor.internals.assignment.ReferenceContainer;
import org.apache.kafka.streams.processor.internals.assignment.StickyTaskAssignor;
import org.apache.kafka.streams.processor.internals.assignment.SubscriptionInfo;
import org.apache.kafka.streams.processor.internals.assignment.TaskAssignor;
import org.apache.kafka.streams.state.HostInfo;
import org.slf4j.Logger;

import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.PriorityQueue;
import java.util.Queue;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.UUID;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Supplier;
import java.util.stream.Collectors;

import static java.util.Map.Entry.comparingByKey;
import static java.util.UUID.randomUUID;
import static org.apache.kafka.common.utils.Utils.filterMap;
import static org.apache.kafka.streams.processor.internals.ClientUtils.fetchCommittedOffsets;
import static org.apache.kafka.streams.processor.internals.ClientUtils.fetchEndOffsetsResult;
import static org.apache.kafka.streams.processor.internals.assignment.StreamsAssignmentProtocolVersions.EARLIEST_PROBEABLE_VERSION;
import static org.apache.kafka.streams.processor.internals.assignment.StreamsAssignmentProtocolVersions.LATEST_SUPPORTED_VERSION;
import static org.apache.kafka.streams.processor.internals.assignment.StreamsAssignmentProtocolVersions.UNKNOWN;
import static org.apache.kafka.streams.processor.internals.assignment.SubscriptionInfo.UNKNOWN_OFFSET_SUM;

public class StreamsPartitionAssignor implements ConsumerPartitionAssignor, Configurable {

    private Logger log;
    private String logPrefix;

    private static class AssignedPartition implements Comparable {

        private final TaskId taskId;
        private final TopicPartition partition;

        AssignedPartition(final TaskId taskId, final TopicPartition partition) {
            this.taskId = taskId;
            this.partition = partition;
        }

        @Override
        public int compareTo(final AssignedPartition that) {
            return PARTITION_COMPARATOR.compare(partition, that.partition);
        }

        @Override
        public boolean equals(final Object o) {
            if (!(o instanceof AssignedPartition)) {
                return false;
            }
            final AssignedPartition other = (AssignedPartition) o;
            return compareTo(other) == 0;
        }

        @Override
        public int hashCode() {
            // Only partition is important for compareTo, equals and hashCode.
            return partition.hashCode();
        }
    }

    private static class ClientMetadata {

        private final HostInfo hostInfo;
        private final ClientState state;
        private final SortedSet consumers;

        ClientMetadata(final UUID processId, final String endPoint, final Map clientTags) {

            // get the host info, or null if no endpoint is configured (ie endPoint == null)
            hostInfo = HostInfo.buildFromEndpoint(endPoint);

            // initialize the consumer memberIds
            consumers = new TreeSet<>();

            // initialize the client state with client tags
            state = new ClientState(processId, clientTags);
        }

        void addConsumer(final String consumerMemberId, final List ownedPartitions) {
            consumers.add(consumerMemberId);
            state.incrementCapacity();
            state.addOwnedPartitions(ownedPartitions, consumerMemberId);
        }

        void addPreviousTasksAndOffsetSums(final String consumerId, final Map taskOffsetSums) {
            state.addPreviousTasksAndOffsetSums(consumerId, taskOffsetSums);
        }

        @Override
        public String toString() {
            return "ClientMetadata{" +
                "hostInfo=" + hostInfo +
                ", consumers=" + consumers +
                ", state=" + state +
                '}';
        }
    }

    // keep track of any future consumers in a "dummy" Client since we can't decipher their subscription
    private static final UUID FUTURE_ID = randomUUID();

    protected static final Comparator PARTITION_COMPARATOR =
        Comparator.comparing(TopicPartition::topic).thenComparingInt(TopicPartition::partition);

    private String userEndPoint;
    private AssignmentConfigs assignmentConfigs;

    // for the main consumer, we need to use a supplier to break a cyclic setup dependency
    private Supplier> mainConsumerSupplier;
    private Admin adminClient;
    private TaskManager taskManager;
    private StreamsMetadataState streamsMetadataState;
    private PartitionGrouper partitionGrouper;
    private AtomicInteger assignmentErrorCode;
    private AtomicLong nextScheduledRebalanceMs;
    private Queue nonFatalExceptionsToHandle;
    private Time time;

    protected int usedSubscriptionMetadataVersion = LATEST_SUPPORTED_VERSION;

    private InternalTopicManager internalTopicManager;
    private CopartitionedTopicsEnforcer copartitionedTopicsEnforcer;
    private RebalanceProtocol rebalanceProtocol;
    private AssignmentListener assignmentListener;

    private Supplier taskAssignorSupplier;
    private byte uniqueField;
    private Map clientTags;

    /**
     * We need to have the PartitionAssignor and its StreamThread to be mutually accessible since the former needs
     * latter's cached metadata while sending subscriptions, and the latter needs former's returned assignment when
     * adding tasks.
     *
     * @throws KafkaException if the stream thread is not specified
     */
    @Override
    public void configure(final Map configs) {
        final AssignorConfiguration assignorConfiguration = new AssignorConfiguration(configs);

        logPrefix = assignorConfiguration.logPrefix();
        log = new LogContext(logPrefix).logger(getClass());
        usedSubscriptionMetadataVersion = assignorConfiguration.configuredMetadataVersion(usedSubscriptionMetadataVersion);

        final ReferenceContainer referenceContainer = assignorConfiguration.referenceContainer();
        mainConsumerSupplier = () -> Objects.requireNonNull(referenceContainer.mainConsumer, "Main consumer was not specified");
        adminClient = Objects.requireNonNull(referenceContainer.adminClient, "Admin client was not specified");
        taskManager = Objects.requireNonNull(referenceContainer.taskManager, "TaskManager was not specified");
        streamsMetadataState = Objects.requireNonNull(referenceContainer.streamsMetadataState, "StreamsMetadataState was not specified");
        assignmentErrorCode = referenceContainer.assignmentErrorCode;
        nextScheduledRebalanceMs = referenceContainer.nextScheduledRebalanceMs;
        nonFatalExceptionsToHandle = referenceContainer.nonFatalExceptionsToHandle;
        time = Objects.requireNonNull(referenceContainer.time, "Time was not specified");
        assignmentConfigs = assignorConfiguration.assignmentConfigs();
        partitionGrouper = new PartitionGrouper();
        userEndPoint = assignorConfiguration.userEndPoint();
        internalTopicManager = assignorConfiguration.internalTopicManager();
        copartitionedTopicsEnforcer = assignorConfiguration.copartitionedTopicsEnforcer();
        rebalanceProtocol = assignorConfiguration.rebalanceProtocol();
        taskAssignorSupplier = assignorConfiguration::taskAssignor;
        assignmentListener = assignorConfiguration.assignmentListener();
        uniqueField = 0;
        clientTags = referenceContainer.clientTags;
    }

    @Override
    public String name() {
        return "stream";
    }

    @Override
    public List supportedProtocols() {
        final List supportedProtocols = new ArrayList<>();
        supportedProtocols.add(RebalanceProtocol.EAGER);
        if (rebalanceProtocol == RebalanceProtocol.COOPERATIVE) {
            supportedProtocols.add(rebalanceProtocol);
        }
        return supportedProtocols;
    }

    @Override
    public ByteBuffer subscriptionUserData(final Set topics) {
        // Adds the following information to subscription
        // 1. Client UUID (a unique id assigned to an instance of KafkaStreams)
        // 2. Map from task id to its overall lag
        // 3. Unique Field to ensure a rebalance when a thread rejoins by forcing the user data to be different

        handleRebalanceStart(topics);
        uniqueField++;

        final Set currentNamedTopologies = taskManager.topologyMetadata().namedTopologiesView();

        // If using NamedTopologies, filter out any that are no longer recognized/have been removed
        final Map taskOffsetSums = taskManager.topologyMetadata().hasNamedTopologies() ?
            filterMap(taskManager.getTaskOffsetSums(), t -> currentNamedTopologies.contains(t.getKey().topologyName())) :
            taskManager.getTaskOffsetSums();

        return new SubscriptionInfo(
            usedSubscriptionMetadataVersion,
            LATEST_SUPPORTED_VERSION,
            taskManager.processId(),
            userEndPoint,
            taskOffsetSums,
            uniqueField,
            assignmentErrorCode.get(),
            clientTags
        ).encode();
    }

    private Map errorAssignment(final Map clientsMetadata,
                                                    final int errorCode) {
        final Map assignment = new HashMap<>();
        for (final ClientMetadata clientMetadata : clientsMetadata.values()) {
            for (final String consumerId : clientMetadata.consumers) {
                assignment.put(consumerId, new Assignment(
                    Collections.emptyList(),
                    new AssignmentInfo(LATEST_SUPPORTED_VERSION,
                        Collections.emptyList(),
                        Collections.emptyMap(),
                        Collections.emptyMap(),
                        Collections.emptyMap(),
                        errorCode).encode()
                ));
            }
        }
        return assignment;
    }

    /*
     * This assigns tasks to consumer clients in the following steps.
     *
     * 0. decode the subscriptions to assemble the metadata for each client and check for version probing
     *
     * 1. check all repartition source topics and use internal topic manager to make sure
     *    they have been created with the right number of partitions. Also verify and/or create
     *    any changelog topics with the correct number of partitions.
     *
     * 2. use the partition grouper to generate tasks along with their assigned partitions, then use
     *    the configured TaskAssignor to construct the mapping of tasks to clients.
     *
     * 3. construct the global mapping of host to partitions to enable query routing.
     *
     * 4. within each client, assign tasks to consumer clients.
     */
    @Override
    public GroupAssignment assign(final Cluster metadata, final GroupSubscription groupSubscription) {
        final Map subscriptions = groupSubscription.groupSubscription();

        // ---------------- Step Zero ---------------- //

        // construct the client metadata from the decoded subscription info

        final Map clientMetadataMap = new HashMap<>();
        final Set allOwnedPartitions = new HashSet<>();
        final Map>> racksForProcessConsumer = new HashMap<>();

        int minReceivedMetadataVersion = LATEST_SUPPORTED_VERSION;
        int minSupportedMetadataVersion = LATEST_SUPPORTED_VERSION;

        boolean shutdownRequested = false;
        boolean assignmentErrorFound = false;
        int futureMetadataVersion = UNKNOWN;
        for (final Map.Entry entry : subscriptions.entrySet()) {
            final String consumerId = entry.getKey();
            final Subscription subscription = entry.getValue();
            final SubscriptionInfo info = SubscriptionInfo.decode(subscription.userData());
            final int usedVersion = info.version();
            if (info.errorCode() == AssignorError.SHUTDOWN_REQUESTED.code()) {
                shutdownRequested = true;
            }

            minReceivedMetadataVersion = updateMinReceivedVersion(usedVersion, minReceivedMetadataVersion);
            minSupportedMetadataVersion = updateMinSupportedVersion(info.latestSupportedVersion(), minSupportedMetadataVersion);

            final UUID processId;
            if (usedVersion > LATEST_SUPPORTED_VERSION) {
                futureMetadataVersion = usedVersion;
                processId = FUTURE_ID;
                if (!clientMetadataMap.containsKey(FUTURE_ID)) {
                    clientMetadataMap.put(FUTURE_ID, new ClientMetadata(FUTURE_ID, null, Collections.emptyMap()));
                }
            } else {
                processId = info.processId();
            }

            racksForProcessConsumer.computeIfAbsent(processId, kv -> new HashMap<>()).put(consumerId, subscription.rackId());

            ClientMetadata clientMetadata = clientMetadataMap.get(processId);

            // create the new client metadata if necessary
            if (clientMetadata == null) {
                clientMetadata = new ClientMetadata(info.processId(), info.userEndPoint(), info.clientTags());
                clientMetadataMap.put(info.processId(), clientMetadata);
            }

            // add the consumer and any info in its subscription to the client
            clientMetadata.addConsumer(consumerId, subscription.ownedPartitions());
            final int prevSize = allOwnedPartitions.size();
            allOwnedPartitions.addAll(subscription.ownedPartitions());
            if (allOwnedPartitions.size() < prevSize + subscription.ownedPartitions().size()) {
                assignmentErrorFound = true;
            }
            clientMetadata.addPreviousTasksAndOffsetSums(consumerId, info.taskOffsetSums());
        }

        if (assignmentErrorFound) {
            log.warn("The previous assignment contains a partition more than once. " +
                "\t Mapping: {}", subscriptions);
        }

        try {
            final boolean versionProbing =
                checkMetadataVersions(minReceivedMetadataVersion, minSupportedMetadataVersion, futureMetadataVersion);

            log.debug("Constructed client metadata {} from the member subscriptions.", clientMetadataMap);

            // ---------------- Step One ---------------- //

            if (shutdownRequested) {
                return new GroupAssignment(errorAssignment(clientMetadataMap, AssignorError.SHUTDOWN_REQUESTED.code()));
            }

            // parse the topology to determine the repartition source topics,
            // making sure they are created with the number of partitions as
            // the maximum of the depending sub-topologies source topics' number of partitions
            final RepartitionTopics repartitionTopics = prepareRepartitionTopics(metadata);
            final Map allRepartitionTopicPartitions = repartitionTopics.topicPartitionsInfo();

            final Cluster fullMetadata = metadata.withPartitions(allRepartitionTopicPartitions);
            log.debug("Created repartition topics {} from the parsed topology.", allRepartitionTopicPartitions.values());

            // ---------------- Step Two ---------------- //

            // construct the assignment of tasks to clients

            final Map topicGroups =
                taskManager.topologyMetadata().subtopologyTopicsInfoMapExcluding(repartitionTopics.topologiesWithMissingInputTopics());

            final Set allSourceTopics = new HashSet<>();
            final Map> sourceTopicsByGroup = new HashMap<>();
            for (final Map.Entry entry : topicGroups.entrySet()) {
                allSourceTopics.addAll(entry.getValue().sourceTopics);
                sourceTopicsByGroup.put(entry.getKey(), entry.getValue().sourceTopics);
            }

            // get the tasks as partition groups from the partition grouper
            final Map> partitionsForTask =
                partitionGrouper.partitionGroups(sourceTopicsByGroup, fullMetadata);

            final Set statefulTasks = new HashSet<>();

            final boolean probingRebalanceNeeded = assignTasksToClients(fullMetadata, allSourceTopics, topicGroups,
                clientMetadataMap, partitionsForTask, racksForProcessConsumer, statefulTasks);

            // ---------------- Step Three ---------------- //

            // construct the global partition assignment per host map

            final Map> partitionsByHost = new HashMap<>();
            final Map> standbyPartitionsByHost = new HashMap<>();
            if (minReceivedMetadataVersion >= 2) {
                populatePartitionsByHostMaps(partitionsByHost, standbyPartitionsByHost, partitionsForTask, clientMetadataMap);
            }

            // ---------------- Step Four ---------------- //

            // compute the assignment of tasks to threads within each client and build the final group assignment

            final Map assignment = computeNewAssignment(
                statefulTasks,
                clientMetadataMap,
                partitionsForTask,
                partitionsByHost,
                standbyPartitionsByHost,
                allOwnedPartitions,
                minReceivedMetadataVersion,
                minSupportedMetadataVersion,
                versionProbing,
                probingRebalanceNeeded
            );

            return new GroupAssignment(assignment);
        } catch (final MissingSourceTopicException e) {
            log.error("Caught an error in the task assignment. Returning an error assignment.", e);
            return new GroupAssignment(
                errorAssignment(clientMetadataMap, AssignorError.INCOMPLETE_SOURCE_TOPIC_METADATA.code())
            );
        } catch (final TaskAssignmentException e) {
            log.error("Caught an error in the task assignment. Returning an error assignment.", e);
            return new GroupAssignment(
                errorAssignment(clientMetadataMap, AssignorError.ASSIGNMENT_ERROR.code())
            );
        }
    }

    /**
     * Verify the subscription versions are within the expected bounds and check for version probing.
     *
     * @return whether this was a version probing rebalance
     */
    private boolean checkMetadataVersions(final int minReceivedMetadataVersion,
                                          final int minSupportedMetadataVersion,
                                          final int futureMetadataVersion) {
        final boolean versionProbing;

        if (futureMetadataVersion == UNKNOWN) {
            versionProbing = false;
        } else if (minReceivedMetadataVersion >= EARLIEST_PROBEABLE_VERSION) {
            versionProbing = true;
            log.info("Received a future (version probing) subscription (version: {})."
                         + " Sending assignment back (with supported version {}).",
                futureMetadataVersion,
                minSupportedMetadataVersion);

        } else {
            throw new TaskAssignmentException(
                "Received a future (version probing) subscription (version: " + futureMetadataVersion
                    + ") and an incompatible pre Kafka 2.0 subscription (version: " + minReceivedMetadataVersion
                    + ") at the same time."
            );
        }

        if (minReceivedMetadataVersion < LATEST_SUPPORTED_VERSION) {
            log.info("Downgrade metadata to version {}. Latest supported version is {}.",
                minReceivedMetadataVersion,
                LATEST_SUPPORTED_VERSION);
        }
        if (minSupportedMetadataVersion < LATEST_SUPPORTED_VERSION) {
            log.info("Downgrade latest supported metadata to version {}. Latest supported version is {}.",
                minSupportedMetadataVersion,
                LATEST_SUPPORTED_VERSION);
        }
        return versionProbing;
    }

    /**
     * Computes and assembles all repartition topic metadata then creates the topics if necessary. Also verifies
     * that all user input topics of each topology have been created ahead of time. If any such source topics are
     * missing from a NamedTopology, the assignor will skip distributing its tasks until they have been created
     * and invoke the exception handler (without killing the thread) once for each topology to alert the user of
     * the missing topics.
     * 

* For regular applications without named topologies, the assignor will instead send a shutdown signal to * all clients so the user can identify and resolve the problem. * * @return application metadata such as partition info of repartition topics, missing external topics, etc */ private RepartitionTopics prepareRepartitionTopics(final Cluster metadata) { final RepartitionTopics repartitionTopics = new RepartitionTopics( taskManager.topologyMetadata(), internalTopicManager, copartitionedTopicsEnforcer, metadata, logPrefix ); repartitionTopics.setup(); final boolean isMissingInputTopics = !repartitionTopics.missingSourceTopicExceptions().isEmpty(); if (isMissingInputTopics) { if (!taskManager.topologyMetadata().hasNamedTopologies()) { throw new MissingSourceTopicException("Missing source topics."); } else { nonFatalExceptionsToHandle.addAll(repartitionTopics.missingSourceTopicExceptions()); } } return repartitionTopics; } /** * Populates the taskForPartition and tasksForTopicGroup maps, and checks that partitions are assigned to exactly * one task. * * @param taskForPartition a map from partition to the corresponding task. Populated here. * @param tasksForTopicGroup a map from the topicGroupId to the set of corresponding tasks. Populated here. * @param allSourceTopics a set of all source topics in the topology * @param partitionsForTask a map from task to the set of input partitions * @param fullMetadata the cluster metadata */ private void populateTasksForMaps(final Map taskForPartition, final Map> tasksForTopicGroup, final Set allSourceTopics, final Map> partitionsForTask, final Cluster fullMetadata) { // check if all partitions are assigned, and there are no duplicates of partitions in multiple tasks final Set allAssignedPartitions = new HashSet<>(); for (final Map.Entry> entry : partitionsForTask.entrySet()) { final TaskId id = entry.getKey(); final Set partitions = entry.getValue(); for (final TopicPartition partition : partitions) { taskForPartition.put(partition, id); if (allAssignedPartitions.contains(partition)) { log.warn("Partition {} is assigned to more than one tasks: {}", partition, partitionsForTask); } } allAssignedPartitions.addAll(partitions); tasksForTopicGroup.computeIfAbsent(new Subtopology(id.subtopology(), id.topologyName()), k -> new HashSet<>()).add(id); } checkAllPartitions(allSourceTopics, partitionsForTask, allAssignedPartitions, fullMetadata); } // Logs a warning if any partitions are not assigned to a task, or a task has no assigned partitions private void checkAllPartitions(final Set allSourceTopics, final Map> partitionsForTask, final Set allAssignedPartitions, final Cluster fullMetadata) { for (final String topic : allSourceTopics) { final List partitionInfoList = fullMetadata.partitionsForTopic(topic); if (partitionInfoList.isEmpty()) { log.warn("No partitions found for topic {}", topic); } else { for (final PartitionInfo partitionInfo : partitionInfoList) { final TopicPartition partition = new TopicPartition(partitionInfo.topic(), partitionInfo.partition()); if (!allAssignedPartitions.contains(partition)) { log.warn("Partition {} is not assigned to any tasks: {}" + " Possible causes of a partition not getting assigned" + " is that another topic defined in the topology has not been" + " created when starting your streams application," + " resulting in no tasks created for this topology at all.", partition, partitionsForTask); } } } } } /** * Assigns a set of tasks to each client (Streams instance) using the configured task assignor, and also * populate the stateful tasks that have been assigned to the clients * @return true if a probing rebalance should be triggered */ private boolean assignTasksToClients(final Cluster fullMetadata, final Set allSourceTopics, final Map topicGroups, final Map clientMetadataMap, final Map> partitionsForTask, final Map>> racksForProcessConsumer, final Set statefulTasks) { if (!statefulTasks.isEmpty()) { throw new TaskAssignmentException("The stateful tasks should not be populated before assigning tasks to clients"); } final Map taskForPartition = new HashMap<>(); final Map> tasksForTopicGroup = new HashMap<>(); populateTasksForMaps(taskForPartition, tasksForTopicGroup, allSourceTopics, partitionsForTask, fullMetadata); final ChangelogTopics changelogTopics = new ChangelogTopics( internalTopicManager, topicGroups, tasksForTopicGroup, logPrefix ); changelogTopics.setup(); final Map clientStates = new HashMap<>(); final boolean lagComputationSuccessful = populateClientStatesMap(clientStates, clientMetadataMap, taskForPartition, changelogTopics); log.info("{} client nodes and {} consumers participating in this rebalance: \n{}.", clientStates.size(), clientStates.values().stream().map(ClientState::capacity).reduce(Integer::sum), clientStates.entrySet().stream() .sorted(comparingByKey()) .map(entry -> entry.getKey() + ": " + entry.getValue().consumers()) .collect(Collectors.joining(Utils.NL))); final Set allTasks = partitionsForTask.keySet(); statefulTasks.addAll(changelogTopics.statefulTaskIds()); log.info("Assigning stateful tasks: {}\n" + "and stateless tasks: {}", statefulTasks, allTasks.stream().filter(t -> !statefulTasks.contains(t)).collect(Collectors.toSet())); log.debug("Assigning tasks and {} standby replicas to client nodes {}", numStandbyReplicas(), clientStates); final TaskAssignor taskAssignor = createTaskAssignor(lagComputationSuccessful); final RackAwareTaskAssignor rackAwareTaskAssignor = new RackAwareTaskAssignor( fullMetadata, partitionsForTask, changelogTopics.changelogPartionsForTask(), tasksForTopicGroup, racksForProcessConsumer, internalTopicManager, assignmentConfigs, time ); final boolean probingRebalanceNeeded = taskAssignor.assign(clientStates, allTasks, statefulTasks, rackAwareTaskAssignor, assignmentConfigs); // Break this up into multiple logs to make sure the summary info gets through, which helps avoid // info loss for example due to long line truncation with large apps log.info("Assigned {} total tasks including {} stateful tasks to {} client nodes.", allTasks.size(), statefulTasks.size(), clientStates.size()); log.info("Assignment of tasks to nodes: {}", clientStates.entrySet().stream() .sorted(comparingByKey()) .map(entry -> entry.getKey() + "=" + entry.getValue().currentAssignment()) .collect(Collectors.joining(Utils.NL))); return probingRebalanceNeeded; } private TaskAssignor createTaskAssignor(final boolean lagComputationSuccessful) { final TaskAssignor taskAssignor = taskAssignorSupplier.get(); if (taskAssignor instanceof StickyTaskAssignor) { // special case: to preserve pre-existing behavior, we invoke the StickyTaskAssignor // whether or not lag computation failed. return taskAssignor; } else if (lagComputationSuccessful) { return taskAssignor; } else { log.info("Failed to fetch end offsets for changelogs, will return previous assignment to clients and " + "trigger another rebalance to retry."); return new FallbackPriorTaskAssignor(); } } /** * Builds a map from client to state, and readies each ClientState for assignment by adding any missing prev tasks * and computing the per-task overall lag based on the fetched end offsets for each changelog. * * @param clientStates a map from each client to its state, including offset lags. Populated by this method. * @param clientMetadataMap a map from each client to its full metadata * @param taskForPartition map from topic partition to its corresponding task * @param changelogTopics object that manages changelog topics * * @return whether we were able to successfully fetch the changelog end offsets and compute each client's lag */ private boolean populateClientStatesMap(final Map clientStates, final Map clientMetadataMap, final Map taskForPartition, final ChangelogTopics changelogTopics) { boolean fetchEndOffsetsSuccessful; Map allTaskEndOffsetSums; try { // Make the listOffsets request first so it can fetch the offsets for non-source changelogs // asynchronously while we use the blocking Consumer#committed call to fetch source-changelog offsets; // note that we would need to wrap all exceptions as Streams exception with partition-level fine-grained // error messages final ListOffsetsResult endOffsetsResult = fetchEndOffsetsResult(changelogTopics.preExistingNonSourceTopicBasedPartitions(), adminClient); final Map sourceChangelogEndOffsets = fetchCommittedOffsets(changelogTopics.preExistingSourceTopicBasedPartitions(), mainConsumerSupplier.get()); final Map endOffsets = ClientUtils.getEndOffsets( endOffsetsResult, changelogTopics.preExistingNonSourceTopicBasedPartitions()); allTaskEndOffsetSums = computeEndOffsetSumsByTask( endOffsets, sourceChangelogEndOffsets, changelogTopics ); fetchEndOffsetsSuccessful = true; } catch (final StreamsException | TimeoutException e) { log.info("Failed to retrieve all end offsets for changelogs, and hence could not calculate the per-task lag; " + "this is not a fatal error but would cause the assignor to fallback to a naive algorithm", e); allTaskEndOffsetSums = changelogTopics.statefulTaskIds().stream().collect(Collectors.toMap(t -> t, t -> UNKNOWN_OFFSET_SUM)); fetchEndOffsetsSuccessful = false; } for (final Map.Entry entry : clientMetadataMap.entrySet()) { final UUID uuid = entry.getKey(); final ClientState state = entry.getValue().state; state.initializePrevTasks(taskForPartition, taskManager.topologyMetadata().hasNamedTopologies()); state.computeTaskLags(uuid, allTaskEndOffsetSums); clientStates.put(uuid, state); } return fetchEndOffsetsSuccessful; } /** * @param endOffsets the listOffsets result from the adminClient * @param sourceChangelogEndOffsets the end (committed) offsets of optimized source changelogs * @param changelogTopics object that manages changelog topics * * @return Map from stateful task to its total end offset summed across all changelog partitions */ private Map computeEndOffsetSumsByTask(final Map endOffsets, final Map sourceChangelogEndOffsets, final ChangelogTopics changelogTopics) { final Map taskEndOffsetSums = new HashMap<>(); for (final TaskId taskId : changelogTopics.statefulTaskIds()) { taskEndOffsetSums.put(taskId, 0L); for (final TopicPartition changelogPartition : changelogTopics.preExistingPartitionsFor(taskId)) { final long changelogPartitionEndOffset; if (sourceChangelogEndOffsets.containsKey(changelogPartition)) { changelogPartitionEndOffset = sourceChangelogEndOffsets.get(changelogPartition); } else if (endOffsets.containsKey(changelogPartition)) { changelogPartitionEndOffset = endOffsets.get(changelogPartition).offset(); } else { log.debug("Fetched offsets did not contain the changelog {} of task {}", changelogPartition, taskId); throw new IllegalStateException("Could not get end offset for " + changelogPartition); } final long newEndOffsetSum = taskEndOffsetSums.get(taskId) + changelogPartitionEndOffset; if (newEndOffsetSum < 0) { taskEndOffsetSums.put(taskId, Long.MAX_VALUE); break; } else { taskEndOffsetSums.put(taskId, newEndOffsetSum); } } } return taskEndOffsetSums; } /** * Populates the global partitionsByHost and standbyPartitionsByHost maps that are sent to each member * * @param partitionsByHost a map from host to the set of partitions hosted there. Populated here. * @param standbyPartitionsByHost a map from host to the set of standby partitions hosted there. Populated here. * @param partitionsForTask a map from task to its set of assigned partitions * @param clientMetadataMap a map from client to its metadata and state */ private void populatePartitionsByHostMaps(final Map> partitionsByHost, final Map> standbyPartitionsByHost, final Map> partitionsForTask, final Map clientMetadataMap) { for (final Map.Entry entry : clientMetadataMap.entrySet()) { final HostInfo hostInfo = entry.getValue().hostInfo; // if application server is configured, also include host state map if (hostInfo != null) { final Set topicPartitions = new HashSet<>(); final Set standbyPartitions = new HashSet<>(); final ClientState state = entry.getValue().state; for (final TaskId id : state.activeTasks()) { topicPartitions.addAll(partitionsForTask.get(id)); } for (final TaskId id : state.standbyTasks()) { standbyPartitions.addAll(partitionsForTask.get(id)); } partitionsByHost.put(hostInfo, topicPartitions); standbyPartitionsByHost.put(hostInfo, standbyPartitions); } } } /** * Computes the assignment of tasks to threads within each client and assembles the final assignment to send out. * * @return the final assignment for each StreamThread consumer */ private Map computeNewAssignment(final Set statefulTasks, final Map clientsMetadata, final Map> partitionsForTask, final Map> partitionsByHostState, final Map> standbyPartitionsByHost, final Set allOwnedPartitions, final int minUserMetadataVersion, final int minSupportedMetadataVersion, final boolean versionProbing, final boolean shouldTriggerProbingRebalance) { boolean rebalanceRequired = shouldTriggerProbingRebalance || versionProbing; final Map assignment = new HashMap<>(); // within the client, distribute tasks to its owned consumers for (final Map.Entry clientEntry : clientsMetadata.entrySet()) { final UUID clientId = clientEntry.getKey(); final ClientMetadata clientMetadata = clientEntry.getValue(); final ClientState state = clientMetadata.state; final SortedSet consumers = clientMetadata.consumers; final Map threadTaskCounts = new HashMap<>(); final Map> activeTaskStatefulAssignment = assignTasksToThreads( state.statefulActiveTasks(), true, consumers, state, threadTaskCounts ); final Map> standbyTaskAssignment = assignTasksToThreads( state.standbyTasks(), true, consumers, state, threadTaskCounts ); final Map> activeTaskStatelessAssignment = assignTasksToThreads( state.statelessActiveTasks(), false, consumers, state, threadTaskCounts ); // Combine activeTaskStatefulAssignment and activeTaskStatelessAssignment together into // activeTaskStatelessAssignment final Map> activeTaskAssignment = activeTaskStatefulAssignment; for (final Map.Entry> threadEntry : activeTaskStatelessAssignment.entrySet()) { activeTaskAssignment.get(threadEntry.getKey()).addAll(threadEntry.getValue()); } // Arbitrarily choose the leader's client to be responsible for triggering the probing rebalance, // note once we pick the first consumer within the process to trigger probing rebalance, other consumer // would not set to trigger any more. final boolean encodeNextProbingRebalanceTime = shouldTriggerProbingRebalance && clientId.equals(taskManager.processId()); final boolean tasksRevoked = addClientAssignments( statefulTasks, assignment, clientMetadata, partitionsForTask, partitionsByHostState, standbyPartitionsByHost, allOwnedPartitions, activeTaskAssignment, standbyTaskAssignment, minUserMetadataVersion, minSupportedMetadataVersion, encodeNextProbingRebalanceTime ); if (tasksRevoked || encodeNextProbingRebalanceTime) { rebalanceRequired = true; log.debug("Requested client {} to schedule a followup rebalance", clientId); } log.info("Client {} per-consumer assignment:\n" + "\tprev owned active {}\n" + "\tprev owned standby {}\n" + "\tassigned active {}\n" + "\trevoking active {}\n" + "\tassigned standby {}\n", clientId, clientMetadata.state.prevOwnedActiveTasksByConsumer(), clientMetadata.state.prevOwnedStandbyByConsumer(), clientMetadata.state.assignedActiveTasksByConsumer(), clientMetadata.state.revokingActiveTasksByConsumer(), clientMetadata.state.assignedStandbyTasksByConsumer()); } if (rebalanceRequired) { assignmentListener.onAssignmentComplete(false); log.info("Finished unstable assignment of tasks, a followup rebalance will be scheduled."); } else { assignmentListener.onAssignmentComplete(true); log.info("Finished stable assignment of tasks, no followup rebalances required."); } return assignment; } /** * Adds the encoded assignment for each StreamThread consumer in the client to the overall assignment map * @return true if a followup rebalance will be required due to revoked tasks */ private boolean addClientAssignments(final Set statefulTasks, final Map assignment, final ClientMetadata clientMetadata, final Map> partitionsForTask, final Map> partitionsByHostState, final Map> standbyPartitionsByHost, final Set allOwnedPartitions, final Map> activeTaskAssignments, final Map> standbyTaskAssignments, final int minUserMetadataVersion, final int minSupportedMetadataVersion, final boolean probingRebalanceNeeded) { boolean followupRebalanceRequiredForRevokedTasks = false; // We only want to encode a scheduled probing rebalance for a single member in this client boolean shouldEncodeProbingRebalance = probingRebalanceNeeded; // Loop through the consumers and build their assignment for (final String consumer : clientMetadata.consumers) { final List activeTasksForConsumer = activeTaskAssignments.get(consumer); // These will be filled in by populateActiveTaskAndPartitionsLists below final List activePartitionsList = new ArrayList<>(); final List assignedActiveList = new ArrayList<>(); final Set activeTasksRemovedPendingRevokation = populateActiveTaskAndPartitionsLists( activePartitionsList, assignedActiveList, consumer, clientMetadata.state, activeTasksForConsumer, partitionsForTask, allOwnedPartitions ); final Map> standbyTaskMap = buildStandbyTaskMap( consumer, standbyTaskAssignments.get(consumer), activeTasksRemovedPendingRevokation, statefulTasks, partitionsForTask, clientMetadata.state ); final AssignmentInfo info = new AssignmentInfo( minUserMetadataVersion, minSupportedMetadataVersion, assignedActiveList, standbyTaskMap, partitionsByHostState, standbyPartitionsByHost, AssignorError.NONE.code() ); if (!activeTasksRemovedPendingRevokation.isEmpty()) { // TODO: once KAFKA-10078 is resolved we can leave it to the client to trigger this rebalance log.info("Requesting followup rebalance be scheduled immediately by {} due to tasks changing ownership.", consumer); info.setNextRebalanceTime(0L); followupRebalanceRequiredForRevokedTasks = true; // Don't bother to schedule a probing rebalance if an immediate one is already scheduled shouldEncodeProbingRebalance = false; } else if (shouldEncodeProbingRebalance) { final long nextRebalanceTimeMs = time.milliseconds() + probingRebalanceIntervalMs(); log.info("Requesting followup rebalance be scheduled by {} for {} to probe for caught-up replica tasks.", consumer, Utils.toLogDateTimeFormat(nextRebalanceTimeMs)); info.setNextRebalanceTime(nextRebalanceTimeMs); shouldEncodeProbingRebalance = false; } assignment.put( consumer, new Assignment( activePartitionsList, info.encode() ) ); } return followupRebalanceRequiredForRevokedTasks; } /** * Populates the lists of active tasks and active task partitions for the consumer with a 1:1 mapping between them * such that the nth task corresponds to the nth partition in the list. This means tasks with multiple partitions * will be repeated in the list. */ private Set populateActiveTaskAndPartitionsLists(final List activePartitionsList, final List assignedActiveList, final String consumer, final ClientState clientState, final List activeTasksForConsumer, final Map> partitionsForTask, final Set allOwnedPartitions) { final List assignedPartitions = new ArrayList<>(); final Set removedActiveTasks = new TreeSet<>(); for (final TaskId taskId : activeTasksForConsumer) { // Populate the consumer for assigned tasks without considering revocation, // this is for debugging purposes only clientState.assignActiveToConsumer(taskId, consumer); final List assignedPartitionsForTask = new ArrayList<>(); for (final TopicPartition partition : partitionsForTask.get(taskId)) { final String oldOwner = clientState.previousOwnerForPartition(partition); final boolean newPartitionForConsumer = oldOwner == null || !oldOwner.equals(consumer); // If the partition is new to this consumer but is still owned by another, remove from the assignment // until it has been revoked and can safely be reassigned according to the COOPERATIVE protocol if (newPartitionForConsumer && allOwnedPartitions.contains(partition)) { log.info( "Removing task {} from {} active assignment until it is safely revoked in followup rebalance", taskId, consumer ); removedActiveTasks.add(taskId); clientState.revokeActiveFromConsumer(taskId, consumer); // Clear the assigned partitions list for this task if any partition can not safely be assigned, // so as not to encode a partial task assignedPartitionsForTask.clear(); // This has no effect on the assignment, as we'll never consult the ClientState again, but // it does perform a useful assertion that the task was actually assigned. clientState.unassignActive(taskId); break; } else { assignedPartitionsForTask.add(new AssignedPartition(taskId, partition)); } } // assignedPartitionsForTask will either contain all partitions for the task or be empty, so just add all assignedPartitions.addAll(assignedPartitionsForTask); } // Add one copy of a task for each corresponding partition, so the receiver can determine the task <-> tp mapping Collections.sort(assignedPartitions); for (final AssignedPartition partition : assignedPartitions) { assignedActiveList.add(partition.taskId); activePartitionsList.add(partition.partition); } return removedActiveTasks; } /** * @return map from task id to its assigned partitions for all standby tasks */ private Map> buildStandbyTaskMap(final String consumer, final Iterable standbyTasks, final Iterable revokedTasks, final Set allStatefulTasks, final Map> partitionsForTask, final ClientState clientState) { final Map> standbyTaskMap = new HashMap<>(); for (final TaskId task : standbyTasks) { clientState.assignStandbyToConsumer(task, consumer); standbyTaskMap.put(task, partitionsForTask.get(task)); } for (final TaskId task : revokedTasks) { // If this task is stateful and already owned by the consumer, but can't (yet) be assigned as an active // task during this rebalance as it must be revoked from another consumer first, place a temporary // standby task here until it can receive the active task to avoid closing the state store (and losing // all of the accumulated state in the case of in-memory stores) if (clientState.previouslyOwnedStandby(task) && allStatefulTasks.contains(task)) { log.info("Adding removed stateful active task {} as a standby for {} until it is revoked and can " + "be transitioned to active in a followup rebalance", task, consumer); // This has no effect on the assignment, as we'll never consult the ClientState again, but // it does perform a useful assertion that the it's legal to assign this task as a standby to this instance clientState.assignStandbyToConsumer(task, consumer); clientState.assignStandby(task); standbyTaskMap.put(task, partitionsForTask.get(task)); } } return standbyTaskMap; } /** * Generate an assignment that tries to preserve thread-level stickiness for stateful tasks without violating * balance. The tasks are balanced across threads. Stateful tasks without previous owners will be interleaved by * group id to spread subtopologies across threads and further balance the workload. * Stateless tasks are simply spread across threads without taking into account previous ownership. * threadLoad is a map that keeps track of task load per thread across multiple calls so active and standby * tasks are evenly distributed */ static Map> assignTasksToThreads(final Collection tasksToAssign, final boolean isStateful, final SortedSet consumers, final ClientState state, final Map threadLoad) { final Map> assignment = new HashMap<>(); for (final String consumer : consumers) { assignment.put(consumer, new ArrayList<>()); } final int totalTasks = threadLoad.values().stream().reduce(tasksToAssign.size(), Integer::sum); final int minTasksPerThread = (int) Math.floor(((double) totalTasks) / consumers.size()); final PriorityQueue unassignedTasks = new PriorityQueue<>(tasksToAssign); final Queue consumersToFill = new LinkedList<>(); // keep track of tasks that we have to skip during the first pass in case we can reassign them later // using tree-map to make sure the iteration ordering over keys are preserved final Map unassignedTaskToPreviousOwner = new TreeMap<>(); if (!unassignedTasks.isEmpty()) { // First assign tasks to previous owner, up to the min expected tasks/thread if these are stateful for (final String consumer : consumers) { final List threadAssignment = assignment.get(consumer); // The number of tasks we have to assign here to hit minTasksPerThread final int tasksTargetCount = minTasksPerThread - threadLoad.getOrDefault(consumer, 0); if (isStateful) { for (final TaskId task : state.prevTasksByLag(consumer)) { if (unassignedTasks.contains(task)) { if (threadAssignment.size() < tasksTargetCount) { threadAssignment.add(task); unassignedTasks.remove(task); } else { unassignedTaskToPreviousOwner.put(task, consumer); } } } } if (threadAssignment.size() < tasksTargetCount) { consumersToFill.offer(consumer); } } // Next interleave remaining unassigned tasks amongst unfilled consumers while (!consumersToFill.isEmpty()) { final TaskId task = unassignedTasks.poll(); if (task != null) { final String consumer = consumersToFill.poll(); final List threadAssignment = assignment.get(consumer); threadAssignment.add(task); final int threadTaskCount = threadAssignment.size() + threadLoad.getOrDefault(consumer, 0); if (threadTaskCount < minTasksPerThread) { consumersToFill.offer(consumer); } } else { throw new TaskAssignmentException("Ran out of unassigned stateful tasks but some members were not at capacity"); } } // At this point all consumers are at the min or min + 1 capacity. // The min + 1 case can occur for standbys where there's fewer standbys than consumers and after assigning // the active tasks some consumers already have min + 1 one tasks assigned. // The tasks still remaining should now be distributed over the consumers that are still at min capacity if (!unassignedTasks.isEmpty()) { for (final String consumer : consumers) { final int taskCount = assignment.get(consumer).size() + threadLoad.getOrDefault(consumer, 0); if (taskCount == minTasksPerThread) { consumersToFill.add(consumer); } } // Go over the tasks we skipped earlier and assign them to their previous owner when possible for (final Map.Entry taskEntry : unassignedTaskToPreviousOwner.entrySet()) { final TaskId task = taskEntry.getKey(); final String consumer = taskEntry.getValue(); if (consumersToFill.contains(consumer) && unassignedTasks.contains(task)) { assignment.get(consumer).add(task); unassignedTasks.remove(task); // Remove this consumer since we know it is now at minCapacity + 1 consumersToFill.remove(consumer); } } // Now just distribute the remaining unassigned stateful tasks over the consumers still at min capacity for (final TaskId task : unassignedTasks) { final String consumer = consumersToFill.poll(); final List threadAssignment = assignment.get(consumer); threadAssignment.add(task); } } } // Update threadLoad for (final Map.Entry> taskEntry : assignment.entrySet()) { final String consumer = taskEntry.getKey(); final int totalCount = threadLoad.getOrDefault(consumer, 0) + taskEntry.getValue().size(); threadLoad.put(consumer, totalCount); } return assignment; } private void validateMetadataVersions(final int receivedAssignmentMetadataVersion, final int latestCommonlySupportedVersion) { if (receivedAssignmentMetadataVersion > usedSubscriptionMetadataVersion) { log.error("Leader sent back an assignment with version {} which was greater than our used version {}", receivedAssignmentMetadataVersion, usedSubscriptionMetadataVersion); throw new TaskAssignmentException( "Sent a version " + usedSubscriptionMetadataVersion + " subscription but got an assignment with higher version " + receivedAssignmentMetadataVersion + "." ); } if (latestCommonlySupportedVersion > LATEST_SUPPORTED_VERSION) { log.error("Leader sent back assignment with commonly supported version {} that is greater than our " + "actual latest supported version {}", latestCommonlySupportedVersion, LATEST_SUPPORTED_VERSION); throw new TaskAssignmentException("Can't upgrade to metadata version greater than we support"); } } // Returns true if subscription version was changed, indicating version probing and need to rebalance again protected boolean maybeUpdateSubscriptionVersion(final int receivedAssignmentMetadataVersion, final int latestCommonlySupportedVersion) { if (receivedAssignmentMetadataVersion >= EARLIEST_PROBEABLE_VERSION) { // If the latest commonly supported version is now greater than our used version, this indicates we have just // completed the rolling upgrade and can now update our subscription version for the final rebalance if (latestCommonlySupportedVersion > usedSubscriptionMetadataVersion) { log.info( "Sent a version {} subscription and group's latest commonly supported version is {} (successful " + "version probing and end of rolling upgrade). Upgrading subscription metadata version to " + "{} for next rebalance.", usedSubscriptionMetadataVersion, latestCommonlySupportedVersion, latestCommonlySupportedVersion ); usedSubscriptionMetadataVersion = latestCommonlySupportedVersion; return true; } // If we received a lower version than we sent, someone else in the group still hasn't upgraded. We // should downgrade our subscription until everyone is on the latest version if (receivedAssignmentMetadataVersion < usedSubscriptionMetadataVersion) { log.info( "Sent a version {} subscription and got version {} assignment back (successful version probing). " + "Downgrade subscription metadata to commonly supported version {} and trigger new rebalance.", usedSubscriptionMetadataVersion, receivedAssignmentMetadataVersion, latestCommonlySupportedVersion ); usedSubscriptionMetadataVersion = latestCommonlySupportedVersion; return true; } } else { log.debug("Received an assignment version {} that is less than the earliest version that allows version " + "probing {}. If this is not during a rolling upgrade from version 2.0 or below, this is an error.", receivedAssignmentMetadataVersion, EARLIEST_PROBEABLE_VERSION); } return false; } @Override public void onAssignment(final Assignment assignment, final ConsumerGroupMetadata metadata) { final List partitions = new ArrayList<>(assignment.partitions()); partitions.sort(PARTITION_COMPARATOR); final AssignmentInfo info = AssignmentInfo.decode(assignment.userData()); if (info.errCode() != AssignorError.NONE.code()) { // set flag to shutdown streams app assignmentErrorCode.set(info.errCode()); return; } /* * latestCommonlySupportedVersion belongs to [usedSubscriptionMetadataVersion, LATEST_SUPPORTED_VERSION] * receivedAssignmentMetadataVersion belongs to [EARLIEST_PROBEABLE_VERSION, usedSubscriptionMetadataVersion] * * usedSubscriptionMetadataVersion will be downgraded to receivedAssignmentMetadataVersion during a rolling * bounce upgrade with version probing. * * usedSubscriptionMetadataVersion will be upgraded to latestCommonlySupportedVersion when all members have * been bounced and it is safe to use the latest version. */ final int receivedAssignmentMetadataVersion = info.version(); final int latestCommonlySupportedVersion = info.commonlySupportedVersion(); validateMetadataVersions(receivedAssignmentMetadataVersion, latestCommonlySupportedVersion); // version 1 field final Map> activeTasks; // version 2 fields final Map topicToPartitionInfo; final Map> partitionsByHost; final Map> standbyPartitionsByHost; final long encodedNextScheduledRebalanceMs; switch (receivedAssignmentMetadataVersion) { case 1: validateActiveTaskEncoding(partitions, info, logPrefix); activeTasks = getActiveTasks(partitions, info); partitionsByHost = Collections.emptyMap(); standbyPartitionsByHost = Collections.emptyMap(); topicToPartitionInfo = Collections.emptyMap(); encodedNextScheduledRebalanceMs = Long.MAX_VALUE; break; case 2: case 3: case 4: case 5: validateActiveTaskEncoding(partitions, info, logPrefix); activeTasks = getActiveTasks(partitions, info); partitionsByHost = info.partitionsByHost(); standbyPartitionsByHost = Collections.emptyMap(); topicToPartitionInfo = getTopicPartitionInfo(partitionsByHost); encodedNextScheduledRebalanceMs = Long.MAX_VALUE; break; case 6: validateActiveTaskEncoding(partitions, info, logPrefix); activeTasks = getActiveTasks(partitions, info); partitionsByHost = info.partitionsByHost(); standbyPartitionsByHost = info.standbyPartitionByHost(); topicToPartitionInfo = getTopicPartitionInfo(partitionsByHost); encodedNextScheduledRebalanceMs = Long.MAX_VALUE; break; case 7: case 8: case 9: case 10: case 11: validateActiveTaskEncoding(partitions, info, logPrefix); activeTasks = getActiveTasks(partitions, info); partitionsByHost = info.partitionsByHost(); standbyPartitionsByHost = info.standbyPartitionByHost(); topicToPartitionInfo = getTopicPartitionInfo(partitionsByHost); encodedNextScheduledRebalanceMs = info.nextRebalanceMs(); break; default: throw new IllegalStateException( "This code should never be reached." + " Please file a bug report at https://issues.apache.org/jira/projects/KAFKA/" ); } maybeScheduleFollowupRebalance( encodedNextScheduledRebalanceMs, receivedAssignmentMetadataVersion, latestCommonlySupportedVersion, partitionsByHost.keySet() ); streamsMetadataState.onChange(partitionsByHost, standbyPartitionsByHost, topicToPartitionInfo); // we do not capture any exceptions but just let the exception thrown from consumer.poll directly // since when stream thread captures it, either we close all tasks as dirty or we close thread taskManager.handleAssignment(activeTasks, info.standbyTasks()); } private void maybeScheduleFollowupRebalance(final long encodedNextScheduledRebalanceMs, final int receivedAssignmentMetadataVersion, final int latestCommonlySupportedVersion, final Set groupHostInfo) { if (maybeUpdateSubscriptionVersion(receivedAssignmentMetadataVersion, latestCommonlySupportedVersion)) { log.info("Requested to schedule immediate rebalance due to version probing."); nextScheduledRebalanceMs.set(0L); } else if (!verifyHostInfo(groupHostInfo)) { log.info("Requested to schedule immediate rebalance to update group with new host endpoint = {}.", userEndPoint); nextScheduledRebalanceMs.set(0L); } else if (encodedNextScheduledRebalanceMs == 0L) { log.info("Requested to schedule immediate rebalance for new tasks to be safely revoked from current owner."); nextScheduledRebalanceMs.set(0L); } else if (encodedNextScheduledRebalanceMs < Long.MAX_VALUE) { log.info( "Requested to schedule next probing rebalance at {} to try for a more balanced assignment.", Utils.toLogDateTimeFormat(encodedNextScheduledRebalanceMs) ); nextScheduledRebalanceMs.set(encodedNextScheduledRebalanceMs); } else { log.info("No followup rebalance was requested, resetting the rebalance schedule."); nextScheduledRebalanceMs.set(Long.MAX_VALUE); } } /** * Verify that this client's host info was included in the map returned in the assignment, and trigger a * rebalance if not. This may be necessary when using static membership, as a rejoining client will be handed * back its original assignment to avoid an unnecessary rebalance. If the client's endpoint has changed, we need * to force a rebalance for the other members in the group to get the updated host info for this client. * * @param groupHostInfo the HostInfo of all clients in the group * @return false if the current host info does not match that in the group assignment */ private boolean verifyHostInfo(final Set groupHostInfo) { if (userEndPoint != null && !groupHostInfo.isEmpty()) { final HostInfo myHostInfo = HostInfo.buildFromEndpoint(userEndPoint); return groupHostInfo.contains(myHostInfo); } else { return true; } } // protected for upgrade test protected static Map> getActiveTasks(final List partitions, final AssignmentInfo info) { final Map> activeTasks = new HashMap<>(); for (int i = 0; i < partitions.size(); i++) { final TopicPartition partition = partitions.get(i); final TaskId id = info.activeTasks().get(i); activeTasks.computeIfAbsent(id, k1 -> new HashSet<>()).add(partition); } return activeTasks; } static Map getTopicPartitionInfo(final Map> partitionsByHost) { final Map topicToPartitionInfo = new HashMap<>(); for (final Set value : partitionsByHost.values()) { for (final TopicPartition topicPartition : value) { topicToPartitionInfo.put( topicPartition, new PartitionInfo( topicPartition.topic(), topicPartition.partition(), null, new Node[0], new Node[0] ) ); } } return topicToPartitionInfo; } private static void validateActiveTaskEncoding(final List partitions, final AssignmentInfo info, final String logPrefix) { // the number of assigned partitions should be the same as number of active tasks, which // could be duplicated if one task has more than one assigned partitions if (partitions.size() != info.activeTasks().size()) { throw new TaskAssignmentException( String.format( "%sNumber of assigned partitions %d is not equal to " + "the number of active taskIds %d, assignmentInfo=%s", logPrefix, partitions.size(), info.activeTasks().size(), info ) ); } } private int updateMinReceivedVersion(final int usedVersion, final int minReceivedMetadataVersion) { return Math.min(usedVersion, minReceivedMetadataVersion); } private int updateMinSupportedVersion(final int supportedVersion, final int minSupportedMetadataVersion) { if (supportedVersion < minSupportedMetadataVersion) { log.debug("Downgrade the current minimum supported version {} to the smaller seen supported version {}", minSupportedMetadataVersion, supportedVersion); return supportedVersion; } else { log.debug("Current minimum supported version remains at {}, last seen supported version was {}", minSupportedMetadataVersion, supportedVersion); return minSupportedMetadataVersion; } } // following functions are for test only void setInternalTopicManager(final InternalTopicManager internalTopicManager) { this.internalTopicManager = internalTopicManager; } RebalanceProtocol rebalanceProtocol() { return rebalanceProtocol; } protected String userEndPoint() { return userEndPoint; } protected TaskManager taskManager() { return taskManager; } protected byte uniqueField() { return uniqueField; } protected Map clientTags() { return clientTags; } protected void handleRebalanceStart(final Set topics) { taskManager.handleRebalanceStart(topics); } long acceptableRecoveryLag() { return assignmentConfigs.acceptableRecoveryLag; } int maxWarmupReplicas() { return assignmentConfigs.maxWarmupReplicas; } int numStandbyReplicas() { return assignmentConfigs.numStandbyReplicas; } long probingRebalanceIntervalMs() { return assignmentConfigs.probingRebalanceIntervalMs; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy