All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.pulsar.functions.worker.MembershipManager Maven / Gradle / Ivy

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.pulsar.functions.worker;

import com.google.common.annotations.VisibleForTesting;

import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;

import lombok.extern.slf4j.Slf4j;

import org.apache.pulsar.client.admin.PulsarAdmin;
import org.apache.pulsar.client.admin.PulsarAdminException;
import org.apache.pulsar.client.api.PulsarClient;
import org.apache.pulsar.common.functions.WorkerInfo;
import org.apache.pulsar.common.policies.data.ConsumerStats;
import org.apache.pulsar.common.policies.data.TopicStats;
import org.apache.pulsar.functions.proto.Function;
import org.apache.pulsar.functions.utils.FunctionCommon;

import static org.apache.pulsar.functions.worker.SchedulerManager.checkHeartBeatFunction;

/**
 * A simple implementation of leader election using a pulsar topic.
 */
@Slf4j
public class MembershipManager implements AutoCloseable {

    private final WorkerConfig workerConfig;
    private PulsarAdmin pulsarAdmin;

    static final String COORDINATION_TOPIC_SUBSCRIPTION = "participants";

    private static String WORKER_IDENTIFIER = "id";

    // How long functions have remained assigned or scheduled on a failed node
    // FullyQualifiedFunctionName -> time in millis
    @VisibleForTesting
    Map unsignedFunctionDurations = new HashMap<>();

    MembershipManager(WorkerService workerService, PulsarClient pulsarClient, PulsarAdmin pulsarAdmin) {
        this.workerConfig = workerService.getWorkerConfig();
        this.pulsarAdmin = pulsarAdmin;
    }

    public List getCurrentMembership() {

        List workerIds = new LinkedList<>();
        TopicStats topicStats = null;
        try {
            topicStats = this.pulsarAdmin.topics().getStats(this.workerConfig.getClusterCoordinationTopic());
        } catch (PulsarAdminException e) {
            log.error("Failed to get status of coordinate topic {}",
                    this.workerConfig.getClusterCoordinationTopic(), e);
            throw new RuntimeException(e);
        }

        for (ConsumerStats consumerStats : topicStats.getSubscriptions()
                .get(COORDINATION_TOPIC_SUBSCRIPTION).getConsumers()) {
            WorkerInfo workerInfo = WorkerInfo.parseFrom(consumerStats.getMetadata().get(WORKER_IDENTIFIER));
            workerIds.add(workerInfo);
        }
        return workerIds;
    }

    public WorkerInfo getLeader() {
        TopicStats topicStats = null;
        try {
            topicStats = this.pulsarAdmin.topics().getStats(this.workerConfig.getClusterCoordinationTopic());
        } catch (PulsarAdminException e) {
            log.error("Failed to get status of coordinate topic {}",
                    this.workerConfig.getClusterCoordinationTopic(), e);
            throw new RuntimeException(e);
        }

        String activeConsumerName = topicStats.getSubscriptions().get(COORDINATION_TOPIC_SUBSCRIPTION).getActiveConsumerName();
        WorkerInfo leader = null;
        for (ConsumerStats consumerStats : topicStats.getSubscriptions()
                .get(COORDINATION_TOPIC_SUBSCRIPTION).getConsumers()) {
            if (consumerStats.getConsumerName().equals(activeConsumerName)) {
                leader = WorkerInfo.parseFrom(consumerStats.getMetadata().get(WORKER_IDENTIFIER));
            }
        }
        if (leader == null) {
            log.warn("Failed to determine leader in functions cluster");
        }
        return leader;
    }

    @Override
    public void close() {

    }

    public void checkFailures(FunctionMetaDataManager functionMetaDataManager,
                              FunctionRuntimeManager functionRuntimeManager,
                              SchedulerManager schedulerManager) {

        Set currentMembership = this.getCurrentMembership().stream()
                .map(entry -> entry.getWorkerId()).collect(Collectors.toSet());
        List functionMetaDataList = functionMetaDataManager.getAllFunctionMetaData();
        Map functionMetaDataMap = new HashMap<>();
        for (Function.FunctionMetaData entry : functionMetaDataList) {
            functionMetaDataMap.put(FunctionCommon.getFullyQualifiedName(entry.getFunctionDetails()), entry);
        }
        Map> currentAssignments = functionRuntimeManager.getCurrentAssignments();
        Map assignmentMap = new HashMap<>();
        for (Map entry : currentAssignments.values()) {
            assignmentMap.putAll(entry);
        }
        long currentTimeMs = System.currentTimeMillis();

        // remove functions
        Iterator> it = unsignedFunctionDurations.entrySet().iterator();
        while (it.hasNext()) {
            Map.Entry entry = it.next();
            String fullyQualifiedFunctionName = FunctionCommon.getFullyQualifiedName(
                    entry.getKey().getFunctionMetaData().getFunctionDetails());
            String fullyQualifiedInstanceId = FunctionCommon.getFullyQualifiedInstanceId(entry.getKey());
            //remove functions that don't exist anymore
            if (!functionMetaDataMap.containsKey(fullyQualifiedFunctionName)) {
                it.remove();
            } else {
                //remove functions that have been scheduled
                Function.Assignment assignment = assignmentMap.get(fullyQualifiedInstanceId);
                if (assignment != null) {
                    String assignedWorkerId = assignment.getWorkerId();
                    // check if assigned to worker that has failed
                    if (currentMembership.contains(assignedWorkerId)) {
                        it.remove();
                    }
                }
            }
        }

        // check for function instances that haven't been assigned
        for (Function.FunctionMetaData functionMetaData : functionMetaDataList) {
            Collection assignments
                    = FunctionRuntimeManager.findFunctionAssignments(functionMetaData.getFunctionDetails().getTenant(),
                    functionMetaData.getFunctionDetails().getNamespace(),
                    functionMetaData.getFunctionDetails().getName(),
                    currentAssignments);

            Set assignedInstances = assignments.stream()
                    .map(assignment -> assignment.getInstance())
                    .collect(Collectors.toSet());

            Set instances = new HashSet<>(SchedulerManager.computeInstances(functionMetaData, functionRuntimeManager.getRuntimeFactory().externallyManaged()));

            for (Function.Instance instance : instances) {
                if (!assignedInstances.contains(instance)) {
                    if (!this.unsignedFunctionDurations.containsKey(instance)) {
                        this.unsignedFunctionDurations.put(instance, currentTimeMs);
                    }
                }
            }
        }

        // check failed nodes
        for (Map.Entry> entry : currentAssignments.entrySet()) {
            String workerId = entry.getKey();
            Map assignmentEntries = entry.getValue();
            if (!currentMembership.contains(workerId)) {
                for (Function.Assignment assignmentEntry : assignmentEntries.values()) {
                    Function.Instance instance = assignmentEntry.getInstance();
                    // avoid scheduling-trigger for heartbeat-function if owner-worker is not up
                    if (checkHeartBeatFunction(instance) != null) {
                        continue;
                    }
                    if (!this.unsignedFunctionDurations.containsKey(instance)) {
                        this.unsignedFunctionDurations.put(instance, currentTimeMs);
                    }
                }
            }
        }

        boolean triggerScheduler = false;
        // check unassigned
        Collection needSchedule = new LinkedList<>();
        Collection needRemove = new LinkedList<>();
        Map numRemoved = new HashMap<>();
        for (Map.Entry entry : this.unsignedFunctionDurations.entrySet()) {
            Function.Instance instance = entry.getKey();
            long unassignedDurationMs = entry.getValue();
            if (currentTimeMs - unassignedDurationMs > this.workerConfig.getRescheduleTimeoutMs()) {
                needSchedule.add(instance);
                // remove assignment from failed node
                Function.Assignment assignment = assignmentMap.get(FunctionCommon.getFullyQualifiedInstanceId(instance));
                if (assignment != null) {
                    needRemove.add(assignment);

                    Integer count = numRemoved.get(assignment.getWorkerId());
                    if (count == null) {
                        count = 0;
                    }
                    numRemoved.put(assignment.getWorkerId(),  count + 1);
                }
                triggerScheduler = true;
            }
        }
        if (!needRemove.isEmpty()) {
            functionRuntimeManager.removeAssignments(needRemove);
        }
        if (triggerScheduler) {
            log.info("Failure check - Total number of instances that need to be scheduled/rescheduled: {} | Number of unassigned instances that need to be scheduled: {} | Number of instances on dead workers that need to be reassigned {}",
                    needSchedule.size(), needSchedule.size() - needRemove.size(), numRemoved);
            schedulerManager.schedule();
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy