All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.kubernetes.KubernetesResourceManager Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.kubernetes;

import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.configuration.GlobalConfiguration;
import org.apache.flink.kubernetes.configuration.KubernetesConfigOptions;
import org.apache.flink.kubernetes.configuration.KubernetesResourceManagerConfiguration;
import org.apache.flink.kubernetes.kubeclient.FlinkKubeClient;
import org.apache.flink.kubernetes.kubeclient.factory.KubernetesTaskManagerFactory;
import org.apache.flink.kubernetes.kubeclient.parameters.KubernetesTaskManagerParameters;
import org.apache.flink.kubernetes.kubeclient.resources.KubernetesPod;
import org.apache.flink.kubernetes.utils.KubernetesUtils;
import org.apache.flink.runtime.clusterframework.ApplicationStatus;
import org.apache.flink.runtime.clusterframework.BootstrapTools;
import org.apache.flink.runtime.clusterframework.ContaineredTaskManagerParameters;
import org.apache.flink.runtime.clusterframework.TaskExecutorProcessUtils;
import org.apache.flink.runtime.clusterframework.types.ResourceID;
import org.apache.flink.runtime.clusterframework.types.ResourceProfile;
import org.apache.flink.runtime.entrypoint.ClusterInformation;
import org.apache.flink.runtime.heartbeat.HeartbeatServices;
import org.apache.flink.runtime.highavailability.HighAvailabilityServices;
import org.apache.flink.runtime.metrics.groups.ResourceManagerMetricGroup;
import org.apache.flink.runtime.resourcemanager.ActiveResourceManager;
import org.apache.flink.runtime.resourcemanager.JobLeaderIdService;
import org.apache.flink.runtime.resourcemanager.ResourceManager;
import org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException;
import org.apache.flink.runtime.resourcemanager.slotmanager.SlotManager;
import org.apache.flink.runtime.rpc.FatalErrorHandler;
import org.apache.flink.runtime.rpc.RpcService;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nullable;

import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CompletableFuture;

/**
 * Kubernetes specific implementation of the {@link ResourceManager}.
 */
public class KubernetesResourceManager extends ActiveResourceManager
	implements FlinkKubeClient.PodCallbackHandler {

	private static final Logger LOG = LoggerFactory.getLogger(KubernetesResourceManager.class);

	/** The taskmanager pod name pattern is {clusterId}-{taskmanager}-{attemptId}-{podIndex}. */
	private static final String TASK_MANAGER_POD_FORMAT = "%s-taskmanager-%d-%d";

	private final Map workerNodes = new HashMap<>();

	private final double defaultCpus;

	/** When ResourceManager failover, the max attempt should recover. */
	private long currentMaxAttemptId = 0;

	/** Current max pod index. When creating a new pod, it should increase one. */
	private long currentMaxPodId = 0;

	private final String clusterId;

	private final FlinkKubeClient kubeClient;

	private final ContaineredTaskManagerParameters taskManagerParameters;

	/** The number of pods requested, but not yet granted. */
	private int numPendingPodRequests = 0;

	private final KubernetesResourceManagerConfiguration configuration;

	public KubernetesResourceManager(
			RpcService rpcService,
			String resourceManagerEndpointId,
			ResourceID resourceId,
			Configuration flinkConfig,
			HighAvailabilityServices highAvailabilityServices,
			HeartbeatServices heartbeatServices,
			SlotManager slotManager,
			JobLeaderIdService jobLeaderIdService,
			ClusterInformation clusterInformation,
			FatalErrorHandler fatalErrorHandler,
			ResourceManagerMetricGroup resourceManagerMetricGroup,
			FlinkKubeClient kubeClient,
			KubernetesResourceManagerConfiguration configuration) {
		super(
			flinkConfig,
			System.getenv(),
			rpcService,
			resourceManagerEndpointId,
			resourceId,
			highAvailabilityServices,
			heartbeatServices,
			slotManager,
			jobLeaderIdService,
			clusterInformation,
			fatalErrorHandler,
			resourceManagerMetricGroup);
		this.clusterId = flinkConfig.getString(KubernetesConfigOptions.CLUSTER_ID);
		this.defaultCpus = taskExecutorProcessSpec.getCpuCores().getValue().doubleValue();

		this.kubeClient = kubeClient;

		this.taskManagerParameters =
			ContaineredTaskManagerParameters.create(flinkConfig, taskExecutorProcessSpec, numSlotsPerTaskManager);
		this.configuration = configuration;
	}

	@Override
	protected Configuration loadClientConfiguration() {
		return GlobalConfiguration.loadConfiguration();
	}

	@Override
	protected void initialize() throws ResourceManagerException {
		recoverWorkerNodesFromPreviousAttempts();

		kubeClient.watchPodsAndDoCallback(KubernetesUtils.getTaskManagerLabels(clusterId, flinkConfig), this);
	}

	@Override
	public CompletableFuture onStop() {
		// shut down all components
		Throwable exception = null;

		try {
			kubeClient.close();
		} catch (Throwable t) {
			exception = t;
		}

		return getStopTerminationFutureOrCompletedExceptionally(exception);
	}

	@Override
	protected void internalDeregisterApplication(ApplicationStatus finalStatus, @Nullable String diagnostics) {
		LOG.info(
			"Stopping kubernetes cluster, clusterId: {}, diagnostics: {}",
			clusterId,
			diagnostics == null ? "" : diagnostics);
		kubeClient.stopAndCleanupCluster(clusterId);
	}

	@Override
	public Collection startNewWorker(ResourceProfile resourceProfile) {
		LOG.info("Starting new worker with resource profile, {}", resourceProfile);
		if (!resourceProfilesPerWorker.iterator().next().isMatching(resourceProfile)) {
			return Collections.emptyList();
		}
		requestKubernetesPod();
		return resourceProfilesPerWorker;
	}

	@Override
	protected KubernetesWorkerNode workerStarted(ResourceID resourceID) {
		return workerNodes.get(resourceID);
	}

	@Override
	public boolean stopWorker(final KubernetesWorkerNode worker) {
		LOG.info("Stopping Worker {}.", worker.getResourceID());
		workerNodes.remove(worker.getResourceID());
		try {
			kubeClient.stopPod(worker.getResourceID().toString());
		} catch (Exception e) {
			kubeClient.handleException(e);
			return false;
		}
		return true;
	}

	@Override
	public void onAdded(List pods) {
		runAsync(() -> {
			for (KubernetesPod pod : pods) {
				if (numPendingPodRequests > 0) {
					numPendingPodRequests--;
					final KubernetesWorkerNode worker = new KubernetesWorkerNode(new ResourceID(pod.getName()));
					workerNodes.putIfAbsent(worker.getResourceID(), worker);
				}

				log.info("Received new TaskManager pod: {} - Remaining pending pod requests: {}",
					pod.getName(), numPendingPodRequests);
			}
		});
	}

	@Override
	public void onModified(List pods) {
		runAsync(() -> pods.forEach(this::removePodIfTerminated));
	}

	@Override
	public void onDeleted(List pods) {
		runAsync(() -> pods.forEach(this::removePodIfTerminated));
	}

	@Override
	public void onError(List pods) {
		runAsync(() -> pods.forEach(this::removePodIfTerminated));
	}

	@VisibleForTesting
	Map getWorkerNodes() {
		return workerNodes;
	}

	private void recoverWorkerNodesFromPreviousAttempts() throws ResourceManagerException {
		final List podList = kubeClient.getPodsWithLabels(KubernetesUtils.getTaskManagerLabels(clusterId, flinkConfig));
		for (KubernetesPod pod : podList) {
			final KubernetesWorkerNode worker = new KubernetesWorkerNode(new ResourceID(pod.getName()));
			workerNodes.put(worker.getResourceID(), worker);
			final long attempt = worker.getAttempt();
			if (attempt > currentMaxAttemptId) {
				currentMaxAttemptId = attempt;
			}
		}

		log.info("Recovered {} pods from previous attempts, current attempt id is {}.",
			workerNodes.size(),
			++currentMaxAttemptId);
	}

	private void requestKubernetesPod() {
		numPendingPodRequests++;

		log.info("Requesting new TaskManager pod with <{},{}>. Number pending requests {}.",
			defaultMemoryMB,
			defaultCpus,
			numPendingPodRequests);

		final String podName = String.format(
			TASK_MANAGER_POD_FORMAT,
			clusterId,
			currentMaxAttemptId,
			++currentMaxPodId);

		final String dynamicProperties =
			BootstrapTools.getDynamicPropertiesAsString(flinkClientConfig, flinkConfig);

		final KubernetesTaskManagerParameters kubernetesTaskManagerParameters = new KubernetesTaskManagerParameters(
			flinkConfig,
			podName,
			defaultMemoryMB,
			dynamicProperties,
			taskManagerParameters);

		final KubernetesPod taskManagerPod =
			KubernetesTaskManagerFactory.createTaskManagerComponent(kubernetesTaskManagerParameters);

		log.info("TaskManager {} will be started with {}.", podName, taskExecutorProcessSpec);
		kubeClient.createTaskManagerPod(taskManagerPod)
			.whenComplete(
				(ignore, throwable) -> {
					if (throwable != null) {
						log.error("Could not start TaskManager in pod {}.", podName, throwable);
						scheduleRunAsync(
							this::decreasePendingAndRequestKubernetesPodIfRequired,
							configuration.getPodCreationRetryInterval());
					}
				}
			);
	}

	private void decreasePendingAndRequestKubernetesPodIfRequired() {
		validateRunsInMainThread();
		numPendingPodRequests--;
		requestKubernetesPodIfRequired();
	}

	/**
	 * Request new pod if pending pods cannot satisfy pending slot requests.
	 */
	private void requestKubernetesPodIfRequired() {
		final int requiredTaskManagerSlots = getNumberRequiredTaskManagerSlots();
		final int pendingTaskManagerSlots = numPendingPodRequests * numSlotsPerTaskManager;

		if (requiredTaskManagerSlots > pendingTaskManagerSlots) {
			requestKubernetesPod();
		}
	}

	private void removePodIfTerminated(KubernetesPod pod) {
		if (pod.isTerminated()) {
			kubeClient.stopPod(pod.getName());
			final KubernetesWorkerNode kubernetesWorkerNode = workerNodes.remove(new ResourceID(pod.getName()));
			if (kubernetesWorkerNode != null) {
				requestKubernetesPodIfRequired();
			}
		}
	}

	@Override
	protected double getCpuCores(Configuration configuration) {
		return TaskExecutorProcessUtils.getCpuCoresWithFallbackConfigOption(configuration, KubernetesConfigOptions.TASK_MANAGER_CPU);
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy