All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.kubernetes.KubernetesResourceManagerDriver Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.kubernetes;

import org.apache.flink.configuration.BlobServerOptions;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.configuration.GlobalConfiguration;
import org.apache.flink.configuration.JobManagerOptions;
import org.apache.flink.configuration.TaskManagerOptions;
import org.apache.flink.kubernetes.configuration.KubernetesConfigOptions;
import org.apache.flink.kubernetes.configuration.KubernetesResourceManagerDriverConfiguration;
import org.apache.flink.kubernetes.kubeclient.FlinkKubeClient;
import org.apache.flink.kubernetes.kubeclient.FlinkPod;
import org.apache.flink.kubernetes.kubeclient.decorators.ExternalServiceDecorator;
import org.apache.flink.kubernetes.kubeclient.decorators.InternalServiceDecorator;
import org.apache.flink.kubernetes.kubeclient.factory.KubernetesTaskManagerFactory;
import org.apache.flink.kubernetes.kubeclient.parameters.KubernetesTaskManagerParameters;
import org.apache.flink.kubernetes.kubeclient.resources.KubernetesPod;
import org.apache.flink.kubernetes.kubeclient.resources.KubernetesTooOldResourceVersionException;
import org.apache.flink.kubernetes.kubeclient.resources.KubernetesWatch;
import org.apache.flink.kubernetes.utils.Constants;
import org.apache.flink.kubernetes.utils.KubernetesUtils;
import org.apache.flink.runtime.clusterframework.ApplicationStatus;
import org.apache.flink.runtime.clusterframework.BootstrapTools;
import org.apache.flink.runtime.clusterframework.ContaineredTaskManagerParameters;
import org.apache.flink.runtime.clusterframework.TaskExecutorProcessSpec;
import org.apache.flink.runtime.clusterframework.types.ResourceID;
import org.apache.flink.runtime.externalresource.ExternalResourceUtils;
import org.apache.flink.runtime.jobmanager.HighAvailabilityMode;
import org.apache.flink.runtime.resourcemanager.active.AbstractResourceManagerDriver;
import org.apache.flink.runtime.resourcemanager.active.ResourceManagerDriver;
import org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException;
import org.apache.flink.runtime.util.ResourceManagerUtils;
import org.apache.flink.runtime.util.config.memory.ProcessMemoryUtils;
import org.apache.flink.util.ExceptionUtils;
import org.apache.flink.util.FlinkException;
import org.apache.flink.util.Preconditions;
import org.apache.flink.util.concurrent.FutureUtils;

import io.fabric8.kubernetes.client.KubernetesClientException;

import javax.annotation.Nullable;

import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.CancellationException;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.CompletionException;

/** Implementation of {@link ResourceManagerDriver} for Kubernetes deployment. */
public class KubernetesResourceManagerDriver
        extends AbstractResourceManagerDriver {

    /** The taskmanager pod name pattern is {clusterId}-{taskmanager}-{attemptId}-{podIndex}. */
    private static final String TASK_MANAGER_POD_FORMAT = "%s-taskmanager-%d-%d";

    private final String clusterId;

    private final String webInterfaceUrl;

    private final FlinkKubeClient flinkKubeClient;

    /** Request resource futures, keyed by pod names. */
    private final Map> requestResourceFutures;

    /** When ResourceManager failover, the max attempt should recover. */
    private long currentMaxAttemptId = 0;

    /** Current max pod index. When creating a new pod, it should increase one. */
    private long currentMaxPodId = 0;

    private Optional podsWatchOpt;

    private volatile boolean running;

    private FlinkPod taskManagerPodTemplate;

    public KubernetesResourceManagerDriver(
            Configuration flinkConfig,
            FlinkKubeClient flinkKubeClient,
            KubernetesResourceManagerDriverConfiguration configuration) {
        super(flinkConfig, GlobalConfiguration.loadConfiguration());
        this.clusterId = Preconditions.checkNotNull(configuration.getClusterId());
        this.webInterfaceUrl = configuration.getWebInterfaceUrl();
        this.flinkKubeClient = Preconditions.checkNotNull(flinkKubeClient);
        this.requestResourceFutures = new HashMap<>();
        this.running = false;
    }

    // ------------------------------------------------------------------------
    //  ResourceManagerDriver
    // ------------------------------------------------------------------------

    @Override
    protected void initializeInternal() throws Exception {
        podsWatchOpt = watchTaskManagerPods();
        final File podTemplateFile = KubernetesUtils.getTaskManagerPodTemplateFileInPod();
        if (podTemplateFile.exists()) {
            taskManagerPodTemplate =
                    KubernetesUtils.loadPodFromTemplateFile(
                            flinkKubeClient, podTemplateFile, Constants.MAIN_CONTAINER_NAME);
        } else {
            taskManagerPodTemplate = new FlinkPod.Builder().build();
        }
        updateKubernetesServiceTargetPortIfNecessary();
        recoverWorkerNodesFromPreviousAttempts();
        this.running = true;
    }

    @Override
    public void terminate() throws Exception {
        if (!running) {
            return;
        }
        running = false;

        // shut down all components
        Exception exception = null;

        try {
            podsWatchOpt.ifPresent(KubernetesWatch::close);
        } catch (Exception e) {
            exception = e;
        }

        try {
            flinkKubeClient.close();
        } catch (Exception e) {
            exception = ExceptionUtils.firstOrSuppressed(e, exception);
        }

        if (exception != null) {
            throw exception;
        }
    }

    @Override
    public void deregisterApplication(
            ApplicationStatus finalStatus, @Nullable String optionalDiagnostics) {
        log.info(
                "Deregistering Flink Kubernetes cluster, clusterId: {}, diagnostics: {}",
                clusterId,
                optionalDiagnostics == null ? "" : optionalDiagnostics);
        flinkKubeClient.stopAndCleanupCluster(clusterId);
    }

    @Override
    public CompletableFuture requestResource(
            TaskExecutorProcessSpec taskExecutorProcessSpec) {
        final KubernetesTaskManagerParameters parameters =
                createKubernetesTaskManagerParameters(
                        taskExecutorProcessSpec, getBlockedNodeRetriever().getAllBlockedNodeIds());
        final KubernetesPod taskManagerPod =
                KubernetesTaskManagerFactory.buildTaskManagerKubernetesPod(
                        taskManagerPodTemplate, parameters);
        final String podName = taskManagerPod.getName();
        final CompletableFuture requestResourceFuture =
                new CompletableFuture<>();

        requestResourceFutures.put(podName, requestResourceFuture);

        log.info(
                "Creating new TaskManager pod with name {} and resource <{},{}>.",
                podName,
                parameters.getTaskManagerMemoryMB(),
                parameters.getTaskManagerCPU());

        final CompletableFuture createPodFuture =
                flinkKubeClient.createTaskManagerPod(taskManagerPod);

        FutureUtils.assertNoException(
                createPodFuture.handleAsync(
                        (ignore, exception) -> {
                            if (exception != null) {
                                log.warn(
                                        "Could not create pod {}, exception: {}",
                                        podName,
                                        exception);
                                CompletableFuture future =
                                        requestResourceFutures.remove(taskManagerPod.getName());
                                if (future != null) {
                                    future.completeExceptionally(exception);
                                }
                            } else {
                                if (requestResourceFuture.isCancelled()) {
                                    stopPod(podName);
                                    log.info(
                                            "pod {} is cancelled before create pod finish, stop it.",
                                            podName);
                                } else {
                                    log.info("Pod {} is created.", podName);
                                }
                            }
                            return null;
                        },
                        getMainThreadExecutor()));

        FutureUtils.assertNoException(
                requestResourceFuture.handle(
                        (ignore, t) -> {
                            if (t == null) {
                                return null;
                            }
                            // Unwrap CompletionException cause if any
                            if (t instanceof CompletionException && t.getCause() != null) {
                                t = t.getCause();
                            }
                            if (t instanceof CancellationException) {

                                requestResourceFutures.remove(taskManagerPod.getName());
                                if (createPodFuture.isDone()) {
                                    log.info(
                                            "pod {} is cancelled before scheduled, stop it.",
                                            podName);
                                    stopPod(taskManagerPod.getName());
                                }
                            } else if (t instanceof RetryableException
                                    || t instanceof KubernetesClientException) {
                                // ignore transient / retriable errors
                            } else {
                                log.error("Error completing resource request.", t);
                                ExceptionUtils.rethrow(t);
                            }
                            return null;
                        }));

        return requestResourceFuture;
    }

    @Override
    public void releaseResource(KubernetesWorkerNode worker) {
        final String podName = worker.getResourceID().toString();

        log.info("Stopping TaskManager pod {}.", podName);

        stopPod(podName);
    }

    // ------------------------------------------------------------------------
    //  Internal
    // ------------------------------------------------------------------------

    private void recoverWorkerNodesFromPreviousAttempts() throws ResourceManagerException {
        List podList =
                flinkKubeClient.getPodsWithLabels(
                        KubernetesUtils.getTaskManagerSelectors(clusterId));
        final List recoveredWorkers = new ArrayList<>();

        for (KubernetesPod pod : podList) {
            final KubernetesWorkerNode worker =
                    new KubernetesWorkerNode(new ResourceID(pod.getName()));
            final long attempt = worker.getAttempt();
            if (attempt > currentMaxAttemptId) {
                currentMaxAttemptId = attempt;
            }

            if (pod.isTerminated() || !pod.isScheduled()) {
                stopPod(pod.getName());
            } else {
                recoveredWorkers.add(worker);
            }
        }

        log.info(
                "Recovered {} pods from previous attempts, current attempt id is {}.",
                recoveredWorkers.size(),
                ++currentMaxAttemptId);

        getResourceEventHandler().onPreviousAttemptWorkersRecovered(recoveredWorkers);
    }

    private void updateKubernetesServiceTargetPortIfNecessary() throws Exception {
        if (!KubernetesUtils.isHostNetwork(flinkConfig)) {
            return;
        }
        final int restPort =
                ResourceManagerUtils.parseRestBindPortFromWebInterfaceUrl(webInterfaceUrl);
        Preconditions.checkArgument(
                restPort > 0, "Failed to parse rest port from " + webInterfaceUrl);
        final String restServiceName = ExternalServiceDecorator.getExternalServiceName(clusterId);
        flinkKubeClient
                .updateServiceTargetPort(restServiceName, Constants.REST_PORT_NAME, restPort)
                .get();
        if (!HighAvailabilityMode.isHighAvailabilityModeActivated(flinkConfig)) {
            final String internalServiceName =
                    InternalServiceDecorator.getInternalServiceName(clusterId);
            flinkKubeClient
                    .updateServiceTargetPort(
                            internalServiceName,
                            Constants.BLOB_SERVER_PORT_NAME,
                            Integer.parseInt(flinkConfig.getString(BlobServerOptions.PORT)))
                    .get();
            flinkKubeClient
                    .updateServiceTargetPort(
                            internalServiceName,
                            Constants.JOB_MANAGER_RPC_PORT_NAME,
                            flinkConfig.getInteger(JobManagerOptions.PORT))
                    .get();
        }
    }

    private KubernetesTaskManagerParameters createKubernetesTaskManagerParameters(
            TaskExecutorProcessSpec taskExecutorProcessSpec, Set blockedNodes) {
        final String podName =
                String.format(
                        TASK_MANAGER_POD_FORMAT, clusterId, currentMaxAttemptId, ++currentMaxPodId);

        final ContaineredTaskManagerParameters taskManagerParameters =
                ContaineredTaskManagerParameters.create(flinkConfig, taskExecutorProcessSpec);

        final Configuration taskManagerConfig = new Configuration(flinkConfig);
        taskManagerConfig.set(TaskManagerOptions.TASK_MANAGER_RESOURCE_ID, podName);

        final String dynamicProperties =
                BootstrapTools.getDynamicPropertiesAsString(flinkClientConfig, taskManagerConfig);
        final String jvmMemOpts =
                ProcessMemoryUtils.generateJvmParametersStr(taskExecutorProcessSpec);
        return new KubernetesTaskManagerParameters(
                flinkConfig,
                podName,
                dynamicProperties,
                jvmMemOpts,
                taskManagerParameters,
                ExternalResourceUtils.getExternalResourceConfigurationKeys(
                        flinkConfig,
                        KubernetesConfigOptions.EXTERNAL_RESOURCE_KUBERNETES_CONFIG_KEY_SUFFIX),
                blockedNodes);
    }

    private void handlePodEventsInMainThread(List pods, PodEvent podEvent) {
        getMainThreadExecutor()
                .execute(
                        () -> {
                            for (KubernetesPod pod : pods) {
                                // we should also handle the deleted event to avoid situations where
                                // the pod itself doesn't reflect the status correctly (i.e. pod
                                // removed during the pending phase).
                                if (podEvent == PodEvent.DELETED || pod.isTerminated()) {
                                    onPodTerminated(pod);
                                } else if (pod.isScheduled()) {
                                    onPodScheduled(pod);
                                }
                            }
                        });
    }

    private void onPodScheduled(KubernetesPod pod) {
        final String podName = pod.getName();
        final CompletableFuture requestResourceFuture =
                requestResourceFutures.remove(podName);

        if (requestResourceFuture == null) {
            log.debug("Ignore TaskManager pod that is already added: {}", podName);
            return;
        }

        log.info("Received new TaskManager pod: {}", podName);
        requestResourceFuture.complete(new KubernetesWorkerNode(new ResourceID(podName)));
    }

    private void onPodTerminated(KubernetesPod pod) {
        final String podName = pod.getName();
        log.debug("TaskManager pod {} is terminated.", podName);

        // this is a safe net, in case onModified/onDeleted/onError is
        // received before onAdded
        final CompletableFuture requestResourceFuture =
                requestResourceFutures.remove(podName);
        if (requestResourceFuture != null) {
            log.warn("Pod {} is terminated before being scheduled.", podName);
            requestResourceFuture.completeExceptionally(
                    new RetryableException("Pod is terminated."));
        }

        getResourceEventHandler()
                .onWorkerTerminated(new ResourceID(podName), pod.getTerminatedDiagnostics());
        stopPod(podName);
    }

    private void stopPod(String podName) {
        flinkKubeClient
                .stopPod(podName)
                .whenComplete(
                        (ignore, throwable) -> {
                            if (throwable != null) {
                                log.warn(
                                        "Could not remove TaskManager pod {}, exception: {}",
                                        podName,
                                        throwable);
                            }
                        });
    }

    private Optional watchTaskManagerPods() throws Exception {
        return Optional.of(
                flinkKubeClient.watchPodsAndDoCallback(
                        KubernetesUtils.getTaskManagerSelectors(clusterId),
                        new PodCallbackHandlerImpl()));
    }

    // ------------------------------------------------------------------------
    //  FlinkKubeClient.WatchCallbackHandler
    // ------------------------------------------------------------------------

    private class PodCallbackHandlerImpl
            implements FlinkKubeClient.WatchCallbackHandler {
        @Override
        public void onAdded(List pods) {
            handlePodEventsInMainThread(pods, PodEvent.ADDED);
        }

        @Override
        public void onModified(List pods) {
            handlePodEventsInMainThread(pods, PodEvent.MODIFIED);
        }

        @Override
        public void onDeleted(List pods) {
            handlePodEventsInMainThread(pods, PodEvent.DELETED);
        }

        @Override
        public void onError(List pods) {
            handlePodEventsInMainThread(pods, PodEvent.ERROR);
        }

        @Override
        public void handleError(Throwable throwable) {
            if (throwable instanceof KubernetesTooOldResourceVersionException) {
                getMainThreadExecutor()
                        .execute(
                                () -> {
                                    if (running) {
                                        podsWatchOpt.ifPresent(KubernetesWatch::close);
                                        log.info("Creating a new watch on TaskManager pods.");
                                        try {
                                            podsWatchOpt = watchTaskManagerPods();
                                        } catch (Exception e) {
                                            getResourceEventHandler().onError(e);
                                        }
                                    }
                                });
            } else {
                getResourceEventHandler().onError(throwable);
            }
        }
    }

    private static class RetryableException extends FlinkException {
        private static final long serialVersionUID = 1L;

        RetryableException(String message) {
            super(message);
        }
    }

    /** Internal type of the pod event. */
    private enum PodEvent {
        ADDED,
        MODIFIED,
        DELETED,
        ERROR
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy