org.apache.flink.runtime.minicluster.MiniCluster Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of flink-runtime Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.minicluster;

import org.apache.flink.annotation.Internal;
import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.api.common.JobExecutionResult;
import org.apache.flink.api.common.JobID;
import org.apache.flink.api.common.JobStatus;
import org.apache.flink.api.common.JobSubmissionResult;
import org.apache.flink.api.common.io.FileOutputFormat;
import org.apache.flink.configuration.ClusterOptions;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.configuration.ConfigurationUtils;
import org.apache.flink.configuration.HighAvailabilityOptions;
import org.apache.flink.configuration.IllegalConfigurationException;
import org.apache.flink.configuration.StateRecoveryOptions;
import org.apache.flink.core.execution.CheckpointType;
import org.apache.flink.core.execution.RecoveryClaimMode;
import org.apache.flink.core.execution.SavepointFormatType;
import org.apache.flink.runtime.blob.BlobCacheService;
import org.apache.flink.runtime.blob.BlobClient;
import org.apache.flink.runtime.blob.BlobServer;
import org.apache.flink.runtime.blob.BlobUtils;
import org.apache.flink.runtime.client.ClientUtils;
import org.apache.flink.runtime.client.JobExecutionException;
import org.apache.flink.runtime.client.JobStatusMessage;
import org.apache.flink.runtime.clusterframework.types.ResourceID;
import org.apache.flink.runtime.dispatcher.DispatcherGateway;
import org.apache.flink.runtime.dispatcher.DispatcherId;
import org.apache.flink.runtime.dispatcher.MemoryExecutionGraphInfoStore;
import org.apache.flink.runtime.dispatcher.TriggerSavepointMode;
import org.apache.flink.runtime.entrypoint.ClusterEntrypointUtils;
import org.apache.flink.runtime.entrypoint.ClusterInformation;
import org.apache.flink.runtime.entrypoint.WorkingDirectory;
import org.apache.flink.runtime.entrypoint.component.DefaultDispatcherResourceManagerComponentFactory;
import org.apache.flink.runtime.entrypoint.component.DispatcherResourceManagerComponent;
import org.apache.flink.runtime.entrypoint.component.DispatcherResourceManagerComponentFactory;
import org.apache.flink.runtime.executiongraph.AccessExecutionGraph;
import org.apache.flink.runtime.executiongraph.ArchivedExecutionGraph;
import org.apache.flink.runtime.externalresource.ExternalResourceInfoProvider;
import org.apache.flink.runtime.heartbeat.HeartbeatServices;
import org.apache.flink.runtime.highavailability.HighAvailabilityServices;
import org.apache.flink.runtime.highavailability.HighAvailabilityServicesFactory;
import org.apache.flink.runtime.highavailability.HighAvailabilityServicesUtils;
import org.apache.flink.runtime.highavailability.nonha.embedded.EmbeddedHaServices;
import org.apache.flink.runtime.highavailability.nonha.embedded.EmbeddedHaServicesWithLeadershipControl;
import org.apache.flink.runtime.highavailability.nonha.embedded.HaLeadershipControl;
import org.apache.flink.runtime.io.network.partition.ClusterPartitionManager;
import org.apache.flink.runtime.jobgraph.IntermediateDataSetID;
import org.apache.flink.runtime.jobgraph.JobGraph;
import org.apache.flink.runtime.jobgraph.SavepointRestoreSettings;
import org.apache.flink.runtime.jobmanager.HighAvailabilityMode;
import org.apache.flink.runtime.jobmaster.JobResult;
import org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService;
import org.apache.flink.runtime.messages.Acknowledge;
import org.apache.flink.runtime.messages.webmonitor.ClusterOverview;
import org.apache.flink.runtime.metrics.MetricRegistry;
import org.apache.flink.runtime.metrics.MetricRegistryConfiguration;
import org.apache.flink.runtime.metrics.MetricRegistryImpl;
import org.apache.flink.runtime.metrics.ReporterSetup;
import org.apache.flink.runtime.metrics.TraceReporterSetup;
import org.apache.flink.runtime.metrics.groups.ProcessMetricGroup;
import org.apache.flink.runtime.metrics.util.MetricUtils;
import org.apache.flink.runtime.operators.coordination.CoordinationRequest;
import org.apache.flink.runtime.operators.coordination.CoordinationResponse;
import org.apache.flink.runtime.resourcemanager.ResourceManagerGateway;
import org.apache.flink.runtime.resourcemanager.ResourceManagerId;
import org.apache.flink.runtime.resourcemanager.ResourceOverview;
import org.apache.flink.runtime.resourcemanager.StandaloneResourceManagerFactory;
import org.apache.flink.runtime.rpc.FatalErrorHandler;
import org.apache.flink.runtime.rpc.RpcService;
import org.apache.flink.runtime.rpc.RpcSystem;
import org.apache.flink.runtime.rpc.RpcUtils;
import org.apache.flink.runtime.scheduler.ExecutionGraphInfo;
import org.apache.flink.runtime.security.token.DefaultDelegationTokenManagerFactory;
import org.apache.flink.runtime.security.token.DelegationTokenManager;
import org.apache.flink.runtime.security.token.DelegationTokenReceiverRepository;
import org.apache.flink.runtime.taskexecutor.TaskExecutor;
import org.apache.flink.runtime.taskexecutor.TaskManagerRunner;
import org.apache.flink.runtime.webmonitor.retriever.LeaderRetriever;
import org.apache.flink.runtime.webmonitor.retriever.MetricQueryServiceRetriever;
import org.apache.flink.runtime.webmonitor.retriever.impl.RpcGatewayRetriever;
import org.apache.flink.runtime.webmonitor.retriever.impl.RpcMetricQueryServiceRetriever;
import org.apache.flink.util.AbstractID;
import org.apache.flink.util.AutoCloseableAsync;
import org.apache.flink.util.ExceptionUtils;
import org.apache.flink.util.ExecutorUtils;
import org.apache.flink.util.FlinkException;
import org.apache.flink.util.InstantiationUtil;
import org.apache.flink.util.Reference;
import org.apache.flink.util.SerializedValue;
import org.apache.flink.util.concurrent.ExecutorThreadFactory;
import org.apache.flink.util.concurrent.ExponentialBackoffRetryStrategy;
import org.apache.flink.util.concurrent.FutureUtils;
import org.apache.flink.util.function.BiFunctionWithException;
import org.apache.flink.util.function.FunctionUtils;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import javax.annotation.concurrent.GuardedBy;

import java.io.IOException;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.URI;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.CompletionException;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Executor;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collectors;

import static org.apache.flink.configuration.ClusterOptions.PROCESS_WORKING_DIR_BASE;
import static org.apache.flink.util.Preconditions.checkNotNull;
import static org.apache.flink.util.Preconditions.checkState;

/** MiniCluster to execute Flink jobs locally. */
public class MiniCluster implements AutoCloseableAsync {

    private static final Logger LOG = LoggerFactory.getLogger(MiniCluster.class);

    /** The lock to guard startup / shutdown / manipulation methods. */
    private final Object lock = new Object();

    /** The configuration for this mini cluster. */
    private final MiniClusterConfiguration miniClusterConfiguration;

    private final Duration rpcTimeout;

    @GuardedBy("lock")
    private final List taskManagers;

    private final TerminatingFatalErrorHandlerFactory
            taskManagerTerminatingFatalErrorHandlerFactory =
                    new TerminatingFatalErrorHandlerFactory();
    private final Supplier> rpcSystemSupplier;

    private CompletableFuture terminationFuture;

    @GuardedBy("lock")
    private MetricRegistryImpl metricRegistry;

    @GuardedBy("lock")
    private ProcessMetricGroup processMetricGroup;

    @GuardedBy("lock")
    private RpcService commonRpcService;

    @GuardedBy("lock")
    private ExecutorService ioExecutor;

    @GuardedBy("lock")
    private final Collection rpcServices;

    @GuardedBy("lock")
    private HighAvailabilityServicesFactory haServicesFactory;

    @GuardedBy("lock")
    private HighAvailabilityServices haServices;

    @GuardedBy("lock")
    private BlobServer blobServer;

    @GuardedBy("lock")
    private HeartbeatServices heartbeatServices;

    @GuardedBy("lock")
    private DelegationTokenManager delegationTokenManager;

    @GuardedBy("lock")
    private DelegationTokenReceiverRepository delegationTokenReceiverRepository;

    @GuardedBy("lock")
    private BlobCacheService blobCacheService;

    @GuardedBy("lock")
    private LeaderRetrievalService resourceManagerLeaderRetriever;

    @GuardedBy("lock")
    private LeaderRetrievalService dispatcherLeaderRetriever;

    @GuardedBy("lock")
    private LeaderRetrievalService clusterRestEndpointLeaderRetrievalService;

    @GuardedBy("lock")
    private Collection dispatcherResourceManagerComponents;

    @GuardedBy("lock")
    private RpcGatewayRetriever dispatcherGatewayRetriever;

    @GuardedBy("lock")
    private RpcGatewayRetriever
            resourceManagerGatewayRetriever;

    @GuardedBy("lock")
    private LeaderRetriever webMonitorLeaderRetriever;

    @GuardedBy("lock")
    private RpcServiceFactory taskManagerRpcServiceFactory;

    @GuardedBy("lock")
    private WorkingDirectory workingDirectory;

    /** Flag marking the mini cluster as started/running. */
    private volatile boolean running;

    @GuardedBy("lock")
    private Reference rpcSystem;

    // ------------------------------------------------------------------------

    /**
     * Creates a new Flink mini cluster based on the given configuration.
     *
     * @param miniClusterConfiguration The configuration for the mini cluster
     */
    public MiniCluster(MiniClusterConfiguration miniClusterConfiguration) {
        this(
                miniClusterConfiguration,
                () -> Reference.owned(RpcSystem.load(miniClusterConfiguration.getConfiguration())));
    }

    public MiniCluster(
            MiniClusterConfiguration miniClusterConfiguration,
            Supplier> rpcSystemSupplier) {

        this.miniClusterConfiguration =
                checkNotNull(miniClusterConfiguration, "config may not be null");
        this.rpcServices =
                new ArrayList<>(
                        1
                                + 2
                                + miniClusterConfiguration
                                        .getNumTaskManagers()); // common + JM + RM + TMs
        this.dispatcherResourceManagerComponents = new ArrayList<>(1);

        // There shouldn't be any lost messages between the MiniCluster and the Flink components
        // since they all run in the same process.
        this.rpcTimeout = RpcUtils.INF_TIMEOUT;
        this.terminationFuture = CompletableFuture.completedFuture(null);
        running = false;

        this.taskManagers = new ArrayList<>(miniClusterConfiguration.getNumTaskManagers());

        this.rpcSystemSupplier = rpcSystemSupplier;
    }

    public CompletableFuture getRestAddress() {
        synchronized (lock) {
            checkState(running, "MiniCluster is not yet running or has already been shut down.");
            return webMonitorLeaderRetriever
                    .getLeaderFuture()
                    .thenApply(
                            FunctionUtils.uncheckedFunction(
                                    addressLeaderIdTuple -> new URI(addressLeaderIdTuple.f0)));
        }
    }

    public ClusterInformation getClusterInformation() {
        synchronized (lock) {
            checkState(running, "MiniCluster is not yet running or has already been shut down.");
            return new ClusterInformation("localhost", blobServer.getPort());
        }
    }

    protected Executor getIOExecutor() {
        return ioExecutor;
    }

    // ------------------------------------------------------------------------
    //  life cycle
    // ------------------------------------------------------------------------

    /** Checks if the mini cluster was started and is running. */
    public boolean isRunning() {
        return running;
    }

    /**
     * Starts the mini cluster, based on the configured properties.
     *
     * @throws Exception This method passes on any exception that occurs during the startup of the
     *     mini cluster.
     */
    public void start() throws Exception {
        synchronized (lock) {
            checkState(!running, "MiniCluster is already running");

            LOG.info("Starting Flink Mini Cluster");
            LOG.debug("Using configuration {}", miniClusterConfiguration);

            final Configuration configuration = miniClusterConfiguration.getConfiguration();
            final boolean useSingleRpcService =
                    miniClusterConfiguration.getRpcServiceSharing() == RpcServiceSharing.SHARED;

            try {
                workingDirectory =
                        WorkingDirectory.create(
                                ClusterEntrypointUtils.generateWorkingDirectoryFile(
                                        configuration,
                                        Optional.of(PROCESS_WORKING_DIR_BASE),
                                        "minicluster_" + ResourceID.generate()));

                initializeIOFormatClasses(configuration);

                rpcSystem = rpcSystemSupplier.get();

                LOG.info("Starting Metrics Registry");
                metricRegistry =
                        createMetricRegistry(
                                configuration,
                                rpcSystem.deref().getMaximumMessageSizeInBytes(configuration));

                // bring up all the RPC services
                LOG.info("Starting RPC Service(s)");

                final RpcServiceFactory dispatcherResourceManagerComponentRpcServiceFactory;
                final RpcService metricQueryServiceRpcService;

                if (useSingleRpcService) {
                    // we always need the 'commonRpcService' for auxiliary calls
                    commonRpcService = createLocalRpcService(configuration, rpcSystem.deref());
                    final CommonRpcServiceFactory commonRpcServiceFactory =
                            new CommonRpcServiceFactory(commonRpcService);
                    taskManagerRpcServiceFactory = commonRpcServiceFactory;
                    dispatcherResourceManagerComponentRpcServiceFactory = commonRpcServiceFactory;
                    metricQueryServiceRpcService =
                            MetricUtils.startLocalMetricsRpcService(
                                    configuration, rpcSystem.deref());
                } else {

                    // start a new service per component, possibly with custom bind addresses
                    final String jobManagerExternalAddress =
                            miniClusterConfiguration.getJobManagerExternalAddress();
                    final String taskManagerExternalAddress =
                            miniClusterConfiguration.getTaskManagerExternalAddress();
                    final String jobManagerExternalPortRange =
                            miniClusterConfiguration.getJobManagerExternalPortRange();
                    final String taskManagerExternalPortRange =
                            miniClusterConfiguration.getTaskManagerExternalPortRange();
                    final String jobManagerBindAddress =
                            miniClusterConfiguration.getJobManagerBindAddress();
                    final String taskManagerBindAddress =
                            miniClusterConfiguration.getTaskManagerBindAddress();

                    dispatcherResourceManagerComponentRpcServiceFactory =
                            new DedicatedRpcServiceFactory(
                                    configuration,
                                    jobManagerExternalAddress,
                                    jobManagerExternalPortRange,
                                    jobManagerBindAddress,
                                    rpcSystem.deref());
                    taskManagerRpcServiceFactory =
                            new DedicatedRpcServiceFactory(
                                    configuration,
                                    taskManagerExternalAddress,
                                    taskManagerExternalPortRange,
                                    taskManagerBindAddress,
                                    rpcSystem.deref());

                    // we always need the 'commonRpcService' for auxiliary calls
                    // bind to the JobManager address with port 0
                    commonRpcService =
                            createRemoteRpcService(
                                    configuration, jobManagerBindAddress, 0, rpcSystem.deref());
                    metricQueryServiceRpcService =
                            MetricUtils.startRemoteMetricsRpcService(
                                    configuration,
                                    commonRpcService.getAddress(),
                                    null,
                                    rpcSystem.deref());
                }

                metricRegistry.startQueryService(metricQueryServiceRpcService, null);

                processMetricGroup =
                        MetricUtils.instantiateProcessMetricGroup(
                                metricRegistry,
                                RpcUtils.getHostname(commonRpcService),
                                ConfigurationUtils.getSystemResourceMetricsProbingInterval(
                                        configuration));

                ioExecutor =
                        Executors.newFixedThreadPool(
                                ClusterEntrypointUtils.getPoolSize(configuration),
                                new ExecutorThreadFactory("mini-cluster-io"));

                delegationTokenManager =
                        DefaultDelegationTokenManagerFactory.create(
                                configuration,
                                miniClusterConfiguration.getPluginManager(),
                                commonRpcService.getScheduledExecutor(),
                                ioExecutor);
                // Obtaining delegation tokens and propagating them to the local JVM receivers in a
                // one-time fashion is required because BlobServer may connect to external file
                // systems
                delegationTokenManager.obtainDelegationTokens();

                delegationTokenReceiverRepository =
                        new DelegationTokenReceiverRepository(
                                configuration, miniClusterConfiguration.getPluginManager());

                haServicesFactory = createHighAvailabilityServicesFactory(configuration);

                haServices = createHighAvailabilityServices(configuration, ioExecutor);

                blobServer =
                        BlobUtils.createBlobServer(
                                configuration,
                                Reference.borrowed(workingDirectory.getBlobStorageDirectory()),
                                haServices.createBlobStore());
                blobServer.start();

                heartbeatServices = HeartbeatServices.fromConfiguration(configuration);

                blobCacheService =
                        BlobUtils.createBlobCacheService(
                                configuration,
                                Reference.borrowed(workingDirectory.getBlobStorageDirectory()),
                                haServices.createBlobStore(),
                                new InetSocketAddress(
                                        InetAddress.getLocalHost(), blobServer.getPort()));

                startTaskManagers();

                MetricQueryServiceRetriever metricQueryServiceRetriever =
                        new RpcMetricQueryServiceRetriever(
                                metricRegistry.getMetricQueryServiceRpcService());

                setupDispatcherResourceManagerComponents(
                        configuration,
                        dispatcherResourceManagerComponentRpcServiceFactory,
                        metricQueryServiceRetriever);

                resourceManagerLeaderRetriever = haServices.getResourceManagerLeaderRetriever();
                dispatcherLeaderRetriever = haServices.getDispatcherLeaderRetriever();
                clusterRestEndpointLeaderRetrievalService =
                        haServices.getClusterRestEndpointLeaderRetriever();

                dispatcherGatewayRetriever =
                        new RpcGatewayRetriever<>(
                                commonRpcService,
                                DispatcherGateway.class,
                                DispatcherId::fromUuid,
                                new ExponentialBackoffRetryStrategy(
                                        21, Duration.ofMillis(5L), Duration.ofMillis(20L)));
                resourceManagerGatewayRetriever =
                        new RpcGatewayRetriever<>(
                                commonRpcService,
                                ResourceManagerGateway.class,
                                ResourceManagerId::fromUuid,
                                new ExponentialBackoffRetryStrategy(
                                        21, Duration.ofMillis(5L), Duration.ofMillis(20L)));
                webMonitorLeaderRetriever = new LeaderRetriever();

                resourceManagerLeaderRetriever.start(resourceManagerGatewayRetriever);
                dispatcherLeaderRetriever.start(dispatcherGatewayRetriever);
                clusterRestEndpointLeaderRetrievalService.start(webMonitorLeaderRetriever);
            } catch (Exception e) {
                // cleanup everything
                try {
                    close();
                } catch (Exception ee) {
                    e.addSuppressed(ee);
                }
                throw e;
            }

            // create a new termination future
            terminationFuture = new CompletableFuture<>();

            // now officially mark this as running
            running = true;

            LOG.info("Flink Mini Cluster started successfully");
        }
    }

    @GuardedBy("lock")
    private void setupDispatcherResourceManagerComponents(
            Configuration configuration,
            RpcServiceFactory dispatcherResourceManagerComponentRpcServiceFactory,
            MetricQueryServiceRetriever metricQueryServiceRetriever)
            throws Exception {
        dispatcherResourceManagerComponents.addAll(
                createDispatcherResourceManagerComponents(
                        configuration,
                        dispatcherResourceManagerComponentRpcServiceFactory,
                        blobServer,
                        heartbeatServices,
                        delegationTokenManager,
                        metricRegistry,
                        metricQueryServiceRetriever,
                        new ShutDownFatalErrorHandler()));

        FutureUtils.completeAll(
                        dispatcherResourceManagerComponents.stream()
                                .map(DispatcherResourceManagerComponent::getShutDownFuture)
                                .collect(Collectors.toList()))
                .whenComplete((ignored, exception) -> closeAsync());
    }

    @VisibleForTesting
    protected Collection
            createDispatcherResourceManagerComponents(
                    Configuration configuration,
                    RpcServiceFactory rpcServiceFactory,
                    BlobServer blobServer,
                    HeartbeatServices heartbeatServices,
                    DelegationTokenManager delegationTokenManager,
                    MetricRegistry metricRegistry,
                    MetricQueryServiceRetriever metricQueryServiceRetriever,
                    FatalErrorHandler fatalErrorHandler)
                    throws Exception {
        DispatcherResourceManagerComponentFactory dispatcherResourceManagerComponentFactory =
                createDispatcherResourceManagerComponentFactory();

        final DispatcherResourceManagerComponent dispatcherResourceManagerComponent =
                dispatcherResourceManagerComponentFactory.create(
                        configuration,
                        ResourceID.generate(),
                        ioExecutor,
                        rpcServiceFactory.createRpcService(),
                        haServices,
                        blobServer,
                        heartbeatServices,
                        delegationTokenManager,
                        metricRegistry,
                        new MemoryExecutionGraphInfoStore(),
                        metricQueryServiceRetriever,
                        Collections.emptySet(),
                        fatalErrorHandler);
        FutureUtils.assertNoException(
                dispatcherResourceManagerComponent
                        .getShutDownFuture()
                        .thenCompose(
                                applicationStatus ->
                                        dispatcherResourceManagerComponent.stopApplication(
                                                applicationStatus, null)));
        return Collections.singleton(dispatcherResourceManagerComponent);
    }

    protected DispatcherResourceManagerComponentFactory
            createDispatcherResourceManagerComponentFactory() {
        return DefaultDispatcherResourceManagerComponentFactory.createSessionComponentFactory(
                StandaloneResourceManagerFactory.getInstance());
    }

    private HighAvailabilityServicesFactory createHighAvailabilityServicesFactory(
            Configuration configuration) {
        final HaServices customMiniClusterHaServicesMode = miniClusterConfiguration.getHaServices();
        if (customMiniClusterHaServicesMode == HaServices.WITH_LEADERSHIP_CONTROL) {
            // special feature of MiniClusters to allow the control of leadership
            // EmbeddedLeaderElection requires a single instance for leader election across multiple
            // JobManager instances on the same JVM (after FLINK-24038 was introduced); therefore,
            // SingletonHighAvailabilityServicesFactory is utilized here
            return new SingletonHighAvailabilityServicesFactory(
                    (config, embeddedLeaderElectionExecutor) ->
                            new EmbeddedHaServicesWithLeadershipControl(
                                    embeddedLeaderElectionExecutor));
        } else if (customMiniClusterHaServicesMode != HaServices.CONFIGURED) {
            throw new IllegalConfigurationException(
                    "Unknown HA Services Mode configured in MiniCluster configuration: "
                            + customMiniClusterHaServicesMode);
        }

        final HighAvailabilityMode highAvailabilityMode =
                HighAvailabilityMode.fromConfig(configuration);
        if (highAvailabilityMode == HighAvailabilityMode.NONE) {
            // basic EmbeddedLeaderElection requires a single instance for leader election across
            // multiple JobManager instances on the same JVM (after FLINK-24038 was introduced);
            // therefore, SingletonHighAvailabilityServicesFactory is utilized here
            return new SingletonHighAvailabilityServicesFactory(
                    (config, embeddedLeaderElectionExecutor) ->
                            new EmbeddedHaServices(embeddedLeaderElectionExecutor));
        } else {
            return new RegularHighAvailabilityServicesFactory();
        }
    }

    @VisibleForTesting
    protected HighAvailabilityServices createHighAvailabilityServices(
            Configuration configuration, Executor executor) throws Exception {
        return haServicesFactory.createHAServices(configuration, executor);
    }

    /**
     * Returns {@link HaLeadershipControl} if enabled.
     *
     * {@link HaLeadershipControl} allows granting and revoking leadership of HA components, e.g.
     * JobManager. The method return {@link Optional#empty()} if the control is not enabled in
     * {@link MiniClusterConfiguration}.
     *
     * 
Enabling this feature disables {@link HighAvailabilityOptions#HA_MODE} option.
     */
    public Optional getHaLeadershipControl() {
        synchronized (lock) {
            return haServices instanceof HaLeadershipControl
                    ? Optional.of((HaLeadershipControl) haServices)
                    : Optional.empty();
        }
    }

    protected HighAvailabilityServices getHaServices() {
        return haServices;
    }

    /**
     * Shuts down the mini cluster, failing all currently executing jobs. The mini cluster can be
     * started again by calling the {@link #start()} method again.
     *
     * 
This method shuts down all started services and components, even if an exception occurs in
     * the process of shutting down some component.
     *
     * @return Future which is completed once the MiniCluster has been completely shut down
     */
    @Override
    public CompletableFuture closeAsync() {
        return closeInternal(true);
    }

    public CompletableFuture closeAsyncWithoutCleaningHighAvailabilityData() {
        return closeInternal(false);
    }

    private CompletableFuture closeInternal(boolean cleanupHaData) {
        synchronized (lock) {
            if (running) {
                LOG.info("Shutting down Flink Mini Cluster");
                try {
                    final long shutdownTimeoutMillis =
                            miniClusterConfiguration
                                    .getConfiguration()
                                    .get(ClusterOptions.CLUSTER_SERVICES_SHUTDOWN_TIMEOUT)
                                    .toMillis();
                    final int numComponents = 2 + miniClusterConfiguration.getNumTaskManagers();
                    final Collection> componentTerminationFutures =
                            new ArrayList<>(numComponents);

                    componentTerminationFutures.addAll(terminateTaskManagers());

                    componentTerminationFutures.add(shutDownResourceManagerComponents());

                    final FutureUtils.ConjunctFuture componentsTerminationFuture =
                            FutureUtils.completeAll(componentTerminationFutures);

                    final CompletableFuture metricSystemTerminationFuture =
                            FutureUtils.composeAfterwards(
                                    componentsTerminationFuture, this::closeMetricSystem);

                    final CompletableFuture rpcServicesTerminationFuture =
                            FutureUtils.composeAfterwards(
                                    metricSystemTerminationFuture, this::terminateRpcServices);

                    final CompletableFuture remainingServicesTerminationFuture =
                            FutureUtils.runAfterwards(
                                    rpcServicesTerminationFuture,
                                    () -> terminateMiniClusterServices(cleanupHaData));

                    final CompletableFuture executorsTerminationFuture =
                            FutureUtils.composeAfterwards(
                                    remainingServicesTerminationFuture,
                                    () -> terminateExecutors(shutdownTimeoutMillis));

                    final CompletableFuture deleteDirectoriesFuture =
                            FutureUtils.runAfterwards(
                                    executorsTerminationFuture, this::deleteDirectories);

                    deleteDirectoriesFuture.whenComplete(
                            (Void ignored, Throwable throwable) -> {
                                if (throwable != null) {
                                    terminationFuture.completeExceptionally(
                                            ExceptionUtils.stripCompletionException(throwable));
                                } else {
                                    terminationFuture.complete(null);
                                }
                            });
                } finally {
                    running = false;
                }
            }

            return terminationFuture;
        }
    }

    private CompletableFuture closeMetricSystem() {
        synchronized (lock) {
            final ArrayList> terminationFutures = new ArrayList<>(2);

            if (processMetricGroup != null) {
                processMetricGroup.close();
                processMetricGroup = null;
            }

            // metrics shutdown
            if (metricRegistry != null) {
                terminationFutures.add(metricRegistry.closeAsync());
                metricRegistry = null;
            }

            return FutureUtils.completeAll(terminationFutures);
        }
    }

    @GuardedBy("lock")
    private void startTaskManagers() throws Exception {
        final int numTaskManagers = miniClusterConfiguration.getNumTaskManagers();

        LOG.info("Starting {} TaskManager(s)", numTaskManagers);

        for (int i = 0; i < numTaskManagers; i++) {
            startTaskManager();
        }
    }

    /**
     * Starts additional TaskManager process.
     *
     * 
When the MiniCluster starts up, it always starts {@link
     * MiniClusterConfiguration#getNumTaskManagers} TaskManagers. All TaskManagers are indexed from
     * 0 to the number of TaskManagers, started so far, minus one. This method starts a TaskManager
     * with the next index which is the number of TaskManagers, started so far. The index always
     * increases with each new started TaskManager. The indices of terminated TaskManagers are not
     * reused after {@link #terminateTaskManager(int)}.
     */
    public void startTaskManager() throws Exception {
        synchronized (lock) {
            final Configuration configuration = miniClusterConfiguration.getConfiguration();

            final TaskExecutor taskExecutor =
                    TaskManagerRunner.startTaskManager(
                            configuration,
                            new ResourceID(UUID.randomUUID().toString()),
                            taskManagerRpcServiceFactory.createRpcService(),
                            haServices,
                            heartbeatServices,
                            metricRegistry,
                            blobCacheService,
                            useLocalCommunication(),
                            ExternalResourceInfoProvider.NO_EXTERNAL_RESOURCES,
                            workingDirectory.createSubWorkingDirectory("tm_" + taskManagers.size()),
                            taskManagerTerminatingFatalErrorHandlerFactory.create(
                                    taskManagers.size()),
                            delegationTokenReceiverRepository);

            taskExecutor.start();
            taskManagers.add(taskExecutor);
        }
    }

    @VisibleForTesting
    protected boolean useLocalCommunication() {
        return miniClusterConfiguration.getNumTaskManagers() == 1;
    }

    @VisibleForTesting
    public Configuration getConfiguration() {
        return miniClusterConfiguration.getConfiguration();
    }

    // HACK: temporary hack to make the changelog state backend tests work with forced
    // full snapshots. This option should be removed once changelog state backend supports forced
    // full snapshots
    @Internal private boolean overrideRestoreModeForChangelogStateBackend;

    @Internal
    public void overrideRestoreModeForChangelogStateBackend() {
        this.overrideRestoreModeForChangelogStateBackend = true;
    }

    @GuardedBy("lock")
    private Collection> terminateTaskManagers() {
        final Collection> terminationFutures =
                new ArrayList<>(taskManagers.size());
        for (int i = 0; i < taskManagers.size(); i++) {
            terminationFutures.add(terminateTaskManager(i));
        }

        return terminationFutures;
    }

    /**
     * Terminates a TaskManager with the given index.
     *
     * 
See {@link #startTaskManager()} to understand how TaskManagers are indexed. This method
     * terminates a TaskManager with a given index but it does not clear the index. The index stays
     * occupied for the lifetime of the MiniCluster and its TaskManager stays terminated. The index
     * is not reused if more TaskManagers are started with {@link #startTaskManager()}.
     *
     * @param index index of the TaskManager to terminate
     * @return {@link CompletableFuture} of the given TaskManager termination
     */
    public CompletableFuture terminateTaskManager(int index) {
        synchronized (lock) {
            final TaskExecutor taskExecutor = taskManagers.get(index);
            return taskExecutor.closeAsync();
        }
    }

    // ------------------------------------------------------------------------
    //  Accessing jobs
    // ------------------------------------------------------------------------

    public CompletableFuture getArchivedExecutionGraph(JobID jobId) {
        return runDispatcherCommand(
                dispatcherGateway ->
                        dispatcherGateway
                                .requestExecutionGraphInfo(jobId, rpcTimeout)
                                .thenApply(ExecutionGraphInfo::getArchivedExecutionGraph));
    }

    public CompletableFuture> listJobs() {
        return runDispatcherCommand(
                dispatcherGateway ->
                        dispatcherGateway
                                .requestMultipleJobDetails(rpcTimeout)
                                .thenApply(
                                        jobs ->
                                                jobs.getJobs().stream()
                                                        .map(
                                                                details ->
                                                                        new JobStatusMessage(
                                                                                details.getJobId(),
                                                                                details
                                                                                        .getJobName(),
                                                                                details.getStatus(),
                                                                                details
                                                                                        .getStartTime()))
                                                        .collect(Collectors.toList())));
    }

    public CompletableFuture getJobStatus(JobID jobId) {
        return runDispatcherCommand(
                dispatcherGateway -> dispatcherGateway.requestJobStatus(jobId, rpcTimeout));
    }

    public CompletableFuture cancelJob(JobID jobId) {
        return runDispatcherCommand(
                dispatcherGateway -> dispatcherGateway.cancelJob(jobId, rpcTimeout));
    }

    public CompletableFuture triggerSavepoint(
            JobID jobId,
            String targetDirectory,
            boolean cancelJob,
            SavepointFormatType formatType) {
        return runDispatcherCommand(
                dispatcherGateway ->
                        dispatcherGateway.triggerSavepointAndGetLocation(
                                jobId,
                                targetDirectory,
                                formatType,
                                cancelJob
                                        ? TriggerSavepointMode.CANCEL_WITH_SAVEPOINT
                                        : TriggerSavepointMode.SAVEPOINT,
                                rpcTimeout));
    }

    public CompletableFuture triggerDetachedSavepoint(
            JobID jobId,
            String targetDirectory,
            boolean cancelJob,
            SavepointFormatType formatType) {
        return runDispatcherCommand(
                dispatcherGateway -> {
                    dispatcherGateway.triggerSavepointAndGetLocation(
                            jobId,
                            targetDirectory,
                            formatType,
                            cancelJob
                                    ? TriggerSavepointMode.CANCEL_WITH_SAVEPOINT
                                    : TriggerSavepointMode.SAVEPOINT,
                            rpcTimeout);
                    // return immediately, no need to wait for the future savepoint path
                    return CompletableFuture.completedFuture("");
                });
    }

    public CompletableFuture triggerCheckpoint(JobID jobID) {
        return runDispatcherCommand(
                dispatcherGateway -> dispatcherGateway.triggerCheckpoint(jobID, rpcTimeout));
    }

    public CompletableFuture triggerCheckpoint(JobID jobID, CheckpointType checkpointType) {
        return runDispatcherCommand(
                dispatcherGateway ->
                        dispatcherGateway.triggerCheckpointAndGetCheckpointID(
                                jobID, checkpointType, rpcTimeout));
    }

    public CompletableFuture stopWithSavepoint(
            JobID jobId,
            String targetDirectory,
            boolean terminate,
            SavepointFormatType formatType) {
        return runDispatcherCommand(
                dispatcherGateway ->
                        dispatcherGateway.stopWithSavepointAndGetLocation(
                                jobId,
                                targetDirectory,
                                formatType,
                                terminate
                                        ? TriggerSavepointMode.TERMINATE_WITH_SAVEPOINT
                                        : TriggerSavepointMode.SUSPEND_WITH_SAVEPOINT,
                                rpcTimeout));
    }

    public CompletableFuture stopWithDetachedSavepoint(
            JobID jobId,
            String targetDirectory,
            boolean terminate,
            SavepointFormatType formatType) {
        return runDispatcherCommand(
                dispatcherGateway -> {
                    dispatcherGateway.stopWithSavepointAndGetLocation(
                            jobId,
                            targetDirectory,
                            formatType,
                            terminate
                                    ? TriggerSavepointMode.TERMINATE_WITH_SAVEPOINT
                                    : TriggerSavepointMode.SUSPEND_WITH_SAVEPOINT,
                            rpcTimeout);
                    // return immediately, no need to wait for the future savepoint path
                    return CompletableFuture.completedFuture("");
                });
    }

    public CompletableFuture disposeSavepoint(String savepointPath) {
        return runDispatcherCommand(
                dispatcherGateway -> dispatcherGateway.disposeSavepoint(savepointPath, rpcTimeout));
    }

    public CompletableFuture getExecutionGraph(JobID jobId) {
        return runDispatcherCommand(
                dispatcherGateway -> dispatcherGateway.requestJob(jobId, rpcTimeout));
    }

    public CompletableFuture deliverCoordinationRequestToCoordinator(
            JobID jobId,
            String operatorUid,
            SerializedValue serializedRequest) {
        return runDispatcherCommand(
                dispatcherGateway ->
                        dispatcherGateway.deliverCoordinationRequestToCoordinator(
                                jobId, operatorUid, serializedRequest, rpcTimeout));
    }

    public CompletableFuture getResourceOverview() {
        return runResourceManagerCommand(
                resourceManagerGateway ->
                        resourceManagerGateway.requestResourceOverview(rpcTimeout));
    }

    private  CompletableFuture runDispatcherCommand(
            Function> dispatcherCommand) {
        return getDispatcherGatewayFuture()
                .thenApply(dispatcherCommand)
                .thenCompose(Function.identity());
    }

    private  CompletableFuture runResourceManagerCommand(
            Function> resourceManagerCommand) {
        return getResourceManagerGatewayFuture()
                .thenApply(resourceManagerCommand)
                .thenCompose(Function.identity());
    }

    // ------------------------------------------------------------------------
    //  running jobs
    // ------------------------------------------------------------------------

    /**
     * This method executes a job in detached mode. The method returns immediately after the job has
     * been added to the
     *
     * @param job The Flink job to execute
     * @throws JobExecutionException Thrown if anything went amiss during initial job launch, or if
     *     the job terminally failed.
     */
    public void runDetached(JobGraph job) throws JobExecutionException, InterruptedException {
        checkNotNull(job, "job is null");

        final CompletableFuture submissionFuture = submitJob(job);

        try {
            submissionFuture.get();
        } catch (ExecutionException e) {
            throw new JobExecutionException(
                    job.getJobID(), ExceptionUtils.stripExecutionException(e));
        }
    }

    /**
     * This method runs a job in blocking mode. The method returns only after the job completed
     * successfully, or after it failed terminally.
     *
     * @param job The Flink job to execute
     * @return The result of the job execution
     * @throws JobExecutionException Thrown if anything went amiss during initial job launch, or if
     *     the job terminally failed.
     */
    public JobExecutionResult executeJobBlocking(JobGraph job)
            throws JobExecutionException, InterruptedException {
        checkNotNull(job, "job is null");

        final CompletableFuture submissionFuture = submitJob(job);

        final CompletableFuture jobResultFuture =
                submissionFuture.thenCompose(
                        (JobSubmissionResult ignored) -> requestJobResult(job.getJobID()));

        final JobResult jobResult;

        try {
            jobResult = jobResultFuture.get();
        } catch (ExecutionException e) {
            throw new JobExecutionException(
                    job.getJobID(),
                    "Could not retrieve JobResult.",
                    ExceptionUtils.stripExecutionException(e));
        }

        try {
            return jobResult.toJobExecutionResult(Thread.currentThread().getContextClassLoader());
        } catch (IOException | ClassNotFoundException e) {
            throw new JobExecutionException(job.getJobID(), e);
        }
    }

    public CompletableFuture submitJob(JobGraph jobGraph) {
        // When MiniCluster uses the local RPC, the provided JobGraph is passed directly to the
        // Dispatcher. This means that any mutations to the JG can affect the Dispatcher behaviour,
        // so we rather clone it to guard against this.
        final JobGraph clonedJobGraph = InstantiationUtil.cloneUnchecked(jobGraph);
        checkRestoreModeForChangelogStateBackend(clonedJobGraph);
        final CompletableFuture dispatcherGatewayFuture =
                getDispatcherGatewayFuture();
        final CompletableFuture blobServerAddressFuture =
                createBlobServerAddress(dispatcherGatewayFuture);
        final CompletableFuture jarUploadFuture =
                uploadAndSetJobFiles(blobServerAddressFuture, clonedJobGraph);
        final CompletableFuture acknowledgeCompletableFuture =
                jarUploadFuture
                        .thenCombine(
                                dispatcherGatewayFuture,
                                (Void ack, DispatcherGateway dispatcherGateway) ->
                                        dispatcherGateway.submitJob(clonedJobGraph, rpcTimeout))
                        .thenCompose(Function.identity());
        return acknowledgeCompletableFuture.thenApply(
                (Acknowledge ignored) -> new JobSubmissionResult(clonedJobGraph.getJobID()));
    }

    // HACK: temporary hack to make the randomized changelog state backend tests work with forced
    // full snapshots. This option should be removed once changelog state backend supports forced
    // full snapshots
    private void checkRestoreModeForChangelogStateBackend(JobGraph jobGraph) {
        final SavepointRestoreSettings savepointRestoreSettings =
                jobGraph.getSavepointRestoreSettings();
        if (overrideRestoreModeForChangelogStateBackend
                && savepointRestoreSettings.getRecoveryClaimMode() == RecoveryClaimMode.NO_CLAIM) {
            final Configuration conf = new Configuration();
            SavepointRestoreSettings.toConfiguration(savepointRestoreSettings, conf);
            conf.set(StateRecoveryOptions.RESTORE_MODE, RecoveryClaimMode.LEGACY);
            jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.fromConfiguration(conf));
        }
    }

    public CompletableFuture requestJobResult(JobID jobId) {
        return runDispatcherCommand(
                dispatcherGateway ->
                        dispatcherGateway.requestJobResult(jobId, RpcUtils.INF_TIMEOUT));
    }

    public CompletableFuture requestClusterOverview() {
        return runDispatcherCommand(
                dispatcherGateway ->
                        dispatcherGateway.requestClusterOverview(RpcUtils.INF_TIMEOUT));
    }

    @VisibleForTesting
    protected CompletableFuture getDispatcherGatewayFuture() {
        synchronized (lock) {
            checkState(running, "MiniCluster is not yet running or has already been shut down.");
            return dispatcherGatewayRetriever.getFuture();
        }
    }

    private CompletableFuture getResourceManagerGatewayFuture() {
        synchronized (lock) {
            checkState(running, "MiniCluster is not yet running or has already been shut down.");
            return resourceManagerGatewayRetriever.getFuture();
        }
    }

    private CompletableFuture uploadAndSetJobFiles(
            final CompletableFuture blobServerAddressFuture,
            final JobGraph job) {
        return blobServerAddressFuture.thenAccept(
                blobServerAddress -> {
                    try {
                        ClientUtils.extractAndUploadJobGraphFiles(
                                job,
                                () ->
                                        new BlobClient(
                                                blobServerAddress,
                                                miniClusterConfiguration.getConfiguration()));
                    } catch (FlinkException e) {
                        throw new CompletionException(e);
                    }
                });
    }

    private CompletableFuture createBlobServerAddress(
            final CompletableFuture dispatcherGatewayFuture) {
        return dispatcherGatewayFuture
                .thenApply(
                        dispatcherGateway ->
                                dispatcherGateway
                                        .getBlobServerPort(rpcTimeout)
                                        .thenApply(
                                                blobServerPort ->
                                                        new InetSocketAddress(
                                                                dispatcherGateway.getHostname(),
                                                                blobServerPort)))
                .thenCompose(Function.identity());
    }

    // ------------------------------------------------------------------------
    //  factories - can be overridden by subclasses to alter behavior
    // ------------------------------------------------------------------------

    /**
     * Factory method to create the metric registry for the mini cluster.
     *
     * @param config The configuration of the mini cluster
     * @param maximumMessageSizeInBytes the maximum message size
     */
    protected MetricRegistryImpl createMetricRegistry(
            Configuration config, long maximumMessageSizeInBytes) {
        return new MetricRegistryImpl(
                MetricRegistryConfiguration.fromConfiguration(config, maximumMessageSizeInBytes),
                ReporterSetup.fromConfiguration(
                        config, miniClusterConfiguration.getPluginManager()),
                TraceReporterSetup.fromConfiguration(
                        config, miniClusterConfiguration.getPluginManager()));
    }

    /**
     * Factory method to instantiate the remote RPC service.
     *
     * @param configuration Flink configuration.
     * @param bindAddress The address to bind the RPC service to.
     * @param bindPort The port range to bind the RPC service to.
     * @param rpcSystem
     * @return The instantiated RPC service
     */
    protected RpcService createRemoteRpcService(
            Configuration configuration, String bindAddress, int bindPort, RpcSystem rpcSystem)
            throws Exception {
        return rpcSystem
                .remoteServiceBuilder(configuration, bindAddress, String.valueOf(bindPort))
                .withBindAddress(bindAddress)
                .withBindPort(bindPort)
                .withExecutorConfiguration(RpcUtils.getTestForkJoinExecutorConfiguration())
                .createAndStart();
    }

    /**
     * Factory method to instantiate the remote RPC service.
     *
     * @param configuration Flink configuration.
     * @param externalAddress The external address to access the RPC service.
     * @param externalPortRange The external port range to access the RPC service.
     * @param bindAddress The address to bind the RPC service to.
     * @param rpcSystem
     * @return The instantiated RPC service
     */
    protected RpcService createRemoteRpcService(
            Configuration configuration,
            String externalAddress,
            String externalPortRange,
            String bindAddress,
            RpcSystem rpcSystem)
            throws Exception {
        return rpcSystem
                .remoteServiceBuilder(configuration, externalAddress, externalPortRange)
                .withBindAddress(bindAddress)
                .withExecutorConfiguration(RpcUtils.getTestForkJoinExecutorConfiguration())
                .createAndStart();
    }

    /**
     * Factory method to instantiate the local RPC service.
     *
     * @param configuration Flink configuration.
     * @param rpcSystem
     * @return The instantiated RPC service
     */
    protected RpcService createLocalRpcService(Configuration configuration, RpcSystem rpcSystem)
            throws Exception {
        return rpcSystem
                .localServiceBuilder(configuration)
                .withExecutorConfiguration(RpcUtils.getTestForkJoinExecutorConfiguration())
                .createAndStart();
    }

    // ------------------------------------------------------------------------
    //  Internal methods
    // ------------------------------------------------------------------------

    @GuardedBy("lock")
    private CompletableFuture shutDownResourceManagerComponents() {

        final Collection> terminationFutures =
                new ArrayList<>(dispatcherResourceManagerComponents.size());

        for (DispatcherResourceManagerComponent dispatcherResourceManagerComponent :
                dispatcherResourceManagerComponents) {
            terminationFutures.add(dispatcherResourceManagerComponent.closeAsync());
        }

        final FutureUtils.ConjunctFuture dispatcherTerminationFuture =
                FutureUtils.completeAll(terminationFutures);

        return FutureUtils.runAfterwards(
                dispatcherTerminationFuture,
                () -> {
                    Exception exception = null;

                    synchronized (lock) {
                        if (resourceManagerLeaderRetriever != null) {
                            try {
                                resourceManagerLeaderRetriever.stop();
                            } catch (Exception e) {
                                exception = ExceptionUtils.firstOrSuppressed(e, exception);
                            }

                            resourceManagerLeaderRetriever = null;
                        }

                        if (dispatcherLeaderRetriever != null) {
                            try {
                                dispatcherLeaderRetriever.stop();
                            } catch (Exception e) {
                                exception = ExceptionUtils.firstOrSuppressed(e, exception);
                            }

                            dispatcherLeaderRetriever = null;
                        }

                        if (clusterRestEndpointLeaderRetrievalService != null) {
                            try {
                                clusterRestEndpointLeaderRetrievalService.stop();
                            } catch (Exception e) {
                                exception = ExceptionUtils.firstOrSuppressed(e, exception);
                            }

                            clusterRestEndpointLeaderRetrievalService = null;
                        }
                    }

                    if (exception != null) {
                        throw exception;
                    }
                });
    }

    private void terminateMiniClusterServices(boolean cleanupHaData) throws Exception {
        // collect the first exception, but continue and add all successive
        // exceptions as suppressed
        Exception exception = null;

        synchronized (lock) {
            if (blobCacheService != null) {
                try {
                    blobCacheService.close();
                } catch (Exception e) {
                    exception = ExceptionUtils.firstOrSuppressed(e, exception);
                }
                blobCacheService = null;
            }

            // shut down the blob server
            if (blobServer != null) {
                try {
                    blobServer.close();
                } catch (Exception e) {
                    exception = ExceptionUtils.firstOrSuppressed(e, exception);
                }
                blobServer = null;
            }

            // shut down high-availability services
            if (haServices != null) {
                haServices.closeWithOptionalClean(cleanupHaData);
                haServices = null;
            }

            try {
                if (rpcSystem.isOwned()) {
                    rpcSystem.deref().close();
                }
            } catch (Exception e) {
                exception = ExceptionUtils.firstOrSuppressed(e, exception);
            }

            if (exception != null) {
                throw exception;
            }
        }
    }

    @Nonnull
    private CompletableFuture terminateRpcServices() {
        synchronized (lock) {
            final int numRpcServices = 1 + rpcServices.size();

            final Collection> rpcTerminationFutures =
                    new ArrayList<>(numRpcServices);

            rpcTerminationFutures.add(commonRpcService.closeAsync());

            for (RpcService rpcService : rpcServices) {
                rpcTerminationFutures.add(rpcService.closeAsync());
            }

            commonRpcService = null;
            rpcServices.clear();

            return FutureUtils.completeAll(rpcTerminationFutures);
        }
    }

    private CompletableFuture terminateExecutors(long executorShutdownTimeoutMillis) {
        synchronized (lock) {
            if (ioExecutor != null) {
                return ExecutorUtils.nonBlockingShutdown(
                        executorShutdownTimeoutMillis, TimeUnit.MILLISECONDS, ioExecutor);
            } else {
                return CompletableFuture.completedFuture(null);
            }
        }
    }

    private void deleteDirectories() throws IOException {
        synchronized (lock) {
            if (workingDirectory != null) {
                workingDirectory.delete();
            }
        }
    }

    public CompletableFuture invalidateClusterDataset(AbstractID clusterDatasetId) {
        return resourceManagerGatewayRetriever
                .getFuture()
                .thenApply(
                        resourceManagerGateway ->
                                resourceManagerGateway.releaseClusterPartitions(
                                        new IntermediateDataSetID(clusterDatasetId)))
                .thenCompose(Function.identity());
    }

    public CompletableFuture> listCompletedClusterDatasetIds() {
        return resourceManagerGatewayRetriever
                .getFuture()
                .thenApply(ClusterPartitionManager::listDataSets)
                .thenCompose(
                        metaInfoMapFuture ->
                                metaInfoMapFuture.thenApply(
                                        metaInfoMap -> new HashSet<>(metaInfoMap.keySet())));
    }

    public CompletableFuture reportHeartbeat(JobID jobId, long expiredTimestamp) {
        return runDispatcherCommand(
                dispatcherGateway ->
                        dispatcherGateway.reportJobClientHeartbeat(
                                jobId, expiredTimestamp, rpcTimeout));
    }

    /** Internal factory for {@link RpcService}. */
    protected interface RpcServiceFactory {
        RpcService createRpcService() throws Exception;
    }

    /** Factory which returns always the common {@link RpcService}. */
    protected static class CommonRpcServiceFactory implements RpcServiceFactory {

        private final RpcService commonRpcService;

        CommonRpcServiceFactory(RpcService commonRpcService) {
            this.commonRpcService = commonRpcService;
        }

        @Override
        public RpcService createRpcService() {
            return commonRpcService;
        }
    }

    /** Factory which creates and registers new {@link RpcService}. */
    protected class DedicatedRpcServiceFactory implements RpcServiceFactory {

        private final Configuration configuration;
        private final String externalAddress;
        private final String externalPortRange;
        private final String bindAddress;
        private final RpcSystem rpcSystem;

        DedicatedRpcServiceFactory(
                Configuration configuration,
                String externalAddress,
                String externalPortRange,
                String bindAddress,
                RpcSystem rpcSystem) {
            this.configuration = configuration;
            this.externalAddress = externalAddress;
            this.externalPortRange = externalPortRange;
            this.bindAddress = bindAddress;
            this.rpcSystem = rpcSystem;
        }

        @Override
        public RpcService createRpcService() throws Exception {
            final RpcService rpcService =
                    MiniCluster.this.createRemoteRpcService(
                            configuration,
                            externalAddress,
                            externalPortRange,
                            bindAddress,
                            rpcSystem);

            synchronized (lock) {
                rpcServices.add(rpcService);
            }

            return rpcService;
        }
    }

    // ------------------------------------------------------------------------
    //  miscellaneous utilities
    // ------------------------------------------------------------------------

    private void initializeIOFormatClasses(Configuration configuration) {
        // TODO: That we still have to call something like this is a crime against humanity
        FileOutputFormat.initDefaultsFromConfiguration(configuration);
    }

    private class TerminatingFatalErrorHandler implements FatalErrorHandler {

        private final int index;

        private TerminatingFatalErrorHandler(int index) {
            this.index = index;
        }

        @Override
        public void onFatalError(Throwable exception) {
            // first check if we are still running
            if (running) {
                LOG.error("TaskManager #{} failed.", index, exception);

                synchronized (lock) {
                    taskManagers.get(index).closeAsync();
                }
            }
        }
    }

    private class ShutDownFatalErrorHandler implements FatalErrorHandler {

        @Override
        public void onFatalError(Throwable exception) {
            LOG.warn("Error in MiniCluster. Shutting the MiniCluster down.", exception);
            closeAsync();
        }
    }

    private class TerminatingFatalErrorHandlerFactory {

        /**
         * Create a new {@link TerminatingFatalErrorHandler} for the {@link TaskExecutor} with the
         * given index.
         *
         * @param index into the {@link #taskManagers} collection to identify the correct {@link
         *     TaskExecutor}.
         * @return {@link TerminatingFatalErrorHandler} for the given index
         */
        @GuardedBy("lock")
        private TerminatingFatalErrorHandler create(int index) {
            return new TerminatingFatalErrorHandler(index);
        }
    }

    /** HA Services to use. */
    public enum HaServices {
        /** Uses the configured HA Services in {@link HighAvailabilityOptions#HA_MODE} option. */
        CONFIGURED,

        /**
         * Enables or disables {@link HaLeadershipControl} in {@link
         * MiniCluster#getHaLeadershipControl}.
         *
         * {@link HaLeadershipControl} allows granting and revoking leadership of HA components.
         * Enabling this feature disables {@link HighAvailabilityOptions#HA_MODE} option.
         */
        WITH_LEADERSHIP_CONTROL
    }

    /**
     * SingletonHighAvailabilityServicesFactory is used for scenarios that are not truly high
     * available and rely on having a single HighAvailabilityServices object.
     */
    private static class SingletonHighAvailabilityServicesFactory
            implements HighAvailabilityServicesFactory {

        private final BiFunctionWithException<
                        Configuration, Executor, HighAvailabilityServices, Exception>
                creationCallback;

        @Nullable private HighAvailabilityServices haServices;

        public SingletonHighAvailabilityServicesFactory(
                BiFunctionWithException<
                                Configuration, Executor, HighAvailabilityServices, Exception>
                        creationCallback) {
            this.creationCallback = creationCallback;
        }

        @Override
        public HighAvailabilityServices createHAServices(
                Configuration configuration, Executor executor) throws Exception {
            if (haServices == null) {
                haServices = creationCallback.apply(configuration, executor);
            }

            return this.haServices;
        }
    }

    private class RegularHighAvailabilityServicesFactory
            implements HighAvailabilityServicesFactory {
        @Override
        public HighAvailabilityServices createHAServices(
                Configuration configuration, Executor executor) throws Exception {
            return HighAvailabilityServicesUtils.createAvailableOrEmbeddedServices(
                    configuration, executor, new ShutDownFatalErrorHandler());
        }
    }
}