org.apache.flink.runtime.minicluster.MiniCluster Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.runtime.minicluster;
import org.apache.flink.annotation.Internal;
import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.api.common.JobExecutionResult;
import org.apache.flink.api.common.JobID;
import org.apache.flink.api.common.JobStatus;
import org.apache.flink.api.common.JobSubmissionResult;
import org.apache.flink.api.common.io.FileOutputFormat;
import org.apache.flink.configuration.ClusterOptions;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.configuration.ConfigurationUtils;
import org.apache.flink.configuration.HighAvailabilityOptions;
import org.apache.flink.configuration.IllegalConfigurationException;
import org.apache.flink.configuration.StateRecoveryOptions;
import org.apache.flink.core.execution.CheckpointType;
import org.apache.flink.core.execution.RecoveryClaimMode;
import org.apache.flink.core.execution.SavepointFormatType;
import org.apache.flink.runtime.blob.BlobCacheService;
import org.apache.flink.runtime.blob.BlobClient;
import org.apache.flink.runtime.blob.BlobServer;
import org.apache.flink.runtime.blob.BlobUtils;
import org.apache.flink.runtime.client.ClientUtils;
import org.apache.flink.runtime.client.JobExecutionException;
import org.apache.flink.runtime.client.JobStatusMessage;
import org.apache.flink.runtime.clusterframework.types.ResourceID;
import org.apache.flink.runtime.dispatcher.DispatcherGateway;
import org.apache.flink.runtime.dispatcher.DispatcherId;
import org.apache.flink.runtime.dispatcher.MemoryExecutionGraphInfoStore;
import org.apache.flink.runtime.dispatcher.TriggerSavepointMode;
import org.apache.flink.runtime.entrypoint.ClusterEntrypointUtils;
import org.apache.flink.runtime.entrypoint.ClusterInformation;
import org.apache.flink.runtime.entrypoint.WorkingDirectory;
import org.apache.flink.runtime.entrypoint.component.DefaultDispatcherResourceManagerComponentFactory;
import org.apache.flink.runtime.entrypoint.component.DispatcherResourceManagerComponent;
import org.apache.flink.runtime.entrypoint.component.DispatcherResourceManagerComponentFactory;
import org.apache.flink.runtime.executiongraph.AccessExecutionGraph;
import org.apache.flink.runtime.executiongraph.ArchivedExecutionGraph;
import org.apache.flink.runtime.externalresource.ExternalResourceInfoProvider;
import org.apache.flink.runtime.heartbeat.HeartbeatServices;
import org.apache.flink.runtime.highavailability.HighAvailabilityServices;
import org.apache.flink.runtime.highavailability.HighAvailabilityServicesFactory;
import org.apache.flink.runtime.highavailability.HighAvailabilityServicesUtils;
import org.apache.flink.runtime.highavailability.nonha.embedded.EmbeddedHaServices;
import org.apache.flink.runtime.highavailability.nonha.embedded.EmbeddedHaServicesWithLeadershipControl;
import org.apache.flink.runtime.highavailability.nonha.embedded.HaLeadershipControl;
import org.apache.flink.runtime.io.network.partition.ClusterPartitionManager;
import org.apache.flink.runtime.jobgraph.IntermediateDataSetID;
import org.apache.flink.runtime.jobgraph.JobGraph;
import org.apache.flink.runtime.jobgraph.SavepointRestoreSettings;
import org.apache.flink.runtime.jobmanager.HighAvailabilityMode;
import org.apache.flink.runtime.jobmaster.JobResult;
import org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService;
import org.apache.flink.runtime.messages.Acknowledge;
import org.apache.flink.runtime.messages.webmonitor.ClusterOverview;
import org.apache.flink.runtime.metrics.MetricRegistry;
import org.apache.flink.runtime.metrics.MetricRegistryConfiguration;
import org.apache.flink.runtime.metrics.MetricRegistryImpl;
import org.apache.flink.runtime.metrics.ReporterSetup;
import org.apache.flink.runtime.metrics.TraceReporterSetup;
import org.apache.flink.runtime.metrics.groups.ProcessMetricGroup;
import org.apache.flink.runtime.metrics.util.MetricUtils;
import org.apache.flink.runtime.operators.coordination.CoordinationRequest;
import org.apache.flink.runtime.operators.coordination.CoordinationResponse;
import org.apache.flink.runtime.resourcemanager.ResourceManagerGateway;
import org.apache.flink.runtime.resourcemanager.ResourceManagerId;
import org.apache.flink.runtime.resourcemanager.ResourceOverview;
import org.apache.flink.runtime.resourcemanager.StandaloneResourceManagerFactory;
import org.apache.flink.runtime.rpc.FatalErrorHandler;
import org.apache.flink.runtime.rpc.RpcService;
import org.apache.flink.runtime.rpc.RpcSystem;
import org.apache.flink.runtime.rpc.RpcUtils;
import org.apache.flink.runtime.scheduler.ExecutionGraphInfo;
import org.apache.flink.runtime.security.token.DefaultDelegationTokenManagerFactory;
import org.apache.flink.runtime.security.token.DelegationTokenManager;
import org.apache.flink.runtime.security.token.DelegationTokenReceiverRepository;
import org.apache.flink.runtime.taskexecutor.TaskExecutor;
import org.apache.flink.runtime.taskexecutor.TaskManagerRunner;
import org.apache.flink.runtime.webmonitor.retriever.LeaderRetriever;
import org.apache.flink.runtime.webmonitor.retriever.MetricQueryServiceRetriever;
import org.apache.flink.runtime.webmonitor.retriever.impl.RpcGatewayRetriever;
import org.apache.flink.runtime.webmonitor.retriever.impl.RpcMetricQueryServiceRetriever;
import org.apache.flink.util.AbstractID;
import org.apache.flink.util.AutoCloseableAsync;
import org.apache.flink.util.ExceptionUtils;
import org.apache.flink.util.ExecutorUtils;
import org.apache.flink.util.FlinkException;
import org.apache.flink.util.InstantiationUtil;
import org.apache.flink.util.Reference;
import org.apache.flink.util.SerializedValue;
import org.apache.flink.util.concurrent.ExecutorThreadFactory;
import org.apache.flink.util.concurrent.ExponentialBackoffRetryStrategy;
import org.apache.flink.util.concurrent.FutureUtils;
import org.apache.flink.util.function.BiFunctionWithException;
import org.apache.flink.util.function.FunctionUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import javax.annotation.concurrent.GuardedBy;
import java.io.IOException;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.URI;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.CompletionException;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Executor;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import static org.apache.flink.configuration.ClusterOptions.PROCESS_WORKING_DIR_BASE;
import static org.apache.flink.util.Preconditions.checkNotNull;
import static org.apache.flink.util.Preconditions.checkState;
/** MiniCluster to execute Flink jobs locally. */
public class MiniCluster implements AutoCloseableAsync {
private static final Logger LOG = LoggerFactory.getLogger(MiniCluster.class);
/** The lock to guard startup / shutdown / manipulation methods. */
private final Object lock = new Object();
/** The configuration for this mini cluster. */
private final MiniClusterConfiguration miniClusterConfiguration;
private final Duration rpcTimeout;
@GuardedBy("lock")
private final List taskManagers;
private final TerminatingFatalErrorHandlerFactory
taskManagerTerminatingFatalErrorHandlerFactory =
new TerminatingFatalErrorHandlerFactory();
private final Supplier> rpcSystemSupplier;
private CompletableFuture terminationFuture;
@GuardedBy("lock")
private MetricRegistryImpl metricRegistry;
@GuardedBy("lock")
private ProcessMetricGroup processMetricGroup;
@GuardedBy("lock")
private RpcService commonRpcService;
@GuardedBy("lock")
private ExecutorService ioExecutor;
@GuardedBy("lock")
private final Collection rpcServices;
@GuardedBy("lock")
private HighAvailabilityServicesFactory haServicesFactory;
@GuardedBy("lock")
private HighAvailabilityServices haServices;
@GuardedBy("lock")
private BlobServer blobServer;
@GuardedBy("lock")
private HeartbeatServices heartbeatServices;
@GuardedBy("lock")
private DelegationTokenManager delegationTokenManager;
@GuardedBy("lock")
private DelegationTokenReceiverRepository delegationTokenReceiverRepository;
@GuardedBy("lock")
private BlobCacheService blobCacheService;
@GuardedBy("lock")
private LeaderRetrievalService resourceManagerLeaderRetriever;
@GuardedBy("lock")
private LeaderRetrievalService dispatcherLeaderRetriever;
@GuardedBy("lock")
private LeaderRetrievalService clusterRestEndpointLeaderRetrievalService;
@GuardedBy("lock")
private Collection dispatcherResourceManagerComponents;
@GuardedBy("lock")
private RpcGatewayRetriever dispatcherGatewayRetriever;
@GuardedBy("lock")
private RpcGatewayRetriever
resourceManagerGatewayRetriever;
@GuardedBy("lock")
private LeaderRetriever webMonitorLeaderRetriever;
@GuardedBy("lock")
private RpcServiceFactory taskManagerRpcServiceFactory;
@GuardedBy("lock")
private WorkingDirectory workingDirectory;
/** Flag marking the mini cluster as started/running. */
private volatile boolean running;
@GuardedBy("lock")
private Reference rpcSystem;
// ------------------------------------------------------------------------
/**
* Creates a new Flink mini cluster based on the given configuration.
*
* @param miniClusterConfiguration The configuration for the mini cluster
*/
public MiniCluster(MiniClusterConfiguration miniClusterConfiguration) {
this(
miniClusterConfiguration,
() -> Reference.owned(RpcSystem.load(miniClusterConfiguration.getConfiguration())));
}
public MiniCluster(
MiniClusterConfiguration miniClusterConfiguration,
Supplier> rpcSystemSupplier) {
this.miniClusterConfiguration =
checkNotNull(miniClusterConfiguration, "config may not be null");
this.rpcServices =
new ArrayList<>(
1
+ 2
+ miniClusterConfiguration
.getNumTaskManagers()); // common + JM + RM + TMs
this.dispatcherResourceManagerComponents = new ArrayList<>(1);
// There shouldn't be any lost messages between the MiniCluster and the Flink components
// since they all run in the same process.
this.rpcTimeout = RpcUtils.INF_TIMEOUT;
this.terminationFuture = CompletableFuture.completedFuture(null);
running = false;
this.taskManagers = new ArrayList<>(miniClusterConfiguration.getNumTaskManagers());
this.rpcSystemSupplier = rpcSystemSupplier;
}
public CompletableFuture getRestAddress() {
synchronized (lock) {
checkState(running, "MiniCluster is not yet running or has already been shut down.");
return webMonitorLeaderRetriever
.getLeaderFuture()
.thenApply(
FunctionUtils.uncheckedFunction(
addressLeaderIdTuple -> new URI(addressLeaderIdTuple.f0)));
}
}
public ClusterInformation getClusterInformation() {
synchronized (lock) {
checkState(running, "MiniCluster is not yet running or has already been shut down.");
return new ClusterInformation("localhost", blobServer.getPort());
}
}
protected Executor getIOExecutor() {
return ioExecutor;
}
// ------------------------------------------------------------------------
// life cycle
// ------------------------------------------------------------------------
/** Checks if the mini cluster was started and is running. */
public boolean isRunning() {
return running;
}
/**
* Starts the mini cluster, based on the configured properties.
*
* @throws Exception This method passes on any exception that occurs during the startup of the
* mini cluster.
*/
public void start() throws Exception {
synchronized (lock) {
checkState(!running, "MiniCluster is already running");
LOG.info("Starting Flink Mini Cluster");
LOG.debug("Using configuration {}", miniClusterConfiguration);
final Configuration configuration = miniClusterConfiguration.getConfiguration();
final boolean useSingleRpcService =
miniClusterConfiguration.getRpcServiceSharing() == RpcServiceSharing.SHARED;
try {
workingDirectory =
WorkingDirectory.create(
ClusterEntrypointUtils.generateWorkingDirectoryFile(
configuration,
Optional.of(PROCESS_WORKING_DIR_BASE),
"minicluster_" + ResourceID.generate()));
initializeIOFormatClasses(configuration);
rpcSystem = rpcSystemSupplier.get();
LOG.info("Starting Metrics Registry");
metricRegistry =
createMetricRegistry(
configuration,
rpcSystem.deref().getMaximumMessageSizeInBytes(configuration));
// bring up all the RPC services
LOG.info("Starting RPC Service(s)");
final RpcServiceFactory dispatcherResourceManagerComponentRpcServiceFactory;
final RpcService metricQueryServiceRpcService;
if (useSingleRpcService) {
// we always need the 'commonRpcService' for auxiliary calls
commonRpcService = createLocalRpcService(configuration, rpcSystem.deref());
final CommonRpcServiceFactory commonRpcServiceFactory =
new CommonRpcServiceFactory(commonRpcService);
taskManagerRpcServiceFactory = commonRpcServiceFactory;
dispatcherResourceManagerComponentRpcServiceFactory = commonRpcServiceFactory;
metricQueryServiceRpcService =
MetricUtils.startLocalMetricsRpcService(
configuration, rpcSystem.deref());
} else {
// start a new service per component, possibly with custom bind addresses
final String jobManagerExternalAddress =
miniClusterConfiguration.getJobManagerExternalAddress();
final String taskManagerExternalAddress =
miniClusterConfiguration.getTaskManagerExternalAddress();
final String jobManagerExternalPortRange =
miniClusterConfiguration.getJobManagerExternalPortRange();
final String taskManagerExternalPortRange =
miniClusterConfiguration.getTaskManagerExternalPortRange();
final String jobManagerBindAddress =
miniClusterConfiguration.getJobManagerBindAddress();
final String taskManagerBindAddress =
miniClusterConfiguration.getTaskManagerBindAddress();
dispatcherResourceManagerComponentRpcServiceFactory =
new DedicatedRpcServiceFactory(
configuration,
jobManagerExternalAddress,
jobManagerExternalPortRange,
jobManagerBindAddress,
rpcSystem.deref());
taskManagerRpcServiceFactory =
new DedicatedRpcServiceFactory(
configuration,
taskManagerExternalAddress,
taskManagerExternalPortRange,
taskManagerBindAddress,
rpcSystem.deref());
// we always need the 'commonRpcService' for auxiliary calls
// bind to the JobManager address with port 0
commonRpcService =
createRemoteRpcService(
configuration, jobManagerBindAddress, 0, rpcSystem.deref());
metricQueryServiceRpcService =
MetricUtils.startRemoteMetricsRpcService(
configuration,
commonRpcService.getAddress(),
null,
rpcSystem.deref());
}
metricRegistry.startQueryService(metricQueryServiceRpcService, null);
processMetricGroup =
MetricUtils.instantiateProcessMetricGroup(
metricRegistry,
RpcUtils.getHostname(commonRpcService),
ConfigurationUtils.getSystemResourceMetricsProbingInterval(
configuration));
ioExecutor =
Executors.newFixedThreadPool(
ClusterEntrypointUtils.getPoolSize(configuration),
new ExecutorThreadFactory("mini-cluster-io"));
delegationTokenManager =
DefaultDelegationTokenManagerFactory.create(
configuration,
miniClusterConfiguration.getPluginManager(),
commonRpcService.getScheduledExecutor(),
ioExecutor);
// Obtaining delegation tokens and propagating them to the local JVM receivers in a
// one-time fashion is required because BlobServer may connect to external file
// systems
delegationTokenManager.obtainDelegationTokens();
delegationTokenReceiverRepository =
new DelegationTokenReceiverRepository(
configuration, miniClusterConfiguration.getPluginManager());
haServicesFactory = createHighAvailabilityServicesFactory(configuration);
haServices = createHighAvailabilityServices(configuration, ioExecutor);
blobServer =
BlobUtils.createBlobServer(
configuration,
Reference.borrowed(workingDirectory.getBlobStorageDirectory()),
haServices.createBlobStore());
blobServer.start();
heartbeatServices = HeartbeatServices.fromConfiguration(configuration);
blobCacheService =
BlobUtils.createBlobCacheService(
configuration,
Reference.borrowed(workingDirectory.getBlobStorageDirectory()),
haServices.createBlobStore(),
new InetSocketAddress(
InetAddress.getLocalHost(), blobServer.getPort()));
startTaskManagers();
MetricQueryServiceRetriever metricQueryServiceRetriever =
new RpcMetricQueryServiceRetriever(
metricRegistry.getMetricQueryServiceRpcService());
setupDispatcherResourceManagerComponents(
configuration,
dispatcherResourceManagerComponentRpcServiceFactory,
metricQueryServiceRetriever);
resourceManagerLeaderRetriever = haServices.getResourceManagerLeaderRetriever();
dispatcherLeaderRetriever = haServices.getDispatcherLeaderRetriever();
clusterRestEndpointLeaderRetrievalService =
haServices.getClusterRestEndpointLeaderRetriever();
dispatcherGatewayRetriever =
new RpcGatewayRetriever<>(
commonRpcService,
DispatcherGateway.class,
DispatcherId::fromUuid,
new ExponentialBackoffRetryStrategy(
21, Duration.ofMillis(5L), Duration.ofMillis(20L)));
resourceManagerGatewayRetriever =
new RpcGatewayRetriever<>(
commonRpcService,
ResourceManagerGateway.class,
ResourceManagerId::fromUuid,
new ExponentialBackoffRetryStrategy(
21, Duration.ofMillis(5L), Duration.ofMillis(20L)));
webMonitorLeaderRetriever = new LeaderRetriever();
resourceManagerLeaderRetriever.start(resourceManagerGatewayRetriever);
dispatcherLeaderRetriever.start(dispatcherGatewayRetriever);
clusterRestEndpointLeaderRetrievalService.start(webMonitorLeaderRetriever);
} catch (Exception e) {
// cleanup everything
try {
close();
} catch (Exception ee) {
e.addSuppressed(ee);
}
throw e;
}
// create a new termination future
terminationFuture = new CompletableFuture<>();
// now officially mark this as running
running = true;
LOG.info("Flink Mini Cluster started successfully");
}
}
@GuardedBy("lock")
private void setupDispatcherResourceManagerComponents(
Configuration configuration,
RpcServiceFactory dispatcherResourceManagerComponentRpcServiceFactory,
MetricQueryServiceRetriever metricQueryServiceRetriever)
throws Exception {
dispatcherResourceManagerComponents.addAll(
createDispatcherResourceManagerComponents(
configuration,
dispatcherResourceManagerComponentRpcServiceFactory,
blobServer,
heartbeatServices,
delegationTokenManager,
metricRegistry,
metricQueryServiceRetriever,
new ShutDownFatalErrorHandler()));
FutureUtils.completeAll(
dispatcherResourceManagerComponents.stream()
.map(DispatcherResourceManagerComponent::getShutDownFuture)
.collect(Collectors.toList()))
.whenComplete((ignored, exception) -> closeAsync());
}
@VisibleForTesting
protected Collection extends DispatcherResourceManagerComponent>
createDispatcherResourceManagerComponents(
Configuration configuration,
RpcServiceFactory rpcServiceFactory,
BlobServer blobServer,
HeartbeatServices heartbeatServices,
DelegationTokenManager delegationTokenManager,
MetricRegistry metricRegistry,
MetricQueryServiceRetriever metricQueryServiceRetriever,
FatalErrorHandler fatalErrorHandler)
throws Exception {
DispatcherResourceManagerComponentFactory dispatcherResourceManagerComponentFactory =
createDispatcherResourceManagerComponentFactory();
final DispatcherResourceManagerComponent dispatcherResourceManagerComponent =
dispatcherResourceManagerComponentFactory.create(
configuration,
ResourceID.generate(),
ioExecutor,
rpcServiceFactory.createRpcService(),
haServices,
blobServer,
heartbeatServices,
delegationTokenManager,
metricRegistry,
new MemoryExecutionGraphInfoStore(),
metricQueryServiceRetriever,
Collections.emptySet(),
fatalErrorHandler);
FutureUtils.assertNoException(
dispatcherResourceManagerComponent
.getShutDownFuture()
.thenCompose(
applicationStatus ->
dispatcherResourceManagerComponent.stopApplication(
applicationStatus, null)));
return Collections.singleton(dispatcherResourceManagerComponent);
}
protected DispatcherResourceManagerComponentFactory
createDispatcherResourceManagerComponentFactory() {
return DefaultDispatcherResourceManagerComponentFactory.createSessionComponentFactory(
StandaloneResourceManagerFactory.getInstance());
}
private HighAvailabilityServicesFactory createHighAvailabilityServicesFactory(
Configuration configuration) {
final HaServices customMiniClusterHaServicesMode = miniClusterConfiguration.getHaServices();
if (customMiniClusterHaServicesMode == HaServices.WITH_LEADERSHIP_CONTROL) {
// special feature of MiniClusters to allow the control of leadership
// EmbeddedLeaderElection requires a single instance for leader election across multiple
// JobManager instances on the same JVM (after FLINK-24038 was introduced); therefore,
// SingletonHighAvailabilityServicesFactory is utilized here
return new SingletonHighAvailabilityServicesFactory(
(config, embeddedLeaderElectionExecutor) ->
new EmbeddedHaServicesWithLeadershipControl(
embeddedLeaderElectionExecutor));
} else if (customMiniClusterHaServicesMode != HaServices.CONFIGURED) {
throw new IllegalConfigurationException(
"Unknown HA Services Mode configured in MiniCluster configuration: "
+ customMiniClusterHaServicesMode);
}
final HighAvailabilityMode highAvailabilityMode =
HighAvailabilityMode.fromConfig(configuration);
if (highAvailabilityMode == HighAvailabilityMode.NONE) {
// basic EmbeddedLeaderElection requires a single instance for leader election across
// multiple JobManager instances on the same JVM (after FLINK-24038 was introduced);
// therefore, SingletonHighAvailabilityServicesFactory is utilized here
return new SingletonHighAvailabilityServicesFactory(
(config, embeddedLeaderElectionExecutor) ->
new EmbeddedHaServices(embeddedLeaderElectionExecutor));
} else {
return new RegularHighAvailabilityServicesFactory();
}
}
@VisibleForTesting
protected HighAvailabilityServices createHighAvailabilityServices(
Configuration configuration, Executor executor) throws Exception {
return haServicesFactory.createHAServices(configuration, executor);
}
/**
* Returns {@link HaLeadershipControl} if enabled.
*
* {@link HaLeadershipControl} allows granting and revoking leadership of HA components, e.g.
* JobManager. The method return {@link Optional#empty()} if the control is not enabled in
* {@link MiniClusterConfiguration}.
*
*
Enabling this feature disables {@link HighAvailabilityOptions#HA_MODE} option.
*/
public Optional getHaLeadershipControl() {
synchronized (lock) {
return haServices instanceof HaLeadershipControl
? Optional.of((HaLeadershipControl) haServices)
: Optional.empty();
}
}
protected HighAvailabilityServices getHaServices() {
return haServices;
}
/**
* Shuts down the mini cluster, failing all currently executing jobs. The mini cluster can be
* started again by calling the {@link #start()} method again.
*
* This method shuts down all started services and components, even if an exception occurs in
* the process of shutting down some component.
*
* @return Future which is completed once the MiniCluster has been completely shut down
*/
@Override
public CompletableFuture closeAsync() {
return closeInternal(true);
}
public CompletableFuture closeAsyncWithoutCleaningHighAvailabilityData() {
return closeInternal(false);
}
private CompletableFuture closeInternal(boolean cleanupHaData) {
synchronized (lock) {
if (running) {
LOG.info("Shutting down Flink Mini Cluster");
try {
final long shutdownTimeoutMillis =
miniClusterConfiguration
.getConfiguration()
.get(ClusterOptions.CLUSTER_SERVICES_SHUTDOWN_TIMEOUT)
.toMillis();
final int numComponents = 2 + miniClusterConfiguration.getNumTaskManagers();
final Collection> componentTerminationFutures =
new ArrayList<>(numComponents);
componentTerminationFutures.addAll(terminateTaskManagers());
componentTerminationFutures.add(shutDownResourceManagerComponents());
final FutureUtils.ConjunctFuture componentsTerminationFuture =
FutureUtils.completeAll(componentTerminationFutures);
final CompletableFuture metricSystemTerminationFuture =
FutureUtils.composeAfterwards(
componentsTerminationFuture, this::closeMetricSystem);
final CompletableFuture rpcServicesTerminationFuture =
FutureUtils.composeAfterwards(
metricSystemTerminationFuture, this::terminateRpcServices);
final CompletableFuture remainingServicesTerminationFuture =
FutureUtils.runAfterwards(
rpcServicesTerminationFuture,
() -> terminateMiniClusterServices(cleanupHaData));
final CompletableFuture executorsTerminationFuture =
FutureUtils.composeAfterwards(
remainingServicesTerminationFuture,
() -> terminateExecutors(shutdownTimeoutMillis));
final CompletableFuture deleteDirectoriesFuture =
FutureUtils.runAfterwards(
executorsTerminationFuture, this::deleteDirectories);
deleteDirectoriesFuture.whenComplete(
(Void ignored, Throwable throwable) -> {
if (throwable != null) {
terminationFuture.completeExceptionally(
ExceptionUtils.stripCompletionException(throwable));
} else {
terminationFuture.complete(null);
}
});
} finally {
running = false;
}
}
return terminationFuture;
}
}
private CompletableFuture closeMetricSystem() {
synchronized (lock) {
final ArrayList> terminationFutures = new ArrayList<>(2);
if (processMetricGroup != null) {
processMetricGroup.close();
processMetricGroup = null;
}
// metrics shutdown
if (metricRegistry != null) {
terminationFutures.add(metricRegistry.closeAsync());
metricRegistry = null;
}
return FutureUtils.completeAll(terminationFutures);
}
}
@GuardedBy("lock")
private void startTaskManagers() throws Exception {
final int numTaskManagers = miniClusterConfiguration.getNumTaskManagers();
LOG.info("Starting {} TaskManager(s)", numTaskManagers);
for (int i = 0; i < numTaskManagers; i++) {
startTaskManager();
}
}
/**
* Starts additional TaskManager process.
*
* When the MiniCluster starts up, it always starts {@link
* MiniClusterConfiguration#getNumTaskManagers} TaskManagers. All TaskManagers are indexed from
* 0 to the number of TaskManagers, started so far, minus one. This method starts a TaskManager
* with the next index which is the number of TaskManagers, started so far. The index always
* increases with each new started TaskManager. The indices of terminated TaskManagers are not
* reused after {@link #terminateTaskManager(int)}.
*/
public void startTaskManager() throws Exception {
synchronized (lock) {
final Configuration configuration = miniClusterConfiguration.getConfiguration();
final TaskExecutor taskExecutor =
TaskManagerRunner.startTaskManager(
configuration,
new ResourceID(UUID.randomUUID().toString()),
taskManagerRpcServiceFactory.createRpcService(),
haServices,
heartbeatServices,
metricRegistry,
blobCacheService,
useLocalCommunication(),
ExternalResourceInfoProvider.NO_EXTERNAL_RESOURCES,
workingDirectory.createSubWorkingDirectory("tm_" + taskManagers.size()),
taskManagerTerminatingFatalErrorHandlerFactory.create(
taskManagers.size()),
delegationTokenReceiverRepository);
taskExecutor.start();
taskManagers.add(taskExecutor);
}
}
@VisibleForTesting
protected boolean useLocalCommunication() {
return miniClusterConfiguration.getNumTaskManagers() == 1;
}
@VisibleForTesting
public Configuration getConfiguration() {
return miniClusterConfiguration.getConfiguration();
}
// HACK: temporary hack to make the changelog state backend tests work with forced
// full snapshots. This option should be removed once changelog state backend supports forced
// full snapshots
@Internal private boolean overrideRestoreModeForChangelogStateBackend;
@Internal
public void overrideRestoreModeForChangelogStateBackend() {
this.overrideRestoreModeForChangelogStateBackend = true;
}
@GuardedBy("lock")
private Collection extends CompletableFuture> terminateTaskManagers() {
final Collection> terminationFutures =
new ArrayList<>(taskManagers.size());
for (int i = 0; i < taskManagers.size(); i++) {
terminationFutures.add(terminateTaskManager(i));
}
return terminationFutures;
}
/**
* Terminates a TaskManager with the given index.
*
* See {@link #startTaskManager()} to understand how TaskManagers are indexed. This method
* terminates a TaskManager with a given index but it does not clear the index. The index stays
* occupied for the lifetime of the MiniCluster and its TaskManager stays terminated. The index
* is not reused if more TaskManagers are started with {@link #startTaskManager()}.
*
* @param index index of the TaskManager to terminate
* @return {@link CompletableFuture} of the given TaskManager termination
*/
public CompletableFuture terminateTaskManager(int index) {
synchronized (lock) {
final TaskExecutor taskExecutor = taskManagers.get(index);
return taskExecutor.closeAsync();
}
}
// ------------------------------------------------------------------------
// Accessing jobs
// ------------------------------------------------------------------------
public CompletableFuture getArchivedExecutionGraph(JobID jobId) {
return runDispatcherCommand(
dispatcherGateway ->
dispatcherGateway
.requestExecutionGraphInfo(jobId, rpcTimeout)
.thenApply(ExecutionGraphInfo::getArchivedExecutionGraph));
}
public CompletableFuture> listJobs() {
return runDispatcherCommand(
dispatcherGateway ->
dispatcherGateway
.requestMultipleJobDetails(rpcTimeout)
.thenApply(
jobs ->
jobs.getJobs().stream()
.map(
details ->
new JobStatusMessage(
details.getJobId(),
details
.getJobName(),
details.getStatus(),
details
.getStartTime()))
.collect(Collectors.toList())));
}
public CompletableFuture getJobStatus(JobID jobId) {
return runDispatcherCommand(
dispatcherGateway -> dispatcherGateway.requestJobStatus(jobId, rpcTimeout));
}
public CompletableFuture cancelJob(JobID jobId) {
return runDispatcherCommand(
dispatcherGateway -> dispatcherGateway.cancelJob(jobId, rpcTimeout));
}
public CompletableFuture triggerSavepoint(
JobID jobId,
String targetDirectory,
boolean cancelJob,
SavepointFormatType formatType) {
return runDispatcherCommand(
dispatcherGateway ->
dispatcherGateway.triggerSavepointAndGetLocation(
jobId,
targetDirectory,
formatType,
cancelJob
? TriggerSavepointMode.CANCEL_WITH_SAVEPOINT
: TriggerSavepointMode.SAVEPOINT,
rpcTimeout));
}
public CompletableFuture triggerDetachedSavepoint(
JobID jobId,
String targetDirectory,
boolean cancelJob,
SavepointFormatType formatType) {
return runDispatcherCommand(
dispatcherGateway -> {
dispatcherGateway.triggerSavepointAndGetLocation(
jobId,
targetDirectory,
formatType,
cancelJob
? TriggerSavepointMode.CANCEL_WITH_SAVEPOINT
: TriggerSavepointMode.SAVEPOINT,
rpcTimeout);
// return immediately, no need to wait for the future savepoint path
return CompletableFuture.completedFuture("");
});
}
public CompletableFuture triggerCheckpoint(JobID jobID) {
return runDispatcherCommand(
dispatcherGateway -> dispatcherGateway.triggerCheckpoint(jobID, rpcTimeout));
}
public CompletableFuture triggerCheckpoint(JobID jobID, CheckpointType checkpointType) {
return runDispatcherCommand(
dispatcherGateway ->
dispatcherGateway.triggerCheckpointAndGetCheckpointID(
jobID, checkpointType, rpcTimeout));
}
public CompletableFuture stopWithSavepoint(
JobID jobId,
String targetDirectory,
boolean terminate,
SavepointFormatType formatType) {
return runDispatcherCommand(
dispatcherGateway ->
dispatcherGateway.stopWithSavepointAndGetLocation(
jobId,
targetDirectory,
formatType,
terminate
? TriggerSavepointMode.TERMINATE_WITH_SAVEPOINT
: TriggerSavepointMode.SUSPEND_WITH_SAVEPOINT,
rpcTimeout));
}
public CompletableFuture stopWithDetachedSavepoint(
JobID jobId,
String targetDirectory,
boolean terminate,
SavepointFormatType formatType) {
return runDispatcherCommand(
dispatcherGateway -> {
dispatcherGateway.stopWithSavepointAndGetLocation(
jobId,
targetDirectory,
formatType,
terminate
? TriggerSavepointMode.TERMINATE_WITH_SAVEPOINT
: TriggerSavepointMode.SUSPEND_WITH_SAVEPOINT,
rpcTimeout);
// return immediately, no need to wait for the future savepoint path
return CompletableFuture.completedFuture("");
});
}
public CompletableFuture disposeSavepoint(String savepointPath) {
return runDispatcherCommand(
dispatcherGateway -> dispatcherGateway.disposeSavepoint(savepointPath, rpcTimeout));
}
public CompletableFuture extends AccessExecutionGraph> getExecutionGraph(JobID jobId) {
return runDispatcherCommand(
dispatcherGateway -> dispatcherGateway.requestJob(jobId, rpcTimeout));
}
public CompletableFuture deliverCoordinationRequestToCoordinator(
JobID jobId,
String operatorUid,
SerializedValue serializedRequest) {
return runDispatcherCommand(
dispatcherGateway ->
dispatcherGateway.deliverCoordinationRequestToCoordinator(
jobId, operatorUid, serializedRequest, rpcTimeout));
}
public CompletableFuture getResourceOverview() {
return runResourceManagerCommand(
resourceManagerGateway ->
resourceManagerGateway.requestResourceOverview(rpcTimeout));
}
private CompletableFuture runDispatcherCommand(
Function> dispatcherCommand) {
return getDispatcherGatewayFuture()
.thenApply(dispatcherCommand)
.thenCompose(Function.identity());
}
private CompletableFuture runResourceManagerCommand(
Function> resourceManagerCommand) {
return getResourceManagerGatewayFuture()
.thenApply(resourceManagerCommand)
.thenCompose(Function.identity());
}
// ------------------------------------------------------------------------
// running jobs
// ------------------------------------------------------------------------
/**
* This method executes a job in detached mode. The method returns immediately after the job has
* been added to the
*
* @param job The Flink job to execute
* @throws JobExecutionException Thrown if anything went amiss during initial job launch, or if
* the job terminally failed.
*/
public void runDetached(JobGraph job) throws JobExecutionException, InterruptedException {
checkNotNull(job, "job is null");
final CompletableFuture submissionFuture = submitJob(job);
try {
submissionFuture.get();
} catch (ExecutionException e) {
throw new JobExecutionException(
job.getJobID(), ExceptionUtils.stripExecutionException(e));
}
}
/**
* This method runs a job in blocking mode. The method returns only after the job completed
* successfully, or after it failed terminally.
*
* @param job The Flink job to execute
* @return The result of the job execution
* @throws JobExecutionException Thrown if anything went amiss during initial job launch, or if
* the job terminally failed.
*/
public JobExecutionResult executeJobBlocking(JobGraph job)
throws JobExecutionException, InterruptedException {
checkNotNull(job, "job is null");
final CompletableFuture submissionFuture = submitJob(job);
final CompletableFuture jobResultFuture =
submissionFuture.thenCompose(
(JobSubmissionResult ignored) -> requestJobResult(job.getJobID()));
final JobResult jobResult;
try {
jobResult = jobResultFuture.get();
} catch (ExecutionException e) {
throw new JobExecutionException(
job.getJobID(),
"Could not retrieve JobResult.",
ExceptionUtils.stripExecutionException(e));
}
try {
return jobResult.toJobExecutionResult(Thread.currentThread().getContextClassLoader());
} catch (IOException | ClassNotFoundException e) {
throw new JobExecutionException(job.getJobID(), e);
}
}
public CompletableFuture submitJob(JobGraph jobGraph) {
// When MiniCluster uses the local RPC, the provided JobGraph is passed directly to the
// Dispatcher. This means that any mutations to the JG can affect the Dispatcher behaviour,
// so we rather clone it to guard against this.
final JobGraph clonedJobGraph = InstantiationUtil.cloneUnchecked(jobGraph);
checkRestoreModeForChangelogStateBackend(clonedJobGraph);
final CompletableFuture dispatcherGatewayFuture =
getDispatcherGatewayFuture();
final CompletableFuture blobServerAddressFuture =
createBlobServerAddress(dispatcherGatewayFuture);
final CompletableFuture jarUploadFuture =
uploadAndSetJobFiles(blobServerAddressFuture, clonedJobGraph);
final CompletableFuture acknowledgeCompletableFuture =
jarUploadFuture
.thenCombine(
dispatcherGatewayFuture,
(Void ack, DispatcherGateway dispatcherGateway) ->
dispatcherGateway.submitJob(clonedJobGraph, rpcTimeout))
.thenCompose(Function.identity());
return acknowledgeCompletableFuture.thenApply(
(Acknowledge ignored) -> new JobSubmissionResult(clonedJobGraph.getJobID()));
}
// HACK: temporary hack to make the randomized changelog state backend tests work with forced
// full snapshots. This option should be removed once changelog state backend supports forced
// full snapshots
private void checkRestoreModeForChangelogStateBackend(JobGraph jobGraph) {
final SavepointRestoreSettings savepointRestoreSettings =
jobGraph.getSavepointRestoreSettings();
if (overrideRestoreModeForChangelogStateBackend
&& savepointRestoreSettings.getRecoveryClaimMode() == RecoveryClaimMode.NO_CLAIM) {
final Configuration conf = new Configuration();
SavepointRestoreSettings.toConfiguration(savepointRestoreSettings, conf);
conf.set(StateRecoveryOptions.RESTORE_MODE, RecoveryClaimMode.LEGACY);
jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.fromConfiguration(conf));
}
}
public CompletableFuture requestJobResult(JobID jobId) {
return runDispatcherCommand(
dispatcherGateway ->
dispatcherGateway.requestJobResult(jobId, RpcUtils.INF_TIMEOUT));
}
public CompletableFuture requestClusterOverview() {
return runDispatcherCommand(
dispatcherGateway ->
dispatcherGateway.requestClusterOverview(RpcUtils.INF_TIMEOUT));
}
@VisibleForTesting
protected CompletableFuture getDispatcherGatewayFuture() {
synchronized (lock) {
checkState(running, "MiniCluster is not yet running or has already been shut down.");
return dispatcherGatewayRetriever.getFuture();
}
}
private CompletableFuture getResourceManagerGatewayFuture() {
synchronized (lock) {
checkState(running, "MiniCluster is not yet running or has already been shut down.");
return resourceManagerGatewayRetriever.getFuture();
}
}
private CompletableFuture uploadAndSetJobFiles(
final CompletableFuture blobServerAddressFuture,
final JobGraph job) {
return blobServerAddressFuture.thenAccept(
blobServerAddress -> {
try {
ClientUtils.extractAndUploadJobGraphFiles(
job,
() ->
new BlobClient(
blobServerAddress,
miniClusterConfiguration.getConfiguration()));
} catch (FlinkException e) {
throw new CompletionException(e);
}
});
}
private CompletableFuture createBlobServerAddress(
final CompletableFuture dispatcherGatewayFuture) {
return dispatcherGatewayFuture
.thenApply(
dispatcherGateway ->
dispatcherGateway
.getBlobServerPort(rpcTimeout)
.thenApply(
blobServerPort ->
new InetSocketAddress(
dispatcherGateway.getHostname(),
blobServerPort)))
.thenCompose(Function.identity());
}
// ------------------------------------------------------------------------
// factories - can be overridden by subclasses to alter behavior
// ------------------------------------------------------------------------
/**
* Factory method to create the metric registry for the mini cluster.
*
* @param config The configuration of the mini cluster
* @param maximumMessageSizeInBytes the maximum message size
*/
protected MetricRegistryImpl createMetricRegistry(
Configuration config, long maximumMessageSizeInBytes) {
return new MetricRegistryImpl(
MetricRegistryConfiguration.fromConfiguration(config, maximumMessageSizeInBytes),
ReporterSetup.fromConfiguration(
config, miniClusterConfiguration.getPluginManager()),
TraceReporterSetup.fromConfiguration(
config, miniClusterConfiguration.getPluginManager()));
}
/**
* Factory method to instantiate the remote RPC service.
*
* @param configuration Flink configuration.
* @param bindAddress The address to bind the RPC service to.
* @param bindPort The port range to bind the RPC service to.
* @param rpcSystem
* @return The instantiated RPC service
*/
protected RpcService createRemoteRpcService(
Configuration configuration, String bindAddress, int bindPort, RpcSystem rpcSystem)
throws Exception {
return rpcSystem
.remoteServiceBuilder(configuration, bindAddress, String.valueOf(bindPort))
.withBindAddress(bindAddress)
.withBindPort(bindPort)
.withExecutorConfiguration(RpcUtils.getTestForkJoinExecutorConfiguration())
.createAndStart();
}
/**
* Factory method to instantiate the remote RPC service.
*
* @param configuration Flink configuration.
* @param externalAddress The external address to access the RPC service.
* @param externalPortRange The external port range to access the RPC service.
* @param bindAddress The address to bind the RPC service to.
* @param rpcSystem
* @return The instantiated RPC service
*/
protected RpcService createRemoteRpcService(
Configuration configuration,
String externalAddress,
String externalPortRange,
String bindAddress,
RpcSystem rpcSystem)
throws Exception {
return rpcSystem
.remoteServiceBuilder(configuration, externalAddress, externalPortRange)
.withBindAddress(bindAddress)
.withExecutorConfiguration(RpcUtils.getTestForkJoinExecutorConfiguration())
.createAndStart();
}
/**
* Factory method to instantiate the local RPC service.
*
* @param configuration Flink configuration.
* @param rpcSystem
* @return The instantiated RPC service
*/
protected RpcService createLocalRpcService(Configuration configuration, RpcSystem rpcSystem)
throws Exception {
return rpcSystem
.localServiceBuilder(configuration)
.withExecutorConfiguration(RpcUtils.getTestForkJoinExecutorConfiguration())
.createAndStart();
}
// ------------------------------------------------------------------------
// Internal methods
// ------------------------------------------------------------------------
@GuardedBy("lock")
private CompletableFuture shutDownResourceManagerComponents() {
final Collection> terminationFutures =
new ArrayList<>(dispatcherResourceManagerComponents.size());
for (DispatcherResourceManagerComponent dispatcherResourceManagerComponent :
dispatcherResourceManagerComponents) {
terminationFutures.add(dispatcherResourceManagerComponent.closeAsync());
}
final FutureUtils.ConjunctFuture dispatcherTerminationFuture =
FutureUtils.completeAll(terminationFutures);
return FutureUtils.runAfterwards(
dispatcherTerminationFuture,
() -> {
Exception exception = null;
synchronized (lock) {
if (resourceManagerLeaderRetriever != null) {
try {
resourceManagerLeaderRetriever.stop();
} catch (Exception e) {
exception = ExceptionUtils.firstOrSuppressed(e, exception);
}
resourceManagerLeaderRetriever = null;
}
if (dispatcherLeaderRetriever != null) {
try {
dispatcherLeaderRetriever.stop();
} catch (Exception e) {
exception = ExceptionUtils.firstOrSuppressed(e, exception);
}
dispatcherLeaderRetriever = null;
}
if (clusterRestEndpointLeaderRetrievalService != null) {
try {
clusterRestEndpointLeaderRetrievalService.stop();
} catch (Exception e) {
exception = ExceptionUtils.firstOrSuppressed(e, exception);
}
clusterRestEndpointLeaderRetrievalService = null;
}
}
if (exception != null) {
throw exception;
}
});
}
private void terminateMiniClusterServices(boolean cleanupHaData) throws Exception {
// collect the first exception, but continue and add all successive
// exceptions as suppressed
Exception exception = null;
synchronized (lock) {
if (blobCacheService != null) {
try {
blobCacheService.close();
} catch (Exception e) {
exception = ExceptionUtils.firstOrSuppressed(e, exception);
}
blobCacheService = null;
}
// shut down the blob server
if (blobServer != null) {
try {
blobServer.close();
} catch (Exception e) {
exception = ExceptionUtils.firstOrSuppressed(e, exception);
}
blobServer = null;
}
// shut down high-availability services
if (haServices != null) {
haServices.closeWithOptionalClean(cleanupHaData);
haServices = null;
}
try {
if (rpcSystem.isOwned()) {
rpcSystem.deref().close();
}
} catch (Exception e) {
exception = ExceptionUtils.firstOrSuppressed(e, exception);
}
if (exception != null) {
throw exception;
}
}
}
@Nonnull
private CompletableFuture terminateRpcServices() {
synchronized (lock) {
final int numRpcServices = 1 + rpcServices.size();
final Collection> rpcTerminationFutures =
new ArrayList<>(numRpcServices);
rpcTerminationFutures.add(commonRpcService.closeAsync());
for (RpcService rpcService : rpcServices) {
rpcTerminationFutures.add(rpcService.closeAsync());
}
commonRpcService = null;
rpcServices.clear();
return FutureUtils.completeAll(rpcTerminationFutures);
}
}
private CompletableFuture terminateExecutors(long executorShutdownTimeoutMillis) {
synchronized (lock) {
if (ioExecutor != null) {
return ExecutorUtils.nonBlockingShutdown(
executorShutdownTimeoutMillis, TimeUnit.MILLISECONDS, ioExecutor);
} else {
return CompletableFuture.completedFuture(null);
}
}
}
private void deleteDirectories() throws IOException {
synchronized (lock) {
if (workingDirectory != null) {
workingDirectory.delete();
}
}
}
public CompletableFuture invalidateClusterDataset(AbstractID clusterDatasetId) {
return resourceManagerGatewayRetriever
.getFuture()
.thenApply(
resourceManagerGateway ->
resourceManagerGateway.releaseClusterPartitions(
new IntermediateDataSetID(clusterDatasetId)))
.thenCompose(Function.identity());
}
public CompletableFuture> listCompletedClusterDatasetIds() {
return resourceManagerGatewayRetriever
.getFuture()
.thenApply(ClusterPartitionManager::listDataSets)
.thenCompose(
metaInfoMapFuture ->
metaInfoMapFuture.thenApply(
metaInfoMap -> new HashSet<>(metaInfoMap.keySet())));
}
public CompletableFuture reportHeartbeat(JobID jobId, long expiredTimestamp) {
return runDispatcherCommand(
dispatcherGateway ->
dispatcherGateway.reportJobClientHeartbeat(
jobId, expiredTimestamp, rpcTimeout));
}
/** Internal factory for {@link RpcService}. */
protected interface RpcServiceFactory {
RpcService createRpcService() throws Exception;
}
/** Factory which returns always the common {@link RpcService}. */
protected static class CommonRpcServiceFactory implements RpcServiceFactory {
private final RpcService commonRpcService;
CommonRpcServiceFactory(RpcService commonRpcService) {
this.commonRpcService = commonRpcService;
}
@Override
public RpcService createRpcService() {
return commonRpcService;
}
}
/** Factory which creates and registers new {@link RpcService}. */
protected class DedicatedRpcServiceFactory implements RpcServiceFactory {
private final Configuration configuration;
private final String externalAddress;
private final String externalPortRange;
private final String bindAddress;
private final RpcSystem rpcSystem;
DedicatedRpcServiceFactory(
Configuration configuration,
String externalAddress,
String externalPortRange,
String bindAddress,
RpcSystem rpcSystem) {
this.configuration = configuration;
this.externalAddress = externalAddress;
this.externalPortRange = externalPortRange;
this.bindAddress = bindAddress;
this.rpcSystem = rpcSystem;
}
@Override
public RpcService createRpcService() throws Exception {
final RpcService rpcService =
MiniCluster.this.createRemoteRpcService(
configuration,
externalAddress,
externalPortRange,
bindAddress,
rpcSystem);
synchronized (lock) {
rpcServices.add(rpcService);
}
return rpcService;
}
}
// ------------------------------------------------------------------------
// miscellaneous utilities
// ------------------------------------------------------------------------
private void initializeIOFormatClasses(Configuration configuration) {
// TODO: That we still have to call something like this is a crime against humanity
FileOutputFormat.initDefaultsFromConfiguration(configuration);
}
private class TerminatingFatalErrorHandler implements FatalErrorHandler {
private final int index;
private TerminatingFatalErrorHandler(int index) {
this.index = index;
}
@Override
public void onFatalError(Throwable exception) {
// first check if we are still running
if (running) {
LOG.error("TaskManager #{} failed.", index, exception);
synchronized (lock) {
taskManagers.get(index).closeAsync();
}
}
}
}
private class ShutDownFatalErrorHandler implements FatalErrorHandler {
@Override
public void onFatalError(Throwable exception) {
LOG.warn("Error in MiniCluster. Shutting the MiniCluster down.", exception);
closeAsync();
}
}
private class TerminatingFatalErrorHandlerFactory {
/**
* Create a new {@link TerminatingFatalErrorHandler} for the {@link TaskExecutor} with the
* given index.
*
* @param index into the {@link #taskManagers} collection to identify the correct {@link
* TaskExecutor}.
* @return {@link TerminatingFatalErrorHandler} for the given index
*/
@GuardedBy("lock")
private TerminatingFatalErrorHandler create(int index) {
return new TerminatingFatalErrorHandler(index);
}
}
/** HA Services to use. */
public enum HaServices {
/** Uses the configured HA Services in {@link HighAvailabilityOptions#HA_MODE} option. */
CONFIGURED,
/**
* Enables or disables {@link HaLeadershipControl} in {@link
* MiniCluster#getHaLeadershipControl}.
*
* {@link HaLeadershipControl} allows granting and revoking leadership of HA components.
* Enabling this feature disables {@link HighAvailabilityOptions#HA_MODE} option.
*/
WITH_LEADERSHIP_CONTROL
}
/**
* SingletonHighAvailabilityServicesFactory is used for scenarios that are not truly high
* available and rely on having a single HighAvailabilityServices object.
*/
private static class SingletonHighAvailabilityServicesFactory
implements HighAvailabilityServicesFactory {
private final BiFunctionWithException<
Configuration, Executor, HighAvailabilityServices, Exception>
creationCallback;
@Nullable private HighAvailabilityServices haServices;
public SingletonHighAvailabilityServicesFactory(
BiFunctionWithException<
Configuration, Executor, HighAvailabilityServices, Exception>
creationCallback) {
this.creationCallback = creationCallback;
}
@Override
public HighAvailabilityServices createHAServices(
Configuration configuration, Executor executor) throws Exception {
if (haServices == null) {
haServices = creationCallback.apply(configuration, executor);
}
return this.haServices;
}
}
private class RegularHighAvailabilityServicesFactory
implements HighAvailabilityServicesFactory {
@Override
public HighAvailabilityServices createHAServices(
Configuration configuration, Executor executor) throws Exception {
return HighAvailabilityServicesUtils.createAvailableOrEmbeddedServices(
configuration, executor, new ShutDownFatalErrorHandler());
}
}
}