All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.runtime.minicluster.MiniCluster Maven / Gradle / Ivy

There is a newer version: 1.13.6
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.minicluster;

import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.api.common.JobExecutionResult;
import org.apache.flink.api.common.JobID;
import org.apache.flink.api.common.JobStatus;
import org.apache.flink.api.common.JobSubmissionResult;
import org.apache.flink.api.common.io.FileOutputFormat;
import org.apache.flink.api.common.time.Time;
import org.apache.flink.configuration.ClusterOptions;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.configuration.ConfigurationUtils;
import org.apache.flink.runtime.akka.AkkaUtils;
import org.apache.flink.runtime.blob.BlobCacheService;
import org.apache.flink.runtime.blob.BlobClient;
import org.apache.flink.runtime.blob.BlobServer;
import org.apache.flink.runtime.client.ClientUtils;
import org.apache.flink.runtime.client.JobExecutionException;
import org.apache.flink.runtime.client.JobStatusMessage;
import org.apache.flink.runtime.clusterframework.ApplicationStatus;
import org.apache.flink.runtime.clusterframework.types.ResourceID;
import org.apache.flink.runtime.concurrent.FutureUtils;
import org.apache.flink.runtime.dispatcher.DispatcherGateway;
import org.apache.flink.runtime.dispatcher.DispatcherId;
import org.apache.flink.runtime.dispatcher.MemoryArchivedExecutionGraphStore;
import org.apache.flink.runtime.entrypoint.ClusterInformation;
import org.apache.flink.runtime.entrypoint.component.DefaultDispatcherResourceManagerComponentFactory;
import org.apache.flink.runtime.entrypoint.component.DispatcherResourceManagerComponent;
import org.apache.flink.runtime.entrypoint.component.DispatcherResourceManagerComponentFactory;
import org.apache.flink.runtime.executiongraph.AccessExecutionGraph;
import org.apache.flink.runtime.heartbeat.HeartbeatServices;
import org.apache.flink.runtime.highavailability.HighAvailabilityServices;
import org.apache.flink.runtime.highavailability.HighAvailabilityServicesUtils;
import org.apache.flink.runtime.jobgraph.JobGraph;
import org.apache.flink.runtime.jobmaster.JobResult;
import org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService;
import org.apache.flink.runtime.messages.Acknowledge;
import org.apache.flink.runtime.messages.webmonitor.ClusterOverview;
import org.apache.flink.runtime.metrics.MetricRegistry;
import org.apache.flink.runtime.metrics.MetricRegistryConfiguration;
import org.apache.flink.runtime.metrics.MetricRegistryImpl;
import org.apache.flink.runtime.metrics.ReporterSetup;
import org.apache.flink.runtime.metrics.groups.ProcessMetricGroup;
import org.apache.flink.runtime.metrics.util.MetricUtils;
import org.apache.flink.runtime.resourcemanager.ResourceManagerGateway;
import org.apache.flink.runtime.resourcemanager.ResourceManagerId;
import org.apache.flink.runtime.resourcemanager.StandaloneResourceManagerFactory;
import org.apache.flink.runtime.rpc.FatalErrorHandler;
import org.apache.flink.runtime.rpc.RpcService;
import org.apache.flink.runtime.rpc.RpcUtils;
import org.apache.flink.runtime.rpc.akka.AkkaRpcService;
import org.apache.flink.runtime.rpc.akka.AkkaRpcServiceConfiguration;
import org.apache.flink.runtime.taskexecutor.TaskExecutor;
import org.apache.flink.runtime.taskexecutor.TaskManagerRunner;
import org.apache.flink.runtime.util.ClusterEntrypointUtils;
import org.apache.flink.runtime.util.ExecutorThreadFactory;
import org.apache.flink.runtime.webmonitor.retriever.LeaderRetriever;
import org.apache.flink.runtime.webmonitor.retriever.MetricQueryServiceRetriever;
import org.apache.flink.runtime.webmonitor.retriever.impl.RpcGatewayRetriever;
import org.apache.flink.runtime.webmonitor.retriever.impl.RpcMetricQueryServiceRetriever;
import org.apache.flink.util.AutoCloseableAsync;
import org.apache.flink.util.ExceptionUtils;
import org.apache.flink.util.ExecutorUtils;
import org.apache.flink.util.FlinkException;
import org.apache.flink.util.function.FunctionUtils;

import akka.actor.ActorSystem;
import com.typesafe.config.Config;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nonnull;
import javax.annotation.concurrent.GuardedBy;

import java.io.IOException;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.URI;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.UUID;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.CompletionException;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Executor;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.function.Function;
import java.util.stream.Collectors;

import static org.apache.flink.util.Preconditions.checkNotNull;
import static org.apache.flink.util.Preconditions.checkState;

/**
 * MiniCluster to execute Flink jobs locally.
 */
public class MiniCluster implements JobExecutorService, AutoCloseableAsync {

	private static final Logger LOG = LoggerFactory.getLogger(MiniCluster.class);

	/** The lock to guard startup / shutdown / manipulation methods. */
	private final Object lock = new Object();

	/** The configuration for this mini cluster. */
	private final MiniClusterConfiguration miniClusterConfiguration;

	private final Time rpcTimeout;

	@GuardedBy("lock")
	private final List taskManagers;

	private final TerminatingFatalErrorHandlerFactory taskManagerTerminatingFatalErrorHandlerFactory = new TerminatingFatalErrorHandlerFactory();

	private CompletableFuture terminationFuture;

	@GuardedBy("lock")
	private MetricRegistryImpl metricRegistry;

	@GuardedBy("lock")
	private ProcessMetricGroup processMetricGroup;

	@GuardedBy("lock")
	private RpcService commonRpcService;

	@GuardedBy("lock")
	private ExecutorService ioExecutor;

	@GuardedBy("lock")
	private final Collection rpcServices;

	@GuardedBy("lock")
	private HighAvailabilityServices haServices;

	@GuardedBy("lock")
	private BlobServer blobServer;

	@GuardedBy("lock")
	private HeartbeatServices heartbeatServices;

	@GuardedBy("lock")
	private BlobCacheService blobCacheService;

	@GuardedBy("lock")
	private LeaderRetrievalService resourceManagerLeaderRetriever;

	@GuardedBy("lock")
	private LeaderRetrievalService dispatcherLeaderRetriever;

	@GuardedBy("lock")
	private LeaderRetrievalService clusterRestEndpointLeaderRetrievalService;

	@GuardedBy("lock")
	private Collection dispatcherResourceManagerComponents;

	@GuardedBy("lock")
	private RpcGatewayRetriever dispatcherGatewayRetriever;

	@GuardedBy("lock")
	private RpcGatewayRetriever resourceManagerGatewayRetriever;

	@GuardedBy("lock")
	private LeaderRetriever webMonitorLeaderRetriever;

	@GuardedBy("lock")
	private RpcServiceFactory taskManagerRpcServiceFactory;

	/** Flag marking the mini cluster as started/running. */
	private volatile boolean running;

	// ------------------------------------------------------------------------

	/**
	 * Creates a new Flink mini cluster based on the given configuration.
	 *
	 * @param miniClusterConfiguration The configuration for the mini cluster
	 */
	public MiniCluster(MiniClusterConfiguration miniClusterConfiguration) {
		this.miniClusterConfiguration = checkNotNull(miniClusterConfiguration, "config may not be null");
		this.rpcServices = new ArrayList<>(1 + 2 + miniClusterConfiguration.getNumTaskManagers()); // common + JM + RM + TMs
		this.dispatcherResourceManagerComponents = new ArrayList<>(1);

		this.rpcTimeout = miniClusterConfiguration.getRpcTimeout();
		this.terminationFuture = CompletableFuture.completedFuture(null);
		running = false;

		this.taskManagers = new ArrayList<>(miniClusterConfiguration.getNumTaskManagers());
	}

	public CompletableFuture getRestAddress() {
		synchronized (lock) {
			checkState(running, "MiniCluster is not yet running or has already been shut down.");
			return webMonitorLeaderRetriever.getLeaderFuture().thenApply(FunctionUtils.uncheckedFunction(addressLeaderIdTuple -> new URI(addressLeaderIdTuple.f0)));
		}
	}

	public ClusterInformation getClusterInformation() {
		synchronized (lock) {
			checkState(running, "MiniCluster is not yet running or has already been shut down.");
			return new ClusterInformation("localhost", blobServer.getPort());
		}
	}

	protected Executor getIOExecutor() {
		return ioExecutor;
	}

	// ------------------------------------------------------------------------
	//  life cycle
	// ------------------------------------------------------------------------

	/**
	 * Checks if the mini cluster was started and is running.
	 */
	public boolean isRunning() {
		return running;
	}

	/**
	 * Starts the mini cluster, based on the configured properties.
	 *
	 * @throws Exception This method passes on any exception that occurs during the startup of
	 *                   the mini cluster.
	 */
	public void start() throws Exception {
		synchronized (lock) {
			checkState(!running, "MiniCluster is already running");

			LOG.info("Starting Flink Mini Cluster");
			LOG.debug("Using configuration {}", miniClusterConfiguration);

			final Configuration configuration = miniClusterConfiguration.getConfiguration();
			final boolean useSingleRpcService = miniClusterConfiguration.getRpcServiceSharing() == RpcServiceSharing.SHARED;

			try {
				initializeIOFormatClasses(configuration);

				LOG.info("Starting Metrics Registry");
				metricRegistry = createMetricRegistry(configuration);

				// bring up all the RPC services
				LOG.info("Starting RPC Service(s)");

				AkkaRpcServiceConfiguration akkaRpcServiceConfig = AkkaRpcServiceConfiguration.fromConfiguration(configuration);

				final RpcServiceFactory dispatcherResourceManagreComponentRpcServiceFactory;

				if (useSingleRpcService) {
					// we always need the 'commonRpcService' for auxiliary calls
					commonRpcService = createRpcService(akkaRpcServiceConfig, false, null);
					final CommonRpcServiceFactory commonRpcServiceFactory = new CommonRpcServiceFactory(commonRpcService);
					taskManagerRpcServiceFactory = commonRpcServiceFactory;
					dispatcherResourceManagreComponentRpcServiceFactory = commonRpcServiceFactory;
				} else {
					// we always need the 'commonRpcService' for auxiliary calls
					commonRpcService = createRpcService(akkaRpcServiceConfig, true, null);

					// start a new service per component, possibly with custom bind addresses
					final String jobManagerBindAddress = miniClusterConfiguration.getJobManagerBindAddress();
					final String taskManagerBindAddress = miniClusterConfiguration.getTaskManagerBindAddress();

					dispatcherResourceManagreComponentRpcServiceFactory = new DedicatedRpcServiceFactory(akkaRpcServiceConfig, jobManagerBindAddress);
					taskManagerRpcServiceFactory = new DedicatedRpcServiceFactory(akkaRpcServiceConfig, taskManagerBindAddress);
				}

				RpcService metricQueryServiceRpcService = MetricUtils.startMetricsRpcService(
					configuration,
					commonRpcService.getAddress());
				metricRegistry.startQueryService(metricQueryServiceRpcService, null);

				processMetricGroup = MetricUtils.instantiateProcessMetricGroup(
					metricRegistry,
					RpcUtils.getHostname(commonRpcService),
					ConfigurationUtils.getSystemResourceMetricsProbingInterval(configuration));

				ioExecutor = Executors.newFixedThreadPool(
					ClusterEntrypointUtils.getPoolSize(configuration),
					new ExecutorThreadFactory("mini-cluster-io"));
				haServices = createHighAvailabilityServices(configuration, ioExecutor);

				blobServer = new BlobServer(configuration, haServices.createBlobStore());
				blobServer.start();

				heartbeatServices = HeartbeatServices.fromConfiguration(configuration);

				blobCacheService = new BlobCacheService(
					configuration, haServices.createBlobStore(), new InetSocketAddress(InetAddress.getLocalHost(), blobServer.getPort())
				);

				startTaskManagers();

				MetricQueryServiceRetriever metricQueryServiceRetriever = new RpcMetricQueryServiceRetriever(metricRegistry.getMetricQueryServiceRpcService());

				setupDispatcherResourceManagerComponents(configuration, dispatcherResourceManagreComponentRpcServiceFactory, metricQueryServiceRetriever);

				resourceManagerLeaderRetriever = haServices.getResourceManagerLeaderRetriever();
				dispatcherLeaderRetriever = haServices.getDispatcherLeaderRetriever();
				clusterRestEndpointLeaderRetrievalService = haServices.getClusterRestEndpointLeaderRetriever();

				dispatcherGatewayRetriever = new RpcGatewayRetriever<>(
					commonRpcService,
					DispatcherGateway.class,
					DispatcherId::fromUuid,
					20,
					Time.milliseconds(20L));
				resourceManagerGatewayRetriever = new RpcGatewayRetriever<>(
					commonRpcService,
					ResourceManagerGateway.class,
					ResourceManagerId::fromUuid,
					20,
					Time.milliseconds(20L));
				webMonitorLeaderRetriever = new LeaderRetriever();

				resourceManagerLeaderRetriever.start(resourceManagerGatewayRetriever);
				dispatcherLeaderRetriever.start(dispatcherGatewayRetriever);
				clusterRestEndpointLeaderRetrievalService.start(webMonitorLeaderRetriever);
			}
			catch (Exception e) {
				// cleanup everything
				try {
					close();
				} catch (Exception ee) {
					e.addSuppressed(ee);
				}
				throw e;
			}

			// create a new termination future
			terminationFuture = new CompletableFuture<>();

			// now officially mark this as running
			running = true;

			LOG.info("Flink Mini Cluster started successfully");
		}
	}

	@GuardedBy("lock")
	private void setupDispatcherResourceManagerComponents(Configuration configuration, RpcServiceFactory dispatcherResourceManagreComponentRpcServiceFactory, MetricQueryServiceRetriever metricQueryServiceRetriever) throws Exception {
		dispatcherResourceManagerComponents.addAll(createDispatcherResourceManagerComponents(
			configuration,
			dispatcherResourceManagreComponentRpcServiceFactory,
			haServices,
			blobServer,
			heartbeatServices,
			metricRegistry,
			metricQueryServiceRetriever,
			new ShutDownFatalErrorHandler()
		));

		final Collection> shutDownFutures = new ArrayList<>(dispatcherResourceManagerComponents.size());

		for (DispatcherResourceManagerComponent dispatcherResourceManagerComponent : dispatcherResourceManagerComponents) {
			final CompletableFuture shutDownFuture = dispatcherResourceManagerComponent.getShutDownFuture();
			FutureUtils.assertNoException(shutDownFuture.thenRun(dispatcherResourceManagerComponent::closeAsync));
			shutDownFutures.add(shutDownFuture);
		}

		FutureUtils.assertNoException(FutureUtils.completeAll(shutDownFutures).thenRun(this::closeAsync));
	}

	@VisibleForTesting
	protected Collection createDispatcherResourceManagerComponents(
			Configuration configuration,
			RpcServiceFactory rpcServiceFactory,
			HighAvailabilityServices haServices,
			BlobServer blobServer,
			HeartbeatServices heartbeatServices,
			MetricRegistry metricRegistry,
			MetricQueryServiceRetriever metricQueryServiceRetriever,
			FatalErrorHandler fatalErrorHandler) throws Exception {
		DispatcherResourceManagerComponentFactory dispatcherResourceManagerComponentFactory = createDispatcherResourceManagerComponentFactory();
		return Collections.singleton(
			dispatcherResourceManagerComponentFactory.create(
				configuration,
				ioExecutor,
				rpcServiceFactory.createRpcService(),
				haServices,
				blobServer,
				heartbeatServices,
				metricRegistry,
				new MemoryArchivedExecutionGraphStore(),
				metricQueryServiceRetriever,
				fatalErrorHandler));
	}

	@Nonnull
	private DispatcherResourceManagerComponentFactory createDispatcherResourceManagerComponentFactory() {
		return DefaultDispatcherResourceManagerComponentFactory.createSessionComponentFactory(StandaloneResourceManagerFactory.INSTANCE);
	}

	@VisibleForTesting
	protected HighAvailabilityServices createHighAvailabilityServices(Configuration configuration, Executor executor) throws Exception {
		LOG.info("Starting high-availability services");
		return HighAvailabilityServicesUtils.createAvailableOrEmbeddedServices(
			configuration,
			executor);
	}

	/**
	 * Shuts down the mini cluster, failing all currently executing jobs.
	 * The mini cluster can be started again by calling the {@link #start()} method again.
	 *
	 * 

This method shuts down all started services and components, * even if an exception occurs in the process of shutting down some component. * * @return Future which is completed once the MiniCluster has been completely shut down */ @Override public CompletableFuture closeAsync() { synchronized (lock) { if (running) { LOG.info("Shutting down Flink Mini Cluster"); try { final long shutdownTimeoutMillis = miniClusterConfiguration.getConfiguration().getLong(ClusterOptions.CLUSTER_SERVICES_SHUTDOWN_TIMEOUT); final int numComponents = 2 + miniClusterConfiguration.getNumTaskManagers(); final Collection> componentTerminationFutures = new ArrayList<>(numComponents); componentTerminationFutures.addAll(terminateTaskExecutors()); componentTerminationFutures.add(shutDownResourceManagerComponents()); final FutureUtils.ConjunctFuture componentsTerminationFuture = FutureUtils.completeAll(componentTerminationFutures); final CompletableFuture metricSystemTerminationFuture = FutureUtils.composeAfterwards( componentsTerminationFuture, this::closeMetricSystem); final CompletableFuture rpcServicesTerminationFuture = FutureUtils.composeAfterwards( metricSystemTerminationFuture, this::terminateRpcServices); final CompletableFuture remainingServicesTerminationFuture = FutureUtils.runAfterwards( rpcServicesTerminationFuture, this::terminateMiniClusterServices); final CompletableFuture executorsTerminationFuture = FutureUtils.composeAfterwards( remainingServicesTerminationFuture, () -> terminateExecutors(shutdownTimeoutMillis)); executorsTerminationFuture.whenComplete( (Void ignored, Throwable throwable) -> { if (throwable != null) { terminationFuture.completeExceptionally(ExceptionUtils.stripCompletionException(throwable)); } else { terminationFuture.complete(null); } }); } finally { running = false; } } return terminationFuture; } } private CompletableFuture closeMetricSystem() { synchronized (lock) { final ArrayList> terminationFutures = new ArrayList<>(2); if (processMetricGroup != null) { processMetricGroup.close(); processMetricGroup = null; } // metrics shutdown if (metricRegistry != null) { terminationFutures.add(metricRegistry.shutdown()); metricRegistry = null; } return FutureUtils.completeAll(terminationFutures); } } @GuardedBy("lock") private void startTaskManagers() throws Exception { final int numTaskManagers = miniClusterConfiguration.getNumTaskManagers(); LOG.info("Starting {} TaskManger(s)", numTaskManagers); for (int i = 0; i < numTaskManagers; i++) { startTaskExecutor(); } } @VisibleForTesting void startTaskExecutor() throws Exception { synchronized (lock) { final Configuration configuration = miniClusterConfiguration.getConfiguration(); final TaskExecutor taskExecutor = TaskManagerRunner.startTaskManager( configuration, new ResourceID(UUID.randomUUID().toString()), taskManagerRpcServiceFactory.createRpcService(), haServices, heartbeatServices, metricRegistry, blobCacheService, useLocalCommunication(), taskManagerTerminatingFatalErrorHandlerFactory.create(taskManagers.size())); taskExecutor.start(); taskManagers.add(taskExecutor); } } @VisibleForTesting protected boolean useLocalCommunication() { return miniClusterConfiguration.getNumTaskManagers() == 1; } @GuardedBy("lock") private Collection> terminateTaskExecutors() { final Collection> terminationFutures = new ArrayList<>(taskManagers.size()); for (int i = 0; i < taskManagers.size(); i++) { terminationFutures.add(terminateTaskExecutor(i)); } return terminationFutures; } @VisibleForTesting @Nonnull protected CompletableFuture terminateTaskExecutor(int index) { synchronized (lock) { final TaskExecutor taskExecutor = taskManagers.get(index); return taskExecutor.closeAsync(); } } // ------------------------------------------------------------------------ // Accessing jobs // ------------------------------------------------------------------------ public CompletableFuture> listJobs() { return runDispatcherCommand(dispatcherGateway -> dispatcherGateway .requestMultipleJobDetails(rpcTimeout) .thenApply(jobs -> jobs.getJobs().stream() .map(details -> new JobStatusMessage(details.getJobId(), details.getJobName(), details.getStatus(), details.getStartTime())) .collect(Collectors.toList()))); } public CompletableFuture getJobStatus(JobID jobId) { return runDispatcherCommand(dispatcherGateway -> dispatcherGateway.requestJobStatus(jobId, rpcTimeout)); } public CompletableFuture cancelJob(JobID jobId) { return runDispatcherCommand(dispatcherGateway -> dispatcherGateway.cancelJob(jobId, rpcTimeout)); } public CompletableFuture triggerSavepoint(JobID jobId, String targetDirectory, boolean cancelJob) { return runDispatcherCommand(dispatcherGateway -> dispatcherGateway.triggerSavepoint(jobId, targetDirectory, cancelJob, rpcTimeout)); } public CompletableFuture stopWithSavepoint(JobID jobId, String targetDirectory, boolean advanceToEndOfEventTime) { return runDispatcherCommand(dispatcherGateway -> dispatcherGateway.stopWithSavepoint(jobId, targetDirectory, advanceToEndOfEventTime, rpcTimeout)); } public CompletableFuture disposeSavepoint(String savepointPath) { return runDispatcherCommand(dispatcherGateway -> dispatcherGateway.disposeSavepoint(savepointPath, rpcTimeout)); } public CompletableFuture getExecutionGraph(JobID jobId) { return runDispatcherCommand(dispatcherGateway -> dispatcherGateway.requestJob(jobId, rpcTimeout)); } private CompletableFuture runDispatcherCommand(Function> dispatcherCommand) { return getDispatcherGatewayFuture().thenApply(dispatcherCommand).thenCompose(Function.identity()); } // ------------------------------------------------------------------------ // running jobs // ------------------------------------------------------------------------ /** * This method executes a job in detached mode. The method returns immediately after the job * has been added to the * * @param job The Flink job to execute * * @throws JobExecutionException Thrown if anything went amiss during initial job launch, * or if the job terminally failed. */ public void runDetached(JobGraph job) throws JobExecutionException, InterruptedException { checkNotNull(job, "job is null"); final CompletableFuture submissionFuture = submitJob(job); try { submissionFuture.get(); } catch (ExecutionException e) { throw new JobExecutionException(job.getJobID(), ExceptionUtils.stripExecutionException(e)); } } /** * This method runs a job in blocking mode. The method returns only after the job * completed successfully, or after it failed terminally. * * @param job The Flink job to execute * @return The result of the job execution * * @throws JobExecutionException Thrown if anything went amiss during initial job launch, * or if the job terminally failed. */ @Override public JobExecutionResult executeJobBlocking(JobGraph job) throws JobExecutionException, InterruptedException { checkNotNull(job, "job is null"); final CompletableFuture submissionFuture = submitJob(job); final CompletableFuture jobResultFuture = submissionFuture.thenCompose( (JobSubmissionResult ignored) -> requestJobResult(job.getJobID())); final JobResult jobResult; try { jobResult = jobResultFuture.get(); } catch (ExecutionException e) { throw new JobExecutionException(job.getJobID(), "Could not retrieve JobResult.", ExceptionUtils.stripExecutionException(e)); } try { return jobResult.toJobExecutionResult(Thread.currentThread().getContextClassLoader()); } catch (IOException | ClassNotFoundException e) { throw new JobExecutionException(job.getJobID(), e); } } public CompletableFuture submitJob(JobGraph jobGraph) { final CompletableFuture dispatcherGatewayFuture = getDispatcherGatewayFuture(); final CompletableFuture blobServerAddressFuture = createBlobServerAddress(dispatcherGatewayFuture); final CompletableFuture jarUploadFuture = uploadAndSetJobFiles(blobServerAddressFuture, jobGraph); final CompletableFuture acknowledgeCompletableFuture = jarUploadFuture .thenCombine( dispatcherGatewayFuture, (Void ack, DispatcherGateway dispatcherGateway) -> dispatcherGateway.submitJob(jobGraph, rpcTimeout)) .thenCompose(Function.identity()); return acknowledgeCompletableFuture.thenApply( (Acknowledge ignored) -> new JobSubmissionResult(jobGraph.getJobID())); } public CompletableFuture requestJobResult(JobID jobId) { return runDispatcherCommand(dispatcherGateway -> dispatcherGateway.requestJobResult(jobId, RpcUtils.INF_TIMEOUT)); } public CompletableFuture requestClusterOverview() { return runDispatcherCommand(dispatcherGateway -> dispatcherGateway.requestClusterOverview(RpcUtils.INF_TIMEOUT)); } @VisibleForTesting protected CompletableFuture getDispatcherGatewayFuture() { synchronized (lock) { checkState(running, "MiniCluster is not yet running or has already been shut down."); return dispatcherGatewayRetriever.getFuture(); } } private CompletableFuture uploadAndSetJobFiles(final CompletableFuture blobServerAddressFuture, final JobGraph job) { return blobServerAddressFuture.thenAccept(blobServerAddress -> { try { ClientUtils.extractAndUploadJobGraphFiles(job, () -> new BlobClient(blobServerAddress, miniClusterConfiguration.getConfiguration())); } catch (FlinkException e) { throw new CompletionException(e); } }); } private CompletableFuture createBlobServerAddress(final CompletableFuture dispatcherGatewayFuture) { return dispatcherGatewayFuture.thenApply(dispatcherGateway -> dispatcherGateway .getBlobServerPort(rpcTimeout) .thenApply(blobServerPort -> new InetSocketAddress(dispatcherGateway.getHostname(), blobServerPort))) .thenCompose(Function.identity()); } // ------------------------------------------------------------------------ // factories - can be overridden by subclasses to alter behavior // ------------------------------------------------------------------------ /** * Factory method to create the metric registry for the mini cluster. * * @param config The configuration of the mini cluster */ protected MetricRegistryImpl createMetricRegistry(Configuration config) { return new MetricRegistryImpl( MetricRegistryConfiguration.fromConfiguration(config), ReporterSetup.fromConfiguration(config)); } /** * Factory method to instantiate the RPC service. * * @param akkaRpcServiceConfig * The default RPC timeout for asynchronous "ask" requests. * @param remoteEnabled * True, if the RPC service should be reachable from other (remote) RPC services. * @param bindAddress * The address to bind the RPC service to. Only relevant when "remoteEnabled" is true. * * @return The instantiated RPC service */ protected RpcService createRpcService( AkkaRpcServiceConfiguration akkaRpcServiceConfig, boolean remoteEnabled, String bindAddress) { final Config akkaConfig; if (remoteEnabled) { akkaConfig = AkkaUtils.getAkkaConfig(akkaRpcServiceConfig.getConfiguration(), bindAddress, 0); } else { akkaConfig = AkkaUtils.getAkkaConfig(akkaRpcServiceConfig.getConfiguration()); } final Config effectiveAkkaConfig = AkkaUtils.testDispatcherConfig().withFallback(akkaConfig); final ActorSystem actorSystem = AkkaUtils.createActorSystem(effectiveAkkaConfig); return new AkkaRpcService(actorSystem, akkaRpcServiceConfig); } // ------------------------------------------------------------------------ // Internal methods // ------------------------------------------------------------------------ @GuardedBy("lock") private CompletableFuture shutDownResourceManagerComponents() { final Collection> terminationFutures = new ArrayList<>(dispatcherResourceManagerComponents.size()); for (DispatcherResourceManagerComponent dispatcherResourceManagerComponent : dispatcherResourceManagerComponents) { terminationFutures.add(dispatcherResourceManagerComponent.closeAsync()); } final FutureUtils.ConjunctFuture dispatcherTerminationFuture = FutureUtils.completeAll(terminationFutures); return FutureUtils.runAfterwards( dispatcherTerminationFuture, () -> { Exception exception = null; synchronized (lock) { if (resourceManagerLeaderRetriever != null) { try { resourceManagerLeaderRetriever.stop(); } catch (Exception e) { exception = ExceptionUtils.firstOrSuppressed(e, exception); } resourceManagerLeaderRetriever = null; } if (dispatcherLeaderRetriever != null) { try { dispatcherLeaderRetriever.stop(); } catch (Exception e) { exception = ExceptionUtils.firstOrSuppressed(e, exception); } dispatcherLeaderRetriever = null; } if (clusterRestEndpointLeaderRetrievalService != null) { try { clusterRestEndpointLeaderRetrievalService.stop(); } catch (Exception e) { exception = ExceptionUtils.firstOrSuppressed(e, exception); } clusterRestEndpointLeaderRetrievalService = null; } } if (exception != null) { throw exception; } }); } private void terminateMiniClusterServices() throws Exception { // collect the first exception, but continue and add all successive // exceptions as suppressed Exception exception = null; synchronized (lock) { if (blobCacheService != null) { try { blobCacheService.close(); } catch (Exception e) { exception = ExceptionUtils.firstOrSuppressed(e, exception); } blobCacheService = null; } // shut down the blob server if (blobServer != null) { try { blobServer.close(); } catch (Exception e) { exception = ExceptionUtils.firstOrSuppressed(e, exception); } blobServer = null; } // shut down high-availability services if (haServices != null) { try { haServices.closeAndCleanupAllData(); } catch (Exception e) { exception = ExceptionUtils.firstOrSuppressed(e, exception); } haServices = null; } if (exception != null) { throw exception; } } } @Nonnull private CompletableFuture terminateRpcServices() { synchronized (lock) { final int numRpcServices = 1 + rpcServices.size(); final Collection> rpcTerminationFutures = new ArrayList<>(numRpcServices); rpcTerminationFutures.add(commonRpcService.stopService()); for (RpcService rpcService : rpcServices) { rpcTerminationFutures.add(rpcService.stopService()); } commonRpcService = null; rpcServices.clear(); return FutureUtils.completeAll(rpcTerminationFutures); } } private CompletableFuture terminateExecutors(long executorShutdownTimeoutMillis) { synchronized (lock) { if (ioExecutor != null) { return ExecutorUtils.nonBlockingShutdown(executorShutdownTimeoutMillis, TimeUnit.MILLISECONDS, ioExecutor); } else { return CompletableFuture.completedFuture(null); } } } /** * Internal factory for {@link RpcService}. */ protected interface RpcServiceFactory { RpcService createRpcService(); } /** * Factory which returns always the common {@link RpcService}. */ protected static class CommonRpcServiceFactory implements RpcServiceFactory { private final RpcService commonRpcService; CommonRpcServiceFactory(RpcService commonRpcService) { this.commonRpcService = commonRpcService; } @Override public RpcService createRpcService() { return commonRpcService; } } /** * Factory which creates and registers new {@link RpcService}. */ protected class DedicatedRpcServiceFactory implements RpcServiceFactory { private final AkkaRpcServiceConfiguration akkaRpcServiceConfig; private final String jobManagerBindAddress; DedicatedRpcServiceFactory(AkkaRpcServiceConfiguration akkaRpcServiceConfig, String jobManagerBindAddress) { this.akkaRpcServiceConfig = akkaRpcServiceConfig; this.jobManagerBindAddress = jobManagerBindAddress; } @Override public RpcService createRpcService() { final RpcService rpcService = MiniCluster.this.createRpcService(akkaRpcServiceConfig, true, jobManagerBindAddress); synchronized (lock) { rpcServices.add(rpcService); } return rpcService; } } // ------------------------------------------------------------------------ // miscellaneous utilities // ------------------------------------------------------------------------ private void initializeIOFormatClasses(Configuration configuration) { // TODO: That we still have to call something like this is a crime against humanity FileOutputFormat.initDefaultsFromConfiguration(configuration); } private class TerminatingFatalErrorHandler implements FatalErrorHandler { private final int index; private TerminatingFatalErrorHandler(int index) { this.index = index; } @Override public void onFatalError(Throwable exception) { // first check if we are still running if (running) { LOG.error("TaskManager #{} failed.", index, exception); synchronized (lock) { taskManagers.get(index).closeAsync(); } } } } private class ShutDownFatalErrorHandler implements FatalErrorHandler { @Override public void onFatalError(Throwable exception) { LOG.warn("Error in MiniCluster. Shutting the MiniCluster down.", exception); closeAsync(); } } private class TerminatingFatalErrorHandlerFactory { /** * Create a new {@link TerminatingFatalErrorHandler} for the {@link TaskExecutor} with * the given index. * * @param index into the {@link #taskManagers} collection to identify the correct {@link TaskExecutor}. * @return {@link TerminatingFatalErrorHandler} for the given index */ @GuardedBy("lock") private TerminatingFatalErrorHandler create(int index) { return new TerminatingFatalErrorHandler(index); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy