All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.runtime.client.JobClient Maven / Gradle / Ivy

There is a newer version: 1.13.6
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.client;

import akka.actor.ActorRef;
import akka.actor.ActorSystem;
import akka.actor.Address;
import akka.actor.Identify;
import akka.actor.PoisonPill;
import akka.actor.Props;
import akka.pattern.Patterns;
import akka.util.Timeout;
import org.apache.flink.api.common.JobExecutionResult;
import org.apache.flink.api.common.JobID;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.runtime.akka.AkkaUtils;
import org.apache.flink.runtime.akka.ListeningBehaviour;
import org.apache.flink.runtime.blob.BlobCache;
import org.apache.flink.runtime.blob.BlobKey;
import org.apache.flink.runtime.execution.librarycache.FlinkUserCodeClassLoader;
import org.apache.flink.runtime.highavailability.HighAvailabilityServices;
import org.apache.flink.runtime.instance.ActorGateway;
import org.apache.flink.runtime.jobgraph.JobGraph;
import org.apache.flink.runtime.messages.JobClientMessages;
import org.apache.flink.runtime.messages.JobManagerMessages;
import org.apache.flink.runtime.util.SerializedThrowable;
import org.apache.flink.util.NetUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Option;
import scala.Some;
import scala.Tuple2;
import scala.concurrent.Await;
import scala.concurrent.Future;
import scala.concurrent.duration.Duration;
import scala.concurrent.duration.FiniteDuration;

import java.io.IOException;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.URL;
import java.util.Collection;
import java.util.concurrent.TimeoutException;

import static org.apache.flink.util.Preconditions.checkNotNull;

/**
 * The JobClient bridges between the JobManager's asynchronous actor messages and
 * the synchronous method calls to trigger.
 */
public class JobClient {

	private static final Logger LOG = LoggerFactory.getLogger(JobClient.class);

	public static ActorSystem startJobClientActorSystem(Configuration config)
			throws IOException {
		LOG.info("Starting JobClient actor system");

		Option> remoting = new Some<>(new Tuple2("", 0));

		// start a remote actor system to listen on an arbitrary port
		ActorSystem system = AkkaUtils.createActorSystem(config, remoting);
		Address address = system.provider().getDefaultAddress();

		String hostAddress = address.host().isDefined() ?
				NetUtils.ipAddressToUrlString(InetAddress.getByName(address.host().get())) :
				"(unknown)";
		int port = address.port().isDefined() ? ((Integer) address.port().get()) : -1;
		LOG.info("Started JobClient actor system at " + hostAddress + ':' + port);

		return system;
	}

	/**
	 * Submits a job to a Flink cluster (non-blocking) and returns a JobListeningContext which can be
	 * passed to {@code awaitJobResult} to get the result of the submission.
	 * @return JobListeningContext which may be used to retrieve the JobExecutionResult via
	 * 			{@code awaitJobResult(JobListeningContext context)}.
	 */
	public static JobListeningContext submitJob(
			ActorSystem actorSystem,
			Configuration config,
			HighAvailabilityServices highAvailabilityServices,
			JobGraph jobGraph,
			FiniteDuration timeout,
			boolean sysoutLogUpdates,
			ClassLoader classLoader) {

		checkNotNull(actorSystem, "The actorSystem must not be null.");
		checkNotNull(highAvailabilityServices, "The high availability services must not be null.");
		checkNotNull(jobGraph, "The jobGraph must not be null.");
		checkNotNull(timeout, "The timeout must not be null.");

		// for this job, we create a proxy JobClientActor that deals with all communication with
		// the JobManager. It forwards the job submission, checks the success/failure responses, logs
		// update messages, watches for disconnect between client and JobManager, ...

		Props jobClientActorProps = JobSubmissionClientActor.createActorProps(
			highAvailabilityServices.getJobManagerLeaderRetriever(HighAvailabilityServices.DEFAULT_JOB_ID),
			timeout,
			sysoutLogUpdates,
			config);

		ActorRef jobClientActor = actorSystem.actorOf(jobClientActorProps);

		Future submissionFuture = Patterns.ask(
				jobClientActor,
				new JobClientMessages.SubmitJobAndWait(jobGraph),
				new Timeout(AkkaUtils.INF_TIMEOUT()));

		return new JobListeningContext(
			jobGraph.getJobID(),
			submissionFuture,
			jobClientActor,
			timeout,
			classLoader,
			highAvailabilityServices);
	}


	/**
	 * Attaches to a running Job using the JobID.
	 * Reconstructs the user class loader by downloading the jars from the JobManager.
	 */
	public static JobListeningContext attachToRunningJob(
			JobID jobID,
			ActorGateway jobManagerGateWay,
			Configuration configuration,
			ActorSystem actorSystem,
			HighAvailabilityServices highAvailabilityServices,
			FiniteDuration timeout,
			boolean sysoutLogUpdates) {

		checkNotNull(jobID, "The jobID must not be null.");
		checkNotNull(jobManagerGateWay, "The jobManagerGateWay must not be null.");
		checkNotNull(configuration, "The configuration must not be null.");
		checkNotNull(actorSystem, "The actorSystem must not be null.");
		checkNotNull(highAvailabilityServices, "The high availability services must not be null.");
		checkNotNull(timeout, "The timeout must not be null.");

		// we create a proxy JobClientActor that deals with all communication with
		// the JobManager. It forwards the job attachments, checks the success/failure responses, logs
		// update messages, watches for disconnect between client and JobManager, ...
		Props jobClientActorProps = JobAttachmentClientActor.createActorProps(
			highAvailabilityServices.getJobManagerLeaderRetriever(HighAvailabilityServices.DEFAULT_JOB_ID),
			timeout,
			sysoutLogUpdates);

		ActorRef jobClientActor = actorSystem.actorOf(jobClientActorProps);

		Future attachmentFuture = Patterns.ask(
				jobClientActor,
				new JobClientMessages.AttachToJobAndWait(jobID),
				new Timeout(AkkaUtils.INF_TIMEOUT()));

		return new JobListeningContext(
			jobID,
			attachmentFuture,
			jobClientActor,
			timeout,
			actorSystem,
			configuration,
			highAvailabilityServices);
	}

	/**
	 * Reconstructs the class loader by first requesting information about it at the JobManager
	 * and then downloading missing jar files.
	 * @param jobID id of job
	 * @param jobManager gateway to the JobManager
	 * @param config the flink configuration
	 * @return A classloader that should behave like the original classloader
	 * @throws JobRetrievalException if anything goes wrong
	 */
	public static ClassLoader retrieveClassLoader(
		JobID jobID,
		ActorGateway jobManager,
		Configuration config,
		HighAvailabilityServices highAvailabilityServices)
		throws JobRetrievalException {

		final Object jmAnswer;
		try {
			jmAnswer = Await.result(
				jobManager.ask(
					new JobManagerMessages.RequestClassloadingProps(jobID),
					AkkaUtils.getDefaultTimeoutAsFiniteDuration()),
				AkkaUtils.getDefaultTimeoutAsFiniteDuration());
		} catch (Exception e) {
			throw new JobRetrievalException(jobID, "Couldn't retrieve class loading properties from JobManager.", e);
		}

		if (jmAnswer instanceof JobManagerMessages.ClassloadingProps) {
			JobManagerMessages.ClassloadingProps props = ((JobManagerMessages.ClassloadingProps) jmAnswer);

			Option jmHost = jobManager.actor().path().address().host();
			String jmHostname = jmHost.isDefined() ? jmHost.get() : "localhost";
			InetSocketAddress serverAddress = new InetSocketAddress(jmHostname, props.blobManagerPort());
			final BlobCache blobClient;
			try {
				// TODO: Fix lifecycle of BlobCache to properly close it upon usage
				blobClient = new BlobCache(serverAddress, config, highAvailabilityServices.createBlobStore());
			} catch (IOException e) {
				throw new JobRetrievalException(jobID,
					"Failed to setup blob cache", e);
			}

			final Collection requiredJarFiles = props.requiredJarFiles();
			final Collection requiredClasspaths = props.requiredClasspaths();

			final URL[] allURLs = new URL[requiredJarFiles.size() + requiredClasspaths.size()];

			int pos = 0;
			for (BlobKey blobKey : props.requiredJarFiles()) {
				try {
					allURLs[pos++] = blobClient.getURL(blobKey);
				} catch (Exception e) {
					try {
						blobClient.close();
					} catch (IOException ioe) {
						LOG.warn("Could not properly close the BlobClient.", ioe);
					}

					throw new JobRetrievalException(jobID, "Failed to download BlobKey " + blobKey, e);
				}
			}

			for (URL url : requiredClasspaths) {
				allURLs[pos++] = url;
			}

			return new FlinkUserCodeClassLoader(allURLs, JobClient.class.getClassLoader());
		} else if (jmAnswer instanceof JobManagerMessages.JobNotFound) {
			throw new JobRetrievalException(jobID, "Couldn't retrieve class loader. Job " + jobID + " not found");
		} else {
			throw new JobRetrievalException(jobID, "Unknown response from JobManager: " + jmAnswer);
		}
	}

	/**
	 * Given a JobListeningContext, awaits the result of the job execution that this context is bound to
	 * @param listeningContext The listening context of the job execution
	 * @return The result of the execution
	 * @throws JobExecutionException if anything goes wrong while monitoring the job
	 */
	public static JobExecutionResult awaitJobResult(JobListeningContext listeningContext) throws JobExecutionException {

		final JobID jobID = listeningContext.getJobID();
		final ActorRef jobClientActor = listeningContext.getJobClientActor();
		final Future jobSubmissionFuture = listeningContext.getJobResultFuture();
		final FiniteDuration askTimeout = listeningContext.getTimeout();
		// retrieves class loader if necessary
		final ClassLoader classLoader = listeningContext.getClassLoader();

		// wait for the future which holds the result to be ready
		// ping the JobClientActor from time to time to check if it is still running
		while (!jobSubmissionFuture.isCompleted()) {
			try {
				Await.ready(jobSubmissionFuture, askTimeout);
			} catch (InterruptedException e) {
				throw new JobExecutionException(
					jobID,
					"Interrupted while waiting for job completion.");
			} catch (TimeoutException e) {
				try {
					Await.result(
						Patterns.ask(
							jobClientActor,
							// Ping the Actor to see if it is alive
							new Identify(true),
							Timeout.durationToTimeout(askTimeout)),
						askTimeout);
					// we got a reply, continue waiting for the job result
				} catch (Exception eInner) {
					// we could have a result but the JobClientActor might have been killed and
					// thus the health check failed
					if (!jobSubmissionFuture.isCompleted()) {
						throw new JobExecutionException(
							jobID,
							"JobClientActor seems to have died before the JobExecutionResult could be retrieved.",
							eInner);
					}
				}
			}
		}

		final Object answer;
		try {
			// we have already awaited the result, zero time to wait here
			answer = Await.result(jobSubmissionFuture, Duration.Zero());
		}
		catch (Throwable throwable) {
			throw new JobExecutionException(jobID,
				"Couldn't retrieve the JobExecutionResult from the JobManager.", throwable);
		}
		finally {
			// failsafe shutdown of the client actor
			jobClientActor.tell(PoisonPill.getInstance(), ActorRef.noSender());
		}

		// second block handles the actual response
		if (answer instanceof JobManagerMessages.JobResultSuccess) {
			LOG.info("Job execution complete");

			SerializedJobExecutionResult result = ((JobManagerMessages.JobResultSuccess) answer).result();
			if (result != null) {
				try {
					return result.toJobExecutionResult(classLoader);
				} catch (Throwable t) {
					throw new JobExecutionException(jobID,
						"Job was successfully executed but JobExecutionResult could not be deserialized.");
				}
			} else {
				throw new JobExecutionException(jobID,
					"Job was successfully executed but result contained a null JobExecutionResult.");
			}
		}
		else if (answer instanceof JobManagerMessages.JobResultFailure) {
			LOG.info("Job execution failed");

			SerializedThrowable serThrowable = ((JobManagerMessages.JobResultFailure) answer).cause();
			if (serThrowable != null) {
				Throwable cause = serThrowable.deserializeError(classLoader);
				if (cause instanceof JobExecutionException) {
					throw (JobExecutionException) cause;
				} else {
					throw new JobExecutionException(jobID, "Job execution failed", cause);
				}
			} else {
				throw new JobExecutionException(jobID,
					"Job execution failed with null as failure cause.");
			}
		}
		else if (answer instanceof JobManagerMessages.JobNotFound) {
			throw new JobRetrievalException(
				((JobManagerMessages.JobNotFound) answer).jobID(),
				"Couldn't retrieve Job " + jobID + " because it was not running.");
		}
		else {
			throw new JobExecutionException(jobID,
				"Unknown answer from JobManager after submitting the job: " + answer);
		}
	}

	/**
	 * Sends a [[JobGraph]] to the JobClient actor specified by jobClient which submits it then to
	 * the JobManager. The method blocks until the job has finished or the JobManager is no longer
	 * alive. In the former case, the [[SerializedJobExecutionResult]] is returned and in the latter
	 * case a [[JobExecutionException]] is thrown.
	 *
	 * @param actorSystem The actor system that performs the communication.
	 * @param config The cluster wide configuration.
	 * @param highAvailabilityServices Service factory for high availability services
	 * @param jobGraph    JobGraph describing the Flink job
	 * @param timeout     Timeout for futures
	 * @param sysoutLogUpdates prints log updates to system out if true
	 * @param classLoader The class loader for deserializing the results
	 * @return The job execution result
	 * @throws JobExecutionException Thrown if the job
	 *                                                               execution fails.
	 */
	public static JobExecutionResult submitJobAndWait(
			ActorSystem actorSystem,
			Configuration config,
			HighAvailabilityServices highAvailabilityServices,
			JobGraph jobGraph,
			FiniteDuration timeout,
			boolean sysoutLogUpdates,
			ClassLoader classLoader) throws JobExecutionException {

		JobListeningContext jobListeningContext = submitJob(
				actorSystem,
				config,
				highAvailabilityServices,
				jobGraph,
				timeout,
				sysoutLogUpdates,
				classLoader);

		return awaitJobResult(jobListeningContext);
	}

	/**
	 * Submits a job in detached mode. The method sends the JobGraph to the
	 * JobManager and waits for the answer whether the job could be started or not.
	 *
	 * @param jobManagerGateway Gateway to the JobManager which will execute the jobs
	 * @param config The cluster wide configuration.
	 * @param jobGraph The job
	 * @param timeout  Timeout in which the JobManager must have responded.
	 */
	public static void submitJobDetached(
			ActorGateway jobManagerGateway,
			Configuration config,
			JobGraph jobGraph,
			FiniteDuration timeout,
			ClassLoader classLoader) throws JobExecutionException {

		checkNotNull(jobManagerGateway, "The jobManagerGateway must not be null.");
		checkNotNull(jobGraph, "The jobGraph must not be null.");
		checkNotNull(timeout, "The timeout must not be null.");

		LOG.info("Checking and uploading JAR files");
		try {
			jobGraph.uploadUserJars(jobManagerGateway, timeout, config);
		}
		catch (IOException e) {
			throw new JobSubmissionException(jobGraph.getJobID(),
				"Could not upload the program's JAR files to the JobManager.", e);
		}

		Object result;
		try {
			Future future = jobManagerGateway.ask(
				new JobManagerMessages.SubmitJob(
					jobGraph,
					ListeningBehaviour.DETACHED // only receive the Acknowledge for the job submission message
				),
				timeout);

			result = Await.result(future, timeout);
		}
		catch (TimeoutException e) {
			throw new JobTimeoutException(jobGraph.getJobID(),
					"JobManager did not respond within " + timeout.toString(), e);
		}
		catch (Throwable t) {
			throw new JobSubmissionException(jobGraph.getJobID(),
					"Failed to send job to JobManager: " + t.getMessage(), t.getCause());
		}

		if (result instanceof JobManagerMessages.JobSubmitSuccess) {
			JobID respondedID = ((JobManagerMessages.JobSubmitSuccess) result).jobId();

			// validate response
			if (!respondedID.equals(jobGraph.getJobID())) {
				throw new JobExecutionException(jobGraph.getJobID(),
						"JobManager responded for wrong Job. This Job: " +
						jobGraph.getJobID() + ", response: " + respondedID);
			}
		}
		else if (result instanceof JobManagerMessages.JobResultFailure) {
			try {
				SerializedThrowable t = ((JobManagerMessages.JobResultFailure) result).cause();
				throw t.deserializeError(classLoader);
			}
			catch (JobExecutionException e) {
				throw e;
			}
			catch (Throwable t) {
				throw new JobExecutionException(jobGraph.getJobID(),
						"JobSubmission failed: " + t.getMessage(), t);
			}
		}
		else {
			throw new JobExecutionException(jobGraph.getJobID(), "Unexpected response from JobManager: " + result);
		}
	}

}