org.apache.flink.runtime.jobmaster.JobMaster Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.runtime.jobmaster;
import org.apache.flink.api.common.JobID;
import org.apache.flink.api.common.JobStatus;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.common.time.Time;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.JobManagerOptions;
import org.apache.flink.queryablestate.KvStateID;
import org.apache.flink.runtime.accumulators.AccumulatorSnapshot;
import org.apache.flink.runtime.blob.BlobWriter;
import org.apache.flink.runtime.checkpoint.CheckpointMetrics;
import org.apache.flink.runtime.checkpoint.TaskStateSnapshot;
import org.apache.flink.runtime.clusterframework.types.AllocationID;
import org.apache.flink.runtime.clusterframework.types.ResourceID;
import org.apache.flink.runtime.concurrent.FutureUtils;
import org.apache.flink.runtime.execution.ExecutionState;
import org.apache.flink.runtime.executiongraph.ExecutionAttemptID;
import org.apache.flink.runtime.executiongraph.JobStatusListener;
import org.apache.flink.runtime.heartbeat.HeartbeatListener;
import org.apache.flink.runtime.heartbeat.HeartbeatManager;
import org.apache.flink.runtime.heartbeat.HeartbeatServices;
import org.apache.flink.runtime.heartbeat.HeartbeatTarget;
import org.apache.flink.runtime.heartbeat.NoOpHeartbeatManager;
import org.apache.flink.runtime.highavailability.HighAvailabilityServices;
import org.apache.flink.runtime.io.network.partition.JobMasterPartitionTracker;
import org.apache.flink.runtime.io.network.partition.PartitionTrackerFactory;
import org.apache.flink.runtime.io.network.partition.ResultPartitionID;
import org.apache.flink.runtime.jobgraph.IntermediateDataSetID;
import org.apache.flink.runtime.jobgraph.JobGraph;
import org.apache.flink.runtime.jobgraph.JobVertexID;
import org.apache.flink.runtime.jobgraph.OperatorID;
import org.apache.flink.runtime.jobmanager.OnCompletionActions;
import org.apache.flink.runtime.jobmanager.PartitionProducerDisposedException;
import org.apache.flink.runtime.jobmaster.factories.JobManagerJobMetricGroupFactory;
import org.apache.flink.runtime.jobmaster.slotpool.SlotPoolService;
import org.apache.flink.runtime.leaderretrieval.LeaderRetrievalListener;
import org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService;
import org.apache.flink.runtime.messages.Acknowledge;
import org.apache.flink.runtime.messages.FlinkJobNotFoundException;
import org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint;
import org.apache.flink.runtime.messages.webmonitor.JobDetails;
import org.apache.flink.runtime.metrics.groups.JobManagerJobMetricGroup;
import org.apache.flink.runtime.operators.coordination.CoordinationRequest;
import org.apache.flink.runtime.operators.coordination.CoordinationResponse;
import org.apache.flink.runtime.operators.coordination.OperatorEvent;
import org.apache.flink.runtime.query.KvStateLocation;
import org.apache.flink.runtime.query.UnknownKvStateLocation;
import org.apache.flink.runtime.registration.RegisteredRpcConnection;
import org.apache.flink.runtime.registration.RegistrationResponse;
import org.apache.flink.runtime.registration.RetryingRegistration;
import org.apache.flink.runtime.resourcemanager.ResourceManagerGateway;
import org.apache.flink.runtime.resourcemanager.ResourceManagerId;
import org.apache.flink.runtime.rpc.FatalErrorHandler;
import org.apache.flink.runtime.rpc.PermanentlyFencedRpcEndpoint;
import org.apache.flink.runtime.rpc.RpcService;
import org.apache.flink.runtime.rpc.akka.AkkaRpcServiceUtils;
import org.apache.flink.runtime.scheduler.ExecutionGraphInfo;
import org.apache.flink.runtime.scheduler.SchedulerNG;
import org.apache.flink.runtime.shuffle.ShuffleMaster;
import org.apache.flink.runtime.slots.ResourceRequirement;
import org.apache.flink.runtime.state.KeyGroupRange;
import org.apache.flink.runtime.taskexecutor.TaskExecutorGateway;
import org.apache.flink.runtime.taskexecutor.TaskExecutorToJobManagerHeartbeatPayload;
import org.apache.flink.runtime.taskexecutor.slot.SlotOffer;
import org.apache.flink.runtime.taskmanager.TaskExecutionState;
import org.apache.flink.runtime.taskmanager.TaskManagerLocation;
import org.apache.flink.runtime.taskmanager.TaskManagerLocation.ResolutionMode;
import org.apache.flink.runtime.taskmanager.UnresolvedTaskManagerLocation;
import org.apache.flink.util.ExceptionUtils;
import org.apache.flink.util.FlinkException;
import org.apache.flink.util.InstantiationUtil;
import org.apache.flink.util.SerializedValue;
import org.slf4j.Logger;
import javax.annotation.Nullable;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.CompletionException;
import java.util.concurrent.Executor;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeoutException;
import static org.apache.flink.util.Preconditions.checkNotNull;
/**
* JobMaster implementation. The job master is responsible for the execution of a single {@link
* JobGraph}.
*
* It offers the following methods as part of its rpc interface to interact with the JobMaster
* remotely:
*
*
* - {@link #updateTaskExecutionState} updates the task execution state for given task
*
*/
public class JobMaster extends PermanentlyFencedRpcEndpoint
implements JobMasterGateway, JobMasterService {
/** Default names for Flink's distributed components. */
public static final String JOB_MANAGER_NAME = "jobmanager";
// ------------------------------------------------------------------------
private final JobMasterConfiguration jobMasterConfiguration;
private final ResourceID resourceId;
private final JobGraph jobGraph;
private final Time rpcTimeout;
private final HighAvailabilityServices highAvailabilityServices;
private final BlobWriter blobWriter;
private final HeartbeatServices heartbeatServices;
private final ScheduledExecutorService scheduledExecutorService;
private final OnCompletionActions jobCompletionActions;
private final FatalErrorHandler fatalErrorHandler;
private final ClassLoader userCodeLoader;
private final SlotPoolService slotPoolService;
private final long initializationTimestamp;
private final boolean retrieveTaskManagerHostName;
// --------- ResourceManager --------
private final LeaderRetrievalService resourceManagerLeaderRetriever;
// --------- TaskManagers --------
private final Map>
registeredTaskManagers;
private final ShuffleMaster> shuffleMaster;
// --------- Scheduler --------
private final SchedulerNG schedulerNG;
private final JobManagerJobStatusListener jobStatusListener;
private final JobManagerJobMetricGroup jobManagerJobMetricGroup;
// -------- Misc ---------
private final Map accumulators;
private final JobMasterPartitionTracker partitionTracker;
private final ExecutionDeploymentTracker executionDeploymentTracker;
private final ExecutionDeploymentReconciler executionDeploymentReconciler;
// -------- Mutable fields ---------
@Nullable private ResourceManagerAddress resourceManagerAddress;
@Nullable private ResourceManagerConnection resourceManagerConnection;
@Nullable private EstablishedResourceManagerConnection establishedResourceManagerConnection;
private HeartbeatManager
taskManagerHeartbeatManager;
private HeartbeatManager resourceManagerHeartbeatManager;
// ------------------------------------------------------------------------
public JobMaster(
RpcService rpcService,
JobMasterId jobMasterId,
JobMasterConfiguration jobMasterConfiguration,
ResourceID resourceId,
JobGraph jobGraph,
HighAvailabilityServices highAvailabilityService,
SlotPoolServiceSchedulerFactory slotPoolServiceSchedulerFactory,
JobManagerSharedServices jobManagerSharedServices,
HeartbeatServices heartbeatServices,
JobManagerJobMetricGroupFactory jobMetricGroupFactory,
OnCompletionActions jobCompletionActions,
FatalErrorHandler fatalErrorHandler,
ClassLoader userCodeLoader,
ShuffleMaster> shuffleMaster,
PartitionTrackerFactory partitionTrackerFactory,
ExecutionDeploymentTracker executionDeploymentTracker,
ExecutionDeploymentReconciler.Factory executionDeploymentReconcilerFactory,
long initializationTimestamp)
throws Exception {
super(rpcService, AkkaRpcServiceUtils.createRandomName(JOB_MANAGER_NAME), jobMasterId);
final ExecutionDeploymentReconciliationHandler executionStateReconciliationHandler =
new ExecutionDeploymentReconciliationHandler() {
@Override
public void onMissingDeploymentsOf(
Collection executionAttemptIds, ResourceID host) {
log.debug(
"Failing deployments {} due to no longer being deployed.",
executionAttemptIds);
for (ExecutionAttemptID executionAttemptId : executionAttemptIds) {
schedulerNG.updateTaskExecutionState(
new TaskExecutionState(
executionAttemptId,
ExecutionState.FAILED,
new FlinkException(
String.format(
"Execution %s is unexpectedly no longer running on task executor %s.",
executionAttemptId, host))));
}
}
@Override
public void onUnknownDeploymentsOf(
Collection executionAttemptIds, ResourceID host) {
log.debug(
"Canceling left-over deployments {} on task executor {}.",
executionAttemptIds,
host);
for (ExecutionAttemptID executionAttemptId : executionAttemptIds) {
Tuple2 taskManagerInfo =
registeredTaskManagers.get(host);
if (taskManagerInfo != null) {
taskManagerInfo.f1.cancelTask(executionAttemptId, rpcTimeout);
}
}
}
};
this.executionDeploymentTracker = executionDeploymentTracker;
this.executionDeploymentReconciler =
executionDeploymentReconcilerFactory.create(executionStateReconciliationHandler);
this.jobMasterConfiguration = checkNotNull(jobMasterConfiguration);
this.resourceId = checkNotNull(resourceId);
this.jobGraph = checkNotNull(jobGraph);
this.rpcTimeout = jobMasterConfiguration.getRpcTimeout();
this.highAvailabilityServices = checkNotNull(highAvailabilityService);
this.blobWriter = jobManagerSharedServices.getBlobWriter();
this.scheduledExecutorService = jobManagerSharedServices.getScheduledExecutorService();
this.jobCompletionActions = checkNotNull(jobCompletionActions);
this.fatalErrorHandler = checkNotNull(fatalErrorHandler);
this.userCodeLoader = checkNotNull(userCodeLoader);
this.initializationTimestamp = initializationTimestamp;
this.retrieveTaskManagerHostName =
jobMasterConfiguration
.getConfiguration()
.getBoolean(JobManagerOptions.RETRIEVE_TASK_MANAGER_HOSTNAME);
final String jobName = jobGraph.getName();
final JobID jid = jobGraph.getJobID();
log.info("Initializing job {} ({}).", jobName, jid);
resourceManagerLeaderRetriever =
highAvailabilityServices.getResourceManagerLeaderRetriever();
this.slotPoolService =
checkNotNull(slotPoolServiceSchedulerFactory).createSlotPoolService(jid);
this.registeredTaskManagers = new HashMap<>(4);
this.partitionTracker =
checkNotNull(partitionTrackerFactory)
.create(
resourceID -> {
Tuple2
taskManagerInfo =
registeredTaskManagers.get(resourceID);
if (taskManagerInfo == null) {
return Optional.empty();
}
return Optional.of(taskManagerInfo.f1);
});
this.shuffleMaster = checkNotNull(shuffleMaster);
this.jobManagerJobMetricGroup = jobMetricGroupFactory.create(jobGraph);
this.jobStatusListener = new JobManagerJobStatusListener();
this.schedulerNG =
createScheduler(
slotPoolServiceSchedulerFactory,
executionDeploymentTracker,
jobManagerJobMetricGroup,
jobStatusListener);
this.heartbeatServices = checkNotNull(heartbeatServices);
this.taskManagerHeartbeatManager = NoOpHeartbeatManager.getInstance();
this.resourceManagerHeartbeatManager = NoOpHeartbeatManager.getInstance();
this.resourceManagerConnection = null;
this.establishedResourceManagerConnection = null;
this.accumulators = new HashMap<>();
}
private SchedulerNG createScheduler(
SlotPoolServiceSchedulerFactory slotPoolServiceSchedulerFactory,
ExecutionDeploymentTracker executionDeploymentTracker,
JobManagerJobMetricGroup jobManagerJobMetricGroup,
JobStatusListener jobStatusListener)
throws Exception {
final SchedulerNG scheduler =
slotPoolServiceSchedulerFactory.createScheduler(
log,
jobGraph,
scheduledExecutorService,
jobMasterConfiguration.getConfiguration(),
slotPoolService,
scheduledExecutorService,
userCodeLoader,
highAvailabilityServices.getCheckpointRecoveryFactory(),
rpcTimeout,
blobWriter,
jobManagerJobMetricGroup,
jobMasterConfiguration.getSlotRequestTimeout(),
shuffleMaster,
partitionTracker,
executionDeploymentTracker,
initializationTimestamp,
getMainThreadExecutor(),
fatalErrorHandler,
jobStatusListener);
return scheduler;
}
private HeartbeatManager createResourceManagerHeartbeatManager(
HeartbeatServices heartbeatServices) {
return heartbeatServices.createHeartbeatManager(
resourceId, new ResourceManagerHeartbeatListener(), getMainThreadExecutor(), log);
}
private HeartbeatManager
createTaskManagerHeartbeatManager(HeartbeatServices heartbeatServices) {
return heartbeatServices.createHeartbeatManagerSender(
resourceId, new TaskManagerHeartbeatListener(), getMainThreadExecutor(), log);
}
// ----------------------------------------------------------------------------------------------
// Lifecycle management
// ----------------------------------------------------------------------------------------------
@Override
protected void onStart() throws JobMasterException {
try {
startJobExecution();
} catch (Exception e) {
final JobMasterException jobMasterException =
new JobMasterException("Could not start the JobMaster.", e);
handleJobMasterError(jobMasterException);
throw jobMasterException;
}
}
/** Suspend the job and shutdown all other services including rpc. */
@Override
public CompletableFuture onStop() {
log.info("Stopping the JobMaster for job {}({}).", jobGraph.getName(), jobGraph.getJobID());
// make sure there is a graceful exit
return stopJobExecution(
new FlinkException(
String.format(
"Stopping JobMaster for job %s(%s).",
jobGraph.getName(), jobGraph.getJobID())))
.exceptionally(
exception -> {
throw new CompletionException(
new JobMasterException(
"Could not properly stop the JobMaster.", exception));
});
}
// ----------------------------------------------------------------------------------------------
// RPC methods
// ----------------------------------------------------------------------------------------------
@Override
public CompletableFuture cancel(Time timeout) {
schedulerNG.cancel();
return CompletableFuture.completedFuture(Acknowledge.get());
}
/**
* Updates the task execution state for a given task.
*
* @param taskExecutionState New task execution state for a given task
* @return Acknowledge the task execution state update
*/
@Override
public CompletableFuture updateTaskExecutionState(
final TaskExecutionState taskExecutionState) {
FlinkException taskExecutionException;
try {
checkNotNull(taskExecutionState, "taskExecutionState");
if (schedulerNG.updateTaskExecutionState(taskExecutionState)) {
return CompletableFuture.completedFuture(Acknowledge.get());
} else {
taskExecutionException =
new ExecutionGraphException(
"The execution attempt "
+ taskExecutionState.getID()
+ " was not found.");
}
} catch (Exception e) {
taskExecutionException =
new JobMasterException(
"Could not update the state of task execution for JobMaster.", e);
handleJobMasterError(taskExecutionException);
}
return FutureUtils.completedExceptionally(taskExecutionException);
}
@Override
public CompletableFuture requestNextInputSplit(
final JobVertexID vertexID, final ExecutionAttemptID executionAttempt) {
try {
return CompletableFuture.completedFuture(
schedulerNG.requestNextInputSplit(vertexID, executionAttempt));
} catch (IOException e) {
log.warn("Error while requesting next input split", e);
return FutureUtils.completedExceptionally(e);
}
}
@Override
public CompletableFuture requestPartitionState(
final IntermediateDataSetID intermediateResultId,
final ResultPartitionID resultPartitionId) {
try {
return CompletableFuture.completedFuture(
schedulerNG.requestPartitionState(intermediateResultId, resultPartitionId));
} catch (PartitionProducerDisposedException e) {
log.info("Error while requesting partition state", e);
return FutureUtils.completedExceptionally(e);
}
}
@Override
public CompletableFuture notifyPartitionDataAvailable(
final ResultPartitionID partitionID, final Time timeout) {
schedulerNG.notifyPartitionDataAvailable(partitionID);
return CompletableFuture.completedFuture(Acknowledge.get());
}
@Override
public CompletableFuture disconnectTaskManager(
final ResourceID resourceID, final Exception cause) {
log.debug(
"Disconnect TaskExecutor {} because: {}",
resourceID.getStringWithMetadata(),
cause.getMessage());
taskManagerHeartbeatManager.unmonitorTarget(resourceID);
slotPoolService.releaseTaskManager(resourceID, cause);
partitionTracker.stopTrackingPartitionsFor(resourceID);
Tuple2 taskManagerConnection =
registeredTaskManagers.remove(resourceID);
if (taskManagerConnection != null) {
taskManagerConnection.f1.disconnectJobManager(jobGraph.getJobID(), cause);
}
return CompletableFuture.completedFuture(Acknowledge.get());
}
// TODO: This method needs a leader session ID
@Override
public void acknowledgeCheckpoint(
final JobID jobID,
final ExecutionAttemptID executionAttemptID,
final long checkpointId,
final CheckpointMetrics checkpointMetrics,
final TaskStateSnapshot checkpointState) {
schedulerNG.acknowledgeCheckpoint(
jobID, executionAttemptID, checkpointId, checkpointMetrics, checkpointState);
}
@Override
public void reportCheckpointMetrics(
JobID jobID,
ExecutionAttemptID executionAttemptID,
long checkpointId,
CheckpointMetrics checkpointMetrics) {
schedulerNG.reportCheckpointMetrics(
jobID, executionAttemptID, checkpointId, checkpointMetrics);
}
// TODO: This method needs a leader session ID
@Override
public void declineCheckpoint(DeclineCheckpoint decline) {
schedulerNG.declineCheckpoint(decline);
}
@Override
public CompletableFuture sendOperatorEventToCoordinator(
final ExecutionAttemptID task,
final OperatorID operatorID,
final SerializedValue serializedEvent) {
try {
final OperatorEvent evt = serializedEvent.deserializeValue(userCodeLoader);
schedulerNG.deliverOperatorEventToCoordinator(task, operatorID, evt);
return CompletableFuture.completedFuture(Acknowledge.get());
} catch (Exception e) {
return FutureUtils.completedExceptionally(e);
}
}
@Override
public CompletableFuture requestKvStateLocation(
final JobID jobId, final String registrationName) {
try {
return CompletableFuture.completedFuture(
schedulerNG.requestKvStateLocation(jobId, registrationName));
} catch (UnknownKvStateLocation | FlinkJobNotFoundException e) {
log.info("Error while request key-value state location", e);
return FutureUtils.completedExceptionally(e);
}
}
@Override
public CompletableFuture notifyKvStateRegistered(
final JobID jobId,
final JobVertexID jobVertexId,
final KeyGroupRange keyGroupRange,
final String registrationName,
final KvStateID kvStateId,
final InetSocketAddress kvStateServerAddress) {
try {
schedulerNG.notifyKvStateRegistered(
jobId,
jobVertexId,
keyGroupRange,
registrationName,
kvStateId,
kvStateServerAddress);
return CompletableFuture.completedFuture(Acknowledge.get());
} catch (FlinkJobNotFoundException e) {
log.info("Error while receiving notification about key-value state registration", e);
return FutureUtils.completedExceptionally(e);
}
}
@Override
public CompletableFuture notifyKvStateUnregistered(
JobID jobId,
JobVertexID jobVertexId,
KeyGroupRange keyGroupRange,
String registrationName) {
try {
schedulerNG.notifyKvStateUnregistered(
jobId, jobVertexId, keyGroupRange, registrationName);
return CompletableFuture.completedFuture(Acknowledge.get());
} catch (FlinkJobNotFoundException e) {
log.info("Error while receiving notification about key-value state de-registration", e);
return FutureUtils.completedExceptionally(e);
}
}
@Override
public CompletableFuture> offerSlots(
final ResourceID taskManagerId, final Collection slots, final Time timeout) {
Tuple2 taskManager =
registeredTaskManagers.get(taskManagerId);
if (taskManager == null) {
return FutureUtils.completedExceptionally(
new Exception("Unknown TaskManager " + taskManagerId));
}
final TaskManagerLocation taskManagerLocation = taskManager.f0;
final TaskExecutorGateway taskExecutorGateway = taskManager.f1;
final RpcTaskManagerGateway rpcTaskManagerGateway =
new RpcTaskManagerGateway(taskExecutorGateway, getFencingToken());
return CompletableFuture.completedFuture(
slotPoolService.offerSlots(taskManagerLocation, rpcTaskManagerGateway, slots));
}
@Override
public void failSlot(
final ResourceID taskManagerId,
final AllocationID allocationId,
final Exception cause) {
if (registeredTaskManagers.containsKey(taskManagerId)) {
internalFailAllocation(taskManagerId, allocationId, cause);
} else {
log.warn(
"Cannot fail slot "
+ allocationId
+ " because the TaskManager "
+ taskManagerId
+ " is unknown.");
}
}
private void internalFailAllocation(
@Nullable ResourceID resourceId, AllocationID allocationId, Exception cause) {
final Optional resourceIdOptional =
slotPoolService.failAllocation(resourceId, allocationId, cause);
resourceIdOptional.ifPresent(
taskManagerId -> {
if (!partitionTracker.isTrackingPartitionsFor(taskManagerId)) {
releaseEmptyTaskManager(taskManagerId);
}
});
}
private void releaseEmptyTaskManager(ResourceID resourceId) {
disconnectTaskManager(
resourceId,
new FlinkException(
String.format(
"No more slots registered at JobMaster %s.",
resourceId.getStringWithMetadata())));
}
@Override
public CompletableFuture registerTaskManager(
final String taskManagerRpcAddress,
final UnresolvedTaskManagerLocation unresolvedTaskManagerLocation,
final JobID jobId,
final Time timeout) {
if (!jobGraph.getJobID().equals(jobId)) {
log.debug(
"Rejecting TaskManager registration attempt because of wrong job id {}.",
jobId);
return CompletableFuture.completedFuture(
new JMTMRegistrationRejection(
String.format(
"The JobManager is not responsible for job %s. Maybe the TaskManager used outdated connection information.",
jobId)));
}
final TaskManagerLocation taskManagerLocation;
try {
if (retrieveTaskManagerHostName) {
taskManagerLocation =
TaskManagerLocation.fromUnresolvedLocation(
unresolvedTaskManagerLocation, ResolutionMode.RETRIEVE_HOST_NAME);
} else {
taskManagerLocation =
TaskManagerLocation.fromUnresolvedLocation(
unresolvedTaskManagerLocation, ResolutionMode.USE_IP_ONLY);
}
} catch (Throwable throwable) {
final String errMsg =
String.format(
"Could not accept TaskManager registration. TaskManager address %s cannot be resolved. %s",
unresolvedTaskManagerLocation.getExternalAddress(),
throwable.getMessage());
log.error(errMsg);
return CompletableFuture.completedFuture(
new RegistrationResponse.Failure(new FlinkException(errMsg, throwable)));
}
final ResourceID taskManagerId = taskManagerLocation.getResourceID();
if (registeredTaskManagers.containsKey(taskManagerId)) {
final RegistrationResponse response = new JMTMRegistrationSuccess(resourceId);
return CompletableFuture.completedFuture(response);
} else {
return getRpcService()
.connect(taskManagerRpcAddress, TaskExecutorGateway.class)
.handleAsync(
(TaskExecutorGateway taskExecutorGateway, Throwable throwable) -> {
if (throwable != null) {
return new RegistrationResponse.Failure(throwable);
}
slotPoolService.registerTaskManager(taskManagerId);
registeredTaskManagers.put(
taskManagerId,
Tuple2.of(taskManagerLocation, taskExecutorGateway));
// monitor the task manager as heartbeat target
taskManagerHeartbeatManager.monitorTarget(
taskManagerId,
new HeartbeatTarget() {
@Override
public void receiveHeartbeat(
ResourceID resourceID,
AllocatedSlotReport payload) {
// the task manager will not request heartbeat, so
// this method will never be called currently
}
@Override
public void requestHeartbeat(
ResourceID resourceID,
AllocatedSlotReport allocatedSlotReport) {
taskExecutorGateway.heartbeatFromJobManager(
resourceID, allocatedSlotReport);
}
});
return new JMTMRegistrationSuccess(resourceId);
},
getMainThreadExecutor());
}
}
@Override
public void disconnectResourceManager(
final ResourceManagerId resourceManagerId, final Exception cause) {
if (isConnectingToResourceManager(resourceManagerId)) {
reconnectToResourceManager(cause);
}
}
private boolean isConnectingToResourceManager(ResourceManagerId resourceManagerId) {
return resourceManagerAddress != null
&& resourceManagerAddress.getResourceManagerId().equals(resourceManagerId);
}
@Override
public void heartbeatFromTaskManager(
final ResourceID resourceID, TaskExecutorToJobManagerHeartbeatPayload payload) {
taskManagerHeartbeatManager.receiveHeartbeat(resourceID, payload);
}
@Override
public void heartbeatFromResourceManager(final ResourceID resourceID) {
resourceManagerHeartbeatManager.requestHeartbeat(resourceID, null);
}
@Override
public CompletableFuture requestJobDetails(Time timeout) {
return CompletableFuture.completedFuture(schedulerNG.requestJobDetails());
}
@Override
public CompletableFuture requestJobStatus(Time timeout) {
return CompletableFuture.completedFuture(schedulerNG.requestJobStatus());
}
@Override
public CompletableFuture requestJob(Time timeout) {
return CompletableFuture.completedFuture(schedulerNG.requestJob());
}
@Override
public CompletableFuture triggerSavepoint(
@Nullable final String targetDirectory, final boolean cancelJob, final Time timeout) {
return schedulerNG.triggerSavepoint(targetDirectory, cancelJob);
}
@Override
public CompletableFuture stopWithSavepoint(
@Nullable final String targetDirectory, final boolean terminate, final Time timeout) {
return schedulerNG.stopWithSavepoint(targetDirectory, terminate);
}
@Override
public void notifyAllocationFailure(AllocationID allocationID, Exception cause) {
internalFailAllocation(null, allocationID, cause);
}
@Override
public void notifyNotEnoughResourcesAvailable(
Collection acquiredResources) {
slotPoolService.notifyNotEnoughResourcesAvailable(acquiredResources);
}
@Override
public CompletableFuture