
com.hazelcast.jet.impl.JobExecutionService Maven / Gradle / Ivy
/*
* Copyright (c) 2008-2024, Hazelcast, Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.hazelcast.jet.impl;
import com.hazelcast.cluster.Address;
import com.hazelcast.cluster.Member;
import com.hazelcast.core.HazelcastInstanceNotActiveException;
import com.hazelcast.core.MemberLeftException;
import com.hazelcast.internal.cluster.MemberInfo;
import com.hazelcast.internal.cluster.impl.ClusterServiceImpl;
import com.hazelcast.internal.cluster.impl.MembershipManager;
import com.hazelcast.internal.cluster.impl.operations.TriggerMemberListPublishOp;
import com.hazelcast.internal.metrics.DynamicMetricsProvider;
import com.hazelcast.internal.metrics.MetricDescriptor;
import com.hazelcast.internal.metrics.MetricsCollectionContext;
import com.hazelcast.internal.metrics.MetricsRegistry;
import com.hazelcast.internal.metrics.Probe;
import com.hazelcast.internal.metrics.collectors.MetricsCollector;
import com.hazelcast.internal.metrics.impl.MetricsCompressor;
import com.hazelcast.internal.util.counters.Counter;
import com.hazelcast.internal.util.counters.MwCounter;
import com.hazelcast.jet.Util;
import com.hazelcast.jet.core.TopologyChangedException;
import com.hazelcast.jet.core.metrics.MetricNames;
import com.hazelcast.jet.core.metrics.MetricTags;
import com.hazelcast.jet.impl.deployment.JetDelegatingClassLoader;
import com.hazelcast.jet.impl.exception.ExecutionNotFoundException;
import com.hazelcast.jet.impl.exception.JobTerminateRequestedException;
import com.hazelcast.jet.impl.execution.ExecutionContext;
import com.hazelcast.jet.impl.execution.ExecutionContext.SenderReceiverKey;
import com.hazelcast.jet.impl.execution.SenderTasklet;
import com.hazelcast.jet.impl.execution.TaskletExecutionService;
import com.hazelcast.jet.impl.execution.init.ExecutionPlan;
import com.hazelcast.jet.impl.metrics.RawJobMetrics;
import com.hazelcast.jet.impl.operation.CheckLightJobsOperation;
import com.hazelcast.jet.impl.util.ExceptionUtil;
import com.hazelcast.logging.ILogger;
import com.hazelcast.spi.exception.RetryableHazelcastException;
import com.hazelcast.spi.exception.TargetNotMemberException;
import com.hazelcast.spi.impl.NodeEngineImpl;
import com.hazelcast.spi.impl.operationservice.Operation;
import com.hazelcast.spi.impl.operationservice.impl.InvocationFuture;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.Set;
import java.util.concurrent.CancellationException;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ScheduledFuture;
import java.util.function.Function;
import java.util.function.UnaryOperator;
import static com.hazelcast.internal.util.ExceptionUtil.sneakyThrow;
import static com.hazelcast.internal.util.ExceptionUtil.withTryCatch;
import static com.hazelcast.jet.Util.idToString;
import static com.hazelcast.jet.impl.JetServiceBackend.SERVICE_NAME;
import static com.hazelcast.jet.impl.JobClassLoaderService.JobPhase.EXECUTION;
import static com.hazelcast.jet.impl.TerminationMode.CANCEL_FORCEFUL;
import static com.hazelcast.jet.impl.util.ExceptionUtil.isOrHasCause;
import static com.hazelcast.jet.impl.util.ExceptionUtil.peel;
import static com.hazelcast.jet.impl.util.Util.doWithClassLoader;
import static com.hazelcast.jet.impl.util.Util.jobIdAndExecutionId;
import static java.util.Collections.newSetFromMap;
import static java.util.Collections.singleton;
import static java.util.concurrent.CompletableFuture.completedFuture;
import static java.util.concurrent.TimeUnit.MINUTES;
import static java.util.concurrent.TimeUnit.NANOSECONDS;
import static java.util.concurrent.TimeUnit.SECONDS;
import static java.util.stream.Collectors.toSet;
/**
* Service to handle ExecutionContexts on all cluster members. Job-control
* operations from coordinator are handled here.
*/
public class JobExecutionService implements DynamicMetricsProvider {
/**
* A timeout after which we cancel a light job that doesn't receive InitOp
* from the coordinator. {@link ExecutionContext} can be created in
* response to data packet received for that execution, but it doesn't know
* the coordinator. Therefore, the checker cannot confirm with the
* coordinator if it still exists. We terminate these jobs after a timeout.
* However, the timeout has to be long enough because if the job happens to
* be initialized later, we'll lose data, and we won't even detect it. It can
* also happen that we lose a DONE_ITEM and the job will get stuck, though
* that's better than incorrect results.
*/
private static final long UNINITIALIZED_CONTEXT_MAX_AGE_NS = MINUTES.toNanos(5);
private static final long FAILED_EXECUTION_EXPIRY_NS = SECONDS.toNanos(5);
private static final CompletableFuture>[] EMPTY_COMPLETABLE_FUTURE_ARRAY = new CompletableFuture[0];
private final Object mutex = new Object();
private final NodeEngineImpl nodeEngine;
private final ILogger logger;
private final TaskletExecutionService taskletExecutionService;
private final JobClassLoaderService jobClassloaderService;
private final Set executionContextJobIds = newSetFromMap(new ConcurrentHashMap<>());
// key: executionId
private final ConcurrentMap executionContexts = new ConcurrentHashMap<>();
/**
* Key: executionId
* Value: expiry time (as per System.nanoTime())
*
* This map contains executions, that failed or were cancelled.
* These executions are very likely to receive further data packets
* from other members whose executions are concurrently cancelled
* too. If we keep no track of these exceptions, in failure-heavy or
* cancellation-heavy scenarios a significant amount of memory could
* be held for time defined in {@link
* #UNINITIALIZED_CONTEXT_MAX_AGE_NS}, see
* issue #19897.
*/
private final ConcurrentMap failedJobs = new ConcurrentHashMap<>();
@Probe(name = MetricNames.JOB_EXECUTIONS_STARTED)
private final Counter executionStarted = MwCounter.newMwCounter();
@Probe(name = MetricNames.JOB_EXECUTIONS_COMPLETED)
private final Counter executionCompleted = MwCounter.newMwCounter();
private final Function super Long, ? extends ExecutionContext> newLightJobExecutionContextFunction;
private final ScheduledFuture> lightExecutionsCheckerFuture;
JobExecutionService(NodeEngineImpl nodeEngine, TaskletExecutionService taskletExecutionService,
JobClassLoaderService jobClassloaderService) {
this.nodeEngine = nodeEngine;
this.logger = nodeEngine.getLogger(getClass());
this.taskletExecutionService = taskletExecutionService;
this.jobClassloaderService = jobClassloaderService;
newLightJobExecutionContextFunction = execId ->
failedJobs.containsKey(execId)
? null
: new ExecutionContext(nodeEngine, execId, execId, true);
// register metrics
MetricsRegistry registry = nodeEngine.getMetricsRegistry();
MetricDescriptor descriptor = registry.newMetricDescriptor()
.withTag(MetricTags.MODULE, "jet");
registry.registerStaticMetrics(descriptor, this);
this.lightExecutionsCheckerFuture = nodeEngine.getExecutionService().scheduleWithRepetition(
this::checkExecutions, 0, 1, SECONDS);
}
public Long getExecutionIdForJobId(long jobId) {
return executionContexts.values().stream()
.filter(ec -> ec.jobId() == jobId)
.findAny()
.map(ExecutionContext::executionId)
.orElse(null);
}
public ExecutionContext getExecutionContext(long executionId) {
return executionContexts.get(executionId);
}
/**
* Gets the execution context or creates it, if it doesn't exist. If
* we're creating it, we assume it's for a light job and that the
* jobId == executionId. Might return null if the job with the given
* ID recently failed.
*
* We can also end up here for a non-light job in this scenario:
* - job runs on 2 members. The master requests termination.
*
- execution on member A terminates and is removed from
* executionContexts
*
- member A receives a packet from member B (because it was in transit
* or simply because the execution on member B might terminate a little
* later)
*
- ExecutionContext is recreated.
*
*
* We ignore this as we assume that we'll never receive the
* StartExecutionOperation. The improperly-created ExecutionContext will be
* removed after a timeout in {@link #checkExecutions()} because it
* will never be initialized.
*
* We mitigate the number of execution context created after a job
* failed by checking the {@link #failedJobs} map before re-creating
* the execution context in this method.
*/
@Nullable
public ExecutionContext getOrCreateExecutionContext(long executionId) {
return executionContexts.computeIfAbsent(executionId, newLightJobExecutionContextFunction);
}
public Collection getExecutionContexts() {
return executionContexts.values();
}
public ConcurrentMap getFailedJobs() {
return failedJobs;
}
Map getSenderMap(long executionId) {
ExecutionContext ctx = executionContexts.get(executionId);
return ctx != null ? ctx.senderMap() : null;
}
public void shutdown() {
lightExecutionsCheckerFuture.cancel(false);
synchronized (mutex) {
cancelAllExecutions("Node is shutting down");
}
}
public void reset() {
cancelAllExecutions("reset");
}
/**
* Cancels all ongoing executions using the given failure supplier.
*/
@SuppressWarnings("rawtypes")
public void cancelAllExecutions(String reason) {
// The ConcurrentHashMap.values() is a projection of underlying data in the map. If other thread mutates the map the
// collection returned by values() mutates as well. That's the reason why we use ArrayList here instead of an array, the
// count of items may change.
Collection contexts = executionContexts.values();
List futures = new ArrayList<>(contexts.size());
for (ExecutionContext exeCtx : contexts) {
logger.fine("Completing %s locally. Reason: %s", exeCtx.jobNameAndExecutionId(), reason);
futures.add(terminateExecution0(exeCtx, null, new CancellationException()));
}
CompletableFuture.allOf(futures.toArray(EMPTY_COMPLETABLE_FUTURE_ARRAY)).join();
}
/**
* Cancels executions that contain the leaving address as the coordinator or a
* job participant
*/
@SuppressWarnings("rawtypes")
void onMemberRemoved(Member member) {
Address address = member.getAddress();
CompletableFuture[] terminationFutures =
executionContexts.values().stream()
// note that coordinator might not be a participant
// (in case it is a lite member)
.filter(exeCtx -> exeCtx.coordinator() != null
&& (exeCtx.coordinator().equals(address) || exeCtx.hasParticipant(address)))
.map(exeCtx -> {
logger.fine("Completing %s locally. Reason: Member %s left the cluster",
exeCtx.jobNameAndExecutionId(), address);
return terminateExecution0(exeCtx, null, new MemberLeftException(member));
})
.toArray(CompletableFuture[]::new);
CompletableFuture.allOf(terminationFutures).join();
}
public CompletableFuture runLightJob(
long jobId,
long executionId,
Address coordinator,
int coordinatorMemberListVersion,
Set participants,
ExecutionPlan plan
) {
assert executionId == jobId : "executionId(" + idToString(executionId) + ") != jobId(" + idToString(jobId) + ")";
verifyClusterInformation(jobId, executionId, coordinator, coordinatorMemberListVersion, participants);
failIfNotRunning();
ExecutionContext execCtx;
synchronized (mutex) {
addExecutionContextJobId(jobId, executionId, coordinator);
execCtx = executionContexts.computeIfAbsent(executionId,
x -> new ExecutionContext(nodeEngine, jobId, executionId, true));
}
Set addresses = participants.stream().map(MemberInfo::getAddress).collect(toSet());
return execCtx.initialize(coordinator, addresses, plan)
.whenComplete((r, e) -> {
if (e != null) {
completeExecution(execCtx, new CancellationException()).join();
}
})
.thenAccept(r -> {
// initial log entry with all of jobId, jobName, executionId
if (logger.isFineEnabled()) {
logger.fine("Execution plan for light job ID=" + idToString(jobId)
+ ", jobName=" + (execCtx.jobName() != null ? '\'' + execCtx.jobName() + '\'' : "null")
+ ", executionId=" + idToString(executionId) + " initialized, will start the execution");
}
})
.thenCompose(r -> beginExecution0(execCtx, false));
}
/**
* Initiates the given execution if the local node accepts the coordinator
* as its master, and has an up-to-date member list information.
* -
* If the local node has a stale member list, it retries the init operation
* until it receives the new member list from the master.
*
-
* If the local node detects that the member list changed after the init
* operation is sent but before executed, then it sends a graceful failure
* so that the job init will be retried properly.
*
-
* If there is an already ongoing execution for the given job, then the
* init execution is retried.
*
*/
public CompletableFuture initExecution(
long jobId, long executionId, Address coordinator, int coordinatorMemberListVersion,
Set participants, ExecutionPlan plan
) {
ExecutionContext execCtx = addExecutionContext(
jobId, executionId, coordinator, coordinatorMemberListVersion, participants);
Set addresses = participants.stream().map(MemberInfo::getAddress).collect(toSet());
ClassLoader jobCl = jobClassloaderService.getClassLoader(jobId);
return doWithClassLoader(jobCl,
() -> execCtx.initialize(coordinator, addresses, plan))
.thenAccept(r -> {
// initial log entry with all of jobId, jobName, executionId
logger.info("Execution plan for jobId=" + idToString(jobId)
+ ", jobName=" + (execCtx.jobName() != null ? '\'' + execCtx.jobName() + '\'' : "null")
+ ", executionId=" + idToString(executionId) + " initialized");
});
}
private void addExecutionContextJobId(long jobId, long executionId, Address coordinator) {
if (!executionContextJobIds.add(jobId)) {
ExecutionContext current = executionContexts.get(executionId);
if (current != null) {
throw new IllegalStateException(String.format(
"Execution context for %s for coordinator %s already exists for coordinator %s",
current.jobNameAndExecutionId(), coordinator, current.coordinator()));
}
// search contexts for one with different executionId, but same jobId
if (logger.isFineEnabled()) {
executionContexts.values().stream()
.filter(e -> e.jobId() == jobId)
.forEach(e -> logger.fine(String.format(
"Execution context for job %s for coordinator %s already exists"
+ " with local execution %s for coordinator %s",
idToString(jobId), coordinator, idToString(e.executionId()),
e.coordinator())));
}
throw new RetryableHazelcastException();
}
}
private ExecutionContext addExecutionContext(
long jobId,
long executionId,
Address coordinator,
int coordinatorMemberListVersion,
Set participants
) {
ExecutionContext execCtx;
ExecutionContext oldContext;
try {
assertIsMaster(jobId, executionId, coordinator);
verifyClusterInformation(jobId, executionId, coordinator, coordinatorMemberListVersion, participants);
failIfNotRunning();
synchronized (mutex) {
addExecutionContextJobId(jobId, executionId, coordinator);
execCtx = new ExecutionContext(nodeEngine, jobId, executionId, false);
oldContext = executionContexts.put(executionId, execCtx);
}
} catch (Throwable t) {
// The classloader was created in InitExecutionOperation#deserializePlan().
// If the InitExecutionOperation#doRun() fails before ExecutionContext is added
// to executionContexts, then classloader must be removed in order to not have leaks.
jobClassloaderService.tryRemoveClassloadersForJob(jobId, EXECUTION);
throw t;
}
if (oldContext != null) {
throw new RuntimeException("Duplicate ExecutionContext for execution " + Util.idToString(executionId));
}
return execCtx;
}
private void assertIsMaster(long jobId, long executionId, Address coordinator) {
Address masterAddress = nodeEngine.getMasterAddress();
if (!coordinator.equals(masterAddress)) {
failIfNotRunning();
throw new IllegalStateException(String.format(
"Coordinator %s cannot initialize %s. Reason: it is not the master, the master is %s",
coordinator, jobIdAndExecutionId(jobId, executionId), masterAddress));
}
}
private void verifyClusterInformation(long jobId, long executionId, Address coordinator,
int coordinatorMemberListVersion, Set participants) {
Address masterAddress = nodeEngine.getMasterAddress();
ClusterServiceImpl clusterService = (ClusterServiceImpl) nodeEngine.getClusterService();
MembershipManager membershipManager = clusterService.getMembershipManager();
int localMemberListVersion = membershipManager.getMemberListVersion();
Address thisAddress = nodeEngine.getThisAddress();
if (coordinatorMemberListVersion > localMemberListVersion) {
if (masterAddress == null) {
// we expect that master will eventually be known to this member (a new master will be
// elected or split brain merge will happen).
throw new RetryableHazelcastException(String.format(
"Cannot initialize %s for coordinator %s, local member list version %s," +
" coordinator member list version %s. And also, since the master address" +
" is not known to this member, cannot request a new member list from master.",
jobIdAndExecutionId(jobId, executionId), coordinator, localMemberListVersion,
coordinatorMemberListVersion));
}
assert !masterAddress.equals(thisAddress) : String.format(
"Local node: %s is master but InitOperation has coordinator member list version: %s larger than "
+ " local member list version: %s", thisAddress, coordinatorMemberListVersion,
localMemberListVersion);
nodeEngine.getOperationService().send(new TriggerMemberListPublishOp(), masterAddress);
throw new RetryableHazelcastException(String.format(
"Cannot initialize %s for coordinator %s, local member list version %s," +
" coordinator member list version %s",
jobIdAndExecutionId(jobId, executionId), coordinator, localMemberListVersion,
coordinatorMemberListVersion));
}
// If the participant members can receive the new member list before the
// coordinator, and we can also get into the
// "coordinatorMemberListVersion < localMemberListVersion" case. If this
// situation occurs when a job participant leaves, then the job start will
// fail. Since the unknown participating member situation couldn't
// be resolved with retrying the InitExecutionOperation for this
// case, we do nothing here and let it fail below if some participant
// isn't found.
// The job start won't fail if this situation occurs when a new member
// is added to the cluster, because all job participants are known to the
// other participating members. The only disadvantage of this is that a
// newly added member will not be a job participant and partition mapping
// may not be completely proper in this case.
boolean isLocalMemberParticipant = false;
for (MemberInfo participant : participants) {
if (participant.getAddress().equals(thisAddress)) {
isLocalMemberParticipant = true;
}
if (membershipManager.getMember(participant.getAddress(), participant.getUuid()) == null) {
throw new TopologyChangedException(String.format(
"Cannot initialize %s for coordinator %s: participant %s not found in local member list." +
" Local member list version: %s, coordinator member list version: %s",
jobIdAndExecutionId(jobId, executionId), coordinator, participant,
localMemberListVersion, coordinatorMemberListVersion));
}
}
if (!isLocalMemberParticipant) {
throw new IllegalArgumentException(String.format(
"Cannot initialize %s since member %s is not in participants: %s",
jobIdAndExecutionId(jobId, executionId), thisAddress, participants));
}
}
private void failIfNotRunning() {
if (!nodeEngine.isRunning()) {
throw new HazelcastInstanceNotActiveException();
}
}
@Nonnull
public ExecutionContext assertExecutionContext(Address callerAddress, long jobId, long executionId,
String callerOpName) {
Address masterAddress = nodeEngine.getMasterAddress();
if (!callerAddress.equals(masterAddress)) {
failIfNotRunning();
throw new IllegalStateException(String.format(
"Caller %s cannot do '%s' for %s: it is not the master, the master is %s",
callerAddress, callerOpName, jobIdAndExecutionId(jobId, executionId), masterAddress));
}
failIfNotRunning();
ExecutionContext executionContext = executionContexts.get(executionId);
if (executionContext == null) {
throw new ExecutionNotFoundException(String.format(
"%s not found for coordinator %s for '%s'",
jobIdAndExecutionId(jobId, executionId), callerAddress, callerOpName));
} else if (!(executionContext.coordinator().equals(callerAddress) && executionContext.jobId() == jobId)) {
throw new IllegalStateException(String.format(
"%s, originally from coordinator %s, cannot do '%s' by coordinator %s and execution %s",
executionContext.jobNameAndExecutionId(), executionContext.coordinator(),
callerOpName, callerAddress, idToString(executionId)));
}
return executionContext;
}
/**
* Completes and cleans up execution of the given job
*/
public CompletableFuture completeExecution(@Nonnull ExecutionContext executionContext, Throwable error) {
ExecutionContext removed = executionContexts.remove(executionContext.executionId());
if (removed != null) {
if (error != null) {
failedJobs.put(executionContext.executionId(), System.nanoTime() + FAILED_EXECUTION_EXPIRY_NS);
}
JetDelegatingClassLoader jobClassLoader = jobClassloaderService.getClassLoader(executionContext.jobId());
return doWithClassLoader(jobClassLoader, () -> executionContext.completeExecution(error))
.whenComplete(withTryCatch(logger, (ignored, t) -> {
if (!executionContext.isLightJob()) {
jobClassloaderService.tryRemoveClassloadersForJob(executionContext.jobId(), EXECUTION);
}
executionCompleted.inc();
executionContextJobIds.remove(executionContext.jobId());
logger.fine("Completed execution of " + executionContext.jobNameAndExecutionId());
}));
} else {
return completedFuture(null);
}
}
public void updateMetrics(@Nonnull Long executionId, RawJobMetrics metrics) {
ExecutionContext executionContext = executionContexts.get(executionId);
if (executionContext != null) {
executionContext.setMetrics(metrics);
}
}
public CompletableFuture beginExecution(
Address coordinator,
long jobId,
long executionId,
boolean collectMetrics
) {
ExecutionContext execCtx = assertExecutionContext(coordinator, jobId, executionId, "StartExecutionOperation");
assert !execCtx.isLightJob() : "StartExecutionOperation received for a light job " + idToString(jobId);
logger.info("Start execution of " + execCtx.jobNameAndExecutionId() + " from coordinator " + coordinator);
return beginExecution0(execCtx, collectMetrics);
}
public CompletableFuture beginExecution0(ExecutionContext execCtx, boolean collectMetrics) {
executionStarted.inc();
return execCtx.beginExecution(taskletExecutionService)
.thenApply(r -> {
RawJobMetrics terminalMetrics;
if (collectMetrics) {
try (
var metricsRenderer = new JobMetricsCollector(nodeEngine.getLocalMember(), logger)
) {
nodeEngine.getMetricsRegistry().collectDynamicMetrics(metricsRenderer, singleton(execCtx));
terminalMetrics = metricsRenderer.getMetrics();
}
} else {
terminalMetrics = null;
}
return terminalMetrics;
})
.handleAsync((metrics, e) -> completeExecution(execCtx, peel(e))
.thenApply(ignored -> {
if (e == null) {
return metrics;
}
throw sneakyThrow(e);
})
)
.thenCompose(stage -> stage)
.whenComplete((metrics, e) -> {
if (ExceptionUtil.isOrHasCause(e, CancellationException.class)) {
logger.fine("Execution of " + execCtx.jobNameAndExecutionId() + " was cancelled");
} else if (e != null) {
logger.fine("Execution of " + execCtx.jobNameAndExecutionId()
+ " completed with failure", e);
} else {
logger.fine("Execution of " + execCtx.jobNameAndExecutionId() + " completed");
}
});
}
@Override
public void provideDynamicMetrics(MetricDescriptor descriptor, MetricsCollectionContext context) {
try {
descriptor.withTag(MetricTags.MODULE, "jet");
executionContexts.forEach((id, ctx) ->
ctx.provideDynamicMetrics(descriptor.copy(), context));
} catch (Throwable t) {
logger.warning("Dynamic metric collection failed", t);
throw t;
}
}
/**
* See also javadoc at {@link CheckLightJobsOperation}.
*/
@SuppressWarnings("rawtypes")
private void checkExecutions() {
try {
long now = System.nanoTime();
long uninitializedContextThreshold = now - UNINITIALIZED_CONTEXT_MAX_AGE_NS;
Map> executionsPerMember = new HashMap<>();
List terminateFutures = new ArrayList<>();
for (ExecutionContext ctx : executionContexts.values()) {
if (!ctx.isLightJob()) {
continue;
}
Address coordinator = ctx.coordinator();
if (coordinator != null) {
// if coordinator is known, add execution to the list to check
executionsPerMember
.computeIfAbsent(coordinator, k -> new ArrayList<>())
.add(ctx.executionId());
} else {
// if coordinator is not known, remove execution if it's not known for too long
if (ctx.getCreatedOn() <= uninitializedContextThreshold) {
logger.fine("Terminating light job %s because it wasn't initialized during %d seconds",
idToString(ctx.executionId()), NANOSECONDS.toSeconds(UNINITIALIZED_CONTEXT_MAX_AGE_NS));
terminateFutures.add(terminateExecution0(ctx, CANCEL_FORCEFUL, new CancellationException()));
}
}
}
if (!terminateFutures.isEmpty()) {
CompletableFuture.allOf(terminateFutures.toArray(EMPTY_COMPLETABLE_FUTURE_ARRAY)).join();
}
// submit the query to the coordinator
for (Entry> en : executionsPerMember.entrySet()) {
long[] executionIds = en.getValue().stream().mapToLong(Long::longValue).toArray();
Operation op = new CheckLightJobsOperation(executionIds);
InvocationFuture future = nodeEngine.getOperationService()
.createInvocationBuilder(SERVICE_NAME, op, en.getKey())
.invoke();
future.whenComplete((r, t) -> {
if (isOrHasCause(t, TargetNotMemberException.class)) {
// if the target isn't a member, then all executions are unknown
r = executionIds;
} else if (t != null) {
logger.warning("Failed to check light job state with coordinator " + en.getKey() + ": " + t, t);
return;
}
assert r != null;
for (long executionId : r) {
ExecutionContext execCtx = executionContexts.get(executionId);
if (execCtx != null) {
logger.fine("Terminating light job " + idToString(executionId)
+ " because the coordinator doesn't know it");
terminateExecution0(execCtx, CANCEL_FORCEFUL, new CancellationException());
}
}
});
}
// clean up failedJobs
failedJobs.values().removeIf(expiryTime -> expiryTime < now);
} catch (Throwable e) {
logger.severe("Failed to query live light executions: " + e, e);
}
}
public CompletableFuture terminateExecution(long jobId, long executionId, Address callerAddress, TerminationMode mode) {
failIfNotRunning();
ExecutionContext executionContext = executionContexts.get(executionId);
if (executionContext == null) {
// If this happens after the execution terminated locally, ignore.
// If this happens before the execution was initialized locally, that means it's a light
// job. We ignore too and rely on the CheckLightJobsOperation.
return completedFuture(null);
}
if (!executionContext.isLightJob()) {
Address masterAddress = nodeEngine.getMasterAddress();
if (!callerAddress.equals(masterAddress)) {
failIfNotRunning();
throw new IllegalStateException(String.format(
"Caller %s cannot do '%s' for terminateExecution: it is not the master, the master is %s",
callerAddress, jobIdAndExecutionId(jobId, executionId), masterAddress));
}
}
Address coordinator = executionContext.coordinator();
if (coordinator == null) {
// This can happen if ExecutionContext was created after a received data packet,
// either before the initialization or after a completion.
// The TerminateOp is always sent after InitOp on coordinator, but it can happen that it's handled
// first on the target member.
// We ignore this and rely on the CheckLightJobsOperation to clean up.
// It can't happen for normal jobs
assert executionContext.isLightJob() : "null coordinator for non-light job";
} else if (!coordinator.equals(callerAddress)) {
throw new IllegalStateException(String.format(
"%s, originally from coordinator %s, cannot do 'terminateExecution' by coordinator %s and execution %s",
executionContext.jobNameAndExecutionId(), coordinator, callerAddress, idToString(executionId)));
}
Exception cause = mode == null ? new CancellationException() : new JobTerminateRequestedException(mode);
return terminateExecution0(executionContext, mode, cause);
}
public CompletableFuture terminateExecution0(ExecutionContext executionContext, TerminationMode mode, Throwable cause) {
if (!executionContext.terminateExecution(mode, cause)) {
// If the execution was terminated before it began, call completeExecution now.
// Otherwise, if the execution was already begun, this method will be called when the tasklets complete.
logger.fine(executionContext.jobNameAndExecutionId()
+ " calling completeExecution because execution terminated before it started");
return completeExecution(executionContext, cause);
}
return completedFuture(null);
}
// for test
public void waitAllExecutionsTerminated() {
for (ExecutionContext ctx : executionContexts.values()) {
try {
ctx.getExecutionFuture().join();
} catch (Throwable ignored) {
}
}
}
private static class JobMetricsCollector implements MetricsCollector, AutoCloseable {
private final MetricsCompressor compressor;
private final ILogger logger;
private final UnaryOperator addPrefixFn;
JobMetricsCollector(@Nonnull Member member, @Nonnull ILogger logger) {
Objects.requireNonNull(member, "member");
this.logger = Objects.requireNonNull(logger, "logger");
this.addPrefixFn = JobMetricsUtil.addMemberPrefixFn(member);
this.compressor = new MetricsCompressor();
}
@Override
public void collectLong(MetricDescriptor descriptor, long value) {
compressor.addLong(addPrefixFn.apply(descriptor), value);
}
@Override
public void collectDouble(MetricDescriptor descriptor, double value) {
compressor.addDouble(addPrefixFn.apply(descriptor), value);
}
@Override
public void collectException(MetricDescriptor descriptor, Exception e) {
logger.warning("Exception when rendering job metrics: " + e, e);
}
@Override
public void collectNoValue(MetricDescriptor descriptor) { }
@Nonnull
public RawJobMetrics getMetrics() {
return RawJobMetrics.of(compressor.getBlobAndClose());
}
@Override
public void close() {
compressor.close();
}
}
}