All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hazelcast.jet.impl.JobExecutionService Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (c) 2008-2024, Hazelcast, Inc. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.hazelcast.jet.impl;

import com.hazelcast.cluster.Address;
import com.hazelcast.cluster.Member;
import com.hazelcast.core.HazelcastInstanceNotActiveException;
import com.hazelcast.core.MemberLeftException;
import com.hazelcast.internal.cluster.MemberInfo;
import com.hazelcast.internal.cluster.impl.ClusterServiceImpl;
import com.hazelcast.internal.cluster.impl.MembershipManager;
import com.hazelcast.internal.cluster.impl.operations.TriggerMemberListPublishOp;
import com.hazelcast.internal.metrics.DynamicMetricsProvider;
import com.hazelcast.internal.metrics.MetricDescriptor;
import com.hazelcast.internal.metrics.MetricsCollectionContext;
import com.hazelcast.internal.metrics.MetricsRegistry;
import com.hazelcast.internal.metrics.Probe;
import com.hazelcast.internal.metrics.collectors.MetricsCollector;
import com.hazelcast.internal.metrics.impl.MetricsCompressor;
import com.hazelcast.internal.util.counters.Counter;
import com.hazelcast.internal.util.counters.MwCounter;
import com.hazelcast.jet.Util;
import com.hazelcast.jet.core.TopologyChangedException;
import com.hazelcast.jet.core.metrics.MetricNames;
import com.hazelcast.jet.core.metrics.MetricTags;
import com.hazelcast.jet.impl.deployment.JetDelegatingClassLoader;
import com.hazelcast.jet.impl.exception.ExecutionNotFoundException;
import com.hazelcast.jet.impl.exception.JobTerminateRequestedException;
import com.hazelcast.jet.impl.execution.ExecutionContext;
import com.hazelcast.jet.impl.execution.ExecutionContext.SenderReceiverKey;
import com.hazelcast.jet.impl.execution.SenderTasklet;
import com.hazelcast.jet.impl.execution.TaskletExecutionService;
import com.hazelcast.jet.impl.execution.init.ExecutionPlan;
import com.hazelcast.jet.impl.metrics.RawJobMetrics;
import com.hazelcast.jet.impl.operation.CheckLightJobsOperation;
import com.hazelcast.jet.impl.util.ExceptionUtil;
import com.hazelcast.logging.ILogger;
import com.hazelcast.spi.exception.RetryableHazelcastException;
import com.hazelcast.spi.exception.TargetNotMemberException;
import com.hazelcast.spi.impl.NodeEngineImpl;
import com.hazelcast.spi.impl.operationservice.Operation;
import com.hazelcast.spi.impl.operationservice.impl.InvocationFuture;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.Set;
import java.util.concurrent.CancellationException;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ScheduledFuture;
import java.util.function.Function;
import java.util.function.UnaryOperator;

import static com.hazelcast.internal.util.ExceptionUtil.sneakyThrow;
import static com.hazelcast.internal.util.ExceptionUtil.withTryCatch;
import static com.hazelcast.jet.Util.idToString;
import static com.hazelcast.jet.impl.JetServiceBackend.SERVICE_NAME;
import static com.hazelcast.jet.impl.JobClassLoaderService.JobPhase.EXECUTION;
import static com.hazelcast.jet.impl.TerminationMode.CANCEL_FORCEFUL;
import static com.hazelcast.jet.impl.util.ExceptionUtil.isOrHasCause;
import static com.hazelcast.jet.impl.util.ExceptionUtil.peel;
import static com.hazelcast.jet.impl.util.Util.doWithClassLoader;
import static com.hazelcast.jet.impl.util.Util.jobIdAndExecutionId;
import static java.util.Collections.newSetFromMap;
import static java.util.Collections.singleton;
import static java.util.concurrent.CompletableFuture.completedFuture;
import static java.util.concurrent.TimeUnit.MINUTES;
import static java.util.concurrent.TimeUnit.NANOSECONDS;
import static java.util.concurrent.TimeUnit.SECONDS;
import static java.util.stream.Collectors.toSet;

/**
 * Service to handle ExecutionContexts on all cluster members. Job-control
 * operations from coordinator are handled here.
 */
public class JobExecutionService implements DynamicMetricsProvider {

    /**
     * A timeout after which we cancel a light job that doesn't receive InitOp
     * from the coordinator. {@link ExecutionContext} can be created in
     * response to data packet received for that execution, but it doesn't know
     * the coordinator. Therefore, the checker cannot confirm with the
     * coordinator if it still exists. We terminate these jobs after a timeout.
     * However, the timeout has to be long enough because if the job happens to
     * be initialized later, we'll lose data, and we won't even detect it. It can
     * also happen that we lose a DONE_ITEM and the job will get stuck, though
     * that's better than incorrect results.
     */
    private static final long UNINITIALIZED_CONTEXT_MAX_AGE_NS = MINUTES.toNanos(5);

    private static final long FAILED_EXECUTION_EXPIRY_NS = SECONDS.toNanos(5);
    private static final CompletableFuture[] EMPTY_COMPLETABLE_FUTURE_ARRAY = new CompletableFuture[0];

    private final Object mutex = new Object();

    private final NodeEngineImpl nodeEngine;
    private final ILogger logger;
    private final TaskletExecutionService taskletExecutionService;
    private final JobClassLoaderService jobClassloaderService;

    private final Set executionContextJobIds = newSetFromMap(new ConcurrentHashMap<>());

    // key: executionId
    private final ConcurrentMap executionContexts = new ConcurrentHashMap<>();

    /**
     * Key: executionId
     * Value: expiry time (as per System.nanoTime())
     * 

* This map contains executions, that failed or were cancelled. * These executions are very likely to receive further data packets * from other members whose executions are concurrently cancelled * too. If we keep no track of these exceptions, in failure-heavy or * cancellation-heavy scenarios a significant amount of memory could * be held for time defined in {@link * #UNINITIALIZED_CONTEXT_MAX_AGE_NS}, see * issue #19897. */ private final ConcurrentMap failedJobs = new ConcurrentHashMap<>(); @Probe(name = MetricNames.JOB_EXECUTIONS_STARTED) private final Counter executionStarted = MwCounter.newMwCounter(); @Probe(name = MetricNames.JOB_EXECUTIONS_COMPLETED) private final Counter executionCompleted = MwCounter.newMwCounter(); private final Function newLightJobExecutionContextFunction; private final ScheduledFuture lightExecutionsCheckerFuture; JobExecutionService(NodeEngineImpl nodeEngine, TaskletExecutionService taskletExecutionService, JobClassLoaderService jobClassloaderService) { this.nodeEngine = nodeEngine; this.logger = nodeEngine.getLogger(getClass()); this.taskletExecutionService = taskletExecutionService; this.jobClassloaderService = jobClassloaderService; newLightJobExecutionContextFunction = execId -> failedJobs.containsKey(execId) ? null : new ExecutionContext(nodeEngine, execId, execId, true); // register metrics MetricsRegistry registry = nodeEngine.getMetricsRegistry(); MetricDescriptor descriptor = registry.newMetricDescriptor() .withTag(MetricTags.MODULE, "jet"); registry.registerStaticMetrics(descriptor, this); this.lightExecutionsCheckerFuture = nodeEngine.getExecutionService().scheduleWithRepetition( this::checkExecutions, 0, 1, SECONDS); } public Long getExecutionIdForJobId(long jobId) { return executionContexts.values().stream() .filter(ec -> ec.jobId() == jobId) .findAny() .map(ExecutionContext::executionId) .orElse(null); } public ExecutionContext getExecutionContext(long executionId) { return executionContexts.get(executionId); } /** * Gets the execution context or creates it, if it doesn't exist. If * we're creating it, we assume it's for a light job and that the * jobId == executionId. Might return null if the job with the given * ID recently failed. *

* We can also end up here for a non-light job in this scenario:

    *
  • job runs on 2 members. The master requests termination. *
  • execution on member A terminates and is removed from * executionContexts *
  • member A receives a packet from member B (because it was in transit * or simply because the execution on member B might terminate a little * later) *
  • ExecutionContext is recreated. *
* * We ignore this as we assume that we'll never receive the * StartExecutionOperation. The improperly-created ExecutionContext will be * removed after a timeout in {@link #checkExecutions()} because it * will never be initialized. *

* We mitigate the number of execution context created after a job * failed by checking the {@link #failedJobs} map before re-creating * the execution context in this method. */ @Nullable public ExecutionContext getOrCreateExecutionContext(long executionId) { return executionContexts.computeIfAbsent(executionId, newLightJobExecutionContextFunction); } public Collection getExecutionContexts() { return executionContexts.values(); } public ConcurrentMap getFailedJobs() { return failedJobs; } Map getSenderMap(long executionId) { ExecutionContext ctx = executionContexts.get(executionId); return ctx != null ? ctx.senderMap() : null; } public void shutdown() { lightExecutionsCheckerFuture.cancel(false); synchronized (mutex) { cancelAllExecutions("Node is shutting down"); } } public void reset() { cancelAllExecutions("reset"); } /** * Cancels all ongoing executions using the given failure supplier. */ @SuppressWarnings("rawtypes") public void cancelAllExecutions(String reason) { // The ConcurrentHashMap.values() is a projection of underlying data in the map. If other thread mutates the map the // collection returned by values() mutates as well. That's the reason why we use ArrayList here instead of an array, the // count of items may change. Collection contexts = executionContexts.values(); List futures = new ArrayList<>(contexts.size()); for (ExecutionContext exeCtx : contexts) { logger.fine("Completing %s locally. Reason: %s", exeCtx.jobNameAndExecutionId(), reason); futures.add(terminateExecution0(exeCtx, null, new CancellationException())); } CompletableFuture.allOf(futures.toArray(EMPTY_COMPLETABLE_FUTURE_ARRAY)).join(); } /** * Cancels executions that contain the leaving address as the coordinator or a * job participant */ @SuppressWarnings("rawtypes") void onMemberRemoved(Member member) { Address address = member.getAddress(); CompletableFuture[] terminationFutures = executionContexts.values().stream() // note that coordinator might not be a participant // (in case it is a lite member) .filter(exeCtx -> exeCtx.coordinator() != null && (exeCtx.coordinator().equals(address) || exeCtx.hasParticipant(address))) .map(exeCtx -> { logger.fine("Completing %s locally. Reason: Member %s left the cluster", exeCtx.jobNameAndExecutionId(), address); return terminateExecution0(exeCtx, null, new MemberLeftException(member)); }) .toArray(CompletableFuture[]::new); CompletableFuture.allOf(terminationFutures).join(); } public CompletableFuture runLightJob( long jobId, long executionId, Address coordinator, int coordinatorMemberListVersion, Set participants, ExecutionPlan plan ) { assert executionId == jobId : "executionId(" + idToString(executionId) + ") != jobId(" + idToString(jobId) + ")"; verifyClusterInformation(jobId, executionId, coordinator, coordinatorMemberListVersion, participants); failIfNotRunning(); ExecutionContext execCtx; synchronized (mutex) { addExecutionContextJobId(jobId, executionId, coordinator); execCtx = executionContexts.computeIfAbsent(executionId, x -> new ExecutionContext(nodeEngine, jobId, executionId, true)); } Set

addresses = participants.stream().map(MemberInfo::getAddress).collect(toSet()); return execCtx.initialize(coordinator, addresses, plan) .whenComplete((r, e) -> { if (e != null) { completeExecution(execCtx, new CancellationException()).join(); } }) .thenAccept(r -> { // initial log entry with all of jobId, jobName, executionId if (logger.isFineEnabled()) { logger.fine("Execution plan for light job ID=" + idToString(jobId) + ", jobName=" + (execCtx.jobName() != null ? '\'' + execCtx.jobName() + '\'' : "null") + ", executionId=" + idToString(executionId) + " initialized, will start the execution"); } }) .thenCompose(r -> beginExecution0(execCtx, false)); } /** * Initiates the given execution if the local node accepts the coordinator * as its master, and has an up-to-date member list information. *
  • * If the local node has a stale member list, it retries the init operation * until it receives the new member list from the master. *
  • * If the local node detects that the member list changed after the init * operation is sent but before executed, then it sends a graceful failure * so that the job init will be retried properly. *
  • * If there is an already ongoing execution for the given job, then the * init execution is retried. *
*/ public CompletableFuture initExecution( long jobId, long executionId, Address coordinator, int coordinatorMemberListVersion, Set participants, ExecutionPlan plan ) { ExecutionContext execCtx = addExecutionContext( jobId, executionId, coordinator, coordinatorMemberListVersion, participants); Set
addresses = participants.stream().map(MemberInfo::getAddress).collect(toSet()); ClassLoader jobCl = jobClassloaderService.getClassLoader(jobId); return doWithClassLoader(jobCl, () -> execCtx.initialize(coordinator, addresses, plan)) .thenAccept(r -> { // initial log entry with all of jobId, jobName, executionId logger.info("Execution plan for jobId=" + idToString(jobId) + ", jobName=" + (execCtx.jobName() != null ? '\'' + execCtx.jobName() + '\'' : "null") + ", executionId=" + idToString(executionId) + " initialized"); }); } private void addExecutionContextJobId(long jobId, long executionId, Address coordinator) { if (!executionContextJobIds.add(jobId)) { ExecutionContext current = executionContexts.get(executionId); if (current != null) { throw new IllegalStateException(String.format( "Execution context for %s for coordinator %s already exists for coordinator %s", current.jobNameAndExecutionId(), coordinator, current.coordinator())); } // search contexts for one with different executionId, but same jobId if (logger.isFineEnabled()) { executionContexts.values().stream() .filter(e -> e.jobId() == jobId) .forEach(e -> logger.fine(String.format( "Execution context for job %s for coordinator %s already exists" + " with local execution %s for coordinator %s", idToString(jobId), coordinator, idToString(e.executionId()), e.coordinator()))); } throw new RetryableHazelcastException(); } } private ExecutionContext addExecutionContext( long jobId, long executionId, Address coordinator, int coordinatorMemberListVersion, Set participants ) { ExecutionContext execCtx; ExecutionContext oldContext; try { assertIsMaster(jobId, executionId, coordinator); verifyClusterInformation(jobId, executionId, coordinator, coordinatorMemberListVersion, participants); failIfNotRunning(); synchronized (mutex) { addExecutionContextJobId(jobId, executionId, coordinator); execCtx = new ExecutionContext(nodeEngine, jobId, executionId, false); oldContext = executionContexts.put(executionId, execCtx); } } catch (Throwable t) { // The classloader was created in InitExecutionOperation#deserializePlan(). // If the InitExecutionOperation#doRun() fails before ExecutionContext is added // to executionContexts, then classloader must be removed in order to not have leaks. jobClassloaderService.tryRemoveClassloadersForJob(jobId, EXECUTION); throw t; } if (oldContext != null) { throw new RuntimeException("Duplicate ExecutionContext for execution " + Util.idToString(executionId)); } return execCtx; } private void assertIsMaster(long jobId, long executionId, Address coordinator) { Address masterAddress = nodeEngine.getMasterAddress(); if (!coordinator.equals(masterAddress)) { failIfNotRunning(); throw new IllegalStateException(String.format( "Coordinator %s cannot initialize %s. Reason: it is not the master, the master is %s", coordinator, jobIdAndExecutionId(jobId, executionId), masterAddress)); } } private void verifyClusterInformation(long jobId, long executionId, Address coordinator, int coordinatorMemberListVersion, Set participants) { Address masterAddress = nodeEngine.getMasterAddress(); ClusterServiceImpl clusterService = (ClusterServiceImpl) nodeEngine.getClusterService(); MembershipManager membershipManager = clusterService.getMembershipManager(); int localMemberListVersion = membershipManager.getMemberListVersion(); Address thisAddress = nodeEngine.getThisAddress(); if (coordinatorMemberListVersion > localMemberListVersion) { if (masterAddress == null) { // we expect that master will eventually be known to this member (a new master will be // elected or split brain merge will happen). throw new RetryableHazelcastException(String.format( "Cannot initialize %s for coordinator %s, local member list version %s," + " coordinator member list version %s. And also, since the master address" + " is not known to this member, cannot request a new member list from master.", jobIdAndExecutionId(jobId, executionId), coordinator, localMemberListVersion, coordinatorMemberListVersion)); } assert !masterAddress.equals(thisAddress) : String.format( "Local node: %s is master but InitOperation has coordinator member list version: %s larger than " + " local member list version: %s", thisAddress, coordinatorMemberListVersion, localMemberListVersion); nodeEngine.getOperationService().send(new TriggerMemberListPublishOp(), masterAddress); throw new RetryableHazelcastException(String.format( "Cannot initialize %s for coordinator %s, local member list version %s," + " coordinator member list version %s", jobIdAndExecutionId(jobId, executionId), coordinator, localMemberListVersion, coordinatorMemberListVersion)); } // If the participant members can receive the new member list before the // coordinator, and we can also get into the // "coordinatorMemberListVersion < localMemberListVersion" case. If this // situation occurs when a job participant leaves, then the job start will // fail. Since the unknown participating member situation couldn't // be resolved with retrying the InitExecutionOperation for this // case, we do nothing here and let it fail below if some participant // isn't found. // The job start won't fail if this situation occurs when a new member // is added to the cluster, because all job participants are known to the // other participating members. The only disadvantage of this is that a // newly added member will not be a job participant and partition mapping // may not be completely proper in this case. boolean isLocalMemberParticipant = false; for (MemberInfo participant : participants) { if (participant.getAddress().equals(thisAddress)) { isLocalMemberParticipant = true; } if (membershipManager.getMember(participant.getAddress(), participant.getUuid()) == null) { throw new TopologyChangedException(String.format( "Cannot initialize %s for coordinator %s: participant %s not found in local member list." + " Local member list version: %s, coordinator member list version: %s", jobIdAndExecutionId(jobId, executionId), coordinator, participant, localMemberListVersion, coordinatorMemberListVersion)); } } if (!isLocalMemberParticipant) { throw new IllegalArgumentException(String.format( "Cannot initialize %s since member %s is not in participants: %s", jobIdAndExecutionId(jobId, executionId), thisAddress, participants)); } } private void failIfNotRunning() { if (!nodeEngine.isRunning()) { throw new HazelcastInstanceNotActiveException(); } } @Nonnull public ExecutionContext assertExecutionContext(Address callerAddress, long jobId, long executionId, String callerOpName) { Address masterAddress = nodeEngine.getMasterAddress(); if (!callerAddress.equals(masterAddress)) { failIfNotRunning(); throw new IllegalStateException(String.format( "Caller %s cannot do '%s' for %s: it is not the master, the master is %s", callerAddress, callerOpName, jobIdAndExecutionId(jobId, executionId), masterAddress)); } failIfNotRunning(); ExecutionContext executionContext = executionContexts.get(executionId); if (executionContext == null) { throw new ExecutionNotFoundException(String.format( "%s not found for coordinator %s for '%s'", jobIdAndExecutionId(jobId, executionId), callerAddress, callerOpName)); } else if (!(executionContext.coordinator().equals(callerAddress) && executionContext.jobId() == jobId)) { throw new IllegalStateException(String.format( "%s, originally from coordinator %s, cannot do '%s' by coordinator %s and execution %s", executionContext.jobNameAndExecutionId(), executionContext.coordinator(), callerOpName, callerAddress, idToString(executionId))); } return executionContext; } /** * Completes and cleans up execution of the given job */ public CompletableFuture completeExecution(@Nonnull ExecutionContext executionContext, Throwable error) { ExecutionContext removed = executionContexts.remove(executionContext.executionId()); if (removed != null) { if (error != null) { failedJobs.put(executionContext.executionId(), System.nanoTime() + FAILED_EXECUTION_EXPIRY_NS); } JetDelegatingClassLoader jobClassLoader = jobClassloaderService.getClassLoader(executionContext.jobId()); return doWithClassLoader(jobClassLoader, () -> executionContext.completeExecution(error)) .whenComplete(withTryCatch(logger, (ignored, t) -> { if (!executionContext.isLightJob()) { jobClassloaderService.tryRemoveClassloadersForJob(executionContext.jobId(), EXECUTION); } executionCompleted.inc(); executionContextJobIds.remove(executionContext.jobId()); logger.fine("Completed execution of " + executionContext.jobNameAndExecutionId()); })); } else { return completedFuture(null); } } public void updateMetrics(@Nonnull Long executionId, RawJobMetrics metrics) { ExecutionContext executionContext = executionContexts.get(executionId); if (executionContext != null) { executionContext.setMetrics(metrics); } } public CompletableFuture beginExecution( Address coordinator, long jobId, long executionId, boolean collectMetrics ) { ExecutionContext execCtx = assertExecutionContext(coordinator, jobId, executionId, "StartExecutionOperation"); assert !execCtx.isLightJob() : "StartExecutionOperation received for a light job " + idToString(jobId); logger.info("Start execution of " + execCtx.jobNameAndExecutionId() + " from coordinator " + coordinator); return beginExecution0(execCtx, collectMetrics); } public CompletableFuture beginExecution0(ExecutionContext execCtx, boolean collectMetrics) { executionStarted.inc(); return execCtx.beginExecution(taskletExecutionService) .thenApply(r -> { RawJobMetrics terminalMetrics; if (collectMetrics) { try ( var metricsRenderer = new JobMetricsCollector(nodeEngine.getLocalMember(), logger) ) { nodeEngine.getMetricsRegistry().collectDynamicMetrics(metricsRenderer, singleton(execCtx)); terminalMetrics = metricsRenderer.getMetrics(); } } else { terminalMetrics = null; } return terminalMetrics; }) .handleAsync((metrics, e) -> completeExecution(execCtx, peel(e)) .thenApply(ignored -> { if (e == null) { return metrics; } throw sneakyThrow(e); }) ) .thenCompose(stage -> stage) .whenComplete((metrics, e) -> { if (ExceptionUtil.isOrHasCause(e, CancellationException.class)) { logger.fine("Execution of " + execCtx.jobNameAndExecutionId() + " was cancelled"); } else if (e != null) { logger.fine("Execution of " + execCtx.jobNameAndExecutionId() + " completed with failure", e); } else { logger.fine("Execution of " + execCtx.jobNameAndExecutionId() + " completed"); } }); } @Override public void provideDynamicMetrics(MetricDescriptor descriptor, MetricsCollectionContext context) { try { descriptor.withTag(MetricTags.MODULE, "jet"); executionContexts.forEach((id, ctx) -> ctx.provideDynamicMetrics(descriptor.copy(), context)); } catch (Throwable t) { logger.warning("Dynamic metric collection failed", t); throw t; } } /** * See also javadoc at {@link CheckLightJobsOperation}. */ @SuppressWarnings("rawtypes") private void checkExecutions() { try { long now = System.nanoTime(); long uninitializedContextThreshold = now - UNINITIALIZED_CONTEXT_MAX_AGE_NS; Map> executionsPerMember = new HashMap<>(); List terminateFutures = new ArrayList<>(); for (ExecutionContext ctx : executionContexts.values()) { if (!ctx.isLightJob()) { continue; } Address coordinator = ctx.coordinator(); if (coordinator != null) { // if coordinator is known, add execution to the list to check executionsPerMember .computeIfAbsent(coordinator, k -> new ArrayList<>()) .add(ctx.executionId()); } else { // if coordinator is not known, remove execution if it's not known for too long if (ctx.getCreatedOn() <= uninitializedContextThreshold) { logger.fine("Terminating light job %s because it wasn't initialized during %d seconds", idToString(ctx.executionId()), NANOSECONDS.toSeconds(UNINITIALIZED_CONTEXT_MAX_AGE_NS)); terminateFutures.add(terminateExecution0(ctx, CANCEL_FORCEFUL, new CancellationException())); } } } if (!terminateFutures.isEmpty()) { CompletableFuture.allOf(terminateFutures.toArray(EMPTY_COMPLETABLE_FUTURE_ARRAY)).join(); } // submit the query to the coordinator for (Entry> en : executionsPerMember.entrySet()) { long[] executionIds = en.getValue().stream().mapToLong(Long::longValue).toArray(); Operation op = new CheckLightJobsOperation(executionIds); InvocationFuture future = nodeEngine.getOperationService() .createInvocationBuilder(SERVICE_NAME, op, en.getKey()) .invoke(); future.whenComplete((r, t) -> { if (isOrHasCause(t, TargetNotMemberException.class)) { // if the target isn't a member, then all executions are unknown r = executionIds; } else if (t != null) { logger.warning("Failed to check light job state with coordinator " + en.getKey() + ": " + t, t); return; } assert r != null; for (long executionId : r) { ExecutionContext execCtx = executionContexts.get(executionId); if (execCtx != null) { logger.fine("Terminating light job " + idToString(executionId) + " because the coordinator doesn't know it"); terminateExecution0(execCtx, CANCEL_FORCEFUL, new CancellationException()); } } }); } // clean up failedJobs failedJobs.values().removeIf(expiryTime -> expiryTime < now); } catch (Throwable e) { logger.severe("Failed to query live light executions: " + e, e); } } public CompletableFuture terminateExecution(long jobId, long executionId, Address callerAddress, TerminationMode mode) { failIfNotRunning(); ExecutionContext executionContext = executionContexts.get(executionId); if (executionContext == null) { // If this happens after the execution terminated locally, ignore. // If this happens before the execution was initialized locally, that means it's a light // job. We ignore too and rely on the CheckLightJobsOperation. return completedFuture(null); } if (!executionContext.isLightJob()) { Address masterAddress = nodeEngine.getMasterAddress(); if (!callerAddress.equals(masterAddress)) { failIfNotRunning(); throw new IllegalStateException(String.format( "Caller %s cannot do '%s' for terminateExecution: it is not the master, the master is %s", callerAddress, jobIdAndExecutionId(jobId, executionId), masterAddress)); } } Address coordinator = executionContext.coordinator(); if (coordinator == null) { // This can happen if ExecutionContext was created after a received data packet, // either before the initialization or after a completion. // The TerminateOp is always sent after InitOp on coordinator, but it can happen that it's handled // first on the target member. // We ignore this and rely on the CheckLightJobsOperation to clean up. // It can't happen for normal jobs assert executionContext.isLightJob() : "null coordinator for non-light job"; } else if (!coordinator.equals(callerAddress)) { throw new IllegalStateException(String.format( "%s, originally from coordinator %s, cannot do 'terminateExecution' by coordinator %s and execution %s", executionContext.jobNameAndExecutionId(), coordinator, callerAddress, idToString(executionId))); } Exception cause = mode == null ? new CancellationException() : new JobTerminateRequestedException(mode); return terminateExecution0(executionContext, mode, cause); } public CompletableFuture terminateExecution0(ExecutionContext executionContext, TerminationMode mode, Throwable cause) { if (!executionContext.terminateExecution(mode, cause)) { // If the execution was terminated before it began, call completeExecution now. // Otherwise, if the execution was already begun, this method will be called when the tasklets complete. logger.fine(executionContext.jobNameAndExecutionId() + " calling completeExecution because execution terminated before it started"); return completeExecution(executionContext, cause); } return completedFuture(null); } // for test public void waitAllExecutionsTerminated() { for (ExecutionContext ctx : executionContexts.values()) { try { ctx.getExecutionFuture().join(); } catch (Throwable ignored) { } } } private static class JobMetricsCollector implements MetricsCollector, AutoCloseable { private final MetricsCompressor compressor; private final ILogger logger; private final UnaryOperator addPrefixFn; JobMetricsCollector(@Nonnull Member member, @Nonnull ILogger logger) { Objects.requireNonNull(member, "member"); this.logger = Objects.requireNonNull(logger, "logger"); this.addPrefixFn = JobMetricsUtil.addMemberPrefixFn(member); this.compressor = new MetricsCompressor(); } @Override public void collectLong(MetricDescriptor descriptor, long value) { compressor.addLong(addPrefixFn.apply(descriptor), value); } @Override public void collectDouble(MetricDescriptor descriptor, double value) { compressor.addDouble(addPrefixFn.apply(descriptor), value); } @Override public void collectException(MetricDescriptor descriptor, Exception e) { logger.warning("Exception when rendering job metrics: " + e, e); } @Override public void collectNoValue(MetricDescriptor descriptor) { } @Nonnull public RawJobMetrics getMetrics() { return RawJobMetrics.of(compressor.getBlobAndClose()); } @Override public void close() { compressor.close(); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy