
com.hazelcast.jet.impl.execution.ExecutionContext Maven / Gradle / Ivy
/*
* Copyright (c) 2008-2024, Hazelcast, Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.hazelcast.jet.impl.execution;
import com.hazelcast.cluster.Address;
import com.hazelcast.internal.metrics.DynamicMetricsProvider;
import com.hazelcast.internal.metrics.MetricDescriptor;
import com.hazelcast.internal.metrics.MetricsCollectionContext;
import com.hazelcast.internal.metrics.ProbeLevel;
import com.hazelcast.internal.metrics.ProbeUnit;
import com.hazelcast.internal.nio.IOUtil;
import com.hazelcast.internal.serialization.InternalSerializationService;
import com.hazelcast.internal.util.concurrent.MPSCQueue;
import com.hazelcast.internal.util.counters.Counter;
import com.hazelcast.internal.util.counters.MwCounter;
import com.hazelcast.jet.config.JobConfig;
import com.hazelcast.jet.core.ProcessorSupplier;
import com.hazelcast.jet.core.metrics.MetricTags;
import com.hazelcast.jet.function.RunnableEx;
import com.hazelcast.jet.impl.JetServiceBackend;
import com.hazelcast.jet.impl.JobClassLoaderService;
import com.hazelcast.jet.impl.TerminationMode;
import com.hazelcast.jet.impl.exception.JobTerminateRequestedException;
import com.hazelcast.jet.impl.exception.TerminatedWithSnapshotException;
import com.hazelcast.jet.impl.execution.init.ExecutionPlan;
import com.hazelcast.jet.impl.execution.init.VertexDef;
import com.hazelcast.jet.impl.metrics.RawJobMetrics;
import com.hazelcast.jet.impl.operation.SnapshotPhase1Operation.SnapshotPhase1Result;
import com.hazelcast.jet.impl.util.Util;
import com.hazelcast.logging.ILogger;
import com.hazelcast.spi.impl.NodeEngineImpl;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.Queue;
import java.util.Set;
import java.util.concurrent.CancellationException;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.Executor;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.function.Function;
import static com.hazelcast.internal.util.ConcurrencyUtil.CALLER_RUNS;
import static com.hazelcast.internal.util.ExceptionUtil.withTryCatch;
import static com.hazelcast.jet.Util.idToString;
import static com.hazelcast.jet.core.metrics.MetricNames.EXECUTION_COMPLETION_TIME;
import static com.hazelcast.jet.core.metrics.MetricNames.EXECUTION_START_TIME;
import static com.hazelcast.jet.impl.util.Util.doWithClassLoader;
import static com.hazelcast.spi.impl.executionservice.ExecutionService.JOB_OFFLOADABLE_EXECUTOR;
import static java.util.Collections.emptyList;
import static java.util.Collections.unmodifiableMap;
import static java.util.concurrent.CompletableFuture.completedFuture;
import static java.util.concurrent.CompletableFuture.runAsync;
/**
* Data pertaining to single job execution on all cluster members. There's one
* instance per job execution; if the job is restarted, another instance will
* be used.
*/
public class ExecutionContext implements DynamicMetricsProvider {
private static final Function super SenderReceiverKey, ? extends Queue> CREATE_RECEIVER_QUEUE_FN =
key -> new MPSCQueue<>(null);
private final long jobId;
private final long executionId;
private final boolean isLightJob;
private volatile Address coordinator;
private volatile Set participants;
private final long createdOn = System.nanoTime();
private final Object executionLock = new Object();
private final ILogger logger;
private final Counter startTime = MwCounter.newMwCounter(-1);
private final Counter completionTime = MwCounter.newMwCounter(-1);
// key: resource identifier
// we use ConcurrentHashMap because ConcurrentMap doesn't guarantee that computeIfAbsent
// executes the supplier strictly only if it's needed.
private final ConcurrentHashMap tempDirectories = new ConcurrentHashMap<>();
private String jobName;
private volatile Map receiverMap;
private volatile Map senderMap;
private final Map> receiverQueuesMap;
private List vertices = emptyList();
private List tasklets = emptyList();
// future which is completed only after all tasklets are completed and contains execution result
private volatile CompletableFuture executionFuture;
// future which can only be used to cancel the local execution.
private final CompletableFuture cancellationFuture = new CompletableFuture<>();
private final NodeEngineImpl nodeEngine;
private final JetServiceBackend jetServiceBackend;
private volatile SnapshotContext snapshotContext;
private boolean metricsEnabled;
private volatile RawJobMetrics metrics = RawJobMetrics.empty();
private InternalSerializationService serializationService;
private final AtomicBoolean executionCompleted = new AtomicBoolean();
public ExecutionContext(NodeEngineImpl nodeEngine, long jobId, long executionId, boolean isLightJob) {
this.jobId = jobId;
this.executionId = executionId;
this.isLightJob = isLightJob;
this.nodeEngine = nodeEngine;
this.jetServiceBackend = nodeEngine.getService(JetServiceBackend.SERVICE_NAME);
this.jobName = idToString(jobId);
this.logger = nodeEngine.getLogger(getClass());
// The map is only concurrent for light jobs because they can receive packets before they are
// initialized. For regular jobs we use non-concurrent map for performance
// We considered replacing the CHM with a HM after initialization for performance, but there's
// not much benefit in that according to this: https://stackoverflow.com/a/32141829/952135
receiverQueuesMap = isLightJob ? new ConcurrentHashMap<>() : new HashMap<>();
}
public CompletableFuture initialize(
@Nonnull Address coordinator,
@Nonnull Set participants,
@Nonnull ExecutionPlan plan) {
this.coordinator = coordinator;
this.participants = participants;
JobConfig jobConfig = plan.getJobConfig();
jobName = jobConfig.getName() == null ? jobName : jobConfig.getName();
// Must be populated early, so all processor suppliers are
// available to be completed in the case of init failure
vertices = plan.getVertices();
snapshotContext = new SnapshotContext(nodeEngine.getLogger(SnapshotContext.class), jobNameAndExecutionId(),
plan.lastSnapshotId(), jobConfig.getProcessingGuarantee());
serializationService = isLightJob
? (InternalSerializationService) nodeEngine.getSerializationService()
: ((JetServiceBackend) nodeEngine.getService(JetServiceBackend.SERVICE_NAME))
.createSerializationService(jobConfig.getSerializerConfigs());
metricsEnabled = jobConfig.isMetricsEnabled() && nodeEngine.getConfig().getMetricsConfig().isEnabled();
return plan.initialize(nodeEngine, jobId, executionId, snapshotContext, tempDirectories, serializationService)
.thenAccept(ignored -> initWithPlan(plan));
}
private void initWithPlan(@Nonnull ExecutionPlan plan) {
int numPrioritySsTasklets = plan.getStoreSnapshotTaskletCount() != 0
? plan.getHigherPriorityVertexCount()
: 0;
snapshotContext.initTaskletCount(plan.getProcessorTaskletCount(), plan.getStoreSnapshotTaskletCount(),
numPrioritySsTasklets);
Map receiverMapTmp = new HashMap<>();
for (Entry>> vertexIdEntry : plan.getReceiverMap().entrySet()) {
for (Entry> ordinalEntry : vertexIdEntry.getValue().entrySet()) {
for (Entry addressEntry : ordinalEntry.getValue().entrySet()) {
SenderReceiverKey key =
new SenderReceiverKey(vertexIdEntry.getKey(), ordinalEntry.getKey(), addressEntry.getKey());
// the queue might already exist, if some data were received for it, or it will be created now
Queue queue = receiverQueuesMap.computeIfAbsent(key, CREATE_RECEIVER_QUEUE_FN);
ReceiverTasklet receiverTasklet = addressEntry.getValue();
receiverTasklet.initIncomingQueue(queue);
receiverMapTmp.put(
new SenderReceiverKey(vertexIdEntry.getKey(), ordinalEntry.getKey(), addressEntry.getKey()),
receiverTasklet);
}
}
}
this.receiverMap = unmodifiableMap(receiverMapTmp);
Map senderMapTmp = new HashMap<>();
for (Entry>> e1 : plan.getSenderMap().entrySet()) {
for (Entry> e2 : e1.getValue().entrySet()) {
for (Entry e3 : e2.getValue().entrySet()) {
senderMapTmp.put(new SenderReceiverKey(e1.getKey(), e2.getKey(), e3.getKey()), e3.getValue());
}
}
}
this.senderMap = unmodifiableMap(senderMapTmp);
tasklets = plan.getTasklets();
}
/**
* Starts local execution of job by submitting tasklets to execution service. If
* execution was cancelled earlier then execution will not be started.
*
* Returns a future which is completed only when all tasklets are completed. If
* execution was already cancelled before this method is called then the returned
* future is completed immediately. The future returned can't be cancelled,
* instead {@link #terminateExecution} should be used.
*/
public CompletableFuture beginExecution(TaskletExecutionService taskletExecService) {
synchronized (executionLock) {
if (executionFuture != null) {
// beginExecution was already called or execution was cancelled before it started.
logger.fine("%s: execution started after cancelled", jobNameAndExecutionId());
return executionFuture;
} else {
// begin job execution
ClassLoader cl = jetServiceBackend.getJobClassLoaderService().getClassLoader(jobId);
if (cl == null) {
cl = nodeEngine.getConfigClassLoader();
}
startTime.set(System.currentTimeMillis());
executionFuture = taskletExecService
.beginExecute(tasklets, cancellationFuture, cl)
.whenComplete(withTryCatch(logger, (r, t) -> setCompletionTime()))
.thenApply(res -> {
// There's a race here: a snapshot could be requested after the job just completed
// normally, in that case we'll report that it terminated with snapshot.
// We ignore this for now.
if (snapshotContext.isTerminalSnapshot()) {
throw new TerminatedWithSnapshotException();
}
return res;
});
}
return executionFuture;
}
}
/**
* Complete local execution. If local execution was started, it should be
* called after execution has completed.
*/
public CompletableFuture completeExecution(Throwable error) {
assert executionFuture == null || executionFuture.isDone()
: "If execution was begun, then completeExecution() should not be called before execution is done.";
if (!executionCompleted.compareAndSet(false, true)) {
return completedFuture(null);
}
for (Tasklet tasklet : tasklets) {
try {
tasklet.close();
} catch (Throwable e) {
logger.severe(jobNameAndExecutionId()
+ " encountered an exception in Processor.close(), ignoring it", e);
}
}
JobClassLoaderService jobClassloaderService = jetServiceBackend.getJobClassLoaderService();
List> futures = new ArrayList<>(vertices.size());
ExecutorService offloadExecutor = nodeEngine.getExecutionService().getExecutor(JOB_OFFLOADABLE_EXECUTOR);
for (VertexDef vertex : vertices) {
ProcessorSupplier processorSupplier = vertex.processorSupplier();
RunnableEx closeAction = () -> {
try {
ClassLoader processorCl = isLightJob ?
null : jobClassloaderService.getProcessorClassLoader(jobId, vertex.name());
doWithClassLoader(processorCl, () -> processorSupplier.close(error));
} catch (Throwable e) {
logger.severe(jobNameAndExecutionId()
+ " encountered an exception in ProcessorSupplier.close(), ignoring it", e);
}
};
Executor executor = processorSupplier.closeIsCooperative() ? CALLER_RUNS : offloadExecutor;
futures.add(runAsync(closeAction, executor));
}
return CompletableFuture.allOf(futures.toArray(new CompletableFuture[0]))
.whenComplete(withTryCatch(logger, (ignored, e) -> {
tempDirectories.forEach((k, dir) -> {
try {
IOUtil.delete(dir);
} catch (Exception ex) {
logger.warning("Failed to delete temporary directory " + dir);
}
});
if (!isLightJob && serializationService != null) {
serializationService.dispose();
}
}));
}
/**
* Terminates the local execution of tasklets. Returns false, if the
* execution wasn't yet begun.
*/
public boolean terminateExecution(@Nullable TerminationMode mode, Throwable cause) {
assert mode == null || !mode.isWithTerminalSnapshot()
: "terminating with a mode that should do a terminal snapshot";
synchronized (executionLock) {
if (mode == null) {
cancellationFuture.completeExceptionally(new ExecutionCancellationException(cause));
} else {
cancellationFuture.completeExceptionally(new JobTerminateRequestedException(mode));
}
if (executionFuture == null) {
// if cancelled before execution started, then assign the already completed future.
executionFuture = cancellationFuture;
return false;
}
// Very rarely it can happen that snapshotContext=null here.
// Basic scenario is when job initializes slowly (under load) and execution context is already created
// but not yet fully initialized (did not reach SnapshotContext creation in ExecutionContext.initialize()).
// If such job is terminated twice for any reason (eg. manual termination, cancelAllExecutions, member left etc.)
// and the completeExecution invocation is slow, then terminateExecution may be invoked twice from
// JobExecutionService.terminateExecution0.
// If that happens, first invocation will set `executionFuture = cancellationFuture` and the second invocation
// will reach here. This should not be very harmful, as completeExecution is safe to be invoked multiple times.
//
// Due to concurrent nature of initialization and cancellation it is hard to precisely know if the job
// was cancelled _before_ execution started or cancelled _when_ the execution was starting.
// However, in any case, snapshotContext=null means that the job has not yet started,
// so there is no cleanup to do.
if (snapshotContext != null) {
snapshotContext.cancel();
}
return true;
}
}
/**
* Starts the phase 1 of a new snapshot.
*/
public CompletableFuture beginSnapshotPhase1(long snapshotId, String mapName, int flags) {
logger.fine("Starting snapshot %d phase 1 for %s on member", snapshotId, jobNameAndExecutionId());
synchronized (executionLock) {
if (cancellationFuture.isDone()) {
throw new CancellationException();
} else if (executionFuture != null && executionFuture.isDone()) {
// if execution is done, there are 0 processors to take snapshot of. Therefore we're done now.
logger.fine("Ignoring snapshot %d phase 1 for %s: execution completed",
snapshotId, jobNameAndExecutionId());
return completedFuture(new SnapshotPhase1Result(0, 0, 0, null));
}
return snapshotContext.startNewSnapshotPhase1(snapshotId, mapName, flags);
}
}
/**
* Starts the phase 2 of the current snapshot.
*/
public CompletableFuture beginSnapshotPhase2(long snapshotId, boolean success) {
logger.fine("Starting snapshot %d phase 2 for %s on member", snapshotId, jobNameAndExecutionId());
synchronized (executionLock) {
if (cancellationFuture.isDone()) {
throw new CancellationException();
} else if (executionFuture != null && executionFuture.isDone()) {
// if execution is done, there are 0 processors to take snapshot of. Therefore we're done now.
logger.fine("Ignoring snapshot %d phase 2 for %s: execution completed",
snapshotId, jobNameAndExecutionId());
return completedFuture(null);
}
return snapshotContext.startNewSnapshotPhase2(snapshotId, success);
}
}
public void handlePacket(int vertexId, int ordinal, Address sender, byte[] payload) {
receiverQueuesMap.computeIfAbsent(new SenderReceiverKey(vertexId, ordinal, sender), CREATE_RECEIVER_QUEUE_FN)
.add(payload);
}
public boolean hasParticipant(Address member) {
// once participants is not null, it's always not null
return participants != null && participants.contains(member);
}
public long jobId() {
return jobId;
}
public long executionId() {
return executionId;
}
public String jobNameAndExecutionId() {
return Util.jobNameAndExecutionId(jobName, executionId);
}
public boolean isLightJob() {
return isLightJob;
}
public Address coordinator() {
return coordinator;
}
public Map senderMap() {
return senderMap;
}
public Map receiverMap() {
return receiverMap;
}
public Set participants() {
return participants;
}
@Nullable
public String jobName() {
return jobName;
}
public RawJobMetrics getMetrics() {
return metrics;
}
public void setMetrics(RawJobMetrics metrics) {
this.metrics = metrics;
}
@Override
public void provideDynamicMetrics(MetricDescriptor descriptor, MetricsCollectionContext context) {
if (!metricsEnabled) {
return;
}
descriptor.withTag(MetricTags.JOB, idToString(jobId))
.withTag(MetricTags.JOB_NAME, jobName)
.withTag(MetricTags.EXECUTION, idToString(executionId));
context.collect(descriptor, EXECUTION_START_TIME, ProbeLevel.INFO, ProbeUnit.MS, startTime.get());
context.collect(descriptor, EXECUTION_COMPLETION_TIME, ProbeLevel.INFO, ProbeUnit.MS, completionTime.get());
for (Tasklet tasklet : tasklets) {
tasklet.provideDynamicMetrics(descriptor.copy(), context);
}
}
public void setCompletionTime() {
completionTime.set(System.currentTimeMillis());
}
public CompletableFuture getExecutionFuture() {
return executionFuture;
}
public long getCreatedOn() {
return createdOn;
}
public static final class SenderReceiverKey {
public final int vertexId;
public final int ordinal;
public final Address address;
public SenderReceiverKey(int vertexId, int ordinal, @Nonnull Address address) {
this.vertexId = vertexId;
this.ordinal = ordinal;
this.address = address;
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
SenderReceiverKey that = (SenderReceiverKey) o;
return vertexId == that.vertexId && ordinal == that.ordinal && address.equals(that.address);
}
@Override
public int hashCode() {
return Objects.hash(vertexId, ordinal, address);
}
}
}