
com.hazelcast.jet.impl.MasterSnapshotContext Maven / Gradle / Ivy
/*
* Copyright (c) 2008-2024, Hazelcast, Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.hazelcast.jet.impl;
import com.hazelcast.core.IndeterminateOperationStateException;
import com.hazelcast.internal.cluster.MemberInfo;
import com.hazelcast.jet.JetException;
import com.hazelcast.jet.impl.JobExecutionRecord.SnapshotStats;
import com.hazelcast.jet.impl.exception.ExecutionNotFoundException;
import com.hazelcast.jet.impl.execution.SnapshotFlags;
import com.hazelcast.jet.impl.execution.init.ExecutionPlan;
import com.hazelcast.jet.impl.operation.SnapshotPhase1Operation;
import com.hazelcast.jet.impl.operation.SnapshotPhase1Operation.SnapshotPhase1Result;
import com.hazelcast.jet.impl.operation.SnapshotPhase2Operation;
import com.hazelcast.logging.ILogger;
import com.hazelcast.map.IMap;
import com.hazelcast.spi.impl.operationservice.Operation;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Queue;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutionException;
import java.util.function.Function;
import java.util.logging.Level;
import static com.hazelcast.jet.Util.idToString;
import static com.hazelcast.jet.core.JobStatus.RUNNING;
import static com.hazelcast.jet.impl.JobRepository.exportedSnapshotMapName;
import static com.hazelcast.jet.impl.JobRepository.safeImap;
import static com.hazelcast.jet.impl.JobRepository.snapshotDataMapName;
import static com.hazelcast.internal.util.ExceptionUtil.withTryCatch;
import static com.hazelcast.jet.impl.util.Util.jobNameAndExecutionId;
import static java.util.concurrent.CompletableFuture.completedFuture;
/**
* Part of {@link MasterContext} that deals with snapshot creation.
*/
class MasterSnapshotContext {
@SuppressWarnings("WeakerAccess") // accessed from subclass in jet-enterprise
final MasterContext mc;
private final ILogger logger;
/**
* It's true while a snapshot is in progress. It's used to prevent
* concurrent snapshots.
*/
private boolean snapshotInProgress;
/**
* A future (re)created when the job is started and completed when terminal
* snapshot is completed (successfully or not).
*/
@Nonnull
private volatile CompletableFuture terminalSnapshotFuture = completedFuture(null);
private class SnapshotRequest {
/**
* User-specified name of the snapshot or null, if no name is specified
* (regular snapshot). This name is without
* {@link JobRepository#EXPORTED_SNAPSHOTS_PREFIX} prefix.
*/
final String snapshotName;
/**
* If true, execution will be terminated after the snapshot.
*/
final boolean isTerminal;
/**
* Future, that will be completed when the snapshot is validated. Can
* be null.
*/
final CompletableFuture future;
SnapshotRequest(@Nullable String snapshotName, boolean isTerminal, @Nullable CompletableFuture future) {
this.snapshotName = snapshotName;
this.isTerminal = isTerminal;
this.future = future;
}
public boolean isExport() {
return snapshotName != null;
}
/**
* @see SnapshotFlags#isExportOnly(int)
*/
public boolean isExportOnly() {
return isExport() && !isTerminal;
}
public int snapshotFlags() {
return SnapshotFlags.create(isTerminal, isExport());
}
public String mapName() {
return isExport() ? exportedSnapshotMapName(snapshotName)
: snapshotDataMapName(mc.jobId(), mc.jobExecutionRecord().ongoingDataMapIndex());
}
/**
* Complete snapshot future, if any.
* @param error Error, or null for successful completion.
*/
public void completeFuture(@Nullable Throwable error) {
if (future != null) {
if (error == null) {
future.complete(null);
} else {
future.completeExceptionally(error);
}
}
}
}
/**
* The queue with snapshots to run. An item is added to it regularly (to do
* a regular snapshot) or when a snapshot export is requested by the user.
*
* Queue is accessed only in synchronized code.
*/
private final Queue snapshotQueue = new LinkedList<>();
MasterSnapshotContext(MasterContext masterContext, ILogger logger) {
mc = masterContext;
this.logger = logger;
}
@SuppressWarnings("SameParameterValue") // used by jet-enterprise
void enqueueSnapshot(String snapshotName, boolean isTerminal, CompletableFuture future) {
snapshotQueue.add(new SnapshotRequest(snapshotName, isTerminal, future));
}
private void enqueueRegularSnapshot() {
enqueueSnapshot(null, false, null);
}
void startScheduledSnapshot(long executionId) {
mc.lock();
try {
if (mc.jobStatus() != RUNNING) {
logger.fine("Not beginning snapshot, " + mc.jobIdString() + " is not RUNNING, but " + mc.jobStatus());
return;
}
if (mc.executionId() != executionId) {
// Current execution is completed and probably a new execution has started, but we don't
// cancel the scheduled snapshot from previous execution, so let's just ignore it.
logger.fine("Not beginning snapshot since unexpected execution ID received for " + mc.jobIdString()
+ ". Received execution ID: " + idToString(executionId));
return;
}
enqueueRegularSnapshot();
} finally {
mc.unlock();
}
tryBeginSnapshot();
}
void tryBeginSnapshot() {
mc.coordinationService().submitToCoordinatorThread(() -> {
final SnapshotRequest requestedSnapshot;
mc.lock();
long localExecutionId;
try {
if (mc.jobStatus() != RUNNING) {
logger.fine("Not beginning snapshot, " + mc.jobIdString() + " is not RUNNING, but " + mc.jobStatus());
return;
}
if (snapshotInProgress) {
logger.fine("Not beginning snapshot since one is already in progress " + mc.jobIdString());
return;
}
if (terminalSnapshotFuture.isDone()) {
logger.fine("Not beginning snapshot since terminal snapshot is already completed " + mc.jobIdString());
return;
}
requestedSnapshot = snapshotQueue.poll();
if (requestedSnapshot == null) {
return;
}
snapshotInProgress = true;
mc.jobExecutionRecord().startNewSnapshot(requestedSnapshot.snapshotName);
localExecutionId = mc.executionId();
} finally {
mc.unlock();
}
long newSnapshotId = mc.jobExecutionRecord().ongoingSnapshotId();
int snapshotFlags = requestedSnapshot.snapshotFlags();
String mapName = requestedSnapshot.mapName();
try {
mc.writeJobExecutionRecordSafe(false);
mc.nodeEngine().getHazelcastInstance().getMap(mapName).clear();
} catch (Exception e) {
logger.warning(String.format("Failed to start snapshot %d for %s",
newSnapshotId, jobNameAndExecutionId(mc.jobName(), localExecutionId)),
e);
requestedSnapshot.completeFuture(e);
return;
}
logger.fine("Starting snapshot %d for %s, flags: %s, writing to: %s",
newSnapshotId, jobNameAndExecutionId(mc.jobName(), localExecutionId),
SnapshotFlags.toString(snapshotFlags), requestedSnapshot.snapshotName);
Function factory = plan ->
new SnapshotPhase1Operation(mc.jobId(), localExecutionId, newSnapshotId, mapName, snapshotFlags);
// Need to take a copy of executionId: we don't cancel the scheduled task when the execution
// finalizes. If a new execution is started in the meantime, we'll use the execution ID to detect it.
mc.invokeOnParticipants(
factory,
responses -> onSnapshotPhase1Complete(responses, localExecutionId, newSnapshotId, requestedSnapshot),
null, true);
});
}
/**
* @param responses collected responses from the members
*/
private void onSnapshotPhase1Complete(
Collection> responses,
long executionId,
long snapshotId,
SnapshotRequest requestedSnapshot
) {
mc.coordinationService().submitToCoordinatorThread(() -> {
SnapshotPhase1Result mergedResult = new SnapshotPhase1Result();
List> missingResponses = new ArrayList<>();
for (Map.Entry entry : responses) {
// the response is either SnapshotOperationResult or an exception, see #invokeOnParticipants() method
Object response = entry.getValue();
if (response instanceof Throwable throwable) {
// If the member doesn't know the execution, it might have completed normally or exceptionally.
// If normally, we ignore it, if exceptionally, we'll also fail the snapshot. To know, we have
// to look at the result of the StartExecutionOperation, which might not have arrived yet. We'll collect
// all the responses to an array, and we'll wait for them later.
if (response instanceof ExecutionNotFoundException) {
missingResponses.add(mc.startOperationResponses().get(entry.getKey().getAddress()));
continue;
}
response = new SnapshotPhase1Result(0, 0, 0, throwable);
}
mergedResult.merge((SnapshotPhase1Result) response);
}
if (!missingResponses.isEmpty()) {
logger.fine("%s will wait for %d responses to StartExecutionOperation in " +
"onSnapshotPhase1Complete()", mc.jobIdString(), missingResponses.size());
}
// In a typical case `missingResponses` will be empty. It will be non-empty if some member completed
// its execution and some other did not, or near the completion of a job, e.g. after a failure.
// `allOf` for an empty array returns a completed future immediately.
// Another edge case is that we'll be waiting for a response to start operation from a next execution,
// which can happen much later - we could handle it, but we ignore it: when it arrives, we'll find a
// changed executionId and ignore the response. It also doesn't occupy a thread - we're using a future.
CompletableFuture.allOf(missingResponses.toArray(new CompletableFuture[0]))
.whenComplete(withTryCatch(logger, (r, t) ->
onSnapshotPhase1CompleteWithStartResponses(responses, executionId, snapshotId, requestedSnapshot,
mergedResult, missingResponses)));
});
}
private void onSnapshotPhase1CompleteWithStartResponses(
Collection> responses,
long executionId,
long snapshotId,
SnapshotRequest requestedSnapshot,
SnapshotPhase1Result mergedResult,
List> missingResponses
) {
mc.coordinationService().submitToCoordinatorThread(() -> {
final boolean isSuccess;
boolean skipPhase2 = false;
SnapshotStats stats;
mc.lock();
try {
if (!missingResponses.isEmpty()) {
logger.fine("%s all awaited responses to StartExecutionOperation received or " +
"were already received", mc.jobIdString());
}
// Note: this method can be called after finalizeJob() is called or even after new execution started.
// Check the execution ID to check if a new execution didn't start yet.
if (executionId != mc.executionId()) {
logger.fine("%s: ignoring responses for snapshot %s phase 1: " +
"the responses are from a different execution: %s. Responses: %s",
mc.jobIdString(), snapshotId, idToString(executionId), responses);
// a new execution started, ignore this response.
return;
}
for (CompletableFuture