Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Copyright (c) 2008-2018, Hazelcast, Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.hazelcast.jet.impl;
import com.hazelcast.core.ExecutionCallback;
import com.hazelcast.internal.cluster.MemberInfo;
import com.hazelcast.internal.cluster.impl.ClusterServiceImpl;
import com.hazelcast.internal.cluster.impl.MembersView;
import com.hazelcast.jet.config.JobConfig;
import com.hazelcast.jet.config.ProcessingGuarantee;
import com.hazelcast.jet.core.DAG;
import com.hazelcast.jet.core.Edge;
import com.hazelcast.jet.core.JobStatus;
import com.hazelcast.jet.core.TopologyChangedException;
import com.hazelcast.jet.core.Vertex;
import com.hazelcast.jet.impl.exception.JobRestartRequestedException;
import com.hazelcast.jet.impl.execution.init.ExecutionPlan;
import com.hazelcast.jet.impl.operation.CancelExecutionOperation;
import com.hazelcast.jet.impl.operation.CompleteExecutionOperation;
import com.hazelcast.jet.impl.operation.InitExecutionOperation;
import com.hazelcast.jet.impl.operation.SnapshotOperation;
import com.hazelcast.jet.impl.operation.StartExecutionOperation;
import com.hazelcast.jet.impl.util.CompletionToken;
import com.hazelcast.jet.impl.util.ExceptionUtil;
import com.hazelcast.jet.impl.util.NonCompletableFuture;
import com.hazelcast.logging.ILogger;
import com.hazelcast.nio.Address;
import com.hazelcast.spi.ExecutionService;
import com.hazelcast.spi.InternalCompletableFuture;
import com.hazelcast.spi.Operation;
import com.hazelcast.spi.impl.NodeEngineImpl;
import javax.annotation.Nullable;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.CancellationException;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Consumer;
import java.util.function.Function;
import java.util.stream.Collectors;
import static com.hazelcast.jet.core.Edge.between;
import static com.hazelcast.jet.core.JobStatus.COMPLETED;
import static com.hazelcast.jet.core.JobStatus.FAILED;
import static com.hazelcast.jet.core.JobStatus.NOT_STARTED;
import static com.hazelcast.jet.core.JobStatus.RESTARTING;
import static com.hazelcast.jet.core.JobStatus.RUNNING;
import static com.hazelcast.jet.core.JobStatus.STARTING;
import static com.hazelcast.jet.core.processor.SourceProcessors.readMapP;
import static com.hazelcast.jet.function.DistributedFunctions.entryKey;
import static com.hazelcast.jet.impl.SnapshotRepository.snapshotDataMapName;
import static com.hazelcast.jet.impl.execution.SnapshotContext.NO_SNAPSHOT;
import static com.hazelcast.jet.impl.execution.init.CustomClassLoadedObject.deserializeWithCustomClassLoader;
import static com.hazelcast.jet.impl.execution.init.ExecutionPlanBuilder.createExecutionPlans;
import static com.hazelcast.jet.impl.util.ExceptionUtil.isTopologicalFailure;
import static com.hazelcast.jet.impl.util.ExceptionUtil.peel;
import static com.hazelcast.jet.impl.util.ExceptionUtil.withTryCatch;
import static com.hazelcast.jet.impl.util.Util.getJetInstance;
import static com.hazelcast.jet.impl.util.Util.idToString;
import static com.hazelcast.jet.impl.util.Util.jobAndExecutionId;
import static java.util.Collections.emptyList;
import static java.util.stream.Collectors.partitioningBy;
import static java.util.stream.Collectors.toList;
/**
* Data pertaining to single job on master member. There's one instance per job,
* shared between multiple executions.
*/
public class MasterContext {
public static final int SNAPSHOT_RESTORE_EDGE_PRIORITY = Integer.MIN_VALUE;
private final NodeEngineImpl nodeEngine;
private final JobCoordinationService coordinationService;
private final ILogger logger;
private final JobRecord jobRecord;
private final long jobId;
private final NonCompletableFuture completionFuture = new NonCompletableFuture();
private final CompletionToken cancellationToken;
private final AtomicReference jobStatus = new AtomicReference<>(NOT_STARTED);
private final SnapshotRepository snapshotRepository;
private volatile Set vertices;
private volatile long executionId;
private volatile long jobStartTime;
private volatile Map executionPlanMap;
private volatile CompletionToken executionRestartToken;
MasterContext(NodeEngineImpl nodeEngine, JobCoordinationService coordinationService, JobRecord jobRecord) {
this.nodeEngine = nodeEngine;
this.coordinationService = coordinationService;
this.snapshotRepository = coordinationService.snapshotRepository();
this.logger = nodeEngine.getLogger(getClass());
this.jobRecord = jobRecord;
this.jobId = jobRecord.getJobId();
this.cancellationToken = new CompletionToken(logger);
}
public long getJobId() {
return jobId;
}
public long getExecutionId() {
return executionId;
}
public JobStatus jobStatus() {
return jobStatus.get();
}
public JobConfig getJobConfig() {
return jobRecord.getConfig();
}
public JobRecord getJobRecord() {
return jobRecord;
}
public CompletableFuture completionFuture() {
return completionFuture;
}
boolean cancelJob() {
return cancellationToken.complete();
}
boolean isCancelled() {
return cancellationToken.isCompleted();
}
/**
* Starts execution of the job if it is not already completed, cancelled or failed.
* If the job is already cancelled, the job completion procedure is triggered.
* If the job quorum is not satisfied, job restart is rescheduled.
* If there was a membership change and the partition table is not completely
* fixed yet, job restart is rescheduled.
*/
void tryStartJob(Function executionIdSupplier) {
if (!setJobStatusToStarting()) {
return;
}
if (scheduleRestartIfQuorumAbsent() || scheduleRestartIfClusterIsNotSafe()) {
return;
}
DAG dag;
try {
dag = deserializeDAG();
} catch (Exception e) {
logger.warning("DAG deserialization failed", e);
finalizeJob(e);
return;
}
// save a copy of the vertex list, because it is going to change
vertices = new HashSet<>();
dag.iterator().forEachRemaining(vertices::add);
executionId = executionIdSupplier.apply(jobId);
// last started snapshot complete or not complete. The next started snapshot must be greater than this number
long lastSnapshotId = NO_SNAPSHOT;
if (isSnapshottingEnabled()) {
Long snapshotIdToRestore = snapshotRepository.latestCompleteSnapshot(jobId);
snapshotRepository.deleteAllSnapshotsExceptOne(jobId, snapshotIdToRestore);
Long lastStartedSnapshot = snapshotRepository.latestStartedSnapshot(jobId);
if (snapshotIdToRestore != null) {
logger.info("State of " + jobIdString() + " will be restored from snapshot "
+ snapshotIdToRestore);
rewriteDagWithSnapshotRestore(dag, snapshotIdToRestore);
} else {
logger.info("No previous snapshot for " + jobIdString() + " found.");
}
if (lastStartedSnapshot != null) {
lastSnapshotId = lastStartedSnapshot;
}
}
MembersView membersView = getMembersView();
ClassLoader previousCL = swapContextClassLoader(coordinationService.getClassLoader(jobId));
try {
int defaultLocalParallelism = getJetInstance(nodeEngine).getConfig().getInstanceConfig()
.getCooperativeThreadCount();
logger.info("Start executing " + jobIdString() + ", status " + jobStatus()
+ "\n" + dag.toString(defaultLocalParallelism));
logger.fine("Building execution plan for " + jobIdString());
executionPlanMap = createExecutionPlans(nodeEngine, membersView, dag, getJobConfig(), lastSnapshotId);
} catch (Exception e) {
logger.severe("Exception creating execution plan for " + jobIdString(), e);
finalizeJob(e);
return;
} finally {
Thread.currentThread().setContextClassLoader(previousCL);
}
logger.fine("Built execution plans for " + jobIdString());
Set participants = executionPlanMap.keySet();
Function operationCtor = plan ->
new InitExecutionOperation(jobId, executionId, membersView.getVersion(), participants,
nodeEngine.getSerializationService().toData(plan));
invoke(operationCtor, this::onInitStepCompleted, null);
}
private void rewriteDagWithSnapshotRestore(DAG dag, long snapshotId) {
logger.info(jobIdString() + ": restoring state from snapshotId=" + snapshotId);
for (Vertex vertex : dag) {
// We add the vertex even in case when the map is empty: this ensures, that
// Processor.finishSnapshotRestore() method is always called on all vertices in
// a job which is restored from a snapshot.
String mapName = snapshotDataMapName(jobId, snapshotId, vertex.getName());
Vertex readSnapshotVertex = dag.newVertex("__snapshot_read." + vertex.getName(), readMapP(mapName));
Vertex explodeVertex = dag.newVertex("__snapshot_explode." + vertex.getName(), ExplodeSnapshotP::new);
readSnapshotVertex.localParallelism(vertex.getLocalParallelism());
explodeVertex.localParallelism(vertex.getLocalParallelism());
int destOrdinal = dag.getInboundEdges(vertex.getName()).size();
dag.edge(between(readSnapshotVertex, explodeVertex).isolated())
.edge(new SnapshotRestoreEdge(explodeVertex, vertex, destOrdinal));
}
}
/**
* Sets job status to starting.
* Returns false if the job start process cannot proceed.
*/
private boolean setJobStatusToStarting() {
JobStatus status = jobStatus();
if (status == COMPLETED || status == FAILED) {
logger.severe("Cannot init job " + idToString(jobId) + ": it is already " + status);
return false;
}
if (cancellationToken.isCompleted()) {
logger.fine("Skipping init job " + idToString(jobId) + ": is already cancelled.");
finalizeJob(new CancellationException());
return false;
}
if (status == NOT_STARTED) {
if (!jobStatus.compareAndSet(NOT_STARTED, STARTING)) {
logger.fine("Cannot init job " + idToString(jobId) + ": someone else is just starting it");
return false;
}
jobStartTime = System.currentTimeMillis();
}
status = jobStatus();
if (!(status == STARTING || status == RESTARTING)) {
logger.severe("Cannot init job " + idToString(jobId) + ": status is " + status);
return false;
}
return true;
}
private boolean scheduleRestartIfQuorumAbsent() {
int quorumSize = jobRecord.getQuorumSize();
if (coordinationService.isQuorumPresent(quorumSize)) {
return false;
}
logger.fine("Rescheduling restart of job " + idToString(jobId) + ": quorum size " + quorumSize + " is not met");
scheduleRestart();
return true;
}
private boolean scheduleRestartIfClusterIsNotSafe() {
if (coordinationService.shouldStartJobs()) {
return false;
}
logger.fine("Rescheduling restart of job " + idToString(jobId) + ": cluster is not safe");
scheduleRestart();
return true;
}
private void scheduleRestart() {
jobStatus.compareAndSet(RUNNING, RESTARTING);
coordinationService.scheduleRestart(jobId);
}
private MembersView getMembersView() {
ClusterServiceImpl clusterService = (ClusterServiceImpl) nodeEngine.getClusterService();
return clusterService.getMembershipManager().getMembersView();
}
private DAG deserializeDAG() {
ClassLoader cl = coordinationService.getClassLoader(jobId);
return deserializeWithCustomClassLoader(nodeEngine.getSerializationService(), cl, jobRecord.getDag());
}
// Called as callback when all InitOperation invocations are done
private void onInitStepCompleted(Map responses) {
Throwable error = getInitResult(responses);
if (error == null) {
JobStatus status = jobStatus();
if (!(status == STARTING || status == RESTARTING)) {
error = new IllegalStateException("Cannot execute " + jobIdString()
+ ": status is " + status);
}
}
if (error == null) {
invokeStartExecution();
} else {
invokeCompleteExecution(error);
}
}
/**
* If there is no failure, then returns null. If the job is cancelled, then returns CancellationException.
* If there is at least one non-restartable failure, such as an exception in user code, then returns that failure.
* Otherwise, the failure is because a job participant has left the cluster.
* In that case, TopologyChangeException is returned so that the job will be restarted.
*/
private Throwable getInitResult(Map responses) {
if (cancellationToken.isCompleted()) {
logger.fine(jobIdString() + " to be cancelled after init");
return new CancellationException();
}
Map>> grouped = groupResponses(responses);
Collection successfulMembers = grouped.get(false).stream().map(Entry::getKey).collect(toList());
if (successfulMembers.size() == executionPlanMap.size()) {
logger.fine("Init of " + jobIdString() + " is successful.");
return null;
}
List> failures = grouped.get(true);
logger.fine("Init of " + jobIdString() + " failed with: " + failures);
// if there is at least one non-restartable failure, such as a user code failure, then fail the job
// otherwise, return TopologyChangedException so that the job will be restarted
return failures
.stream()
.map(e -> (Throwable) e.getValue())
.filter(t -> !isTopologicalFailure(t))
.findFirst()
.map(ExceptionUtil::peel)
.orElse(new TopologyChangedException());
}
// true -> failures, false -> success responses
private Map>> groupResponses(Map responses) {
Map>> grouped = responses
.entrySet()
.stream()
.collect(partitioningBy(e -> e.getValue() instanceof Throwable));
grouped.putIfAbsent(true, emptyList());
grouped.putIfAbsent(false, emptyList());
return grouped;
}
// If a participant leaves or the execution fails in a participant locally, executions are cancelled
// on the remaining participants and the callback is completed after all invocations return.
private void invokeStartExecution() {
logger.fine("Executing " + jobIdString());
long executionId = this.executionId;
ExecutionInvocationCallback callback = new ExecutionInvocationCallback(executionId);
cancellationToken.whenCompleted(callback::cancelInvocations);
CompletionToken executionRestartToken = new CompletionToken(logger);
executionRestartToken.whenCompleted(callback::cancelInvocations);
Function operationCtor = plan -> new StartExecutionOperation(jobId, executionId);
Consumer