org.apache.flink.runtime.checkpoint.CheckpointCoordinator Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of flink-runtime_2.10 Show documentation
There is a newer version: 1.3.3
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.checkpoint;

import akka.actor.ActorSystem;
import akka.actor.PoisonPill;
import akka.actor.Props;
import org.apache.flink.api.common.JobID;
import org.apache.flink.runtime.checkpoint.stats.CheckpointStatsTracker;
import org.apache.flink.runtime.checkpoint.stats.DisabledCheckpointStatsTracker;
import org.apache.flink.runtime.execution.ExecutionState;
import org.apache.flink.runtime.executiongraph.Execution;
import org.apache.flink.runtime.executiongraph.ExecutionAttemptID;
import org.apache.flink.runtime.executiongraph.ExecutionJobVertex;
import org.apache.flink.runtime.executiongraph.ExecutionVertex;
import org.apache.flink.runtime.instance.ActorGateway;
import org.apache.flink.runtime.instance.AkkaActorGateway;
import org.apache.flink.runtime.jobgraph.JobVertexID;
import org.apache.flink.runtime.jobmanager.RecoveryMode;
import org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint;
import org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint;
import org.apache.flink.runtime.messages.checkpoint.NotifyCheckpointComplete;
import org.apache.flink.runtime.messages.checkpoint.TriggerCheckpoint;
import org.apache.flink.runtime.state.StateHandle;
import org.apache.flink.util.SerializedValue;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Timer;
import java.util.TimerTask;
import java.util.UUID;

import static org.apache.flink.util.Preconditions.checkArgument;
import static org.apache.flink.util.Preconditions.checkNotNull;

/**
 * The checkpoint coordinator coordinates the distributed snapshots of operators and state.
 * It triggers the checkpoint by sending the messages to the relevant tasks and collects the
 * checkpoint acknowledgements. It also collects and maintains the overview of the state handles
 * reported by the tasks that acknowledge the checkpoint.
 *
 * Depending on the configured {@link RecoveryMode}, the behaviour of the {@link
 * CompletedCheckpointStore} and {@link CheckpointIDCounter} change. The default standalone
 * implementations don't support any recovery.
 */
public class CheckpointCoordinator {

	static final Logger LOG = LoggerFactory.getLogger(CheckpointCoordinator.class);

	/** The number of recent checkpoints whose IDs are remembered */
	private static final int NUM_GHOST_CHECKPOINT_IDS = 16;

	/** Coordinator-wide lock to safeguard the checkpoint updates */
	protected final Object lock = new Object();

	/** The job whose checkpoint this coordinator coordinates */
	private final JobID job;

	/** Tasks who need to be sent a message when a checkpoint is started */
	private final ExecutionVertex[] tasksToTrigger;

	/** Tasks who need to acknowledge a checkpoint before it succeeds */
	private final ExecutionVertex[] tasksToWaitFor;

	/** Tasks who need to be sent a message when a checkpoint is confirmed */
	private final ExecutionVertex[] tasksToCommitTo;

	/** Map from checkpoint ID to the pending checkpoint */
	private final Map pendingCheckpoints;

	/** Completed checkpoints. Implementations can be blocking. Make sure calls to methods
	 * accessing this don't block the job manager actor and run asynchronously. */
	private final CompletedCheckpointStore completedCheckpointStore;

	/** A list of recent checkpoint IDs, to identify late messages (vs invalid ones) */
	private final ArrayDeque recentPendingCheckpoints;

	/** Checkpoint ID counter to ensure ascending IDs. In case of job manager failures, these
	 * need to be ascending across job managers. */
	protected final CheckpointIDCounter checkpointIdCounter;

	/** Class loader used to deserialize the state handles (as they may be user-defined) */
	private final ClassLoader userClassLoader;

	/** The base checkpoint interval. Actual trigger time may be affected by the
	 * max concurrent checkpoints and minimum-pause values */
	private final long baseInterval;

	/** The max time (in ms) that a checkpoint may take */
	private final long checkpointTimeout;

	/** The min time(in ms) to delay after a checkpoint could be triggered. Allows to
	 * enforce minimum processing time between checkpoint attempts */
	private final long minPauseBetweenCheckpoints;

	/** The maximum number of checkpoints that may be in progress at the same time */
	private final int maxConcurrentCheckpointAttempts;

	/** The timer that handles the checkpoint timeouts and triggers periodic checkpoints */
	private final Timer timer;

	/** Actor that receives status updates from the execution graph this coordinator works for */
	private ActorGateway jobStatusListener;

	/** The number of consecutive failed trigger attempts */
	private int numUnsuccessfulCheckpointsTriggers;

	private ScheduledTrigger currentPeriodicTrigger;

	private long lastTriggeredCheckpoint;

	/** Flag whether a triggered checkpoint should immediately schedule the next checkpoint.
	 * Non-volatile, because only accessed in synchronized scope */
	private boolean periodicScheduling;

	/** Flag whether a trigger request could not be handled immediately. Non-volatile, because only
	 * accessed in synchronized scope */
	private boolean triggerRequestQueued;

	/** Flag marking the coordinator as shut down (not accepting any messages any more) */
	private volatile boolean shutdown;

	/** Shutdown hook thread to clean up state handles. */
	private final Thread shutdownHook;

	/** Helper for tracking checkpoint statistics  */
	private final CheckpointStatsTracker statsTracker;

	protected final int numberKeyGroups;

	// --------------------------------------------------------------------------------------------

	public CheckpointCoordinator(
			JobID job,
			long baseInterval,
			long checkpointTimeout,
			int numberKeyGroups,
			ExecutionVertex[] tasksToTrigger,
			ExecutionVertex[] tasksToWaitFor,
			ExecutionVertex[] tasksToCommitTo,
			ClassLoader userClassLoader,
			CheckpointIDCounter checkpointIDCounter,
			CompletedCheckpointStore completedCheckpointStore,
			RecoveryMode recoveryMode) throws Exception {

		this(job, baseInterval, checkpointTimeout, 0L, Integer.MAX_VALUE, numberKeyGroups,
				tasksToTrigger, tasksToWaitFor, tasksToCommitTo,
				userClassLoader, checkpointIDCounter, completedCheckpointStore, recoveryMode,
				new DisabledCheckpointStatsTracker());
	}

	public CheckpointCoordinator(
			JobID job,
			long baseInterval,
			long checkpointTimeout,
			long minPauseBetweenCheckpoints,
			int maxConcurrentCheckpointAttempts,
			int numberKeyGroups,
			ExecutionVertex[] tasksToTrigger,
			ExecutionVertex[] tasksToWaitFor,
			ExecutionVertex[] tasksToCommitTo,
			ClassLoader userClassLoader,
			CheckpointIDCounter checkpointIDCounter,
			CompletedCheckpointStore completedCheckpointStore,
			RecoveryMode recoveryMode,
			CheckpointStatsTracker statsTracker) throws Exception {

		// Sanity check
		checkArgument(baseInterval > 0, "Checkpoint timeout must be larger than zero");
		checkArgument(checkpointTimeout >= 1, "Checkpoint timeout must be larger than zero");
		checkArgument(minPauseBetweenCheckpoints >= 0, "minPauseBetweenCheckpoints must be >= 0");
		checkArgument(maxConcurrentCheckpointAttempts >= 1, "maxConcurrentCheckpointAttempts must be >= 1");

		this.job = checkNotNull(job);
		this.baseInterval = baseInterval;
		this.checkpointTimeout = checkpointTimeout;
		this.minPauseBetweenCheckpoints = minPauseBetweenCheckpoints;
		this.maxConcurrentCheckpointAttempts = maxConcurrentCheckpointAttempts;
		this.tasksToTrigger = checkNotNull(tasksToTrigger);
		this.tasksToWaitFor = checkNotNull(tasksToWaitFor);
		this.tasksToCommitTo = checkNotNull(tasksToCommitTo);
		this.pendingCheckpoints = new LinkedHashMap();
		this.completedCheckpointStore = checkNotNull(completedCheckpointStore);
		this.recentPendingCheckpoints = new ArrayDeque(NUM_GHOST_CHECKPOINT_IDS);
		this.userClassLoader = userClassLoader;

		// Started with the periodic scheduler
		this.checkpointIdCounter = checkNotNull(checkpointIDCounter);

		this.timer = new Timer("Checkpoint Timer", true);

		this.statsTracker = checkNotNull(statsTracker);

		if (recoveryMode == RecoveryMode.STANDALONE) {
			// Add shutdown hook to clean up state handles when no checkpoint recovery is
			// possible. In case of another configured recovery mode, the checkpoints need to be
			// available for the standby job managers.
			this.shutdownHook = new Thread(new Runnable() {
				@Override
				public void run() {
					try {
						CheckpointCoordinator.this.shutdown();
					}
					catch (Throwable t) {
						LOG.error("Error during shutdown of checkpoint coordinator via " +
								"JVM shutdown hook: " + t.getMessage(), t);
					}
				}
			});

			try {
				// Add JVM shutdown hook to call shutdown of service
				Runtime.getRuntime().addShutdownHook(shutdownHook);
			}
			catch (IllegalStateException ignored) {
				// JVM is already shutting down. No need to do anything.
			}
			catch (Throwable t) {
				LOG.error("Cannot register checkpoint coordinator shutdown hook.", t);
			}
		}
		else {
			this.shutdownHook = null;
		}

		this.numberKeyGroups = numberKeyGroups;
	}

	// --------------------------------------------------------------------------------------------
	// Callbacks
	// --------------------------------------------------------------------------------------------

	/**
	 * Callback on shutdown of the coordinator. Called in lock scope.
	 */
	protected void onShutdown() {
	}

	/**
	 * Callback on cancellation of a checkpoint. Called in lock scope.
	 */
	protected void onCancelCheckpoint(long canceledCheckpointId) {
	}

	/**
	 * Callback on full acknowledgement of a checkpoint. Called in lock scope.
	 */
	protected void onFullyAcknowledgedCheckpoint(CompletedCheckpoint checkpoint) {
	}

	// --------------------------------------------------------------------------------------------
	//  Clean shutdown
	// --------------------------------------------------------------------------------------------

	/**
	 * Shuts down the checkpoint coordinator.
	 *
	 * 
After this method has been called, the coordinator does not accept
	 * and further messages and cannot trigger any further checkpoints. All
	 * checkpoint state is discarded.
	 */
	public void shutdown() throws Exception {
		shutdown(true);
	}

	/**
	 * Suspends the checkpoint coordinator.
	 *
	 * 
After this method has been called, the coordinator does not accept
	 * and further messages and cannot trigger any further checkpoints.
	 *
	 * 
The difference to shutdown is that checkpoint state in the store
	 * and counter is kept around if possible to recover later.
	 */
	public void suspend() throws Exception {
		shutdown(false);
	}

	/**
	 * Shuts down the checkpoint coordinator.
	 *
	 * @param shutdownStoreAndCounter Depending on this flag the checkpoint
	 * state services are shut down or suspended.
	 */
	private void shutdown(boolean shutdownStoreAndCounter) throws Exception {
		synchronized (lock) {
			try {
				if (!shutdown) {
					shutdown = true;
					LOG.info("Stopping checkpoint coordinator for job " + job);

					periodicScheduling = false;
					triggerRequestQueued = false;

					// shut down the thread that handles the timeouts and pending triggers
					timer.cancel();

					// make sure that the actor does not linger
					if (jobStatusListener != null) {
						jobStatusListener.tell(PoisonPill.getInstance());
						jobStatusListener = null;
					}

					// clear and discard all pending checkpoints
					for (PendingCheckpoint pending : pendingCheckpoints.values()) {
						pending.discard(userClassLoader);
					}
					pendingCheckpoints.clear();

					if (shutdownStoreAndCounter) {
						completedCheckpointStore.shutdown();
						checkpointIdCounter.shutdown();
					} else {
						completedCheckpointStore.suspend();
						checkpointIdCounter.suspend();
					}

					onShutdown();
				}
			} finally {
				// Remove shutdown hook to prevent resource leaks, unless this is invoked by the
				// shutdown hook itself.
				if (shutdownHook != null && shutdownHook != Thread.currentThread()) {
					try {
						Runtime.getRuntime().removeShutdownHook(shutdownHook);
					}
					catch (IllegalStateException ignored) {
						// race, JVM is in shutdown already, we can safely ignore this
					}
					catch (Throwable t) {
						LOG.warn("Error unregistering checkpoint coordinator shutdown hook.", t);
					}
				}
			}
		}
	}

	public boolean isShutdown() {
		return shutdown;
	}

	// --------------------------------------------------------------------------------------------
	//  Handling checkpoints and messages
	// --------------------------------------------------------------------------------------------

	/**
	 * Triggers a new checkpoint and uses the given timestamp as the checkpoint
	 * timestamp.
	 *
	 * @param timestamp The timestamp for the checkpoint.
	 */
	public boolean triggerCheckpoint(long timestamp) throws Exception {
		return triggerCheckpoint(timestamp, -1);
	}

	/**
	 * Triggers a new checkpoint and uses the given timestamp as the checkpoint
	 * timestamp.
	 *
	 * @param timestamp The timestamp for the checkpoint.
	 * @param nextCheckpointId The checkpoint ID to use for this checkpoint or -1 if
	 *                         the checkpoint ID counter should be queried.
	 */
	public boolean triggerCheckpoint(long timestamp, long nextCheckpointId) throws Exception {
		// make some eager pre-checks
		synchronized (lock) {
			// abort if the coordinator has been shutdown in the meantime
			if (shutdown) {
				return false;
			}

			// sanity check: there should never be more than one trigger request queued
			if (triggerRequestQueued) {
				LOG.warn("Trying to trigger another checkpoint while one was queued already");
				return false;
			}

			// if too many checkpoints are currently in progress, we need to mark that a request is queued
			if (pendingCheckpoints.size() >= maxConcurrentCheckpointAttempts) {
				triggerRequestQueued = true;
				if (currentPeriodicTrigger != null) {
					currentPeriodicTrigger.cancel();
					currentPeriodicTrigger = null;
				}
				return false;
			}

			//make sure the minimum interval between checkpoints has passed
			if (lastTriggeredCheckpoint + minPauseBetweenCheckpoints > timestamp) {
				if (currentPeriodicTrigger != null) {
					currentPeriodicTrigger.cancel();
					currentPeriodicTrigger = null;
				}
				ScheduledTrigger trigger = new ScheduledTrigger();
				timer.scheduleAtFixedRate(trigger, minPauseBetweenCheckpoints, baseInterval);
				return false;
			}
		}

		// first check if all tasks that we need to trigger are running.
		// if not, abort the checkpoint
		ExecutionAttemptID[] triggerIDs = new ExecutionAttemptID[tasksToTrigger.length];
		for (int i = 0; i < tasksToTrigger.length; i++) {
			Execution ee = tasksToTrigger[i].getCurrentExecutionAttempt();
			if (ee != null && ee.getState() == ExecutionState.RUNNING) {
				triggerIDs[i] = ee.getAttemptId();
			} else {
				LOG.info("Checkpoint triggering task {} is not being executed at the moment. Aborting checkpoint.",
						tasksToTrigger[i].getSimpleName());
				return false;
			}
		}

		// next, check if all tasks that need to acknowledge the checkpoint are running.
		// if not, abort the checkpoint
		Map ackTasks = new HashMap<>(tasksToWaitFor.length);

		for (ExecutionVertex ev : tasksToWaitFor) {
			Execution ee = ev.getCurrentExecutionAttempt();
			if (ee != null) {
				ackTasks.put(ee.getAttemptId(), ev);
			} else {
				LOG.info("Checkpoint acknowledging task {} is not being executed at the moment. Aborting checkpoint.",
						ev.getSimpleName());
				return false;
			}
		}

		// we will actually trigger this checkpoint!

		lastTriggeredCheckpoint = timestamp;
		final long checkpointID;
		if (nextCheckpointId < 0) {
			try {
				// this must happen outside the locked scope, because it communicates
				// with external services (in HA mode) and may block for a while.
				checkpointID = checkpointIdCounter.getAndIncrement();
			}
			catch (Throwable t) {
				int numUnsuccessful = ++numUnsuccessfulCheckpointsTriggers;
				LOG.warn("Failed to trigger checkpoint (" + numUnsuccessful + " consecutive failed attempts so far)", t);
				return false;
			}
		}
		else {
			checkpointID = nextCheckpointId;
		}

		LOG.info("Triggering checkpoint " + checkpointID + " @ " + timestamp);

		final PendingCheckpoint checkpoint = new PendingCheckpoint(job, checkpointID, timestamp, ackTasks);

		// schedule the timer that will clean up the expired checkpoints
		TimerTask canceller = new TimerTask() {
			@Override
			public void run() {
				try {
					synchronized (lock) {
						// only do the work if the checkpoint is not discarded anyways
						// note that checkpoint completion discards the pending checkpoint object
						if (!checkpoint.isDiscarded()) {
							LOG.info("Checkpoint " + checkpointID + " expired before completing.");

							checkpoint.discard(userClassLoader);
							pendingCheckpoints.remove(checkpointID);
							rememberRecentCheckpointId(checkpointID);

							onCancelCheckpoint(checkpointID);

							triggerQueuedRequests();
						}
					}
				}
				catch (Throwable t) {
					LOG.error("Exception while handling checkpoint timeout", t);
				}
			}
		};

		try {
			// re-acquire the lock
			synchronized (lock) {
				// since we released the lock in the meantime, we need to re-check
				// that the conditions still hold. this is clumsy, but it allows us to
				// release the lock in the meantime while calls to external services are
				// blocking progress, and still gives us early checks that skip work
				// if no checkpoint can happen anyways
				if (shutdown) {
					return false;
				}
				else if (triggerRequestQueued) {
					LOG.warn("Trying to trigger another checkpoint while one was queued already");
					return false;
				}
				else if (pendingCheckpoints.size() >= maxConcurrentCheckpointAttempts) {
					triggerRequestQueued = true;
					if (currentPeriodicTrigger != null) {
						currentPeriodicTrigger.cancel();
						currentPeriodicTrigger = null;
					}
					return false;
				}

				pendingCheckpoints.put(checkpointID, checkpoint);
				timer.schedule(canceller, checkpointTimeout);
			}
			// end of lock scope

			// send the messages to the tasks that trigger their checkpoint
			for (int i = 0; i < tasksToTrigger.length; i++) {
				ExecutionAttemptID id = triggerIDs[i];
				TriggerCheckpoint message = new TriggerCheckpoint(job, id, checkpointID, timestamp);
				tasksToTrigger[i].sendMessageToCurrentExecution(message, id);
			}

			numUnsuccessfulCheckpointsTriggers = 0;
			return true;
		}
		catch (Throwable t) {
			// guard the map against concurrent modifications
			synchronized (lock) {
				pendingCheckpoints.remove(checkpointID);
			}

			int numUnsuccessful = ++numUnsuccessfulCheckpointsTriggers;
			LOG.warn("Failed to trigger checkpoint (" + numUnsuccessful + " consecutive failed attempts so far)", t);
			if (!checkpoint.isDiscarded()) {
				checkpoint.discard(userClassLoader);
			}
			return false;
		}
	}

	/**
	 * Receives a {@link DeclineCheckpoint} message and returns whether the
	 * message was associated with a pending checkpoint.
	 *
	 * @param message Checkpoint decline from the task manager
	 *
	 * @return Flag indicating whether the declined checkpoint was associated
	 * with a pending checkpoint.
	 */
	public boolean receiveDeclineMessage(DeclineCheckpoint message) throws Exception {
		if (shutdown || message == null) {
			return false;
		}
		if (!job.equals(message.getJob())) {
			LOG.error("Received DeclineCheckpoint message for wrong job: {}", message);
			return false;
		}

		final long checkpointId = message.getCheckpointId();

		PendingCheckpoint checkpoint;

		// Flag indicating whether the ack message was for a known pending
		// checkpoint.
		boolean isPendingCheckpoint;

		synchronized (lock) {
			// we need to check inside the lock for being shutdown as well, otherwise we
			// get races and invalid error log messages
			if (shutdown) {
				return false;
			}

			checkpoint = pendingCheckpoints.get(checkpointId);

			if (checkpoint != null && !checkpoint.isDiscarded()) {
				isPendingCheckpoint = true;

				LOG.info("Discarding checkpoint " + checkpointId
					+ " because of checkpoint decline from task " + message.getTaskExecutionId());

				pendingCheckpoints.remove(checkpointId);
				checkpoint.discard(userClassLoader);
				rememberRecentCheckpointId(checkpointId);

				onCancelCheckpoint(checkpointId);

				boolean haveMoreRecentPending = false;
				Iterator> entries = pendingCheckpoints.entrySet().iterator();
				while (entries.hasNext()) {
					PendingCheckpoint p = entries.next().getValue();
					if (!p.isDiscarded() && p.getCheckpointTimestamp() >= checkpoint.getCheckpointTimestamp()) {
						haveMoreRecentPending = true;
						break;
					}
				}
				if (!haveMoreRecentPending && !triggerRequestQueued) {
					LOG.info("Triggering new checkpoint because of discarded checkpoint " + checkpointId);
					triggerCheckpoint(System.currentTimeMillis());
				} else if (!haveMoreRecentPending) {
					LOG.info("Promoting queued checkpoint request because of discarded checkpoint " + checkpointId);
					triggerQueuedRequests();
				}
			} else if (checkpoint != null) {
				// this should not happen
				throw new IllegalStateException(
					"Received message for discarded but non-removed checkpoint " + checkpointId);
			} else {
				// message is for an unknown checkpoint, or comes too late (checkpoint disposed)
				if (recentPendingCheckpoints.contains(checkpointId)) {
					isPendingCheckpoint = true;
					LOG.info("Received another decline checkpoint message for now expired checkpoint attempt " + checkpointId);
				} else {
					isPendingCheckpoint = false;
				}
			}
		}

		return isPendingCheckpoint;
	}

	/**
	 * Receives an AcknowledgeCheckpoint message and returns whether the
	 * message was associated with a pending checkpoint.
	 *
	 * @param message Checkpoint ack from the task manager
	 *
	 * @return Flag indicating whether the ack'd checkpoint was associated
	 * with a pending checkpoint.
	 *
	 * @throws Exception If the checkpoint cannot be added to the completed checkpoint store.
	 */
	public boolean receiveAcknowledgeMessage(AcknowledgeCheckpoint message) throws Exception {
		if (shutdown || message == null) {
			return false;
		}
		if (!job.equals(message.getJob())) {
			LOG.error("Received AcknowledgeCheckpoint message for wrong job: {}", message);
			return false;
		}

		final long checkpointId = message.getCheckpointId();

		CompletedCheckpoint completed = null;
		PendingCheckpoint checkpoint;

		// Flag indicating whether the ack message was for a known pending
		// checkpoint.
		boolean isPendingCheckpoint;

		synchronized (lock) {
			// we need to check inside the lock for being shutdown as well, otherwise we
			// get races and invalid error log messages
			if (shutdown) {
				return false;
			}

			checkpoint = pendingCheckpoints.get(checkpointId);

			if (checkpoint != null && !checkpoint.isDiscarded()) {
				isPendingCheckpoint = true;

				if (checkpoint.acknowledgeTask(
					message.getTaskExecutionId(),
					message.getState(),
					message.getStateSize(),
					null)) { // TODO: Give KV-state to the acknowledgeTask method
					if (checkpoint.isFullyAcknowledged()) {
						completed = checkpoint.finalizeCheckpoint();

						completedCheckpointStore.addCheckpoint(completed);

						LOG.info("Completed checkpoint " + checkpointId + " (in " +
								completed.getDuration() + " ms)");

						if (LOG.isDebugEnabled()) {
							StringBuilder builder = new StringBuilder();
							for (Map.Entry entry: completed.getTaskStates().entrySet()) {
								builder.append("JobVertexID: ").append(entry.getKey()).append(" {").append(entry.getValue()).append("}");
							}

							LOG.debug(builder.toString());
						}

						pendingCheckpoints.remove(checkpointId);
						rememberRecentCheckpointId(checkpointId);

						dropSubsumedCheckpoints(completed.getTimestamp());

						onFullyAcknowledgedCheckpoint(completed);

						triggerQueuedRequests();
					}
				}
				else {
					// checkpoint did not accept message
					LOG.error("Received duplicate or invalid acknowledge message for checkpoint " + checkpointId
							+ " , task " + message.getTaskExecutionId());
				}
			}
			else if (checkpoint != null) {
				// this should not happen
				throw new IllegalStateException(
						"Received message for discarded but non-removed checkpoint " + checkpointId);
			}
			else {
				// message is for an unknown checkpoint, or comes too late (checkpoint disposed)
				if (recentPendingCheckpoints.contains(checkpointId)) {
					isPendingCheckpoint = true;
					LOG.warn("Received late message for now expired checkpoint attempt " + checkpointId);
				}
				else {
					isPendingCheckpoint = false;
				}
			}
		}

		// send the confirmation messages to the necessary targets. we do this here
		// to be outside the lock scope
		if (completed != null) {
			final long timestamp = completed.getTimestamp();

			for (ExecutionVertex ev : tasksToCommitTo) {
				Execution ee = ev.getCurrentExecutionAttempt();
				if (ee != null) {
					ExecutionAttemptID attemptId = ee.getAttemptId();
					NotifyCheckpointComplete notifyMessage = new NotifyCheckpointComplete(job, attemptId, checkpointId, timestamp);
					ev.sendMessageToCurrentExecution(notifyMessage, ee.getAttemptId());
				}
			}

			statsTracker.onCompletedCheckpoint(completed);
		}

		return isPendingCheckpoint;
	}

	private void rememberRecentCheckpointId(long id) {
		if (recentPendingCheckpoints.size() >= NUM_GHOST_CHECKPOINT_IDS) {
			recentPendingCheckpoints.removeFirst();
		}
		recentPendingCheckpoints.addLast(id);
	}

	private void dropSubsumedCheckpoints(long timestamp) throws Exception {
		Iterator> entries = pendingCheckpoints.entrySet().iterator();
		while (entries.hasNext()) {
			PendingCheckpoint p = entries.next().getValue();
			if (p.getCheckpointTimestamp() < timestamp) {
				rememberRecentCheckpointId(p.getCheckpointId());

				p.discard(userClassLoader);

				onCancelCheckpoint(p.getCheckpointId());

				entries.remove();
			}
		}
	}

	/**
	 * Triggers the queued request, if there is one.
	 *
	 * NOTE: The caller of this method must hold the lock when invoking the method!
	 */
	private void triggerQueuedRequests() throws Exception {
		if (triggerRequestQueued) {
			triggerRequestQueued = false;

			// trigger the checkpoint from the trigger timer, to finish the work of this thread before
			// starting with the next checkpoint
			ScheduledTrigger trigger = new ScheduledTrigger();
			if (periodicScheduling) {
				if (currentPeriodicTrigger != null) {
					currentPeriodicTrigger.cancel();
				}
				currentPeriodicTrigger = trigger;
				timer.scheduleAtFixedRate(trigger, 0L, baseInterval);
			}
			else {
				timer.schedule(trigger, 0L);
			}
		}
	}

	// --------------------------------------------------------------------------------------------
	//  Checkpoint State Restoring
	// --------------------------------------------------------------------------------------------

	public boolean restoreLatestCheckpointedState(
			Map tasks,
			boolean errorIfNoCheckpoint,
			boolean allOrNothingState) throws Exception {

		synchronized (lock) {
			if (shutdown) {
				throw new IllegalStateException("CheckpointCoordinator is shut down");
			}

			// Recover the checkpoints
			completedCheckpointStore.recover();

			// restore from the latest checkpoint
			CompletedCheckpoint latest = completedCheckpointStore.getLatestCheckpoint();

			if (latest == null) {
				if (errorIfNoCheckpoint) {
					throw new IllegalStateException("No completed checkpoint available");
				} else {
					return false;
				}
			}

			for (Map.Entry taskGroupStateEntry: latest.getTaskStates().entrySet()) {
				TaskState taskState = taskGroupStateEntry.getValue();
				ExecutionJobVertex executionJobVertex = tasks.get(taskGroupStateEntry.getKey());

				if (executionJobVertex != null) {
					// check that we only restore the state if the parallelism has not been changed
					if (taskState.getParallelism() != executionJobVertex.getParallelism()) {
						throw new RuntimeException("Cannot restore the latest checkpoint because " +
							"the parallelism changed. The operator" + executionJobVertex.getJobVertexId() +
							" has parallelism " + executionJobVertex.getParallelism() + " whereas the corresponding" +
							"state object has a parallelism of " + taskState.getParallelism());
					}

					int counter = 0;

					List> keyGroupPartitions = createKeyGroupPartitions(numberKeyGroups, executionJobVertex.getParallelism());

					for (int i = 0; i < executionJobVertex.getParallelism(); i++) {
						SubtaskState subtaskState = taskState.getState(i);
						SerializedValue> state = null;

						if (subtaskState != null) {
							// count the number of executions for which we set a state
							counter++;
							state = subtaskState.getState();
						}

						Map>> kvStateForTaskMap = taskState.getUnwrappedKvStates(keyGroupPartitions.get(i));

						Execution currentExecutionAttempt = executionJobVertex.getTaskVertices()[i].getCurrentExecutionAttempt();
						currentExecutionAttempt.setInitialState(state, kvStateForTaskMap);
					}

					if (allOrNothingState && counter > 0 && counter < executionJobVertex.getParallelism()) {
						throw new IllegalStateException("The checkpoint contained state only for " +
							"a subset of tasks for vertex " + executionJobVertex);
					}
				} else {
					throw new IllegalStateException("There is no execution job vertex for the job" +
						" vertex ID " + taskGroupStateEntry.getKey());
				}
			}

			return true;
		}
	}

	/**
	 * Groups the available set of key groups into key group partitions. A key group partition is
	 * the set of key groups which is assigned to the same task. Each set of the returned list
	 * constitutes a key group partition.
	 *
	 * @param numberKeyGroups Number of available key groups (indexed from 0 to numberKeyGroups - 1)
	 * @param parallelism Parallelism to generate the key group partitioning for
	 * @return List of key group partitions
	 */
	protected List> createKeyGroupPartitions(int numberKeyGroups, int parallelism) {
		ArrayList> result = new ArrayList<>(parallelism);

		for (int p = 0; p < parallelism; p++) {
			HashSet keyGroupPartition = new HashSet<>();

			for (int k = p; k < numberKeyGroups; k += parallelism) {
				keyGroupPartition.add(k);
			}

			result.add(keyGroupPartition);
		}

		return result;
	}

	// --------------------------------------------------------------------------------------------
	//  Accessors
	// --------------------------------------------------------------------------------------------

	public int getNumberOfPendingCheckpoints() {
		return this.pendingCheckpoints.size();
	}

	public int getNumberOfRetainedSuccessfulCheckpoints() {
		synchronized (lock) {
			return completedCheckpointStore.getNumberOfRetainedCheckpoints();
		}
	}

	public Map getPendingCheckpoints() {
		synchronized (lock) {
			return new HashMap(this.pendingCheckpoints);
		}
	}

	public List getSuccessfulCheckpoints() throws Exception {
		synchronized (lock) {
			return completedCheckpointStore.getAllCheckpoints();
		}
	}

	protected long getAndIncrementCheckpointId() {
		try {
			// this must happen outside the locked scope, because it communicates
			// with external services (in HA mode) and may block for a while.
			return checkpointIdCounter.getAndIncrement();
		}
		catch (Throwable t) {
			int numUnsuccessful = ++numUnsuccessfulCheckpointsTriggers;
			LOG.warn("Failed to trigger checkpoint (" + numUnsuccessful + " consecutive failed attempts so far)", t);
			return -1;
		}
	}

	protected ActorGateway getJobStatusListener() {
		return jobStatusListener;
	}

	protected void setJobStatusListener(ActorGateway jobStatusListener) {
		this.jobStatusListener = jobStatusListener;
	}

	// --------------------------------------------------------------------------------------------
	//  Periodic scheduling of checkpoints
	// --------------------------------------------------------------------------------------------

	public void startCheckpointScheduler() throws Exception {
		synchronized (lock) {
			if (shutdown) {
				throw new IllegalArgumentException("Checkpoint coordinator is shut down");
			}

			// make sure all prior timers are cancelled
			stopCheckpointScheduler();

			try {
				// Multiple start calls are OK
				checkpointIdCounter.start();
			} catch (Exception e) {
				String msg = "Failed to start checkpoint ID counter: " + e.getMessage();
				throw new RuntimeException(msg, e);
			}

			periodicScheduling = true;
			currentPeriodicTrigger = new ScheduledTrigger();
			timer.scheduleAtFixedRate(currentPeriodicTrigger, baseInterval, baseInterval);
		}
	}

	public void stopCheckpointScheduler() throws Exception {
		synchronized (lock) {
			triggerRequestQueued = false;
			periodicScheduling = false;

			if (currentPeriodicTrigger != null) {
				currentPeriodicTrigger.cancel();
				currentPeriodicTrigger = null;
			}

			for (PendingCheckpoint p : pendingCheckpoints.values()) {
				p.discard(userClassLoader);
			}
			pendingCheckpoints.clear();

			numUnsuccessfulCheckpointsTriggers = 0;
		}
	}

	// ------------------------------------------------------------------------
	//  job status listener that schedules / cancels periodic checkpoints
	// ------------------------------------------------------------------------

	public ActorGateway createActivatorDeactivator(ActorSystem actorSystem, UUID leaderSessionID) {
		synchronized (lock) {
			if (shutdown) {
				throw new IllegalArgumentException("Checkpoint coordinator is shut down");
			}

			if (jobStatusListener == null) {
				Props props = Props.create(CheckpointCoordinatorDeActivator.class, this, leaderSessionID);

				// wrap the ActorRef in a AkkaActorGateway to support message decoration
				jobStatusListener = new AkkaActorGateway(actorSystem.actorOf(props), leaderSessionID);
			}

			return jobStatusListener;
		}
	}

	// ------------------------------------------------------------------------

	private class ScheduledTrigger extends TimerTask {

		@Override
		public void run() {
			try {
				triggerCheckpoint(System.currentTimeMillis());
			}
			catch (Exception e) {
				LOG.error("Exception while triggering checkpoint", e);
			}
		}
	}
}