All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.stratosphere.nephele.jobmanager.scheduler.RecoveryLogic Maven / Gradle / Ivy

There is a newer version: 0.5.2-hadoop2
Show newest version
/***********************************************************************************************************************
 * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations under the License.
 **********************************************************************************************************************/
package eu.stratosphere.nephele.jobmanager.scheduler;

import java.io.IOException;
import java.util.ArrayDeque;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Queue;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import eu.stratosphere.nephele.execution.ExecutionState;
import eu.stratosphere.nephele.executiongraph.ExecutionEdge;
import eu.stratosphere.nephele.executiongraph.ExecutionGate;
import eu.stratosphere.nephele.executiongraph.ExecutionGraph;
import eu.stratosphere.nephele.executiongraph.ExecutionVertex;
import eu.stratosphere.nephele.executiongraph.ExecutionVertexID;
import eu.stratosphere.nephele.instance.AbstractInstance;
import eu.stratosphere.nephele.instance.DummyInstance;
import eu.stratosphere.nephele.io.channels.ChannelID;
import eu.stratosphere.nephele.taskmanager.AbstractTaskResult.ReturnCode;
import eu.stratosphere.nephele.taskmanager.TaskCancelResult;
import eu.stratosphere.nephele.util.SerializableHashSet;
import eu.stratosphere.util.StringUtils;

public final class RecoveryLogic {

	/**
	 * The logger to report information and problems.
	 */
	private static final Log LOG = LogFactory.getLog(RecoveryLogic.class);

	/**
	 * Private constructor so class cannot be instantiated.
	 */
	private RecoveryLogic() {
	}

	public static boolean recover(final ExecutionVertex failedVertex,
			final Map verticesToBeRestarted,
			final Set assignedVertices) {

		// Perform initial sanity check
		if (failedVertex.getExecutionState() != ExecutionState.FAILED) {
			LOG.error("Vertex " + failedVertex + " is requested to be recovered, but is not failed");
			return false;
		}

		final ExecutionGraph eg = failedVertex.getExecutionGraph();
		synchronized (eg) {

			LOG.info("Starting recovery for failed vertex " + failedVertex);

			final Set verticesToBeCanceled = new HashSet();

			findVerticesToRestart(failedVertex, verticesToBeCanceled);

			// Restart all predecessors without checkpoint
			final Iterator cancelIterator = verticesToBeCanceled.iterator();
			while (cancelIterator.hasNext()) {

				final ExecutionVertex vertex = cancelIterator.next();

				if (vertex.compareAndUpdateExecutionState(ExecutionState.FINISHED, getStateToUpdate(vertex))) {
					LOG.info("Vertex " + vertex + " has already finished and will not be canceled");
					if (vertex.getExecutionState() == ExecutionState.ASSIGNED) {
						assignedVertices.add(vertex);
					}
					continue;
				}

				LOG.info(vertex + " is canceled by recovery logic");
				verticesToBeRestarted.put(vertex.getID(), vertex);
				final TaskCancelResult cancelResult = vertex.cancelTask();

				if (cancelResult.getReturnCode() != ReturnCode.SUCCESS
						&& cancelResult.getReturnCode() != ReturnCode.TASK_NOT_FOUND) {

					verticesToBeRestarted.remove(vertex.getID());
					LOG.error("Unable to cancel vertex" + cancelResult.getDescription());
					return false;
				}
			}

			LOG.info("Starting cache invalidation");

			// Invalidate the lookup caches
			if (!invalidateReceiverLookupCaches(failedVertex, verticesToBeCanceled)) {
				return false;
			}

			LOG.info("Cache invalidation complete");

			// Restart failed vertex
			failedVertex.updateExecutionState(getStateToUpdate(failedVertex));
			if (failedVertex.getExecutionState() == ExecutionState.ASSIGNED) {
				assignedVertices.add(failedVertex);
			}
		}

		return true;
	}

	static boolean hasInstanceAssigned(final ExecutionVertex vertex) {

		return !(vertex.getAllocatedResource().getInstance() instanceof DummyInstance);
	}

	private static ExecutionState getStateToUpdate(final ExecutionVertex vertex) {

		if (hasInstanceAssigned(vertex)) {
			return ExecutionState.ASSIGNED;
		}

		return ExecutionState.CREATED;
	}

	private static void findVerticesToRestart(final ExecutionVertex failedVertex,
			final Set verticesToBeCanceled) {

		final Queue verticesToTest = new ArrayDeque();
		final Set visited = new HashSet();
		verticesToTest.add(failedVertex);

		while (!verticesToTest.isEmpty()) {

			final ExecutionVertex vertex = verticesToTest.poll();

			// Predecessors must be either checkpoints or need to be restarted, too
			for (int j = 0; j < vertex.getNumberOfPredecessors(); j++) {
				final ExecutionVertex predecessor = vertex.getPredecessor(j);

				if (hasInstanceAssigned(predecessor)) {
					verticesToBeCanceled.add(predecessor);
				}

				if (!visited.contains(predecessor)) {
					verticesToTest.add(predecessor);
				}
			}
			visited.add(vertex);
		}
	}

	private static final boolean invalidateReceiverLookupCaches(final ExecutionVertex failedVertex,
			final Set verticesToBeCanceled) {

		final Map> entriesToInvalidate = new HashMap>();

		collectCacheEntriesToInvalidate(failedVertex, entriesToInvalidate);
		for (final Iterator it = verticesToBeCanceled.iterator(); it.hasNext();) {
			collectCacheEntriesToInvalidate(it.next(), entriesToInvalidate);
		}

		final Iterator>> it = entriesToInvalidate.entrySet().iterator();

		while (it.hasNext()) {

			final Map.Entry> entry = it.next();
			final AbstractInstance instance = entry.getKey();

			try {
				instance.invalidateLookupCacheEntries(entry.getValue());
			} catch (IOException ioe) {
				LOG.error(StringUtils.stringifyException(ioe));
				return false;
			}
		}

		return true;
	}

	private static void collectCacheEntriesToInvalidate(final ExecutionVertex vertex,
			final Map> entriesToInvalidate) {

		final int numberOfOutputGates = vertex.getNumberOfOutputGates();
		for (int i = 0; i < numberOfOutputGates; ++i) {

			final ExecutionGate outputGate = vertex.getOutputGate(i);
			for (int j = 0; j < outputGate.getNumberOfEdges(); ++j) {

				final ExecutionEdge outputChannel = outputGate.getEdge(j);

				final ExecutionVertex connectedVertex = outputChannel.getInputGate().getVertex();
				if (connectedVertex == null) {
					LOG.error("Connected vertex is null");
					continue;
				}

				final AbstractInstance instance = connectedVertex.getAllocatedResource().getInstance();
				if (instance instanceof DummyInstance) {
					continue;
				}

				Set channelIDs = entriesToInvalidate.get(instance);
				if (channelIDs == null) {
					channelIDs = new SerializableHashSet();
					entriesToInvalidate.put(instance, channelIDs);
				}

				channelIDs.add(outputChannel.getInputChannelID());
			}
		}

		for (int i = 0; i < vertex.getNumberOfInputGates(); ++i) {

			final ExecutionGate inputGate = vertex.getInputGate(i);
			for (int j = 0; j < inputGate.getNumberOfEdges(); ++j) {

				final ExecutionEdge inputChannel = inputGate.getEdge(j);

				final ExecutionVertex connectedVertex = inputChannel.getOutputGate().getVertex();
				if (connectedVertex == null) {
					LOG.error("Connected vertex is null");
					continue;
				}

				final AbstractInstance instance = connectedVertex.getAllocatedResource().getInstance();
				if (instance instanceof DummyInstance) {
					continue;
				}

				Set channelIDs = entriesToInvalidate.get(instance);
				if (channelIDs == null) {
					channelIDs = new SerializableHashSet();
					entriesToInvalidate.put(instance, channelIDs);
				}

				channelIDs.add(inputChannel.getOutputChannelID());
			}
		}
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy