All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.runtime.resourcemanager.slotmanager.SlotManagerImpl Maven / Gradle / Ivy

There is a newer version: 1.13.6
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.resourcemanager.slotmanager;

import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.api.common.JobID;
import org.apache.flink.api.common.time.Time;
import org.apache.flink.runtime.clusterframework.types.AllocationID;
import org.apache.flink.runtime.clusterframework.types.ResourceProfile;
import org.apache.flink.runtime.clusterframework.types.SlotID;
import org.apache.flink.runtime.concurrent.ScheduledExecutor;
import org.apache.flink.runtime.instance.InstanceID;
import org.apache.flink.runtime.messages.Acknowledge;
import org.apache.flink.runtime.resourcemanager.ResourceManagerId;
import org.apache.flink.runtime.resourcemanager.SlotRequest;
import org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException;
import org.apache.flink.runtime.resourcemanager.exceptions.UnfulfillableSlotRequestException;
import org.apache.flink.runtime.resourcemanager.registration.TaskExecutorConnection;
import org.apache.flink.runtime.taskexecutor.SlotReport;
import org.apache.flink.runtime.taskexecutor.SlotStatus;
import org.apache.flink.runtime.taskexecutor.TaskExecutorGateway;
import org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException;
import org.apache.flink.runtime.taskexecutor.exceptions.SlotOccupiedException;
import org.apache.flink.util.FlinkException;
import org.apache.flink.util.OptionalConsumer;
import org.apache.flink.util.Preconditions;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.concurrent.CancellationException;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.Executor;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;

/**
 * Implementation of {@link SlotManager}.
 */
public class SlotManagerImpl implements SlotManager {
	private static final Logger LOG = LoggerFactory.getLogger(SlotManagerImpl.class);

	/** Scheduled executor for timeouts. */
	private final ScheduledExecutor scheduledExecutor;

	/** Timeout for slot requests to the task manager. */
	private final Time taskManagerRequestTimeout;

	/** Timeout after which an allocation is discarded. */
	private final Time slotRequestTimeout;

	/** Timeout after which an unused TaskManager is released. */
	private final Time taskManagerTimeout;

	/** Map for all registered slots. */
	private final HashMap slots;

	/** Index of all currently free slots. */
	private final LinkedHashMap freeSlots;

	/** All currently registered task managers. */
	private final HashMap taskManagerRegistrations;

	/** Map of fulfilled and active allocations for request deduplication purposes. */
	private final HashMap fulfilledSlotRequests;

	/** Map of pending/unfulfilled slot allocation requests. */
	private final HashMap pendingSlotRequests;

	private final HashMap pendingSlots;

	private final SlotMatchingStrategy slotMatchingStrategy;

	/** ResourceManager's id. */
	private ResourceManagerId resourceManagerId;

	/** Executor for future callbacks which have to be "synchronized". */
	private Executor mainThreadExecutor;

	/** Callbacks for resource (de-)allocations. */
	private ResourceActions resourceActions;

	private ScheduledFuture taskManagerTimeoutCheck;

	private ScheduledFuture slotRequestTimeoutCheck;

	/** True iff the component has been started. */
	private boolean started;

	/** Release task executor only when each produced result partition is either consumed or failed. */
	private final boolean waitResultConsumedBeforeRelease;

	/**
	 * If true, fail unfulfillable slot requests immediately. Otherwise, allow unfulfillable request to pend.
	 * A slot request is considered unfulfillable if it cannot be fulfilled by neither a slot that is already registered
	 * (including allocated ones) nor a pending slot that the {@link ResourceActions} can allocate.
	 * */
	private boolean failUnfulfillableRequest = true;

	public SlotManagerImpl(
			SlotMatchingStrategy slotMatchingStrategy,
			ScheduledExecutor scheduledExecutor,
			Time taskManagerRequestTimeout,
			Time slotRequestTimeout,
			Time taskManagerTimeout,
			boolean waitResultConsumedBeforeRelease) {

		this.slotMatchingStrategy = Preconditions.checkNotNull(slotMatchingStrategy);
		this.scheduledExecutor = Preconditions.checkNotNull(scheduledExecutor);
		this.taskManagerRequestTimeout = Preconditions.checkNotNull(taskManagerRequestTimeout);
		this.slotRequestTimeout = Preconditions.checkNotNull(slotRequestTimeout);
		this.taskManagerTimeout = Preconditions.checkNotNull(taskManagerTimeout);
		this.waitResultConsumedBeforeRelease = waitResultConsumedBeforeRelease;

		slots = new HashMap<>(16);
		freeSlots = new LinkedHashMap<>(16);
		taskManagerRegistrations = new HashMap<>(4);
		fulfilledSlotRequests = new HashMap<>(16);
		pendingSlotRequests = new HashMap<>(16);
		pendingSlots = new HashMap<>(16);

		resourceManagerId = null;
		resourceActions = null;
		mainThreadExecutor = null;
		taskManagerTimeoutCheck = null;
		slotRequestTimeoutCheck = null;

		started = false;
	}

	@Override
	public int getNumberRegisteredSlots() {
		return slots.size();
	}

	@Override
	public int getNumberRegisteredSlotsOf(InstanceID instanceId) {
		TaskManagerRegistration taskManagerRegistration = taskManagerRegistrations.get(instanceId);

		if (taskManagerRegistration != null) {
			return taskManagerRegistration.getNumberRegisteredSlots();
		} else {
			return 0;
		}
	}

	@Override
	public int getNumberFreeSlots() {
		return freeSlots.size();
	}

	@Override
	public int getNumberFreeSlotsOf(InstanceID instanceId) {
		TaskManagerRegistration taskManagerRegistration = taskManagerRegistrations.get(instanceId);

		if (taskManagerRegistration != null) {
			return taskManagerRegistration.getNumberFreeSlots();
		} else {
			return 0;
		}
	}

	@Override
	public int getNumberPendingTaskManagerSlots() {
		return pendingSlots.size();
	}

	@Override
	public int getNumberPendingSlotRequests() {
		return pendingSlotRequests.size();
	}

	@VisibleForTesting
	public int getNumberAssignedPendingTaskManagerSlots() {
		return (int) pendingSlots.values().stream().filter(slot -> slot.getAssignedPendingSlotRequest() != null).count();
	}

	// ---------------------------------------------------------------------------------------------
	// Component lifecycle methods
	// ---------------------------------------------------------------------------------------------

	/**
	 * Starts the slot manager with the given leader id and resource manager actions.
	 *
	 * @param newResourceManagerId to use for communication with the task managers
	 * @param newMainThreadExecutor to use to run code in the ResourceManager's main thread
	 * @param newResourceActions to use for resource (de-)allocations
	 */
	@Override
	public void start(ResourceManagerId newResourceManagerId, Executor newMainThreadExecutor, ResourceActions newResourceActions) {
		LOG.info("Starting the SlotManager.");

		this.resourceManagerId = Preconditions.checkNotNull(newResourceManagerId);
		mainThreadExecutor = Preconditions.checkNotNull(newMainThreadExecutor);
		resourceActions = Preconditions.checkNotNull(newResourceActions);

		started = true;

		taskManagerTimeoutCheck = scheduledExecutor.scheduleWithFixedDelay(
			() -> mainThreadExecutor.execute(
				() -> checkTaskManagerTimeouts()),
			0L,
			taskManagerTimeout.toMilliseconds(),
			TimeUnit.MILLISECONDS);

		slotRequestTimeoutCheck = scheduledExecutor.scheduleWithFixedDelay(
			() -> mainThreadExecutor.execute(
				() -> checkSlotRequestTimeouts()),
			0L,
			slotRequestTimeout.toMilliseconds(),
			TimeUnit.MILLISECONDS);
	}

	/**
	 * Suspends the component. This clears the internal state of the slot manager.
	 */
	@Override
	public void suspend() {
		LOG.info("Suspending the SlotManager.");

		// stop the timeout checks for the TaskManagers and the SlotRequests
		if (taskManagerTimeoutCheck != null) {
			taskManagerTimeoutCheck.cancel(false);
			taskManagerTimeoutCheck = null;
		}

		if (slotRequestTimeoutCheck != null) {
			slotRequestTimeoutCheck.cancel(false);
			slotRequestTimeoutCheck = null;
		}

		for (PendingSlotRequest pendingSlotRequest : pendingSlotRequests.values()) {
			cancelPendingSlotRequest(pendingSlotRequest);
		}

		pendingSlotRequests.clear();

		ArrayList registeredTaskManagers = new ArrayList<>(taskManagerRegistrations.keySet());

		for (InstanceID registeredTaskManager : registeredTaskManagers) {
			unregisterTaskManager(registeredTaskManager, new SlotManagerException("The slot manager is being suspended."));
		}

		resourceManagerId = null;
		resourceActions = null;
		started = false;
	}

	/**
	 * Closes the slot manager.
	 *
	 * @throws Exception if the close operation fails
	 */
	@Override
	public void close() throws Exception {
		LOG.info("Closing the SlotManager.");

		suspend();
	}

	// ---------------------------------------------------------------------------------------------
	// Public API
	// ---------------------------------------------------------------------------------------------

	/**
	 * Requests a slot with the respective resource profile.
	 *
	 * @param slotRequest specifying the requested slot specs
	 * @return true if the slot request was registered; false if the request is a duplicate
	 * @throws ResourceManagerException if the slot request failed (e.g. not enough resources left)
	 */
	@Override
	public boolean registerSlotRequest(SlotRequest slotRequest) throws ResourceManagerException {
		checkInit();

		if (checkDuplicateRequest(slotRequest.getAllocationId())) {
			LOG.debug("Ignoring a duplicate slot request with allocation id {}.", slotRequest.getAllocationId());

			return false;
		} else {
			PendingSlotRequest pendingSlotRequest = new PendingSlotRequest(slotRequest);

			pendingSlotRequests.put(slotRequest.getAllocationId(), pendingSlotRequest);

			try {
				internalRequestSlot(pendingSlotRequest);
			} catch (ResourceManagerException e) {
				// requesting the slot failed --> remove pending slot request
				pendingSlotRequests.remove(slotRequest.getAllocationId());

				throw new ResourceManagerException("Could not fulfill slot request " + slotRequest.getAllocationId() + '.', e);
			}

			return true;
		}
	}

	/**
	 * Cancels and removes a pending slot request with the given allocation id. If there is no such
	 * pending request, then nothing is done.
	 *
	 * @param allocationId identifying the pending slot request
	 * @return True if a pending slot request was found; otherwise false
	 */
	@Override
	public boolean unregisterSlotRequest(AllocationID allocationId) {
		checkInit();

		PendingSlotRequest pendingSlotRequest = pendingSlotRequests.remove(allocationId);

		if (null != pendingSlotRequest) {
			LOG.debug("Cancel slot request {}.", allocationId);

			cancelPendingSlotRequest(pendingSlotRequest);

			return true;
		} else {
			LOG.debug("No pending slot request with allocation id {} found. Ignoring unregistration request.", allocationId);

			return false;
		}
	}

	/**
	 * Registers a new task manager at the slot manager. This will make the task managers slots
	 * known and, thus, available for allocation.
	 *
	 * @param taskExecutorConnection for the new task manager
	 * @param initialSlotReport for the new task manager
	 */
	@Override
	public void registerTaskManager(final TaskExecutorConnection taskExecutorConnection, SlotReport initialSlotReport) {
		checkInit();

		LOG.debug("Registering TaskManager {} under {} at the SlotManager.", taskExecutorConnection.getResourceID(), taskExecutorConnection.getInstanceID());

		// we identify task managers by their instance id
		if (taskManagerRegistrations.containsKey(taskExecutorConnection.getInstanceID())) {
			reportSlotStatus(taskExecutorConnection.getInstanceID(), initialSlotReport);
		} else {
			// first register the TaskManager
			ArrayList reportedSlots = new ArrayList<>();

			for (SlotStatus slotStatus : initialSlotReport) {
				reportedSlots.add(slotStatus.getSlotID());
			}

			TaskManagerRegistration taskManagerRegistration = new TaskManagerRegistration(
				taskExecutorConnection,
				reportedSlots);

			taskManagerRegistrations.put(taskExecutorConnection.getInstanceID(), taskManagerRegistration);

			// next register the new slots
			for (SlotStatus slotStatus : initialSlotReport) {
				registerSlot(
					slotStatus.getSlotID(),
					slotStatus.getAllocationID(),
					slotStatus.getJobID(),
					slotStatus.getResourceProfile(),
					taskExecutorConnection);
			}
		}

	}

	@Override
	public boolean unregisterTaskManager(InstanceID instanceId, Exception cause) {
		checkInit();

		LOG.debug("Unregister TaskManager {} from the SlotManager.", instanceId);

		TaskManagerRegistration taskManagerRegistration = taskManagerRegistrations.remove(instanceId);

		if (null != taskManagerRegistration) {
			internalUnregisterTaskManager(taskManagerRegistration, cause);

			return true;
		} else {
			LOG.debug("There is no task manager registered with instance ID {}. Ignoring this message.", instanceId);

			return false;
		}
	}

	/**
	 * Reports the current slot allocations for a task manager identified by the given instance id.
	 *
	 * @param instanceId identifying the task manager for which to report the slot status
	 * @param slotReport containing the status for all of its slots
	 * @return true if the slot status has been updated successfully, otherwise false
	 */
	@Override
	public boolean reportSlotStatus(InstanceID instanceId, SlotReport slotReport) {
		checkInit();

		LOG.debug("Received slot report from instance {}: {}.", instanceId, slotReport);

		TaskManagerRegistration taskManagerRegistration = taskManagerRegistrations.get(instanceId);

		if (null != taskManagerRegistration) {

			for (SlotStatus slotStatus : slotReport) {
				updateSlot(slotStatus.getSlotID(), slotStatus.getAllocationID(), slotStatus.getJobID());
			}

			return true;
		} else {
			LOG.debug("Received slot report for unknown task manager with instance id {}. Ignoring this report.", instanceId);

			return false;
		}
	}

	/**
	 * Free the given slot from the given allocation. If the slot is still allocated by the given
	 * allocation id, then the slot will be marked as free and will be subject to new slot requests.
	 *
	 * @param slotId identifying the slot to free
	 * @param allocationId with which the slot is presumably allocated
	 */
	@Override
	public void freeSlot(SlotID slotId, AllocationID allocationId) {
		checkInit();

		TaskManagerSlot slot = slots.get(slotId);

		if (null != slot) {
			if (slot.getState() == TaskManagerSlot.State.ALLOCATED) {
				if (Objects.equals(allocationId, slot.getAllocationId())) {

					TaskManagerRegistration taskManagerRegistration = taskManagerRegistrations.get(slot.getInstanceId());

					if (taskManagerRegistration == null) {
						throw new IllegalStateException("Trying to free a slot from a TaskManager " +
							slot.getInstanceId() + " which has not been registered.");
					}

					updateSlotState(slot, taskManagerRegistration, null, null);
				} else {
					LOG.debug("Received request to free slot {} with expected allocation id {}, " +
						"but actual allocation id {} differs. Ignoring the request.", slotId, allocationId, slot.getAllocationId());
				}
			} else {
				LOG.debug("Slot {} has not been allocated.", allocationId);
			}
		} else {
			LOG.debug("Trying to free a slot {} which has not been registered. Ignoring this message.", slotId);
		}
	}

	@Override
	public void setFailUnfulfillableRequest(boolean failUnfulfillableRequest) {
		if (!this.failUnfulfillableRequest && failUnfulfillableRequest) {
			// fail unfulfillable pending requests
			Iterator> slotRequestIterator = pendingSlotRequests.entrySet().iterator();
			while (slotRequestIterator.hasNext()) {
				PendingSlotRequest pendingSlotRequest = slotRequestIterator.next().getValue();
				if (pendingSlotRequest.getAssignedPendingTaskManagerSlot() != null) {
					continue;
				}
				if (!isFulfillableByRegisteredSlots(pendingSlotRequest.getResourceProfile())) {
					slotRequestIterator.remove();
					resourceActions.notifyAllocationFailure(
						pendingSlotRequest.getJobId(),
						pendingSlotRequest.getAllocationId(),
						new UnfulfillableSlotRequestException(pendingSlotRequest.getAllocationId(), pendingSlotRequest.getResourceProfile())
					);
				}
			}
		}
		this.failUnfulfillableRequest = failUnfulfillableRequest;
	}

	// ---------------------------------------------------------------------------------------------
	// Behaviour methods
	// ---------------------------------------------------------------------------------------------

	/**
	 * Finds a matching slot request for a given resource profile. If there is no such request,
	 * the method returns null.
	 *
	 * 

Note: If you want to change the behaviour of the slot manager wrt slot allocation and * request fulfillment, then you should override this method. * * @param slotResourceProfile defining the resources of an available slot * @return A matching slot request which can be deployed in a slot with the given resource * profile. Null if there is no such slot request pending. */ private PendingSlotRequest findMatchingRequest(ResourceProfile slotResourceProfile) { for (PendingSlotRequest pendingSlotRequest : pendingSlotRequests.values()) { if (!pendingSlotRequest.isAssigned() && slotResourceProfile.isMatching(pendingSlotRequest.getResourceProfile())) { return pendingSlotRequest; } } return null; } /** * Finds a matching slot for a given resource profile. A matching slot has at least as many * resources available as the given resource profile. If there is no such slot available, then * the method returns null. * *

Note: If you want to change the behaviour of the slot manager wrt slot allocation and * request fulfillment, then you should override this method. * * @param requestResourceProfile specifying the resource requirements for the a slot request * @return A matching slot which fulfills the given resource profile. {@link Optional#empty()} * if there is no such slot available. */ private Optional findMatchingSlot(ResourceProfile requestResourceProfile) { final Optional optionalMatchingSlot = slotMatchingStrategy.findMatchingSlot( requestResourceProfile, freeSlots.values(), this::getNumberRegisteredSlotsOf); optionalMatchingSlot.ifPresent(taskManagerSlot -> { // sanity check Preconditions.checkState( taskManagerSlot.getState() == TaskManagerSlot.State.FREE, "TaskManagerSlot %s is not in state FREE but %s.", taskManagerSlot.getSlotId(), taskManagerSlot.getState()); freeSlots.remove(taskManagerSlot.getSlotId()); }); return optionalMatchingSlot; } // --------------------------------------------------------------------------------------------- // Internal slot operations // --------------------------------------------------------------------------------------------- /** * Registers a slot for the given task manager at the slot manager. The slot is identified by * the given slot id. The given resource profile defines the available resources for the slot. * The task manager connection can be used to communicate with the task manager. * * @param slotId identifying the slot on the task manager * @param allocationId which is currently deployed in the slot * @param resourceProfile of the slot * @param taskManagerConnection to communicate with the remote task manager */ private void registerSlot( SlotID slotId, AllocationID allocationId, JobID jobId, ResourceProfile resourceProfile, TaskExecutorConnection taskManagerConnection) { if (slots.containsKey(slotId)) { // remove the old slot first removeSlot( slotId, new SlotManagerException( String.format( "Re-registration of slot %s. This indicates that the TaskExecutor has re-connected.", slotId))); } final TaskManagerSlot slot = createAndRegisterTaskManagerSlot(slotId, resourceProfile, taskManagerConnection); final PendingTaskManagerSlot pendingTaskManagerSlot; if (allocationId == null) { pendingTaskManagerSlot = findExactlyMatchingPendingTaskManagerSlot(resourceProfile); } else { pendingTaskManagerSlot = null; } if (pendingTaskManagerSlot == null) { updateSlot(slotId, allocationId, jobId); } else { pendingSlots.remove(pendingTaskManagerSlot.getTaskManagerSlotId()); final PendingSlotRequest assignedPendingSlotRequest = pendingTaskManagerSlot.getAssignedPendingSlotRequest(); if (assignedPendingSlotRequest == null) { handleFreeSlot(slot); } else { assignedPendingSlotRequest.unassignPendingTaskManagerSlot(); allocateSlot(slot, assignedPendingSlotRequest); } } } @Nonnull private TaskManagerSlot createAndRegisterTaskManagerSlot(SlotID slotId, ResourceProfile resourceProfile, TaskExecutorConnection taskManagerConnection) { final TaskManagerSlot slot = new TaskManagerSlot( slotId, resourceProfile, taskManagerConnection); slots.put(slotId, slot); return slot; } @Nullable private PendingTaskManagerSlot findExactlyMatchingPendingTaskManagerSlot(ResourceProfile resourceProfile) { for (PendingTaskManagerSlot pendingTaskManagerSlot : pendingSlots.values()) { if (pendingTaskManagerSlot.getResourceProfile().equals(resourceProfile)) { return pendingTaskManagerSlot; } } return null; } /** * Updates a slot with the given allocation id. * * @param slotId to update * @param allocationId specifying the current allocation of the slot * @param jobId specifying the job to which the slot is allocated * @return True if the slot could be updated; otherwise false */ private boolean updateSlot(SlotID slotId, AllocationID allocationId, JobID jobId) { final TaskManagerSlot slot = slots.get(slotId); if (slot != null) { final TaskManagerRegistration taskManagerRegistration = taskManagerRegistrations.get(slot.getInstanceId()); if (taskManagerRegistration != null) { updateSlotState(slot, taskManagerRegistration, allocationId, jobId); return true; } else { throw new IllegalStateException("Trying to update a slot from a TaskManager " + slot.getInstanceId() + " which has not been registered."); } } else { LOG.debug("Trying to update unknown slot with slot id {}.", slotId); return false; } } private void updateSlotState( TaskManagerSlot slot, TaskManagerRegistration taskManagerRegistration, @Nullable AllocationID allocationId, @Nullable JobID jobId) { if (null != allocationId) { switch (slot.getState()) { case PENDING: // we have a pending slot request --> check whether we have to reject it PendingSlotRequest pendingSlotRequest = slot.getAssignedSlotRequest(); if (Objects.equals(pendingSlotRequest.getAllocationId(), allocationId)) { // we can cancel the slot request because it has been fulfilled cancelPendingSlotRequest(pendingSlotRequest); // remove the pending slot request, since it has been completed pendingSlotRequests.remove(pendingSlotRequest.getAllocationId()); slot.completeAllocation(allocationId, jobId); } else { // we first have to free the slot in order to set a new allocationId slot.clearPendingSlotRequest(); // set the allocation id such that the slot won't be considered for the pending slot request slot.updateAllocation(allocationId, jobId); // remove the pending request if any as it has been assigned final PendingSlotRequest actualPendingSlotRequest = pendingSlotRequests.remove(allocationId); if (actualPendingSlotRequest != null) { cancelPendingSlotRequest(actualPendingSlotRequest); } // this will try to find a new slot for the request rejectPendingSlotRequest( pendingSlotRequest, new Exception("Task manager reported slot " + slot.getSlotId() + " being already allocated.")); } taskManagerRegistration.occupySlot(); break; case ALLOCATED: if (!Objects.equals(allocationId, slot.getAllocationId())) { slot.freeSlot(); slot.updateAllocation(allocationId, jobId); } break; case FREE: // the slot is currently free --> it is stored in freeSlots freeSlots.remove(slot.getSlotId()); slot.updateAllocation(allocationId, jobId); taskManagerRegistration.occupySlot(); break; } fulfilledSlotRequests.put(allocationId, slot.getSlotId()); } else { // no allocation reported switch (slot.getState()) { case FREE: handleFreeSlot(slot); break; case PENDING: // don't do anything because we still have a pending slot request break; case ALLOCATED: AllocationID oldAllocation = slot.getAllocationId(); slot.freeSlot(); fulfilledSlotRequests.remove(oldAllocation); taskManagerRegistration.freeSlot(); handleFreeSlot(slot); break; } } } /** * Tries to allocate a slot for the given slot request. If there is no slot available, the * resource manager is informed to allocate more resources and a timeout for the request is * registered. * * @param pendingSlotRequest to allocate a slot for * @throws ResourceManagerException if the slot request failed or is unfulfillable */ private void internalRequestSlot(PendingSlotRequest pendingSlotRequest) throws ResourceManagerException { final ResourceProfile resourceProfile = pendingSlotRequest.getResourceProfile(); OptionalConsumer.of(findMatchingSlot(resourceProfile)) .ifPresent(taskManagerSlot -> allocateSlot(taskManagerSlot, pendingSlotRequest)) .ifNotPresent(() -> fulfillPendingSlotRequestWithPendingTaskManagerSlot(pendingSlotRequest)); } private void fulfillPendingSlotRequestWithPendingTaskManagerSlot(PendingSlotRequest pendingSlotRequest) throws ResourceManagerException { ResourceProfile resourceProfile = pendingSlotRequest.getResourceProfile(); Optional pendingTaskManagerSlotOptional = findFreeMatchingPendingTaskManagerSlot(resourceProfile); if (!pendingTaskManagerSlotOptional.isPresent()) { pendingTaskManagerSlotOptional = allocateResource(resourceProfile); } OptionalConsumer.of(pendingTaskManagerSlotOptional) .ifPresent(pendingTaskManagerSlot -> assignPendingTaskManagerSlot(pendingSlotRequest, pendingTaskManagerSlot)) .ifNotPresent(() -> { // request can not be fulfilled by any free slot or pending slot that can be allocated, // check whether it can be fulfilled by allocated slots if (failUnfulfillableRequest && !isFulfillableByRegisteredSlots(pendingSlotRequest.getResourceProfile())) { throw new UnfulfillableSlotRequestException(pendingSlotRequest.getAllocationId(), pendingSlotRequest.getResourceProfile()); } }); } private Optional findFreeMatchingPendingTaskManagerSlot(ResourceProfile requiredResourceProfile) { for (PendingTaskManagerSlot pendingTaskManagerSlot : pendingSlots.values()) { if (pendingTaskManagerSlot.getAssignedPendingSlotRequest() == null && pendingTaskManagerSlot.getResourceProfile().isMatching(requiredResourceProfile)) { return Optional.of(pendingTaskManagerSlot); } } return Optional.empty(); } private boolean isFulfillableByRegisteredSlots(ResourceProfile resourceProfile) { for (TaskManagerSlot slot : slots.values()) { if (slot.getResourceProfile().isMatching(resourceProfile)) { return true; } } return false; } private Optional allocateResource(ResourceProfile resourceProfile) throws ResourceManagerException { final Collection requestedSlots = resourceActions.allocateResource(resourceProfile); if (requestedSlots.isEmpty()) { return Optional.empty(); } else { final Iterator slotIterator = requestedSlots.iterator(); final PendingTaskManagerSlot pendingTaskManagerSlot = new PendingTaskManagerSlot(slotIterator.next()); pendingSlots.put(pendingTaskManagerSlot.getTaskManagerSlotId(), pendingTaskManagerSlot); while (slotIterator.hasNext()) { final PendingTaskManagerSlot additionalPendingTaskManagerSlot = new PendingTaskManagerSlot(slotIterator.next()); pendingSlots.put(additionalPendingTaskManagerSlot.getTaskManagerSlotId(), additionalPendingTaskManagerSlot); } return Optional.of(pendingTaskManagerSlot); } } private void assignPendingTaskManagerSlot(PendingSlotRequest pendingSlotRequest, PendingTaskManagerSlot pendingTaskManagerSlot) { pendingTaskManagerSlot.assignPendingSlotRequest(pendingSlotRequest); pendingSlotRequest.assignPendingTaskManagerSlot(pendingTaskManagerSlot); } /** * Allocates the given slot for the given slot request. This entails sending a registration * message to the task manager and treating failures. * * @param taskManagerSlot to allocate for the given slot request * @param pendingSlotRequest to allocate the given slot for */ private void allocateSlot(TaskManagerSlot taskManagerSlot, PendingSlotRequest pendingSlotRequest) { Preconditions.checkState(taskManagerSlot.getState() == TaskManagerSlot.State.FREE); TaskExecutorConnection taskExecutorConnection = taskManagerSlot.getTaskManagerConnection(); TaskExecutorGateway gateway = taskExecutorConnection.getTaskExecutorGateway(); final CompletableFuture completableFuture = new CompletableFuture<>(); final AllocationID allocationId = pendingSlotRequest.getAllocationId(); final SlotID slotId = taskManagerSlot.getSlotId(); final InstanceID instanceID = taskManagerSlot.getInstanceId(); taskManagerSlot.assignPendingSlotRequest(pendingSlotRequest); pendingSlotRequest.setRequestFuture(completableFuture); returnPendingTaskManagerSlotIfAssigned(pendingSlotRequest); TaskManagerRegistration taskManagerRegistration = taskManagerRegistrations.get(instanceID); if (taskManagerRegistration == null) { throw new IllegalStateException("Could not find a registered task manager for instance id " + instanceID + '.'); } taskManagerRegistration.markUsed(); // RPC call to the task manager CompletableFuture requestFuture = gateway.requestSlot( slotId, pendingSlotRequest.getJobId(), allocationId, pendingSlotRequest.getTargetAddress(), resourceManagerId, taskManagerRequestTimeout); requestFuture.whenComplete( (Acknowledge acknowledge, Throwable throwable) -> { if (acknowledge != null) { completableFuture.complete(acknowledge); } else { completableFuture.completeExceptionally(throwable); } }); completableFuture.whenCompleteAsync( (Acknowledge acknowledge, Throwable throwable) -> { try { if (acknowledge != null) { updateSlot(slotId, allocationId, pendingSlotRequest.getJobId()); } else { if (throwable instanceof SlotOccupiedException) { SlotOccupiedException exception = (SlotOccupiedException) throwable; updateSlot(slotId, exception.getAllocationId(), exception.getJobId()); } else { removeSlotRequestFromSlot(slotId, allocationId); } if (!(throwable instanceof CancellationException)) { handleFailedSlotRequest(slotId, allocationId, throwable); } else { LOG.debug("Slot allocation request {} has been cancelled.", allocationId, throwable); } } } catch (Exception e) { LOG.error("Error while completing the slot allocation.", e); } }, mainThreadExecutor); } private void returnPendingTaskManagerSlotIfAssigned(PendingSlotRequest pendingSlotRequest) { final PendingTaskManagerSlot pendingTaskManagerSlot = pendingSlotRequest.getAssignedPendingTaskManagerSlot(); if (pendingTaskManagerSlot != null) { pendingTaskManagerSlot.unassignPendingSlotRequest(); pendingSlotRequest.unassignPendingTaskManagerSlot(); } } /** * Handles a free slot. It first tries to find a pending slot request which can be fulfilled. * If there is no such request, then it will add the slot to the set of free slots. * * @param freeSlot to find a new slot request for */ private void handleFreeSlot(TaskManagerSlot freeSlot) { Preconditions.checkState(freeSlot.getState() == TaskManagerSlot.State.FREE); PendingSlotRequest pendingSlotRequest = findMatchingRequest(freeSlot.getResourceProfile()); if (null != pendingSlotRequest) { allocateSlot(freeSlot, pendingSlotRequest); } else { freeSlots.put(freeSlot.getSlotId(), freeSlot); } } /** * Removes the given set of slots from the slot manager. * * @param slotsToRemove identifying the slots to remove from the slot manager * @param cause for removing the slots */ private void removeSlots(Iterable slotsToRemove, Exception cause) { for (SlotID slotId : slotsToRemove) { removeSlot(slotId, cause); } } /** * Removes the given slot from the slot manager. * * @param slotId identifying the slot to remove * @param cause for removing the slot */ private void removeSlot(SlotID slotId, Exception cause) { TaskManagerSlot slot = slots.remove(slotId); if (null != slot) { freeSlots.remove(slotId); if (slot.getState() == TaskManagerSlot.State.PENDING) { // reject the pending slot request --> triggering a new allocation attempt rejectPendingSlotRequest( slot.getAssignedSlotRequest(), cause); } AllocationID oldAllocationId = slot.getAllocationId(); if (oldAllocationId != null) { fulfilledSlotRequests.remove(oldAllocationId); resourceActions.notifyAllocationFailure( slot.getJobId(), oldAllocationId, cause); } } else { LOG.debug("There was no slot registered with slot id {}.", slotId); } } // --------------------------------------------------------------------------------------------- // Internal request handling methods // --------------------------------------------------------------------------------------------- /** * Removes a pending slot request identified by the given allocation id from a slot identified * by the given slot id. * * @param slotId identifying the slot * @param allocationId identifying the presumable assigned pending slot request */ private void removeSlotRequestFromSlot(SlotID slotId, AllocationID allocationId) { TaskManagerSlot taskManagerSlot = slots.get(slotId); if (null != taskManagerSlot) { if (taskManagerSlot.getState() == TaskManagerSlot.State.PENDING && Objects.equals(allocationId, taskManagerSlot.getAssignedSlotRequest().getAllocationId())) { TaskManagerRegistration taskManagerRegistration = taskManagerRegistrations.get(taskManagerSlot.getInstanceId()); if (taskManagerRegistration == null) { throw new IllegalStateException("Trying to remove slot request from slot for which there is no TaskManager " + taskManagerSlot.getInstanceId() + " is registered."); } // clear the pending slot request taskManagerSlot.clearPendingSlotRequest(); updateSlotState(taskManagerSlot, taskManagerRegistration, null, null); } else { LOG.debug("Ignore slot request removal for slot {}.", slotId); } } else { LOG.debug("There was no slot with {} registered. Probably this slot has been already freed.", slotId); } } /** * Handles a failed slot request. The slot manager tries to find a new slot fulfilling * the resource requirements for the failed slot request. * * @param slotId identifying the slot which was assigned to the slot request before * @param allocationId identifying the failed slot request * @param cause of the failure */ private void handleFailedSlotRequest(SlotID slotId, AllocationID allocationId, Throwable cause) { PendingSlotRequest pendingSlotRequest = pendingSlotRequests.get(allocationId); LOG.debug("Slot request with allocation id {} failed for slot {}.", allocationId, slotId, cause); if (null != pendingSlotRequest) { pendingSlotRequest.setRequestFuture(null); try { internalRequestSlot(pendingSlotRequest); } catch (ResourceManagerException e) { pendingSlotRequests.remove(allocationId); resourceActions.notifyAllocationFailure( pendingSlotRequest.getJobId(), allocationId, e); } } else { LOG.debug("There was not pending slot request with allocation id {}. Probably the request has been fulfilled or cancelled.", allocationId); } } /** * Rejects the pending slot request by failing the request future with a * {@link SlotAllocationException}. * * @param pendingSlotRequest to reject * @param cause of the rejection */ private void rejectPendingSlotRequest(PendingSlotRequest pendingSlotRequest, Exception cause) { CompletableFuture request = pendingSlotRequest.getRequestFuture(); if (null != request) { request.completeExceptionally(new SlotAllocationException(cause)); } else { LOG.debug("Cannot reject pending slot request {}, since no request has been sent.", pendingSlotRequest.getAllocationId()); } } /** * Cancels the given slot request. * * @param pendingSlotRequest to cancel */ private void cancelPendingSlotRequest(PendingSlotRequest pendingSlotRequest) { CompletableFuture request = pendingSlotRequest.getRequestFuture(); returnPendingTaskManagerSlotIfAssigned(pendingSlotRequest); if (null != request) { request.cancel(false); } } // --------------------------------------------------------------------------------------------- // Internal timeout methods // --------------------------------------------------------------------------------------------- @VisibleForTesting void checkTaskManagerTimeouts() { if (!taskManagerRegistrations.isEmpty()) { long currentTime = System.currentTimeMillis(); ArrayList timedOutTaskManagers = new ArrayList<>(taskManagerRegistrations.size()); // first retrieve the timed out TaskManagers for (TaskManagerRegistration taskManagerRegistration : taskManagerRegistrations.values()) { if (currentTime - taskManagerRegistration.getIdleSince() >= taskManagerTimeout.toMilliseconds()) { // we collect the instance ids first in order to avoid concurrent modifications by the // ResourceActions.releaseResource call timedOutTaskManagers.add(taskManagerRegistration); } } // second we trigger the release resource callback which can decide upon the resource release for (TaskManagerRegistration taskManagerRegistration : timedOutTaskManagers) { if (waitResultConsumedBeforeRelease) { releaseTaskExecutorIfPossible(taskManagerRegistration); } else { releaseTaskExecutor(taskManagerRegistration.getInstanceId()); } } } } private void releaseTaskExecutorIfPossible(TaskManagerRegistration taskManagerRegistration) { long idleSince = taskManagerRegistration.getIdleSince(); taskManagerRegistration .getTaskManagerConnection() .getTaskExecutorGateway() .canBeReleased() .thenAcceptAsync( canBeReleased -> { InstanceID timedOutTaskManagerId = taskManagerRegistration.getInstanceId(); boolean stillIdle = idleSince == taskManagerRegistration.getIdleSince(); if (stillIdle && canBeReleased) { releaseTaskExecutor(timedOutTaskManagerId); } }, mainThreadExecutor); } private void releaseTaskExecutor(InstanceID timedOutTaskManagerId) { final FlinkException cause = new FlinkException("TaskExecutor exceeded the idle timeout."); LOG.debug("Release TaskExecutor {} because it exceeded the idle timeout.", timedOutTaskManagerId); resourceActions.releaseResource(timedOutTaskManagerId, cause); } private void checkSlotRequestTimeouts() { if (!pendingSlotRequests.isEmpty()) { long currentTime = System.currentTimeMillis(); Iterator> slotRequestIterator = pendingSlotRequests.entrySet().iterator(); while (slotRequestIterator.hasNext()) { PendingSlotRequest slotRequest = slotRequestIterator.next().getValue(); if (currentTime - slotRequest.getCreationTimestamp() >= slotRequestTimeout.toMilliseconds()) { slotRequestIterator.remove(); if (slotRequest.isAssigned()) { cancelPendingSlotRequest(slotRequest); } resourceActions.notifyAllocationFailure( slotRequest.getJobId(), slotRequest.getAllocationId(), new TimeoutException("The allocation could not be fulfilled in time.")); } } } } // --------------------------------------------------------------------------------------------- // Internal utility methods // --------------------------------------------------------------------------------------------- private void internalUnregisterTaskManager(TaskManagerRegistration taskManagerRegistration, Exception cause) { Preconditions.checkNotNull(taskManagerRegistration); removeSlots(taskManagerRegistration.getSlots(), cause); } private boolean checkDuplicateRequest(AllocationID allocationId) { return pendingSlotRequests.containsKey(allocationId) || fulfilledSlotRequests.containsKey(allocationId); } private void checkInit() { Preconditions.checkState(started, "The slot manager has not been started."); } // --------------------------------------------------------------------------------------------- // Testing methods // --------------------------------------------------------------------------------------------- @VisibleForTesting TaskManagerSlot getSlot(SlotID slotId) { return slots.get(slotId); } @VisibleForTesting PendingSlotRequest getSlotRequest(AllocationID allocationId) { return pendingSlotRequests.get(allocationId); } @VisibleForTesting boolean isTaskManagerIdle(InstanceID instanceId) { TaskManagerRegistration taskManagerRegistration = taskManagerRegistrations.get(instanceId); if (null != taskManagerRegistration) { return taskManagerRegistration.isIdle(); } else { return false; } } @Override @VisibleForTesting public void unregisterTaskManagersAndReleaseResources() { Iterator> taskManagerRegistrationIterator = taskManagerRegistrations.entrySet().iterator(); while (taskManagerRegistrationIterator.hasNext()) { TaskManagerRegistration taskManagerRegistration = taskManagerRegistrationIterator.next().getValue(); taskManagerRegistrationIterator.remove(); final FlinkException cause = new FlinkException("Triggering of SlotManager#unregisterTaskManagersAndReleaseResources."); internalUnregisterTaskManager(taskManagerRegistration, cause); resourceActions.releaseResource(taskManagerRegistration.getInstanceId(), cause); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy