All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.runtime.io.network.partition.external.ExternalBlockResultPartitionManager Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.io.network.partition.external;

import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.core.memory.MemoryType;
import org.apache.flink.runtime.io.network.partition.BufferAvailabilityListener;
import org.apache.flink.runtime.io.network.partition.FixedLengthBufferPool;
import org.apache.flink.runtime.io.network.partition.ResultPartitionID;
import org.apache.flink.runtime.io.network.partition.ResultPartitionProvider;
import org.apache.flink.runtime.io.network.partition.ResultSubpartitionView;
import org.apache.flink.runtime.taskmanager.DispatcherThreadFactory;

import org.apache.hadoop.io.ReadaheadPool;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;

import static org.apache.flink.runtime.io.network.partition.external.OsCachePolicy.READ_AHEAD;

/**
 * Implementation of {@link ResultPartitionProvider} for external shuffle service.
 */
public class ExternalBlockResultPartitionManager implements ResultPartitionProvider {
	private static final Logger LOG = LoggerFactory.getLogger(ExternalBlockResultPartitionManager.class);

	private final ExternalBlockShuffleServiceConfiguration shuffleServiceConfiguration;

	private final LocalResultPartitionResolver resultPartitionResolver;

	/** Each directory has its group of threads to do disk IO operations. */
	@VisibleForTesting
	final Map dirToThreadPool = new HashMap<>();

	/** Cache file meta for result partitions. */
	@VisibleForTesting
	final ConcurrentHashMap
		resultPartitionMetaMap = new ConcurrentHashMap<>();

	/** The buffer pool to read data into. */
	@VisibleForTesting
	final FixedLengthBufferPool bufferPool;

	/** Periodically recycle result partitions. */
	private final ScheduledExecutorService resultPartitionRecyclerExecutorService;

	private final AtomicBoolean isStopped = new AtomicBoolean(false);

	private final ReadaheadPool readaheadPool;

	private final ScheduledExecutorService selfCheckExecutorService;

	public ExternalBlockResultPartitionManager(
		ExternalBlockShuffleServiceConfiguration shuffleServiceConfiguration) throws Exception {

		this.shuffleServiceConfiguration = shuffleServiceConfiguration;
		this.resultPartitionResolver = LocalResultPartitionResolverFactory.create(shuffleServiceConfiguration);

		// Init the buffer pool
		this.bufferPool = new FixedLengthBufferPool(
			shuffleServiceConfiguration.getBufferNumber(),
			shuffleServiceConfiguration.getMemorySizePerBufferInBytes(),
			MemoryType.OFF_HEAP);

		constructThreadPools();

		this.readaheadPool = READ_AHEAD.equals(shuffleServiceConfiguration.getOsCachePolicy())
			? ReadaheadPool.getInstance() : null;

		ThreadFactory recyclerThreadFactory = new DispatcherThreadFactory(
			new ThreadGroup("FlinkShuffleService"), "ResultPartitionRecycler");
		this.resultPartitionRecyclerExecutorService = Executors.newSingleThreadScheduledExecutor(recyclerThreadFactory);
		this.resultPartitionRecyclerExecutorService.scheduleWithFixedDelay(
			() -> recycleResultPartitions(),
			0,
			shuffleServiceConfiguration.getDiskScanIntervalInMS(),
			TimeUnit.MILLISECONDS);

		// create a thread for self-check
		ThreadFactory selfCheckThreadFactory = new DispatcherThreadFactory(
			new ThreadGroup("FlinkShuffleService"), "SelfCheckThread");
		this.selfCheckExecutorService = Executors.newSingleThreadScheduledExecutor(selfCheckThreadFactory);
		this.selfCheckExecutorService.scheduleAtFixedRate(
			new SelfCheckTask(),
			shuffleServiceConfiguration.getSelfCheckIntervalInMS(),
			shuffleServiceConfiguration.getSelfCheckIntervalInMS(),
			TimeUnit.MILLISECONDS);

		LOG.info("Final configurations: " + shuffleServiceConfiguration);
	}

	@Override
	public ResultSubpartitionView createSubpartitionView(
		ResultPartitionID resultPartitionId,
		int index,
		BufferAvailabilityListener availabilityListener) throws IOException {

		// Reject all the requests if shuffle service is stopping.
		if (isStopped.get()) {
			throw new IOException("ExternalBlockResultPartitionManager has already been stopped.");
		}

		ExternalBlockResultPartitionMeta resultPartitionMeta = resultPartitionMetaMap.get(resultPartitionId);
		if (resultPartitionMeta == null) {
			LocalResultPartitionResolver.ResultPartitionFileInfo fileInfo = resultPartitionResolver.getResultPartitionDir(
				resultPartitionId);
			resultPartitionMeta = new ExternalBlockResultPartitionMeta(
				resultPartitionId,
				shuffleServiceConfiguration.getFileSystem(),
				fileInfo,
				shuffleServiceConfiguration.getOsCachePolicy(),
				shuffleServiceConfiguration.getMaxReadAheadLengthInBytes());
			ExternalBlockResultPartitionMeta prevResultPartitionMeta =
				resultPartitionMetaMap.putIfAbsent(resultPartitionId, resultPartitionMeta);
			if (prevResultPartitionMeta != null) {
				resultPartitionMeta = prevResultPartitionMeta;
			}
		}

		ExternalBlockSubpartitionView subpartitionView = new ExternalBlockSubpartitionView(
			resultPartitionMeta,
			index,
			dirToThreadPool.get(resultPartitionMeta.getRootDir()),
			resultPartitionId,
			bufferPool,
			shuffleServiceConfiguration.getWaitCreditDelay(),
			availabilityListener,
			readaheadPool);

		resultPartitionMeta.notifySubpartitionStartConsuming(index);

		return subpartitionView;
	}

	/**
	 * This method is used to create a mapping between yarnAppId and user.
	 */
	public void initializeApplication(String user, String appId) {
		resultPartitionResolver.initializeApplication(user, appId);
	}

	/**
	 * This method is used to remove both in-memory meta info and local files when
	 * this application is stopped in Yarn.
	 */
	public void stopApplication(String appId) {
		Set resultPartitionIDS = resultPartitionResolver.stopApplication(appId);
		if (!resultPartitionIDS.isEmpty()) {
			resultPartitionIDS.forEach(resultPartitionID -> {
				resultPartitionMetaMap.remove(resultPartitionID);
			});
		}
	}

	public void stop() {
		LOG.warn("Stop ExternalBlockResultPartitionManager, probably ShuffleService is stopped");
		try {
			boolean succ = isStopped.compareAndSet(false, true);
			if (!succ) {
				LOG.info("ExternalBlockResultPartitionManager has already been stopped.");
				return;
			}

			// Stop disk IO threads immediately
			Iterator threadPoolIter = dirToThreadPool.entrySet().iterator();
			while (threadPoolIter.hasNext()) {
				Map.Entry entry = (Map.Entry) threadPoolIter.next();
				((ThreadPoolExecutor) entry.getValue()).shutdownNow();
			}

			resultPartitionRecyclerExecutorService.shutdownNow();

			resultPartitionResolver.stop();

			bufferPool.lazyDestroy();

			resultPartitionMetaMap.clear();
		} catch (Throwable e) {
			LOG.error("Exception occurs when stopping ExternalBlockResultPartitionManager", e);
		}
	}

	// ------------------------------------ Internal Utilities ------------------------------------

	/**
	 * This method is called only in constructor to construct thread pools for disk IO threads.
	 */
	private void constructThreadPools() {
		ThreadGroup threadGroup = new ThreadGroup("Disk IO Thread Group");
		shuffleServiceConfiguration.getDirToDiskType().forEach((dir, diskType) -> {
			Integer threadNum = shuffleServiceConfiguration.getDiskTypeToIOThreadNum().get(diskType);
			BlockingQueue blockingQueue = new ExternalBlockSubpartitionViewSchedulerDelegate(
				shuffleServiceConfiguration.newSubpartitionViewScheduler());
			ThreadPoolExecutor threadPool = new ThreadPoolExecutor(
				threadNum, threadNum, 0L, TimeUnit.MILLISECONDS, blockingQueue,
				new DispatcherThreadFactory(threadGroup, "IO thread [" + diskType + "] [" + dir + "]"));
			dirToThreadPool.put(dir, threadPool);
		});
	}

	@VisibleForTesting
	void recycleResultPartitions() {
		long currTime = System.currentTimeMillis();
		if (LOG.isDebugEnabled()) {
			LOG.debug("Start to recycle result partitions, currTime: " + currTime);
		}

		HashMap consumedPartitionsToRemove = new HashMap<>();
		HashMap partialConsumedPartitionsToRemove = new HashMap<>();
		for (Map.Entry partitionEntry : resultPartitionMetaMap.entrySet()) {
			ResultPartitionID resultPartitionID = partitionEntry.getKey();
			ExternalBlockResultPartitionMeta resultPartitionMeta = partitionEntry.getValue();
			if (!resultPartitionMeta.hasInitialized()) {
				continue;
			}
			int refCnt = resultPartitionMeta.getReferenceCount();
			if (refCnt > 0) {
				// Skip because some subpartition views are consuming subpartitions.
				continue;
			}
			int unconsumedSubpartitionCount = resultPartitionMeta.getUnconsumedSubpartitionCount();
			if (unconsumedSubpartitionCount <= 0) {
				// It seems all the down streams have fetched their data from this result partition.
				long lastActiveTimeInMs = resultPartitionMeta.getLastActiveTimeInMs();
				// we may get -1L in a rare condition due to decreasing count and setting timestamp without lock
				if ((currTime - lastActiveTimeInMs) > resultPartitionMeta.getConsumedPartitionTTL()) {
					consumedPartitionsToRemove.put(resultPartitionID, resultPartitionMeta);
				}
			} else {
				// There are subpartitions left to be consumed. If this job fails, such partition
				// will never be fully consumed.
				long lastActiveTimeInMs = resultPartitionMeta.getLastActiveTimeInMs();
				if ((currTime - lastActiveTimeInMs) > resultPartitionMeta.getPartialConsumedPartitionTTL()) {
					partialConsumedPartitionsToRemove.put(resultPartitionID, resultPartitionMeta);
				}
			}
		}

		removeResultPartitionAndMeta(consumedPartitionsToRemove,
			"CONSUMED_PARTITION_TTL_TIMEOUT",
			LOG.isDebugEnabled());
		removeResultPartitionAndMeta(partialConsumedPartitionsToRemove,
			"PARTIAL_CONSUMED_PARTITION_TTL_TIMEOUT",
			true);

		if (LOG.isDebugEnabled()) {
			LOG.debug("Finish recycling result partitions, cost " + (System.currentTimeMillis() - currTime) + " ms.");
		}
	}

	private void removeResultPartitionAndMeta(
		HashMap partitionsToRemove,
		String recycleReason,
		boolean printLog) {

		if (partitionsToRemove.isEmpty()) {
			return;
		}

		Iterator> iterator =
			partitionsToRemove.entrySet().iterator();
		while (iterator.hasNext()) {
			Map.Entry entry = iterator.next();
			ResultPartitionID resultPartitionID = entry.getKey();
			ExternalBlockResultPartitionMeta meta = entry.getValue();
			// double check reference count
			if (meta.getReferenceCount() > 0) {
				iterator.remove();
			} else {
				resultPartitionMetaMap.remove(resultPartitionID);
				resultPartitionResolver.recycleResultPartition(resultPartitionID);
				if (printLog) {
					LOG.info("Delete partition's directory: {}, reason: {}, lastActiveTime: {}",
						meta.getResultPartitionDir(), recycleReason, meta.getLastActiveTimeInMs());
				}
			}
		}
	}

	private enum LogLevel {
		DEBUG,
		INFO,
		WARN,
		ERROR
	}

	private final class SelfCheckTask implements Runnable {
		private static final String LOG_PREFIX = "FlinkShuffleService-SelfCheck-";

		private final int totalBufferNum;

		private int usedBufferNum = 0;

		private long lastCheckMemoryFootprintTimestamp;

		SelfCheckTask() {
			// 1. prepare to check buffers' usage
			totalBufferNum = bufferPool.getNumBuffers();
			usedBufferNum = bufferPool.bestEffortGetNumOfUsedBuffers();

			// 2. prepare to shrink ResultPartitionMeta
			lastCheckMemoryFootprintTimestamp = System.currentTimeMillis();
		}

		@Override
		public void run() {
			if (LOG.isDebugEnabled()) {
				LOG.debug("Start to do self check, startTimeInMS: " + System.currentTimeMillis());
			}

			checkBufferUsage();
			shrinkResultPartitionMetaIfNecessary();

			if (LOG.isDebugEnabled()) {
				LOG.debug("Finish self check, endTimeInMS: " + System.currentTimeMillis());
			}
		}

		private void checkBufferUsage() {
			StringBuilder stringBuilder = new StringBuilder(LOG_PREFIX + "-BufferUsage");
			LogLevel logLevel = LogLevel.INFO;

			int tmpUsedBufferNum = bufferPool.bestEffortGetNumOfUsedBuffers();
			stringBuilder.append("TotalBufferNum: ").append(totalBufferNum);
			if (tmpUsedBufferNum > 0) {
				stringBuilder.append(", UsedBufferNum: ").append(tmpUsedBufferNum);
				if (usedBufferNum == totalBufferNum) {
					logLevel = LogLevel.WARN;
					stringBuilder.append(", BuffersHasBeenUsedUp");
				} else if (tmpUsedBufferNum == usedBufferNum) {
					stringBuilder.append(", UsedBufferNumUnchanged");
				} else {
					stringBuilder.append(", PreviousUsedBufferNum: ").append(usedBufferNum);
				}
			} else {
				stringBuilder.append(", TotalBuffersUnused");
				logLevel = LogLevel.DEBUG;
			}
			usedBufferNum = tmpUsedBufferNum;

			printReport(stringBuilder.toString(), logLevel);
		}

		private void shrinkResultPartitionMetaIfNecessary() {
			// Overall memory GC is actually not a light-weight operation, so use another interval
			// instead of the one used by self check.
			long currentTimestamp = System.currentTimeMillis();
			if ((currentTimestamp - lastCheckMemoryFootprintTimestamp) <
					shuffleServiceConfiguration.getMemoryShrinkageIntervalInMS()) {
				return;
			} else {
				lastCheckMemoryFootprintTimestamp = currentTimestamp;
			}

			StringBuilder logBuilder = new StringBuilder(LOG_PREFIX).append("-MemoryGC ");

			// 1. Estimates the memory footprint used by result partition meta.
			long shrinkableMemoryFootprint = 0L;
			for (Map.Entry partitionEntry :
					resultPartitionMetaMap.entrySet()) {
				shrinkableMemoryFootprint += partitionEntry.getValue().getShrinkableMemoryFootprint();
			}
			if (shrinkableMemoryFootprint <= shuffleServiceConfiguration.getHeapMemoryThresholdInBytes()) {
				logBuilder.append("SkipMemoryGC, detail: memory is sufficient")
					.append(", heapMemoryThresholdInBytes: ")
					.append(shuffleServiceConfiguration.getHeapMemoryThresholdInBytes())
					.append(", shrinkableHeapMemoryFootprint : ").append(shrinkableMemoryFootprint);
				printReport(logBuilder.toString(), LogLevel.DEBUG);
				return;
			}

			// 2. Recycles enough shrinkable memory to achieve the target.
			final long expectedMemoryToShrink = shrinkableMemoryFootprint
				- shuffleServiceConfiguration.getHeapMemoryThresholdInBytes();
			long actualMemoryToShrink = 0L;

			long expirationInterval = 24L * 60 * 60 * 1000; // 24 hours in milliseconds
			boolean partitionLeftToBeShrunk = true;
			while (partitionLeftToBeShrunk && expirationInterval >=
					shuffleServiceConfiguration.getObjectMinIdleIntervalToShrinkInMS()) {
				partitionLeftToBeShrunk = false;
				for (Map.Entry partitionEntry :
					resultPartitionMetaMap.entrySet()) {
					ExternalBlockResultPartitionMeta meta = partitionEntry.getValue();
					if (meta.getReferenceCount() < 1) {
						continue;
					}
					if (currentTimestamp - meta.getLastActiveTimeInMs() > expirationInterval) {
						actualMemoryToShrink += meta.shrinkMemoryFootprint();
						if (actualMemoryToShrink >= expectedMemoryToShrink) {
							break;
						}
					} else {
						partitionLeftToBeShrunk = true;
					}
				}
				if (actualMemoryToShrink >= expectedMemoryToShrink) {
					break;
				} else {
					// Target is not fulfilled, try to shrink metas with nearer LastActiveTime.
					expirationInterval /= 2;
				}
			}

			if (actualMemoryToShrink >= expectedMemoryToShrink) {
				logBuilder.append("MemoryGCSuccess");
			} else {
				logBuilder.append("MemoryGCFailure");
			}

			logBuilder.append(", shrinkableMemoryFootprint : ").append(shrinkableMemoryFootprint)
				.append(" -> ").append(shrinkableMemoryFootprint - actualMemoryToShrink)
				.append(", actualMemoryToShrink: ").append(actualMemoryToShrink)
				.append(", expectedMemoryToShrink: ").append(expectedMemoryToShrink);
			printReport(logBuilder.toString(), LogLevel.WARN);
		}

		private void printReport(String log, LogLevel logLevel) {
			if (logLevel == LogLevel.WARN) {
				LOG.warn(log);
			} else if (logLevel == LogLevel.ERROR) {
				LOG.error(log);
			} else if (logLevel == LogLevel.INFO) {
				LOG.info(log);
			} else {
				LOG.debug(log);
			}
		}
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy