Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.runtime.io.network.partition.external;
import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.core.memory.MemoryType;
import org.apache.flink.runtime.io.network.partition.BufferAvailabilityListener;
import org.apache.flink.runtime.io.network.partition.FixedLengthBufferPool;
import org.apache.flink.runtime.io.network.partition.ResultPartitionID;
import org.apache.flink.runtime.io.network.partition.ResultPartitionProvider;
import org.apache.flink.runtime.io.network.partition.ResultSubpartitionView;
import org.apache.flink.runtime.taskmanager.DispatcherThreadFactory;
import org.apache.hadoop.io.ReadaheadPool;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import static org.apache.flink.runtime.io.network.partition.external.OsCachePolicy.READ_AHEAD;
/**
* Implementation of {@link ResultPartitionProvider} for external shuffle service.
*/
public class ExternalBlockResultPartitionManager implements ResultPartitionProvider {
private static final Logger LOG = LoggerFactory.getLogger(ExternalBlockResultPartitionManager.class);
private final ExternalBlockShuffleServiceConfiguration shuffleServiceConfiguration;
private final LocalResultPartitionResolver resultPartitionResolver;
/** Each directory has its group of threads to do disk IO operations. */
@VisibleForTesting
final Map dirToThreadPool = new HashMap<>();
/** Cache file meta for result partitions. */
@VisibleForTesting
final ConcurrentHashMap
resultPartitionMetaMap = new ConcurrentHashMap<>();
/** The buffer pool to read data into. */
@VisibleForTesting
final FixedLengthBufferPool bufferPool;
/** Periodically recycle result partitions. */
private final ScheduledExecutorService resultPartitionRecyclerExecutorService;
private final AtomicBoolean isStopped = new AtomicBoolean(false);
private final ReadaheadPool readaheadPool;
private final ScheduledExecutorService selfCheckExecutorService;
public ExternalBlockResultPartitionManager(
ExternalBlockShuffleServiceConfiguration shuffleServiceConfiguration) throws Exception {
this.shuffleServiceConfiguration = shuffleServiceConfiguration;
this.resultPartitionResolver = LocalResultPartitionResolverFactory.create(shuffleServiceConfiguration);
// Init the buffer pool
this.bufferPool = new FixedLengthBufferPool(
shuffleServiceConfiguration.getBufferNumber(),
shuffleServiceConfiguration.getMemorySizePerBufferInBytes(),
MemoryType.OFF_HEAP);
constructThreadPools();
this.readaheadPool = READ_AHEAD.equals(shuffleServiceConfiguration.getOsCachePolicy())
? ReadaheadPool.getInstance() : null;
ThreadFactory recyclerThreadFactory = new DispatcherThreadFactory(
new ThreadGroup("FlinkShuffleService"), "ResultPartitionRecycler");
this.resultPartitionRecyclerExecutorService = Executors.newSingleThreadScheduledExecutor(recyclerThreadFactory);
this.resultPartitionRecyclerExecutorService.scheduleWithFixedDelay(
() -> recycleResultPartitions(),
0,
shuffleServiceConfiguration.getDiskScanIntervalInMS(),
TimeUnit.MILLISECONDS);
// create a thread for self-check
ThreadFactory selfCheckThreadFactory = new DispatcherThreadFactory(
new ThreadGroup("FlinkShuffleService"), "SelfCheckThread");
this.selfCheckExecutorService = Executors.newSingleThreadScheduledExecutor(selfCheckThreadFactory);
this.selfCheckExecutorService.scheduleAtFixedRate(
new SelfCheckTask(),
shuffleServiceConfiguration.getSelfCheckIntervalInMS(),
shuffleServiceConfiguration.getSelfCheckIntervalInMS(),
TimeUnit.MILLISECONDS);
LOG.info("Final configurations: " + shuffleServiceConfiguration);
}
@Override
public ResultSubpartitionView createSubpartitionView(
ResultPartitionID resultPartitionId,
int index,
BufferAvailabilityListener availabilityListener) throws IOException {
// Reject all the requests if shuffle service is stopping.
if (isStopped.get()) {
throw new IOException("ExternalBlockResultPartitionManager has already been stopped.");
}
ExternalBlockResultPartitionMeta resultPartitionMeta = resultPartitionMetaMap.get(resultPartitionId);
if (resultPartitionMeta == null) {
LocalResultPartitionResolver.ResultPartitionFileInfo fileInfo = resultPartitionResolver.getResultPartitionDir(
resultPartitionId);
resultPartitionMeta = new ExternalBlockResultPartitionMeta(
resultPartitionId,
shuffleServiceConfiguration.getFileSystem(),
fileInfo,
shuffleServiceConfiguration.getOsCachePolicy(),
shuffleServiceConfiguration.getMaxReadAheadLengthInBytes());
ExternalBlockResultPartitionMeta prevResultPartitionMeta =
resultPartitionMetaMap.putIfAbsent(resultPartitionId, resultPartitionMeta);
if (prevResultPartitionMeta != null) {
resultPartitionMeta = prevResultPartitionMeta;
}
}
ExternalBlockSubpartitionView subpartitionView = new ExternalBlockSubpartitionView(
resultPartitionMeta,
index,
dirToThreadPool.get(resultPartitionMeta.getRootDir()),
resultPartitionId,
bufferPool,
shuffleServiceConfiguration.getWaitCreditDelay(),
availabilityListener,
readaheadPool);
resultPartitionMeta.notifySubpartitionStartConsuming(index);
return subpartitionView;
}
/**
* This method is used to create a mapping between yarnAppId and user.
*/
public void initializeApplication(String user, String appId) {
resultPartitionResolver.initializeApplication(user, appId);
}
/**
* This method is used to remove both in-memory meta info and local files when
* this application is stopped in Yarn.
*/
public void stopApplication(String appId) {
Set resultPartitionIDS = resultPartitionResolver.stopApplication(appId);
if (!resultPartitionIDS.isEmpty()) {
resultPartitionIDS.forEach(resultPartitionID -> {
resultPartitionMetaMap.remove(resultPartitionID);
});
}
}
public void stop() {
LOG.warn("Stop ExternalBlockResultPartitionManager, probably ShuffleService is stopped");
try {
boolean succ = isStopped.compareAndSet(false, true);
if (!succ) {
LOG.info("ExternalBlockResultPartitionManager has already been stopped.");
return;
}
// Stop disk IO threads immediately
Iterator threadPoolIter = dirToThreadPool.entrySet().iterator();
while (threadPoolIter.hasNext()) {
Map.Entry entry = (Map.Entry) threadPoolIter.next();
((ThreadPoolExecutor) entry.getValue()).shutdownNow();
}
resultPartitionRecyclerExecutorService.shutdownNow();
resultPartitionResolver.stop();
bufferPool.lazyDestroy();
resultPartitionMetaMap.clear();
} catch (Throwable e) {
LOG.error("Exception occurs when stopping ExternalBlockResultPartitionManager", e);
}
}
// ------------------------------------ Internal Utilities ------------------------------------
/**
* This method is called only in constructor to construct thread pools for disk IO threads.
*/
private void constructThreadPools() {
ThreadGroup threadGroup = new ThreadGroup("Disk IO Thread Group");
shuffleServiceConfiguration.getDirToDiskType().forEach((dir, diskType) -> {
Integer threadNum = shuffleServiceConfiguration.getDiskTypeToIOThreadNum().get(diskType);
BlockingQueue blockingQueue = new ExternalBlockSubpartitionViewSchedulerDelegate(
shuffleServiceConfiguration.newSubpartitionViewScheduler());
ThreadPoolExecutor threadPool = new ThreadPoolExecutor(
threadNum, threadNum, 0L, TimeUnit.MILLISECONDS, blockingQueue,
new DispatcherThreadFactory(threadGroup, "IO thread [" + diskType + "] [" + dir + "]"));
dirToThreadPool.put(dir, threadPool);
});
}
@VisibleForTesting
void recycleResultPartitions() {
long currTime = System.currentTimeMillis();
if (LOG.isDebugEnabled()) {
LOG.debug("Start to recycle result partitions, currTime: " + currTime);
}
HashMap consumedPartitionsToRemove = new HashMap<>();
HashMap partialConsumedPartitionsToRemove = new HashMap<>();
for (Map.Entry partitionEntry : resultPartitionMetaMap.entrySet()) {
ResultPartitionID resultPartitionID = partitionEntry.getKey();
ExternalBlockResultPartitionMeta resultPartitionMeta = partitionEntry.getValue();
if (!resultPartitionMeta.hasInitialized()) {
continue;
}
int refCnt = resultPartitionMeta.getReferenceCount();
if (refCnt > 0) {
// Skip because some subpartition views are consuming subpartitions.
continue;
}
int unconsumedSubpartitionCount = resultPartitionMeta.getUnconsumedSubpartitionCount();
if (unconsumedSubpartitionCount <= 0) {
// It seems all the down streams have fetched their data from this result partition.
long lastActiveTimeInMs = resultPartitionMeta.getLastActiveTimeInMs();
// we may get -1L in a rare condition due to decreasing count and setting timestamp without lock
if ((currTime - lastActiveTimeInMs) > resultPartitionMeta.getConsumedPartitionTTL()) {
consumedPartitionsToRemove.put(resultPartitionID, resultPartitionMeta);
}
} else {
// There are subpartitions left to be consumed. If this job fails, such partition
// will never be fully consumed.
long lastActiveTimeInMs = resultPartitionMeta.getLastActiveTimeInMs();
if ((currTime - lastActiveTimeInMs) > resultPartitionMeta.getPartialConsumedPartitionTTL()) {
partialConsumedPartitionsToRemove.put(resultPartitionID, resultPartitionMeta);
}
}
}
removeResultPartitionAndMeta(consumedPartitionsToRemove,
"CONSUMED_PARTITION_TTL_TIMEOUT",
LOG.isDebugEnabled());
removeResultPartitionAndMeta(partialConsumedPartitionsToRemove,
"PARTIAL_CONSUMED_PARTITION_TTL_TIMEOUT",
true);
if (LOG.isDebugEnabled()) {
LOG.debug("Finish recycling result partitions, cost " + (System.currentTimeMillis() - currTime) + " ms.");
}
}
private void removeResultPartitionAndMeta(
HashMap partitionsToRemove,
String recycleReason,
boolean printLog) {
if (partitionsToRemove.isEmpty()) {
return;
}
Iterator> iterator =
partitionsToRemove.entrySet().iterator();
while (iterator.hasNext()) {
Map.Entry entry = iterator.next();
ResultPartitionID resultPartitionID = entry.getKey();
ExternalBlockResultPartitionMeta meta = entry.getValue();
// double check reference count
if (meta.getReferenceCount() > 0) {
iterator.remove();
} else {
resultPartitionMetaMap.remove(resultPartitionID);
resultPartitionResolver.recycleResultPartition(resultPartitionID);
if (printLog) {
LOG.info("Delete partition's directory: {}, reason: {}, lastActiveTime: {}",
meta.getResultPartitionDir(), recycleReason, meta.getLastActiveTimeInMs());
}
}
}
}
private enum LogLevel {
DEBUG,
INFO,
WARN,
ERROR
}
private final class SelfCheckTask implements Runnable {
private static final String LOG_PREFIX = "FlinkShuffleService-SelfCheck-";
private final int totalBufferNum;
private int usedBufferNum = 0;
private long lastCheckMemoryFootprintTimestamp;
SelfCheckTask() {
// 1. prepare to check buffers' usage
totalBufferNum = bufferPool.getNumBuffers();
usedBufferNum = bufferPool.bestEffortGetNumOfUsedBuffers();
// 2. prepare to shrink ResultPartitionMeta
lastCheckMemoryFootprintTimestamp = System.currentTimeMillis();
}
@Override
public void run() {
if (LOG.isDebugEnabled()) {
LOG.debug("Start to do self check, startTimeInMS: " + System.currentTimeMillis());
}
checkBufferUsage();
shrinkResultPartitionMetaIfNecessary();
if (LOG.isDebugEnabled()) {
LOG.debug("Finish self check, endTimeInMS: " + System.currentTimeMillis());
}
}
private void checkBufferUsage() {
StringBuilder stringBuilder = new StringBuilder(LOG_PREFIX + "-BufferUsage");
LogLevel logLevel = LogLevel.INFO;
int tmpUsedBufferNum = bufferPool.bestEffortGetNumOfUsedBuffers();
stringBuilder.append("TotalBufferNum: ").append(totalBufferNum);
if (tmpUsedBufferNum > 0) {
stringBuilder.append(", UsedBufferNum: ").append(tmpUsedBufferNum);
if (usedBufferNum == totalBufferNum) {
logLevel = LogLevel.WARN;
stringBuilder.append(", BuffersHasBeenUsedUp");
} else if (tmpUsedBufferNum == usedBufferNum) {
stringBuilder.append(", UsedBufferNumUnchanged");
} else {
stringBuilder.append(", PreviousUsedBufferNum: ").append(usedBufferNum);
}
} else {
stringBuilder.append(", TotalBuffersUnused");
logLevel = LogLevel.DEBUG;
}
usedBufferNum = tmpUsedBufferNum;
printReport(stringBuilder.toString(), logLevel);
}
private void shrinkResultPartitionMetaIfNecessary() {
// Overall memory GC is actually not a light-weight operation, so use another interval
// instead of the one used by self check.
long currentTimestamp = System.currentTimeMillis();
if ((currentTimestamp - lastCheckMemoryFootprintTimestamp) <
shuffleServiceConfiguration.getMemoryShrinkageIntervalInMS()) {
return;
} else {
lastCheckMemoryFootprintTimestamp = currentTimestamp;
}
StringBuilder logBuilder = new StringBuilder(LOG_PREFIX).append("-MemoryGC ");
// 1. Estimates the memory footprint used by result partition meta.
long shrinkableMemoryFootprint = 0L;
for (Map.Entry partitionEntry :
resultPartitionMetaMap.entrySet()) {
shrinkableMemoryFootprint += partitionEntry.getValue().getShrinkableMemoryFootprint();
}
if (shrinkableMemoryFootprint <= shuffleServiceConfiguration.getHeapMemoryThresholdInBytes()) {
logBuilder.append("SkipMemoryGC, detail: memory is sufficient")
.append(", heapMemoryThresholdInBytes: ")
.append(shuffleServiceConfiguration.getHeapMemoryThresholdInBytes())
.append(", shrinkableHeapMemoryFootprint : ").append(shrinkableMemoryFootprint);
printReport(logBuilder.toString(), LogLevel.DEBUG);
return;
}
// 2. Recycles enough shrinkable memory to achieve the target.
final long expectedMemoryToShrink = shrinkableMemoryFootprint
- shuffleServiceConfiguration.getHeapMemoryThresholdInBytes();
long actualMemoryToShrink = 0L;
long expirationInterval = 24L * 60 * 60 * 1000; // 24 hours in milliseconds
boolean partitionLeftToBeShrunk = true;
while (partitionLeftToBeShrunk && expirationInterval >=
shuffleServiceConfiguration.getObjectMinIdleIntervalToShrinkInMS()) {
partitionLeftToBeShrunk = false;
for (Map.Entry partitionEntry :
resultPartitionMetaMap.entrySet()) {
ExternalBlockResultPartitionMeta meta = partitionEntry.getValue();
if (meta.getReferenceCount() < 1) {
continue;
}
if (currentTimestamp - meta.getLastActiveTimeInMs() > expirationInterval) {
actualMemoryToShrink += meta.shrinkMemoryFootprint();
if (actualMemoryToShrink >= expectedMemoryToShrink) {
break;
}
} else {
partitionLeftToBeShrunk = true;
}
}
if (actualMemoryToShrink >= expectedMemoryToShrink) {
break;
} else {
// Target is not fulfilled, try to shrink metas with nearer LastActiveTime.
expirationInterval /= 2;
}
}
if (actualMemoryToShrink >= expectedMemoryToShrink) {
logBuilder.append("MemoryGCSuccess");
} else {
logBuilder.append("MemoryGCFailure");
}
logBuilder.append(", shrinkableMemoryFootprint : ").append(shrinkableMemoryFootprint)
.append(" -> ").append(shrinkableMemoryFootprint - actualMemoryToShrink)
.append(", actualMemoryToShrink: ").append(actualMemoryToShrink)
.append(", expectedMemoryToShrink: ").append(expectedMemoryToShrink);
printReport(logBuilder.toString(), LogLevel.WARN);
}
private void printReport(String log, LogLevel logLevel) {
if (logLevel == LogLevel.WARN) {
LOG.warn(log);
} else if (logLevel == LogLevel.ERROR) {
LOG.error(log);
} else if (logLevel == LogLevel.INFO) {
LOG.info(log);
} else {
LOG.debug(log);
}
}
}
}