org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.server.blockmanagement;
import static org.apache.hadoop.util.ExitUtil.terminate;
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.Random;
import java.util.TreeMap;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.CacheDirective;
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor.CachedBlocksList.Type;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState;
import org.apache.hadoop.hdfs.server.namenode.CacheManager;
import org.apache.hadoop.hdfs.server.namenode.CachePool;
import org.apache.hadoop.hdfs.server.namenode.CachedBlock;
import org.apache.hadoop.hdfs.server.namenode.FSDirectory;
import org.apache.hadoop.hdfs.server.namenode.FSDirectory.DirOp;
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
import org.apache.hadoop.hdfs.server.namenode.INode;
import org.apache.hadoop.hdfs.server.namenode.INodeDirectory;
import org.apache.hadoop.hdfs.server.namenode.INodeFile;
import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot;
import org.apache.hadoop.hdfs.util.ReadOnlyList;
import org.apache.hadoop.util.GSet;
import org.apache.hadoop.util.Time;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Preconditions;
/**
* Scans the namesystem, scheduling blocks to be cached as appropriate.
*
* The CacheReplicationMonitor does a full scan when the NameNode first
* starts up, and at configurable intervals afterwards.
*/
@InterfaceAudience.LimitedPrivate({"HDFS"})
public class CacheReplicationMonitor extends Thread implements Closeable {
private static final Logger LOG =
LoggerFactory.getLogger(CacheReplicationMonitor.class);
private final FSNamesystem namesystem;
private final BlockManager blockManager;
private final CacheManager cacheManager;
private final GSet cachedBlocks;
/**
* Pseudorandom number source
*/
private static final Random random = new Random();
/**
* The interval at which we scan the namesystem for caching changes.
*/
private final long intervalMs;
/**
* The CacheReplicationMonitor (CRM) lock. Used to synchronize starting and
* waiting for rescan operations.
*/
private final ReentrantLock lock;
/**
* Notifies the scan thread that an immediate rescan is needed.
*/
private final Condition doRescan;
/**
* Notifies waiting threads that a rescan has finished.
*/
private final Condition scanFinished;
/**
* The number of rescans completed. Used to wait for scans to finish.
* Protected by the CacheReplicationMonitor lock.
*/
private long completedScanCount = 0;
/**
* The scan we're currently performing, or -1 if no scan is in progress.
* Protected by the CacheReplicationMonitor lock.
*/
private long curScanCount = -1;
/**
* The number of rescans we need to complete. Protected by the CRM lock.
*/
private long neededScanCount = 0;
/**
* True if this monitor should terminate. Protected by the CRM lock.
*/
private boolean shutdown = false;
/**
* Mark status of the current scan.
*/
private boolean mark = false;
/**
* Cache directives found in the previous scan.
*/
private int scannedDirectives;
/**
* Blocks found in the previous scan.
*/
private long scannedBlocks;
public CacheReplicationMonitor(FSNamesystem namesystem,
CacheManager cacheManager, long intervalMs, ReentrantLock lock) {
this.namesystem = namesystem;
this.blockManager = namesystem.getBlockManager();
this.cacheManager = cacheManager;
this.cachedBlocks = cacheManager.getCachedBlocks();
this.intervalMs = intervalMs;
this.lock = lock;
this.doRescan = this.lock.newCondition();
this.scanFinished = this.lock.newCondition();
}
@Override
public void run() {
long startTimeMs = 0;
Thread.currentThread().setName("CacheReplicationMonitor(" +
System.identityHashCode(this) + ")");
LOG.info("Starting CacheReplicationMonitor with interval " +
intervalMs + " milliseconds");
try {
long curTimeMs = Time.monotonicNow();
while (true) {
lock.lock();
try {
while (true) {
if (shutdown) {
LOG.info("Shutting down CacheReplicationMonitor");
return;
}
if (completedScanCount < neededScanCount) {
LOG.debug("Rescanning because of pending operations");
break;
}
long delta = (startTimeMs + intervalMs) - curTimeMs;
if (delta <= 0) {
LOG.debug("Rescanning after {} milliseconds", (curTimeMs - startTimeMs));
break;
}
doRescan.await(delta, TimeUnit.MILLISECONDS);
curTimeMs = Time.monotonicNow();
}
} finally {
lock.unlock();
}
startTimeMs = curTimeMs;
mark = !mark;
rescan();
curTimeMs = Time.monotonicNow();
// Update synchronization-related variables.
lock.lock();
try {
completedScanCount = curScanCount;
curScanCount = -1;
scanFinished.signalAll();
} finally {
lock.unlock();
}
LOG.debug("Scanned {} directive(s) and {} block(s) in {} millisecond(s).",
scannedDirectives, scannedBlocks, (curTimeMs - startTimeMs));
}
} catch (InterruptedException e) {
LOG.info("Shutting down CacheReplicationMonitor.");
return;
} catch (Throwable t) {
LOG.error("Thread exiting", t);
terminate(1, t);
}
}
/**
* Waits for a rescan to complete. This doesn't guarantee consistency with
* pending operations, only relative recency, since it will not force a new
* rescan if a rescan is already underway.
*
* Note that this call will release the FSN lock, so operations before and
* after are not atomic.
*/
public void waitForRescanIfNeeded() {
Preconditions.checkArgument(!namesystem.hasWriteLock(),
"Must not hold the FSN write lock when waiting for a rescan.");
Preconditions.checkArgument(lock.isHeldByCurrentThread(),
"Must hold the CRM lock when waiting for a rescan.");
if (neededScanCount <= completedScanCount) {
return;
}
// If no scan is already ongoing, mark the CRM as dirty and kick
if (curScanCount < 0) {
doRescan.signal();
}
// Wait until the scan finishes and the count advances
while ((!shutdown) && (completedScanCount < neededScanCount)) {
try {
scanFinished.await();
} catch (InterruptedException e) {
LOG.warn("Interrupted while waiting for CacheReplicationMonitor"
+ " rescan", e);
break;
}
}
}
/**
* Indicates to the CacheReplicationMonitor that there have been CacheManager
* changes that require a rescan.
*/
public void setNeedsRescan() {
Preconditions.checkArgument(lock.isHeldByCurrentThread(),
"Must hold the CRM lock when setting the needsRescan bit.");
if (curScanCount >= 0) {
// If there is a scan in progress, we need to wait for the scan after
// that.
neededScanCount = curScanCount + 1;
} else {
// If there is no scan in progress, we need to wait for the next scan.
neededScanCount = completedScanCount + 1;
}
}
/**
* Shut down the monitor thread.
*/
@Override
public void close() throws IOException {
Preconditions.checkArgument(namesystem.hasWriteLock());
lock.lock();
try {
if (shutdown) return;
// Since we hold both the FSN write lock and the CRM lock here,
// we know that the CRM thread cannot be currently modifying
// the cache manager state while we're closing it.
// Since the CRM thread checks the value of 'shutdown' after waiting
// for a lock, we know that the thread will not modify the cache
// manager state after this point.
shutdown = true;
doRescan.signalAll();
scanFinished.signalAll();
} finally {
lock.unlock();
}
}
private void rescan() throws InterruptedException {
scannedDirectives = 0;
scannedBlocks = 0;
try {
namesystem.writeLock();
try {
lock.lock();
if (shutdown) {
throw new InterruptedException("CacheReplicationMonitor was " +
"shut down.");
}
curScanCount = completedScanCount + 1;
} finally {
lock.unlock();
}
resetStatistics();
rescanCacheDirectives();
rescanCachedBlockMap();
blockManager.getDatanodeManager().resetLastCachingDirectiveSentTime();
} finally {
namesystem.writeUnlock();
}
}
private void resetStatistics() {
for (CachePool pool: cacheManager.getCachePools()) {
pool.resetStatistics();
}
for (CacheDirective directive: cacheManager.getCacheDirectives()) {
directive.resetStatistics();
}
}
/**
* Scan all CacheDirectives. Use the information to figure out
* what cache replication factor each block should have.
*/
private void rescanCacheDirectives() {
FSDirectory fsDir = namesystem.getFSDirectory();
final long now = new Date().getTime();
for (CacheDirective directive : cacheManager.getCacheDirectives()) {
scannedDirectives++;
// Skip processing this entry if it has expired
if (directive.getExpiryTime() > 0 && directive.getExpiryTime() <= now) {
LOG.debug("Directive {}: the directive expired at {} (now = {})",
directive.getId(), directive.getExpiryTime(), now);
continue;
}
String path = directive.getPath();
INode node;
try {
node = fsDir.getINode(path, DirOp.READ);
} catch (IOException e) {
// We don't cache through symlinks or invalid paths
LOG.debug("Directive {}: Failed to resolve path {} ({})",
directive.getId(), path, e.getMessage());
continue;
}
if (node == null) {
LOG.debug("Directive {}: No inode found at {}", directive.getId(),
path);
} else if (node.isDirectory()) {
INodeDirectory dir = node.asDirectory();
ReadOnlyList children = dir
.getChildrenList(Snapshot.CURRENT_STATE_ID);
for (INode child : children) {
if (child.isFile()) {
rescanFile(directive, child.asFile());
}
}
} else if (node.isFile()) {
rescanFile(directive, node.asFile());
} else {
LOG.debug("Directive {}: ignoring non-directive, non-file inode {} ",
directive.getId(), node);
}
}
}
/**
* Apply a CacheDirective to a file.
*
* @param directive The CacheDirective to apply.
* @param file The file.
*/
private void rescanFile(CacheDirective directive, INodeFile file) {
BlockInfo[] blockInfos = file.getBlocks();
// Increment the "needed" statistics
directive.addFilesNeeded(1);
// We don't cache UC blocks, don't add them to the total here
long neededTotal = file.computeFileSizeNotIncludingLastUcBlock() *
directive.getReplication();
directive.addBytesNeeded(neededTotal);
// The pool's bytesNeeded is incremented as we scan. If the demand
// thus far plus the demand of this file would exceed the pool's limit,
// do not cache this file.
CachePool pool = directive.getPool();
if (pool.getBytesNeeded() > pool.getLimit()) {
LOG.debug("Directive {}: not scanning file {} because " +
"bytesNeeded for pool {} is {}, but the pool's limit is {}",
directive.getId(),
file.getFullPathName(),
pool.getPoolName(),
pool.getBytesNeeded(),
pool.getLimit());
return;
}
long cachedTotal = 0;
for (BlockInfo blockInfo : blockInfos) {
if (!blockInfo.getBlockUCState().equals(BlockUCState.COMPLETE)) {
// We don't try to cache blocks that are under construction.
LOG.trace("Directive {}: can't cache block {} because it is in state "
+ "{}, not COMPLETE.", directive.getId(), blockInfo,
blockInfo.getBlockUCState()
);
continue;
}
Block block = new Block(blockInfo.getBlockId());
CachedBlock ncblock = new CachedBlock(block.getBlockId(),
directive.getReplication(), mark);
CachedBlock ocblock = cachedBlocks.get(ncblock);
if (ocblock == null) {
cachedBlocks.put(ncblock);
ocblock = ncblock;
} else {
// Update bytesUsed using the current replication levels.
// Assumptions: we assume that all the blocks are the same length
// on each datanode. We can assume this because we're only caching
// blocks in state COMPLETE.
// Note that if two directives are caching the same block(s), they will
// both get them added to their bytesCached.
List cachedOn =
ocblock.getDatanodes(Type.CACHED);
long cachedByBlock = Math.min(cachedOn.size(),
directive.getReplication()) * blockInfo.getNumBytes();
cachedTotal += cachedByBlock;
if ((mark != ocblock.getMark()) ||
(ocblock.getReplication() < directive.getReplication())) {
//
// Overwrite the block's replication and mark in two cases:
//
// 1. If the mark on the CachedBlock is different from the mark for
// this scan, that means the block hasn't been updated during this
// scan, and we should overwrite whatever is there, since it is no
// longer valid.
//
// 2. If the replication in the CachedBlock is less than what the
// directive asks for, we want to increase the block's replication
// field to what the directive asks for.
//
ocblock.setReplicationAndMark(directive.getReplication(), mark);
}
}
LOG.trace("Directive {}: setting replication for block {} to {}",
directive.getId(), blockInfo, ocblock.getReplication());
}
// Increment the "cached" statistics
directive.addBytesCached(cachedTotal);
if (cachedTotal == neededTotal) {
directive.addFilesCached(1);
}
LOG.debug("Directive {}: caching {}: {}/{} bytes", directive.getId(),
file.getFullPathName(), cachedTotal, neededTotal);
}
private String findReasonForNotCaching(CachedBlock cblock,
BlockInfo blockInfo) {
if (blockInfo == null) {
// Somehow, a cache report with the block arrived, but the block
// reports from the DataNode haven't (yet?) described such a block.
// Alternately, the NameNode might have invalidated the block, but the
// DataNode hasn't caught up. In any case, we want to tell the DN
// to uncache this.
return "not tracked by the BlockManager";
} else if (!blockInfo.isComplete()) {
// When a cached block changes state from complete to some other state
// on the DataNode (perhaps because of append), it will begin the
// uncaching process. However, the uncaching process is not
// instantaneous, especially if clients have pinned the block. So
// there may be a period of time when incomplete blocks remain cached
// on the DataNodes.
return "not complete";
} else if (cblock.getReplication() == 0) {
// Since 0 is not a valid value for a cache directive's replication
// field, seeing a replication of 0 on a CacheBlock means that it
// has never been reached by any sweep.
return "not needed by any directives";
} else if (cblock.getMark() != mark) {
// Although the block was needed in the past, we didn't reach it during
// the current sweep. Therefore, it doesn't need to be cached any more.
// Need to set the replication to 0 so it doesn't flip back to cached
// when the mark flips on the next scan
cblock.setReplicationAndMark((short)0, mark);
return "no longer needed by any directives";
}
return null;
}
/**
* Scan through the cached block map.
* Any blocks which are under-replicated should be assigned new Datanodes.
* Blocks that are over-replicated should be removed from Datanodes.
*/
private void rescanCachedBlockMap() {
// Remove pendingCached blocks that will make DN out-of-capacity.
Set datanodes =
blockManager.getDatanodeManager().getDatanodes();
for (DatanodeDescriptor dn : datanodes) {
long remaining = dn.getCacheRemaining();
for (Iterator it = dn.getPendingCached().iterator();
it.hasNext();) {
CachedBlock cblock = it.next();
BlockInfo blockInfo = blockManager.
getStoredBlock(new Block(cblock.getBlockId()));
if (blockInfo == null) {
// Cannot find this block on the NameNode, skip this block from
// capacity calculation. Later logic will handle this block.
LOG.debug("Block {}: cannot be found in block manager and hence"
+ " skipped from calculation for node {}.", cblock.getBlockId(),
dn.getDatanodeUuid());
continue;
}
if (blockInfo.getNumBytes() > remaining) {
LOG.debug("Block {}: removing from PENDING_CACHED for node {} "
+ "because it cannot fit in remaining cache size {}.",
cblock.getBlockId(), dn.getDatanodeUuid(), remaining);
it.remove();
} else {
remaining -= blockInfo.getNumBytes();
}
}
}
for (Iterator cbIter = cachedBlocks.iterator();
cbIter.hasNext(); ) {
scannedBlocks++;
CachedBlock cblock = cbIter.next();
List pendingCached =
cblock.getDatanodes(Type.PENDING_CACHED);
List cached =
cblock.getDatanodes(Type.CACHED);
List pendingUncached =
cblock.getDatanodes(Type.PENDING_UNCACHED);
// Remove nodes from PENDING_UNCACHED if they were actually uncached.
for (Iterator iter = pendingUncached.iterator();
iter.hasNext(); ) {
DatanodeDescriptor datanode = iter.next();
if (!cblock.isInList(datanode.getCached())) {
LOG.trace("Block {}: removing from PENDING_UNCACHED for node {} "
+ "because the DataNode uncached it.", cblock.getBlockId(),
datanode.getDatanodeUuid());
datanode.getPendingUncached().remove(cblock);
iter.remove();
}
}
BlockInfo blockInfo = blockManager.
getStoredBlock(new Block(cblock.getBlockId()));
String reason = findReasonForNotCaching(cblock, blockInfo);
int neededCached = 0;
if (reason != null) {
LOG.trace("Block {}: can't cache block because it is {}",
cblock.getBlockId(), reason);
} else {
neededCached = cblock.getReplication();
}
int numCached = cached.size();
if (numCached >= neededCached) {
// If we have enough replicas, drop all pending cached.
for (Iterator iter = pendingCached.iterator();
iter.hasNext(); ) {
DatanodeDescriptor datanode = iter.next();
datanode.getPendingCached().remove(cblock);
iter.remove();
LOG.trace("Block {}: removing from PENDING_CACHED for node {} "
+ "because we already have {} cached replicas and we only" +
" need {}",
cblock.getBlockId(), datanode.getDatanodeUuid(), numCached,
neededCached
);
}
}
if (numCached < neededCached) {
// If we don't have enough replicas, drop all pending uncached.
for (Iterator iter = pendingUncached.iterator();
iter.hasNext(); ) {
DatanodeDescriptor datanode = iter.next();
datanode.getPendingUncached().remove(cblock);
iter.remove();
LOG.trace("Block {}: removing from PENDING_UNCACHED for node {} "
+ "because we only have {} cached replicas and we need " +
"{}", cblock.getBlockId(), datanode.getDatanodeUuid(),
numCached, neededCached
);
}
}
int neededUncached = numCached -
(pendingUncached.size() + neededCached);
if (neededUncached > 0) {
addNewPendingUncached(neededUncached, cblock, cached,
pendingUncached);
} else {
int additionalCachedNeeded = neededCached -
(numCached + pendingCached.size());
if (additionalCachedNeeded > 0) {
addNewPendingCached(additionalCachedNeeded, cblock, cached,
pendingCached);
}
}
if ((neededCached == 0) &&
pendingUncached.isEmpty() &&
pendingCached.isEmpty()) {
// we have nothing more to do with this block.
LOG.trace("Block {}: removing from cachedBlocks, since neededCached "
+ "== 0, and pendingUncached and pendingCached are empty.",
cblock.getBlockId()
);
cbIter.remove();
}
}
}
/**
* Add new entries to the PendingUncached list.
*
* @param neededUncached The number of replicas that need to be uncached.
* @param cachedBlock The block which needs to be uncached.
* @param cached A list of DataNodes currently caching the block.
* @param pendingUncached A list of DataNodes that will soon uncache the
* block.
*/
private void addNewPendingUncached(int neededUncached,
CachedBlock cachedBlock, List cached,
List pendingUncached) {
// Figure out which replicas can be uncached.
LinkedList possibilities =
new LinkedList();
for (DatanodeDescriptor datanode : cached) {
if (!pendingUncached.contains(datanode)) {
possibilities.add(datanode);
}
}
while (neededUncached > 0) {
if (possibilities.isEmpty()) {
LOG.warn("Logic error: we're trying to uncache more replicas than " +
"actually exist for " + cachedBlock);
return;
}
DatanodeDescriptor datanode =
possibilities.remove(random.nextInt(possibilities.size()));
pendingUncached.add(datanode);
boolean added = datanode.getPendingUncached().add(cachedBlock);
assert added;
neededUncached--;
}
}
/**
* Add new entries to the PendingCached list.
*
* @param neededCached The number of replicas that need to be cached.
* @param cachedBlock The block which needs to be cached.
* @param cached A list of DataNodes currently caching the block.
* @param pendingCached A list of DataNodes that will soon cache the
* block.
*/
private void addNewPendingCached(final int neededCached,
CachedBlock cachedBlock, List cached,
List pendingCached) {
// To figure out which replicas can be cached, we consult the
// blocksMap. We don't want to try to cache a corrupt replica, though.
BlockInfo blockInfo = blockManager.
getStoredBlock(new Block(cachedBlock.getBlockId()));
if (blockInfo == null) {
LOG.debug("Block {}: can't add new cached replicas," +
" because there is no record of this block " +
"on the NameNode.", cachedBlock.getBlockId());
return;
}
if (!blockInfo.isComplete()) {
LOG.debug("Block {}: can't cache this block, because it is not yet"
+ " complete.", cachedBlock.getBlockId());
return;
}
// Filter the list of replicas to only the valid targets
List possibilities =
new LinkedList();
int numReplicas = blockInfo.getCapacity();
Collection corrupt =
blockManager.getCorruptReplicas(blockInfo);
int outOfCapacity = 0;
for (int i = 0; i < numReplicas; i++) {
DatanodeDescriptor datanode = blockInfo.getDatanode(i);
if (datanode == null) {
continue;
}
if (!datanode.isInService()) {
continue;
}
if (corrupt != null && corrupt.contains(datanode)) {
continue;
}
if (pendingCached.contains(datanode) || cached.contains(datanode)) {
continue;
}
long pendingBytes = 0;
// Subtract pending cached blocks from effective capacity
Iterator it = datanode.getPendingCached().iterator();
while (it.hasNext()) {
CachedBlock cBlock = it.next();
BlockInfo info =
blockManager.getStoredBlock(new Block(cBlock.getBlockId()));
if (info != null) {
pendingBytes -= info.getNumBytes();
}
}
it = datanode.getPendingUncached().iterator();
// Add pending uncached blocks from effective capacity
while (it.hasNext()) {
CachedBlock cBlock = it.next();
BlockInfo info =
blockManager.getStoredBlock(new Block(cBlock.getBlockId()));
if (info != null) {
pendingBytes += info.getNumBytes();
}
}
long pendingCapacity = pendingBytes + datanode.getCacheRemaining();
if (pendingCapacity < blockInfo.getNumBytes()) {
LOG.trace("Block {}: DataNode {} is not a valid possibility " +
"because the block has size {}, but the DataNode only has {} " +
"bytes of cache remaining ({} pending bytes, {} already cached.)",
blockInfo.getBlockId(), datanode.getDatanodeUuid(),
blockInfo.getNumBytes(), pendingCapacity, pendingBytes,
datanode.getCacheRemaining());
outOfCapacity++;
continue;
}
possibilities.add(datanode);
}
List chosen = chooseDatanodesForCaching(possibilities,
neededCached, blockManager.getDatanodeManager().getStaleInterval());
for (DatanodeDescriptor datanode : chosen) {
LOG.trace("Block {}: added to PENDING_CACHED on DataNode {}",
blockInfo.getBlockId(), datanode.getDatanodeUuid());
pendingCached.add(datanode);
boolean added = datanode.getPendingCached().add(cachedBlock);
assert added;
}
// We were unable to satisfy the requested replication factor
if (neededCached > chosen.size()) {
LOG.debug("Block {}: we only have {} of {} cached replicas."
+ " {} DataNodes have insufficient cache capacity.",
blockInfo.getBlockId(),
(cachedBlock.getReplication() - neededCached + chosen.size()),
cachedBlock.getReplication(), outOfCapacity
);
}
}
/**
* Chooses datanode locations for caching from a list of valid possibilities.
* Non-stale nodes are chosen before stale nodes.
*
* @param possibilities List of candidate datanodes
* @param neededCached Number of replicas needed
* @param staleInterval Age of a stale datanode
* @return A list of chosen datanodes
*/
private static List chooseDatanodesForCaching(
final List possibilities, final int neededCached,
final long staleInterval) {
// Make a copy that we can modify
List targets =
new ArrayList(possibilities);
// Selected targets
List chosen = new LinkedList();
// Filter out stale datanodes
List stale = new LinkedList();
Iterator it = targets.iterator();
while (it.hasNext()) {
DatanodeDescriptor d = it.next();
if (d.isStale(staleInterval)) {
it.remove();
stale.add(d);
}
}
// Select targets
while (chosen.size() < neededCached) {
// Try to use stale nodes if we're out of non-stale nodes, else we're done
if (targets.isEmpty()) {
if (!stale.isEmpty()) {
targets = stale;
} else {
break;
}
}
// Select a random target
DatanodeDescriptor target =
chooseRandomDatanodeByRemainingCapacity(targets);
chosen.add(target);
targets.remove(target);
}
return chosen;
}
/**
* Choose a single datanode from the provided list of possible
* targets, weighted by the percentage of free space remaining on the node.
*
* @return The chosen datanode
*/
private static DatanodeDescriptor chooseRandomDatanodeByRemainingCapacity(
final List targets) {
// Use a weighted probability to choose the target datanode
float total = 0;
for (DatanodeDescriptor d : targets) {
total += d.getCacheRemainingPercent();
}
// Give each datanode a portion of keyspace equal to its relative weight
// [0, w1) selects d1, [w1, w2) selects d2, etc.
TreeMap lottery =
new TreeMap();
int offset = 0;
for (DatanodeDescriptor d : targets) {
// Since we're using floats, be paranoid about negative values
int weight =
Math.max(1, (int)((d.getCacheRemainingPercent() / total) * 1000000));
offset += weight;
lottery.put(offset, d);
}
// Choose a number from [0, offset), which is the total amount of weight,
// to select the winner
DatanodeDescriptor winner =
lottery.higherEntry(random.nextInt(offset)).getValue();
return winner;
}
}