All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor Maven / Gradle / Ivy

The newest version!
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hdfs.server.blockmanagement;

import static org.apache.hadoop.util.ExitUtil.terminate;

import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.Random;
import java.util.TreeMap;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;

import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.CacheDirective;
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor.CachedBlocksList.Type;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState;
import org.apache.hadoop.hdfs.server.namenode.CacheManager;
import org.apache.hadoop.hdfs.server.namenode.CachePool;
import org.apache.hadoop.hdfs.server.namenode.CachedBlock;
import org.apache.hadoop.hdfs.server.namenode.FSDirectory;
import org.apache.hadoop.hdfs.server.namenode.FSDirectory.DirOp;
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
import org.apache.hadoop.hdfs.server.namenode.INode;
import org.apache.hadoop.hdfs.server.namenode.INodeDirectory;
import org.apache.hadoop.hdfs.server.namenode.INodeFile;
import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot;
import org.apache.hadoop.hdfs.util.ReadOnlyList;
import org.apache.hadoop.util.GSet;
import org.apache.hadoop.util.Time;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Preconditions;

/**
 * Scans the namesystem, scheduling blocks to be cached as appropriate.
 *
 * The CacheReplicationMonitor does a full scan when the NameNode first
 * starts up, and at configurable intervals afterwards.
 */
@InterfaceAudience.LimitedPrivate({"HDFS"})
public class CacheReplicationMonitor extends Thread implements Closeable {

  private static final Logger LOG =
      LoggerFactory.getLogger(CacheReplicationMonitor.class);

  private final FSNamesystem namesystem;

  private final BlockManager blockManager;

  private final CacheManager cacheManager;

  private final GSet cachedBlocks;

  /**
   * Pseudorandom number source
   */
  private static final Random random = new Random();

  /**
   * The interval at which we scan the namesystem for caching changes.
   */
  private final long intervalMs;

  /**
   * The CacheReplicationMonitor (CRM) lock. Used to synchronize starting and
   * waiting for rescan operations.
   */
  private final ReentrantLock lock;

  /**
   * Notifies the scan thread that an immediate rescan is needed.
   */
  private final Condition doRescan;

  /**
   * Notifies waiting threads that a rescan has finished.
   */
  private final Condition scanFinished;

  /**
   * The number of rescans completed. Used to wait for scans to finish.
   * Protected by the CacheReplicationMonitor lock.
   */
  private long completedScanCount = 0;

  /**
   * The scan we're currently performing, or -1 if no scan is in progress.
   * Protected by the CacheReplicationMonitor lock.
   */
  private long curScanCount = -1;

  /**
   * The number of rescans we need to complete.  Protected by the CRM lock.
   */
  private long neededScanCount = 0;

  /**
   * True if this monitor should terminate. Protected by the CRM lock.
   */
  private boolean shutdown = false;

  /**
   * Mark status of the current scan.
   */
  private boolean mark = false;

  /**
   * Cache directives found in the previous scan.
   */
  private int scannedDirectives;

  /**
   * Blocks found in the previous scan.
   */
  private long scannedBlocks;

  public CacheReplicationMonitor(FSNamesystem namesystem,
      CacheManager cacheManager, long intervalMs, ReentrantLock lock) {
    this.namesystem = namesystem;
    this.blockManager = namesystem.getBlockManager();
    this.cacheManager = cacheManager;
    this.cachedBlocks = cacheManager.getCachedBlocks();
    this.intervalMs = intervalMs;
    this.lock = lock;
    this.doRescan = this.lock.newCondition();
    this.scanFinished = this.lock.newCondition();
  }

  @Override
  public void run() {
    long startTimeMs = 0;
    Thread.currentThread().setName("CacheReplicationMonitor(" +
        System.identityHashCode(this) + ")");
    LOG.info("Starting CacheReplicationMonitor with interval " +
             intervalMs + " milliseconds");
    try {
      long curTimeMs = Time.monotonicNow();
      while (true) {
        lock.lock();
        try {
          while (true) {
            if (shutdown) {
              LOG.info("Shutting down CacheReplicationMonitor");
              return;
            }
            if (completedScanCount < neededScanCount) {
              LOG.debug("Rescanning because of pending operations");
              break;
            }
            long delta = (startTimeMs + intervalMs) - curTimeMs;
            if (delta <= 0) {
              LOG.debug("Rescanning after {} milliseconds", (curTimeMs - startTimeMs));
              break;
            }
            doRescan.await(delta, TimeUnit.MILLISECONDS);
            curTimeMs = Time.monotonicNow();
          }
        } finally {
          lock.unlock();
        }
        startTimeMs = curTimeMs;
        mark = !mark;
        rescan();
        curTimeMs = Time.monotonicNow();
        // Update synchronization-related variables.
        lock.lock();
        try {
          completedScanCount = curScanCount;
          curScanCount = -1;
          scanFinished.signalAll();
        } finally {
          lock.unlock();
        }
        LOG.debug("Scanned {} directive(s) and {} block(s) in {} millisecond(s).",
            scannedDirectives, scannedBlocks, (curTimeMs - startTimeMs));
      }
    } catch (InterruptedException e) {
      LOG.info("Shutting down CacheReplicationMonitor.");
      return;
    } catch (Throwable t) {
      LOG.error("Thread exiting", t);
      terminate(1, t);
    }
  }

  /**
   * Waits for a rescan to complete. This doesn't guarantee consistency with
   * pending operations, only relative recency, since it will not force a new
   * rescan if a rescan is already underway.
   * 

* Note that this call will release the FSN lock, so operations before and * after are not atomic. */ public void waitForRescanIfNeeded() { Preconditions.checkArgument(!namesystem.hasWriteLock(), "Must not hold the FSN write lock when waiting for a rescan."); Preconditions.checkArgument(lock.isHeldByCurrentThread(), "Must hold the CRM lock when waiting for a rescan."); if (neededScanCount <= completedScanCount) { return; } // If no scan is already ongoing, mark the CRM as dirty and kick if (curScanCount < 0) { doRescan.signal(); } // Wait until the scan finishes and the count advances while ((!shutdown) && (completedScanCount < neededScanCount)) { try { scanFinished.await(); } catch (InterruptedException e) { LOG.warn("Interrupted while waiting for CacheReplicationMonitor" + " rescan", e); break; } } } /** * Indicates to the CacheReplicationMonitor that there have been CacheManager * changes that require a rescan. */ public void setNeedsRescan() { Preconditions.checkArgument(lock.isHeldByCurrentThread(), "Must hold the CRM lock when setting the needsRescan bit."); if (curScanCount >= 0) { // If there is a scan in progress, we need to wait for the scan after // that. neededScanCount = curScanCount + 1; } else { // If there is no scan in progress, we need to wait for the next scan. neededScanCount = completedScanCount + 1; } } /** * Shut down the monitor thread. */ @Override public void close() throws IOException { Preconditions.checkArgument(namesystem.hasWriteLock()); lock.lock(); try { if (shutdown) return; // Since we hold both the FSN write lock and the CRM lock here, // we know that the CRM thread cannot be currently modifying // the cache manager state while we're closing it. // Since the CRM thread checks the value of 'shutdown' after waiting // for a lock, we know that the thread will not modify the cache // manager state after this point. shutdown = true; doRescan.signalAll(); scanFinished.signalAll(); } finally { lock.unlock(); } } private void rescan() throws InterruptedException { scannedDirectives = 0; scannedBlocks = 0; try { namesystem.writeLock(); try { lock.lock(); if (shutdown) { throw new InterruptedException("CacheReplicationMonitor was " + "shut down."); } curScanCount = completedScanCount + 1; } finally { lock.unlock(); } resetStatistics(); rescanCacheDirectives(); rescanCachedBlockMap(); blockManager.getDatanodeManager().resetLastCachingDirectiveSentTime(); } finally { namesystem.writeUnlock(); } } private void resetStatistics() { for (CachePool pool: cacheManager.getCachePools()) { pool.resetStatistics(); } for (CacheDirective directive: cacheManager.getCacheDirectives()) { directive.resetStatistics(); } } /** * Scan all CacheDirectives. Use the information to figure out * what cache replication factor each block should have. */ private void rescanCacheDirectives() { FSDirectory fsDir = namesystem.getFSDirectory(); final long now = new Date().getTime(); for (CacheDirective directive : cacheManager.getCacheDirectives()) { scannedDirectives++; // Skip processing this entry if it has expired if (directive.getExpiryTime() > 0 && directive.getExpiryTime() <= now) { LOG.debug("Directive {}: the directive expired at {} (now = {})", directive.getId(), directive.getExpiryTime(), now); continue; } String path = directive.getPath(); INode node; try { node = fsDir.getINode(path, DirOp.READ); } catch (IOException e) { // We don't cache through symlinks or invalid paths LOG.debug("Directive {}: Failed to resolve path {} ({})", directive.getId(), path, e.getMessage()); continue; } if (node == null) { LOG.debug("Directive {}: No inode found at {}", directive.getId(), path); } else if (node.isDirectory()) { INodeDirectory dir = node.asDirectory(); ReadOnlyList children = dir .getChildrenList(Snapshot.CURRENT_STATE_ID); for (INode child : children) { if (child.isFile()) { rescanFile(directive, child.asFile()); } } } else if (node.isFile()) { rescanFile(directive, node.asFile()); } else { LOG.debug("Directive {}: ignoring non-directive, non-file inode {} ", directive.getId(), node); } } } /** * Apply a CacheDirective to a file. * * @param directive The CacheDirective to apply. * @param file The file. */ private void rescanFile(CacheDirective directive, INodeFile file) { BlockInfo[] blockInfos = file.getBlocks(); // Increment the "needed" statistics directive.addFilesNeeded(1); // We don't cache UC blocks, don't add them to the total here long neededTotal = file.computeFileSizeNotIncludingLastUcBlock() * directive.getReplication(); directive.addBytesNeeded(neededTotal); // The pool's bytesNeeded is incremented as we scan. If the demand // thus far plus the demand of this file would exceed the pool's limit, // do not cache this file. CachePool pool = directive.getPool(); if (pool.getBytesNeeded() > pool.getLimit()) { LOG.debug("Directive {}: not scanning file {} because " + "bytesNeeded for pool {} is {}, but the pool's limit is {}", directive.getId(), file.getFullPathName(), pool.getPoolName(), pool.getBytesNeeded(), pool.getLimit()); return; } long cachedTotal = 0; for (BlockInfo blockInfo : blockInfos) { if (!blockInfo.getBlockUCState().equals(BlockUCState.COMPLETE)) { // We don't try to cache blocks that are under construction. LOG.trace("Directive {}: can't cache block {} because it is in state " + "{}, not COMPLETE.", directive.getId(), blockInfo, blockInfo.getBlockUCState() ); continue; } Block block = new Block(blockInfo.getBlockId()); CachedBlock ncblock = new CachedBlock(block.getBlockId(), directive.getReplication(), mark); CachedBlock ocblock = cachedBlocks.get(ncblock); if (ocblock == null) { cachedBlocks.put(ncblock); ocblock = ncblock; } else { // Update bytesUsed using the current replication levels. // Assumptions: we assume that all the blocks are the same length // on each datanode. We can assume this because we're only caching // blocks in state COMPLETE. // Note that if two directives are caching the same block(s), they will // both get them added to their bytesCached. List cachedOn = ocblock.getDatanodes(Type.CACHED); long cachedByBlock = Math.min(cachedOn.size(), directive.getReplication()) * blockInfo.getNumBytes(); cachedTotal += cachedByBlock; if ((mark != ocblock.getMark()) || (ocblock.getReplication() < directive.getReplication())) { // // Overwrite the block's replication and mark in two cases: // // 1. If the mark on the CachedBlock is different from the mark for // this scan, that means the block hasn't been updated during this // scan, and we should overwrite whatever is there, since it is no // longer valid. // // 2. If the replication in the CachedBlock is less than what the // directive asks for, we want to increase the block's replication // field to what the directive asks for. // ocblock.setReplicationAndMark(directive.getReplication(), mark); } } LOG.trace("Directive {}: setting replication for block {} to {}", directive.getId(), blockInfo, ocblock.getReplication()); } // Increment the "cached" statistics directive.addBytesCached(cachedTotal); if (cachedTotal == neededTotal) { directive.addFilesCached(1); } LOG.debug("Directive {}: caching {}: {}/{} bytes", directive.getId(), file.getFullPathName(), cachedTotal, neededTotal); } private String findReasonForNotCaching(CachedBlock cblock, BlockInfo blockInfo) { if (blockInfo == null) { // Somehow, a cache report with the block arrived, but the block // reports from the DataNode haven't (yet?) described such a block. // Alternately, the NameNode might have invalidated the block, but the // DataNode hasn't caught up. In any case, we want to tell the DN // to uncache this. return "not tracked by the BlockManager"; } else if (!blockInfo.isComplete()) { // When a cached block changes state from complete to some other state // on the DataNode (perhaps because of append), it will begin the // uncaching process. However, the uncaching process is not // instantaneous, especially if clients have pinned the block. So // there may be a period of time when incomplete blocks remain cached // on the DataNodes. return "not complete"; } else if (cblock.getReplication() == 0) { // Since 0 is not a valid value for a cache directive's replication // field, seeing a replication of 0 on a CacheBlock means that it // has never been reached by any sweep. return "not needed by any directives"; } else if (cblock.getMark() != mark) { // Although the block was needed in the past, we didn't reach it during // the current sweep. Therefore, it doesn't need to be cached any more. // Need to set the replication to 0 so it doesn't flip back to cached // when the mark flips on the next scan cblock.setReplicationAndMark((short)0, mark); return "no longer needed by any directives"; } return null; } /** * Scan through the cached block map. * Any blocks which are under-replicated should be assigned new Datanodes. * Blocks that are over-replicated should be removed from Datanodes. */ private void rescanCachedBlockMap() { // Remove pendingCached blocks that will make DN out-of-capacity. Set datanodes = blockManager.getDatanodeManager().getDatanodes(); for (DatanodeDescriptor dn : datanodes) { long remaining = dn.getCacheRemaining(); for (Iterator it = dn.getPendingCached().iterator(); it.hasNext();) { CachedBlock cblock = it.next(); BlockInfo blockInfo = blockManager. getStoredBlock(new Block(cblock.getBlockId())); if (blockInfo == null) { // Cannot find this block on the NameNode, skip this block from // capacity calculation. Later logic will handle this block. LOG.debug("Block {}: cannot be found in block manager and hence" + " skipped from calculation for node {}.", cblock.getBlockId(), dn.getDatanodeUuid()); continue; } if (blockInfo.getNumBytes() > remaining) { LOG.debug("Block {}: removing from PENDING_CACHED for node {} " + "because it cannot fit in remaining cache size {}.", cblock.getBlockId(), dn.getDatanodeUuid(), remaining); it.remove(); } else { remaining -= blockInfo.getNumBytes(); } } } for (Iterator cbIter = cachedBlocks.iterator(); cbIter.hasNext(); ) { scannedBlocks++; CachedBlock cblock = cbIter.next(); List pendingCached = cblock.getDatanodes(Type.PENDING_CACHED); List cached = cblock.getDatanodes(Type.CACHED); List pendingUncached = cblock.getDatanodes(Type.PENDING_UNCACHED); // Remove nodes from PENDING_UNCACHED if they were actually uncached. for (Iterator iter = pendingUncached.iterator(); iter.hasNext(); ) { DatanodeDescriptor datanode = iter.next(); if (!cblock.isInList(datanode.getCached())) { LOG.trace("Block {}: removing from PENDING_UNCACHED for node {} " + "because the DataNode uncached it.", cblock.getBlockId(), datanode.getDatanodeUuid()); datanode.getPendingUncached().remove(cblock); iter.remove(); } } BlockInfo blockInfo = blockManager. getStoredBlock(new Block(cblock.getBlockId())); String reason = findReasonForNotCaching(cblock, blockInfo); int neededCached = 0; if (reason != null) { LOG.trace("Block {}: can't cache block because it is {}", cblock.getBlockId(), reason); } else { neededCached = cblock.getReplication(); } int numCached = cached.size(); if (numCached >= neededCached) { // If we have enough replicas, drop all pending cached. for (Iterator iter = pendingCached.iterator(); iter.hasNext(); ) { DatanodeDescriptor datanode = iter.next(); datanode.getPendingCached().remove(cblock); iter.remove(); LOG.trace("Block {}: removing from PENDING_CACHED for node {} " + "because we already have {} cached replicas and we only" + " need {}", cblock.getBlockId(), datanode.getDatanodeUuid(), numCached, neededCached ); } } if (numCached < neededCached) { // If we don't have enough replicas, drop all pending uncached. for (Iterator iter = pendingUncached.iterator(); iter.hasNext(); ) { DatanodeDescriptor datanode = iter.next(); datanode.getPendingUncached().remove(cblock); iter.remove(); LOG.trace("Block {}: removing from PENDING_UNCACHED for node {} " + "because we only have {} cached replicas and we need " + "{}", cblock.getBlockId(), datanode.getDatanodeUuid(), numCached, neededCached ); } } int neededUncached = numCached - (pendingUncached.size() + neededCached); if (neededUncached > 0) { addNewPendingUncached(neededUncached, cblock, cached, pendingUncached); } else { int additionalCachedNeeded = neededCached - (numCached + pendingCached.size()); if (additionalCachedNeeded > 0) { addNewPendingCached(additionalCachedNeeded, cblock, cached, pendingCached); } } if ((neededCached == 0) && pendingUncached.isEmpty() && pendingCached.isEmpty()) { // we have nothing more to do with this block. LOG.trace("Block {}: removing from cachedBlocks, since neededCached " + "== 0, and pendingUncached and pendingCached are empty.", cblock.getBlockId() ); cbIter.remove(); } } } /** * Add new entries to the PendingUncached list. * * @param neededUncached The number of replicas that need to be uncached. * @param cachedBlock The block which needs to be uncached. * @param cached A list of DataNodes currently caching the block. * @param pendingUncached A list of DataNodes that will soon uncache the * block. */ private void addNewPendingUncached(int neededUncached, CachedBlock cachedBlock, List cached, List pendingUncached) { // Figure out which replicas can be uncached. LinkedList possibilities = new LinkedList(); for (DatanodeDescriptor datanode : cached) { if (!pendingUncached.contains(datanode)) { possibilities.add(datanode); } } while (neededUncached > 0) { if (possibilities.isEmpty()) { LOG.warn("Logic error: we're trying to uncache more replicas than " + "actually exist for " + cachedBlock); return; } DatanodeDescriptor datanode = possibilities.remove(random.nextInt(possibilities.size())); pendingUncached.add(datanode); boolean added = datanode.getPendingUncached().add(cachedBlock); assert added; neededUncached--; } } /** * Add new entries to the PendingCached list. * * @param neededCached The number of replicas that need to be cached. * @param cachedBlock The block which needs to be cached. * @param cached A list of DataNodes currently caching the block. * @param pendingCached A list of DataNodes that will soon cache the * block. */ private void addNewPendingCached(final int neededCached, CachedBlock cachedBlock, List cached, List pendingCached) { // To figure out which replicas can be cached, we consult the // blocksMap. We don't want to try to cache a corrupt replica, though. BlockInfo blockInfo = blockManager. getStoredBlock(new Block(cachedBlock.getBlockId())); if (blockInfo == null) { LOG.debug("Block {}: can't add new cached replicas," + " because there is no record of this block " + "on the NameNode.", cachedBlock.getBlockId()); return; } if (!blockInfo.isComplete()) { LOG.debug("Block {}: can't cache this block, because it is not yet" + " complete.", cachedBlock.getBlockId()); return; } // Filter the list of replicas to only the valid targets List possibilities = new LinkedList(); int numReplicas = blockInfo.getCapacity(); Collection corrupt = blockManager.getCorruptReplicas(blockInfo); int outOfCapacity = 0; for (int i = 0; i < numReplicas; i++) { DatanodeDescriptor datanode = blockInfo.getDatanode(i); if (datanode == null) { continue; } if (!datanode.isInService()) { continue; } if (corrupt != null && corrupt.contains(datanode)) { continue; } if (pendingCached.contains(datanode) || cached.contains(datanode)) { continue; } long pendingBytes = 0; // Subtract pending cached blocks from effective capacity Iterator it = datanode.getPendingCached().iterator(); while (it.hasNext()) { CachedBlock cBlock = it.next(); BlockInfo info = blockManager.getStoredBlock(new Block(cBlock.getBlockId())); if (info != null) { pendingBytes -= info.getNumBytes(); } } it = datanode.getPendingUncached().iterator(); // Add pending uncached blocks from effective capacity while (it.hasNext()) { CachedBlock cBlock = it.next(); BlockInfo info = blockManager.getStoredBlock(new Block(cBlock.getBlockId())); if (info != null) { pendingBytes += info.getNumBytes(); } } long pendingCapacity = pendingBytes + datanode.getCacheRemaining(); if (pendingCapacity < blockInfo.getNumBytes()) { LOG.trace("Block {}: DataNode {} is not a valid possibility " + "because the block has size {}, but the DataNode only has {} " + "bytes of cache remaining ({} pending bytes, {} already cached.)", blockInfo.getBlockId(), datanode.getDatanodeUuid(), blockInfo.getNumBytes(), pendingCapacity, pendingBytes, datanode.getCacheRemaining()); outOfCapacity++; continue; } possibilities.add(datanode); } List chosen = chooseDatanodesForCaching(possibilities, neededCached, blockManager.getDatanodeManager().getStaleInterval()); for (DatanodeDescriptor datanode : chosen) { LOG.trace("Block {}: added to PENDING_CACHED on DataNode {}", blockInfo.getBlockId(), datanode.getDatanodeUuid()); pendingCached.add(datanode); boolean added = datanode.getPendingCached().add(cachedBlock); assert added; } // We were unable to satisfy the requested replication factor if (neededCached > chosen.size()) { LOG.debug("Block {}: we only have {} of {} cached replicas." + " {} DataNodes have insufficient cache capacity.", blockInfo.getBlockId(), (cachedBlock.getReplication() - neededCached + chosen.size()), cachedBlock.getReplication(), outOfCapacity ); } } /** * Chooses datanode locations for caching from a list of valid possibilities. * Non-stale nodes are chosen before stale nodes. * * @param possibilities List of candidate datanodes * @param neededCached Number of replicas needed * @param staleInterval Age of a stale datanode * @return A list of chosen datanodes */ private static List chooseDatanodesForCaching( final List possibilities, final int neededCached, final long staleInterval) { // Make a copy that we can modify List targets = new ArrayList(possibilities); // Selected targets List chosen = new LinkedList(); // Filter out stale datanodes List stale = new LinkedList(); Iterator it = targets.iterator(); while (it.hasNext()) { DatanodeDescriptor d = it.next(); if (d.isStale(staleInterval)) { it.remove(); stale.add(d); } } // Select targets while (chosen.size() < neededCached) { // Try to use stale nodes if we're out of non-stale nodes, else we're done if (targets.isEmpty()) { if (!stale.isEmpty()) { targets = stale; } else { break; } } // Select a random target DatanodeDescriptor target = chooseRandomDatanodeByRemainingCapacity(targets); chosen.add(target); targets.remove(target); } return chosen; } /** * Choose a single datanode from the provided list of possible * targets, weighted by the percentage of free space remaining on the node. * * @return The chosen datanode */ private static DatanodeDescriptor chooseRandomDatanodeByRemainingCapacity( final List targets) { // Use a weighted probability to choose the target datanode float total = 0; for (DatanodeDescriptor d : targets) { total += d.getCacheRemainingPercent(); } // Give each datanode a portion of keyspace equal to its relative weight // [0, w1) selects d1, [w1, w2) selects d2, etc. TreeMap lottery = new TreeMap(); int offset = 0; for (DatanodeDescriptor d : targets) { // Since we're using floats, be paranoid about negative values int weight = Math.max(1, (int)((d.getCacheRemainingPercent() / total) * 1000000)); offset += weight; lottery.put(offset, d); } // Choose a number from [0, offset), which is the total amount of weight, // to select the winner DatanodeDescriptor winner = lottery.higherEntry(random.nextInt(offset)).getValue(); return winner; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy