All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hdfs.server.namenode.BlockManager Maven / Gradle / Ivy

The newest version!
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hdfs.server.namenode;

import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.TreeMap;
import java.util.TreeSet;

import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.BlockListAsLongs;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.server.common.HdfsConstants.BlockUCState;
import org.apache.hadoop.hdfs.server.common.HdfsConstants.ReplicaState;
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem.NumberReplicas;
import org.apache.hadoop.hdfs.server.namenode.UnderReplicatedBlocks.BlockIterator;
import org.apache.hadoop.hdfs.DFSConfigKeys;

/**
 * Keeps information related to the blocks stored in the Hadoop cluster.
 * This class is a helper class for {@link FSNamesystem} and requires several
 * methods to be called with lock held on {@link FSNamesystem}.
 */
@InterfaceAudience.Private
public class BlockManager {
  // Default initial capacity and load factor of map
  public static final int DEFAULT_INITIAL_MAP_CAPACITY = 16;
  public static final float DEFAULT_MAP_LOAD_FACTOR = 0.75f;
  public static final int DEFAULT_MAX_CORRUPT_FILES_RETURNED = 500;

  private final FSNamesystem namesystem;

  volatile long pendingReplicationBlocksCount = 0L;
  volatile long corruptReplicaBlocksCount = 0L;
  volatile long underReplicatedBlocksCount = 0L;
  volatile long scheduledReplicationBlocksCount = 0L;
  volatile long excessBlocksCount = 0L;
  volatile long pendingDeletionBlocksCount = 0L;

  //
  // Mapping: Block -> { INode, datanodes, self ref }
  // Updated only in response to client-sent information.
  //
  final BlocksMap blocksMap;

  //
  // Store blocks-->datanodedescriptor(s) map of corrupt replicas
  //
  CorruptReplicasMap corruptReplicas = new CorruptReplicasMap();

  //
  // Keeps a Collection for every named machine containing
  // blocks that have recently been invalidated and are thought to live
  // on the machine in question.
  // Mapping: StorageID -> ArrayList
  //
  Map> recentInvalidateSets =
    new TreeMap>();

  //
  // Keeps a TreeSet for every named node. Each treeset contains
  // a list of the blocks that are "extra" at that location. We'll
  // eventually remove these extras.
  // Mapping: StorageID -> TreeSet
  //
  Map> excessReplicateMap =
    new TreeMap>();

  //
  // Store set of Blocks that need to be replicated 1 or more times.
  // We also store pending replication-orders.
  //
  UnderReplicatedBlocks neededReplications = new UnderReplicatedBlocks();
  private PendingReplicationBlocks pendingReplications;

  //  The maximum number of replicas allowed for a block
  int maxReplication;
  //  How many outgoing replication streams a given node should have at one time
  int maxReplicationStreams;
  // Minimum copies needed or else write is disallowed
  int minReplication;
  // Default number of replicas
  int defaultReplication;
  // How many entries are returned by getCorruptInodes()
  int maxCorruptFilesReturned;
  
  // variable to enable check for enough racks 
  boolean shouldCheckForEnoughRacks = true;

  /**
   * Last block index used for replication work.
   */
  private int replIndex = 0;
  private long missingBlocksInCurIter = 0;
  private long missingBlocksInPrevIter = 0;
  Random r = new Random();

  // for block replicas placement
  BlockPlacementPolicy replicator;

  BlockManager(FSNamesystem fsn, Configuration conf) throws IOException {
    this(fsn, conf, DEFAULT_INITIAL_MAP_CAPACITY);
  }
  
  BlockManager(FSNamesystem fsn, Configuration conf, int capacity)
      throws IOException {
    namesystem = fsn;
    pendingReplications = new PendingReplicationBlocks(
        conf.getInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_PENDING_TIMEOUT_SEC_KEY,
            DFSConfigKeys.DFS_NAMENODE_REPLICATION_PENDING_TIMEOUT_SEC_DEFAULT) * 1000L);
    setConfigurationParameters(conf);
    blocksMap = new BlocksMap(capacity, DEFAULT_MAP_LOAD_FACTOR);
  }

  void setConfigurationParameters(Configuration conf) throws IOException {
    this.replicator = BlockPlacementPolicy.getInstance(
                         conf,
                         namesystem,
                         namesystem.clusterMap);

    this.maxCorruptFilesReturned = conf.getInt("dfs.corruptfilesreturned.max",
        DEFAULT_MAX_CORRUPT_FILES_RETURNED);
    this.defaultReplication = conf.getInt("dfs.replication", 3);
    this.maxReplication = conf.getInt("dfs.replication.max", 512);
    this.minReplication = conf.getInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_KEY,
                                      DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_DEFAULT);
    if (minReplication <= 0)
      throw new IOException(
                            "Unexpected configuration parameters: dfs.namenode.replication.min = "
                            + minReplication
                            + " must be greater than 0");
    if (maxReplication >= (int)Short.MAX_VALUE)
      throw new IOException(
                            "Unexpected configuration parameters: dfs.replication.max = "
                            + maxReplication + " must be less than " + (Short.MAX_VALUE));
    if (maxReplication < minReplication)
      throw new IOException(
                            "Unexpected configuration parameters: dfs.namenode.replication.min = "
                            + minReplication
                            + " must be less than dfs.replication.max = "
                            + maxReplication);
    this.maxReplicationStreams = conf.getInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_MAX_STREAMS_KEY,
                                             DFSConfigKeys.DFS_NAMENODE_REPLICATION_MAX_STREAMS_DEFAULT);
    this.shouldCheckForEnoughRacks = conf.get(DFSConfigKeys.NET_TOPOLOGY_SCRIPT_FILE_NAME_KEY) == null ? false
                                                                             : true;
    FSNamesystem.LOG.info("defaultReplication = " + defaultReplication);
    FSNamesystem.LOG.info("maxReplication = " + maxReplication);
    FSNamesystem.LOG.info("minReplication = " + minReplication);
    FSNamesystem.LOG.info("maxReplicationStreams = " + maxReplicationStreams);
    FSNamesystem.LOG.info("shouldCheckForEnoughRacks = " + shouldCheckForEnoughRacks);
  }

  void activate() {
    pendingReplications.start();
  }

  void close() {
    if (pendingReplications != null) pendingReplications.stop();
    blocksMap.close();
  }

  void metaSave(PrintWriter out) {
    //
    // Dump contents of neededReplication
    //
    synchronized (neededReplications) {
      out.println("Metasave: Blocks waiting for replication: " + 
                  neededReplications.size());
      for (Block block : neededReplications) {
        List containingNodes =
                                          new ArrayList();
        NumberReplicas numReplicas = new NumberReplicas();
        // source node returned is not used
        chooseSourceDatanode(block, containingNodes, numReplicas);
        int usableReplicas = numReplicas.liveReplicas() +
                             numReplicas.decommissionedReplicas();
       
        if (block instanceof BlockInfo) {
          String fileName = ((BlockInfo)block).getINode().getFullPathName();
          out.print(fileName + ": ");
        }
        // l: == live:, d: == decommissioned c: == corrupt e: == excess
        out.print(block + ((usableReplicas > 0)? "" : " MISSING") + 
                  " (replicas:" +
                  " l: " + numReplicas.liveReplicas() +
                  " d: " + numReplicas.decommissionedReplicas() +
                  " c: " + numReplicas.corruptReplicas() +
                  " e: " + numReplicas.excessReplicas() + ") "); 

        Collection corruptNodes = 
                                      corruptReplicas.getNodes(block);
        
        for (Iterator jt = blocksMap.nodeIterator(block);
             jt.hasNext();) {
          DatanodeDescriptor node = jt.next();
          String state = "";
          if (corruptNodes != null && corruptNodes.contains(node)) {
            state = "(corrupt)";
          } else if (node.isDecommissioned() || 
              node.isDecommissionInProgress()) {
            state = "(decommissioned)";
          }          
          out.print(" " + node + state + " : ");
        }
        out.println("");
      }
    }

    //
    // Dump blocks from pendingReplication
    //
    pendingReplications.metaSave(out);

    //
    // Dump blocks that are waiting to be deleted
    //
    dumpRecentInvalidateSets(out);
  }

  /**
   * @param block
   * @return true if the block has minimum replicas
   */
  boolean checkMinReplication(Block block) {
    return (countNodes(block).liveReplicas() >= minReplication);
  }

  /**
   * Commit a block of a file
   * 
   * @param fileINode file inode
   * @param block block to be committed
   * @param commitBlock - contains client reported block length and generation
   * @throws IOException if the block does not have at least a minimal number
   * of replicas reported from data-nodes.
   */
  private void commitBlock(INodeFileUnderConstruction fileINode,
                       BlockInfoUnderConstruction block,
                       Block commitBlock) throws IOException {
    if (block.getBlockUCState() == BlockUCState.COMMITTED)
      return;
    assert block.getNumBytes() <= commitBlock.getNumBytes() :
      "commitBlock length is less than the stored one "
      + commitBlock.getNumBytes() + " vs. " + block.getNumBytes();
    block.commitBlock(commitBlock);
    
    // Adjust disk space consumption if required
    long diff = fileINode.getPreferredBlockSize() - commitBlock.getNumBytes();    
    if (diff > 0) {
      try {
        String path = /* For finding parents */
        namesystem.leaseManager.findPath(fileINode);
        namesystem.dir.updateSpaceConsumed(path, 0, -diff
            * fileINode.getReplication());
      } catch (IOException e) {
        FSNamesystem.LOG
            .warn("Unexpected exception while updating disk space : "
                + e.getMessage());
      }
    }
  }
  
  /**
   * Commit the last block of the file and mark it as complete if it has
   * meets the minimum replication requirement
   * 
   * @param fileINode file inode
   * @param commitBlock - contains client reported block length and generation
   * @throws IOException if the block does not have at least a minimal number
   * of replicas reported from data-nodes.
   */
  void commitOrCompleteLastBlock(INodeFileUnderConstruction fileINode, 
      Block commitBlock) throws IOException {
    
    if(commitBlock == null)
      return; // not committing, this is a block allocation retry
    BlockInfo lastBlock = fileINode.getLastBlock();
    if(lastBlock == null)
      return; // no blocks in file yet
    if(lastBlock.isComplete())
      return; // already completed (e.g. by syncBlock)
    
    commitBlock(fileINode, (BlockInfoUnderConstruction)lastBlock, commitBlock);

    if(countNodes(lastBlock).liveReplicas() >= minReplication)
      completeBlock(fileINode,fileINode.numBlocks()-1);
  }

  /**
   * Convert a specified block of the file to a complete block.
   * @param fileINode file
   * @param blkIndex  block index in the file
   * @throws IOException if the block does not have at least a minimal number
   * of replicas reported from data-nodes.
   */
  BlockInfo completeBlock(INodeFile fileINode, int blkIndex)
  throws IOException {
    if(blkIndex < 0)
      return null;
    BlockInfo curBlock = fileINode.getBlocks()[blkIndex];
    if(curBlock.isComplete())
      return curBlock;
    BlockInfoUnderConstruction ucBlock = (BlockInfoUnderConstruction)curBlock;
    if(ucBlock.numNodes() < minReplication)
      throw new IOException("Cannot complete block: " +
          "block does not satisfy minimal replication requirement.");
    BlockInfo completeBlock = ucBlock.convertToCompleteBlock();
    // replace penultimate block in file
    fileINode.setBlock(blkIndex, completeBlock);
    // replace block in the blocksMap
    return blocksMap.replaceBlock(completeBlock);
  }

  BlockInfo completeBlock(INodeFile fileINode, BlockInfo block)
  throws IOException {
    BlockInfo[] fileBlocks = fileINode.getBlocks();
    for(int idx = 0; idx < fileBlocks.length; idx++)
      if(fileBlocks[idx] == block) {
        return completeBlock(fileINode, idx);
      }
    return block;
  }

  /**
   * Convert the last block of the file to an under construction block.

* The block is converted only if the file has blocks and the last one * is a partial block (its size is less than the preferred block size). * The converted block is returned to the client. * The client uses the returned block locations to form the data pipeline * for this block.
* The methods returns null if there is no partial block at the end. * The client is supposed to allocate a new block with the next call. * * @param fileINode file * @return the last block locations if the block is partial or null otherwise */ LocatedBlock convertLastBlockToUnderConstruction( INodeFileUnderConstruction fileINode) throws IOException { BlockInfo oldBlock = fileINode.getLastBlock(); if(oldBlock == null || fileINode.getPreferredBlockSize() == oldBlock.getNumBytes()) return null; assert oldBlock == getStoredBlock(oldBlock) : "last block of the file is not in blocksMap"; DatanodeDescriptor[] targets = getNodes(oldBlock); BlockInfoUnderConstruction ucBlock = fileINode.setLastBlock(oldBlock, targets); blocksMap.replaceBlock(ucBlock); // Remove block from replication queue. updateNeededReplications(oldBlock, 0, 0); // remove this block from the list of pending blocks to be deleted. for (DatanodeDescriptor dd : targets) { String datanodeId = dd.getStorageID(); removeFromInvalidates(datanodeId, oldBlock); } long fileLength = fileINode.computeContentSummary().getLength(); return getBlockLocation(ucBlock, fileLength - ucBlock.getNumBytes()); } /** * Get all valid locations of the block */ ArrayList getValidLocations(Block block) { ArrayList machineSet = new ArrayList(blocksMap.numNodes(block)); for(Iterator it = blocksMap.nodeIterator(block); it.hasNext();) { String storageID = it.next().getStorageID(); // filter invalidate replicas if( ! belongsToInvalidates(storageID, block)) { machineSet.add(storageID); } } return machineSet; } List getBlockLocations(BlockInfo[] blocks, long offset, long length, int nrBlocksToReturn) throws IOException { int curBlk = 0; long curPos = 0, blkSize = 0; int nrBlocks = (blocks[0].getNumBytes() == 0) ? 0 : blocks.length; for (curBlk = 0; curBlk < nrBlocks; curBlk++) { blkSize = blocks[curBlk].getNumBytes(); assert blkSize > 0 : "Block of size 0"; if (curPos + blkSize > offset) { break; } curPos += blkSize; } if (nrBlocks > 0 && curBlk == nrBlocks) // offset >= end of file return Collections.emptyList(); long endOff = offset + length; List results = new ArrayList(blocks.length); do { results.add(getBlockLocation(blocks[curBlk], curPos)); curPos += blocks[curBlk].getNumBytes(); curBlk++; } while (curPos < endOff && curBlk < blocks.length && results.size() < nrBlocksToReturn); return results; } /** @param needBlockToken * @return a LocatedBlock for the given block */ LocatedBlock getBlockLocation(final BlockInfo blk, final long pos ) throws IOException { if (!blk.isComplete()) { final BlockInfoUnderConstruction uc = (BlockInfoUnderConstruction)blk; final DatanodeDescriptor[] locations = uc.getExpectedLocations(); return new LocatedBlock(uc, locations, pos, false); } // get block locations final int numCorruptNodes = countNodes(blk).corruptReplicas(); final int numCorruptReplicas = corruptReplicas.numCorruptReplicas(blk); if (numCorruptNodes != numCorruptReplicas) { FSNamesystem.LOG.warn("Inconsistent number of corrupt replicas for " + blk + " blockMap has " + numCorruptNodes + " but corrupt replicas map has " + numCorruptReplicas); } final int numNodes = blocksMap.numNodes(blk); final boolean isCorrupt = numCorruptNodes == numNodes; final int numMachines = isCorrupt ? numNodes: numNodes - numCorruptNodes; final DatanodeDescriptor[] machines = new DatanodeDescriptor[numMachines]; if (numMachines > 0) { int j = 0; for(Iterator it = blocksMap.nodeIterator(blk); it.hasNext();) { final DatanodeDescriptor d = it.next(); final boolean replicaCorrupt = corruptReplicas.isReplicaCorrupt(blk, d); if (isCorrupt || (!isCorrupt && !replicaCorrupt)) machines[j++] = d; } } return new LocatedBlock(blk, machines, pos, isCorrupt); } /** * Check whether the replication parameter is within the range * determined by system configuration. */ void verifyReplication(String src, short replication, String clientName) throws IOException { if (replication >= minReplication && replication <= maxReplication) { //common case. avoid building 'text' return; } String text = "file " + src + ((clientName != null) ? " on client " + clientName : "") + ".\n" + "Requested replication " + replication; if (replication > maxReplication) throw new IOException(text + " exceeds maximum " + maxReplication); if (replication < minReplication) throw new IOException(text + " is less than the required minimum " + minReplication); } void removeFromInvalidates(String storageID, Block block) { Collection v = recentInvalidateSets.get(storageID); if (v != null && v.remove(block)) { pendingDeletionBlocksCount--; if (v.isEmpty()) { recentInvalidateSets.remove(storageID); } } } boolean belongsToInvalidates(String storageID, Block block) { Collection invalidateSet = recentInvalidateSets.get(storageID); return invalidateSet != null && invalidateSet.contains(block); } /** * Adds block to list of blocks which will be invalidated on specified * datanode * * @param b block * @param dn datanode * @param log true to create an entry in the log */ void addToInvalidates(Block b, DatanodeInfo dn, boolean log) { Collection invalidateSet = recentInvalidateSets .get(dn.getStorageID()); if (invalidateSet == null) { invalidateSet = new HashSet(); recentInvalidateSets.put(dn.getStorageID(), invalidateSet); } if (invalidateSet.add(b)) { pendingDeletionBlocksCount++; if (log) { NameNode.stateChangeLog.info("BLOCK* NameSystem.addToInvalidates: " + b + " to " + dn.getName()); } } } /** * Adds block to list of blocks which will be invalidated on specified * datanode and log the operation * * @param b block * @param dn datanode */ void addToInvalidates(Block b, DatanodeInfo dn) { addToInvalidates(b, dn, true); } /** * Adds block to list of blocks which will be invalidated on all its * datanodes. */ private void addToInvalidates(Block b) { StringBuilder datanodes = new StringBuilder(); for (Iterator it = blocksMap.nodeIterator(b); it .hasNext();) { DatanodeDescriptor node = it.next(); addToInvalidates(b, node, false); datanodes.append(node.getName()).append(" "); } if (datanodes.length() != 0) { NameNode.stateChangeLog.info("BLOCK* NameSystem.addToInvalidates: " + b + " to " + datanodes.toString()); } } /** * dumps the contents of recentInvalidateSets */ private void dumpRecentInvalidateSets(PrintWriter out) { int size = recentInvalidateSets.values().size(); out.println("Metasave: Blocks " + pendingDeletionBlocksCount + " waiting deletion from " + size + " datanodes."); if (size == 0) { return; } for(Map.Entry> entry : recentInvalidateSets.entrySet()) { Collection blocks = entry.getValue(); if (blocks.size() > 0) { out.println(namesystem.getDatanode(entry.getKey()).getName() + blocks); } } } void findAndMarkBlockAsCorrupt(Block blk, DatanodeInfo dn) throws IOException { BlockInfo storedBlock = getStoredBlock(blk); if (storedBlock == null) { // Check if the replica is in the blockMap, if not // ignore the request for now. This could happen when BlockScanner // thread of Datanode reports bad block before Block reports are sent // by the Datanode on startup NameNode.stateChangeLog.info("BLOCK* NameSystem.markBlockAsCorrupt: " + "block " + blk + " could not be marked as " + "corrupt as it does not exist in blocksMap"); return; } markBlockAsCorrupt(storedBlock, dn); } private void markBlockAsCorrupt(BlockInfo storedBlock, DatanodeInfo dn) throws IOException { assert storedBlock != null : "storedBlock should not be null"; DatanodeDescriptor node = namesystem.getDatanode(dn); if (node == null) { throw new IOException("Cannot mark block " + storedBlock.getBlockName() + " as corrupt because datanode " + dn.getName() + " does not exist. "); } INodeFile inode = storedBlock.getINode(); if (inode == null) { NameNode.stateChangeLog.info("BLOCK NameSystem.markBlockAsCorrupt: " + "block " + storedBlock + " could not be marked as corrupt as it" + " does not belong to any file"); addToInvalidates(storedBlock, node); return; } // Add replica to the data-node if it is not already there node.addBlock(storedBlock); // Add this replica to corruptReplicas Map corruptReplicas.addToCorruptReplicasMap(storedBlock, node); if (countNodes(storedBlock).liveReplicas() >= inode.getReplication()) { // the block is over-replicated so invalidate the replicas immediately invalidateBlock(storedBlock, node); } else { // add the block to neededReplication updateNeededReplications(storedBlock, -1, 0); } } /** * Invalidates the given block on the given datanode. */ private void invalidateBlock(Block blk, DatanodeInfo dn) throws IOException { NameNode.stateChangeLog.info("DIR* NameSystem.invalidateBlock: " + blk + " on " + dn.getName()); DatanodeDescriptor node = namesystem.getDatanode(dn); if (node == null) { throw new IOException("Cannot invalidate block " + blk + " because datanode " + dn.getName() + " does not exist."); } // Check how many copies we have of the block. If we have at least one // copy on a live node, then we can delete it. int count = countNodes(blk).liveReplicas(); if (count >= 1) { addToInvalidates(blk, dn); removeStoredBlock(blk, node); if(NameNode.stateChangeLog.isDebugEnabled()) { NameNode.stateChangeLog.debug("BLOCK* NameSystem.invalidateBlocks: " + blk + " on " + dn.getName() + " listed for deletion."); } } else { NameNode.stateChangeLog.info("BLOCK* NameSystem.invalidateBlocks: " + blk + " on " + dn.getName() + " is the only copy and was not deleted."); } } void updateState() { pendingReplicationBlocksCount = pendingReplications.size(); underReplicatedBlocksCount = neededReplications.size(); corruptReplicaBlocksCount = corruptReplicas.size(); } /** * Schedule blocks for deletion at datanodes * @param nodesToProcess number of datanodes to schedule deletion work * @return total number of block for deletion */ int computeInvalidateWork(int nodesToProcess) { int numOfNodes = recentInvalidateSets.size(); nodesToProcess = Math.min(numOfNodes, nodesToProcess); // TODO should using recentInvalidateSets be synchronized? // get an array of the keys ArrayList keyArray = new ArrayList(recentInvalidateSets.keySet()); // randomly pick up nodesToProcess nodes // and put them at [0, nodesToProcess) int remainingNodes = numOfNodes - nodesToProcess; if (nodesToProcess < remainingNodes) { for(int i=0; i> blocksToReplicate = chooseUnderReplicatedBlocks(blocksToProcess); // replicate blocks int scheduledReplicationCount = 0; for (int i=0; i> chooseUnderReplicatedBlocks(int blocksToProcess) { // initialize data structure for the return value List> blocksToReplicate = new ArrayList>( UnderReplicatedBlocks.LEVEL); for (int i = 0; i < UnderReplicatedBlocks.LEVEL; i++) { blocksToReplicate.add(new ArrayList()); } namesystem.writeLock(); try { synchronized (neededReplications) { if (neededReplications.size() == 0) { missingBlocksInCurIter = 0; missingBlocksInPrevIter = 0; return blocksToReplicate; } // Go through all blocks that need replications. BlockIterator neededReplicationsIterator = neededReplications .iterator(); // skip to the first unprocessed block, which is at replIndex for (int i = 0; i < replIndex && neededReplicationsIterator.hasNext(); i++) { neededReplicationsIterator.next(); } // # of blocks to process equals either twice the number of live // data-nodes or the number of under-replicated blocks whichever is less blocksToProcess = Math.min(blocksToProcess, neededReplications.size()); for (int blkCnt = 0; blkCnt < blocksToProcess; blkCnt++, replIndex++) { if (!neededReplicationsIterator.hasNext()) { // start from the beginning replIndex = 0; missingBlocksInPrevIter = missingBlocksInCurIter; missingBlocksInCurIter = 0; blocksToProcess = Math.min(blocksToProcess, neededReplications .size()); if (blkCnt >= blocksToProcess) break; neededReplicationsIterator = neededReplications.iterator(); assert neededReplicationsIterator.hasNext() : "neededReplications should not be empty."; } Block block = neededReplicationsIterator.next(); int priority = neededReplicationsIterator.getPriority(); if (priority < 0 || priority >= blocksToReplicate.size()) { FSNamesystem.LOG.warn("Unexpected replication priority: " + priority + " " + block); } else { blocksToReplicate.get(priority).add(block); } } // end for } // end synchronized neededReplication } finally { namesystem.writeUnlock(); } return blocksToReplicate; } /** Replicate a block * * @param block block to be replicated * @param priority a hint of its priority in the neededReplication queue * @return if the block gets replicated or not */ private boolean computeReplicationWorkForBlock(Block block, int priority) { int requiredReplication, numEffectiveReplicas; List containingNodes; DatanodeDescriptor srcNode; INodeFile fileINode = null; int additionalReplRequired; namesystem.writeLock(); try { synchronized (neededReplications) { // block should belong to a file fileINode = blocksMap.getINode(block); // abandoned block or block reopened for append if(fileINode == null || fileINode.isUnderConstruction()) { neededReplications.remove(block, priority); // remove from neededReplications replIndex--; return false; } requiredReplication = fileINode.getReplication(); // get a source data-node containingNodes = new ArrayList(); NumberReplicas numReplicas = new NumberReplicas(); srcNode = chooseSourceDatanode(block, containingNodes, numReplicas); if ((numReplicas.liveReplicas() + numReplicas.decommissionedReplicas()) <= 0) { missingBlocksInCurIter++; } if(srcNode == null) // block can not be replicated from any node return false; // do not schedule more if enough replicas is already pending numEffectiveReplicas = numReplicas.liveReplicas() + pendingReplications.getNumReplicas(block); if (numEffectiveReplicas >= requiredReplication) { if ( (pendingReplications.getNumReplicas(block) > 0) || (blockHasEnoughRacks(block)) ) { neededReplications.remove(block, priority); // remove from neededReplications replIndex--; NameNode.stateChangeLog.info("BLOCK* " + "Removing block " + block + " from neededReplications as it has enough replicas."); return false; } } if (numReplicas.liveReplicas() < requiredReplication) { additionalReplRequired = requiredReplication - numEffectiveReplicas; } else { additionalReplRequired = 1; //Needed on a new rack } } } finally { namesystem.writeUnlock(); } // choose replication targets: NOT HOLDING THE GLOBAL LOCK // It is costly to extract the filename for which chooseTargets is called, // so for now we pass in the Inode itself. DatanodeDescriptor targets[] = replicator.chooseTarget(fileINode, additionalReplRequired, srcNode, containingNodes, block.getNumBytes()); if(targets.length == 0) return false; namesystem.writeLock(); try { synchronized (neededReplications) { // Recheck since global lock was released // block should belong to a file fileINode = blocksMap.getINode(block); // abandoned block or block reopened for append if(fileINode == null || fileINode.isUnderConstruction()) { neededReplications.remove(block, priority); // remove from neededReplications replIndex--; return false; } requiredReplication = fileINode.getReplication(); // do not schedule more if enough replicas is already pending NumberReplicas numReplicas = countNodes(block); numEffectiveReplicas = numReplicas.liveReplicas() + pendingReplications.getNumReplicas(block); if (numEffectiveReplicas >= requiredReplication) { if ( (pendingReplications.getNumReplicas(block) > 0) || (blockHasEnoughRacks(block)) ) { neededReplications.remove(block, priority); // remove from neededReplications replIndex--; NameNode.stateChangeLog.info("BLOCK* " + "Removing block " + block + " from neededReplications as it has enough replicas."); return false; } } if ( (numReplicas.liveReplicas() >= requiredReplication) && (!blockHasEnoughRacks(block)) ) { if (srcNode.getNetworkLocation().equals(targets[0].getNetworkLocation())) { //No use continuing, unless a new rack in this case return false; } } // Add block to the to be replicated list srcNode.addBlockToBeReplicated(block, targets); for (DatanodeDescriptor dn : targets) { dn.incBlocksScheduled(); } // Move the block-replication into a "pending" state. // The reason we use 'pending' is so we can retry // replications that fail after an appropriate amount of time. pendingReplications.add(block, targets.length); if(NameNode.stateChangeLog.isDebugEnabled()) { NameNode.stateChangeLog.debug( "BLOCK* block " + block + " is moved from neededReplications to pendingReplications"); } // remove from neededReplications if(numEffectiveReplicas + targets.length >= requiredReplication) { neededReplications.remove(block, priority); // remove from neededReplications replIndex--; } if (NameNode.stateChangeLog.isInfoEnabled()) { StringBuilder targetList = new StringBuilder("datanode(s)"); for (int k = 0; k < targets.length; k++) { targetList.append(' '); targetList.append(targets[k].getName()); } NameNode.stateChangeLog.info( "BLOCK* ask " + srcNode.getName() + " to replicate " + block + " to " + targetList); if(NameNode.stateChangeLog.isDebugEnabled()) { NameNode.stateChangeLog.debug( "BLOCK* neededReplications = " + neededReplications.size() + " pendingReplications = " + pendingReplications.size()); } } } } finally { namesystem.writeUnlock(); } return true; } /** * Parse the data-nodes the block belongs to and choose one, * which will be the replication source. * * We prefer nodes that are in DECOMMISSION_INPROGRESS state to other nodes * since the former do not have write traffic and hence are less busy. * We do not use already decommissioned nodes as a source. * Otherwise we choose a random node among those that did not reach their * replication limit. * * In addition form a list of all nodes containing the block * and calculate its replication numbers. */ private DatanodeDescriptor chooseSourceDatanode( Block block, List containingNodes, NumberReplicas numReplicas) { containingNodes.clear(); DatanodeDescriptor srcNode = null; int live = 0; int decommissioned = 0; int corrupt = 0; int excess = 0; Iterator it = blocksMap.nodeIterator(block); Collection nodesCorrupt = corruptReplicas.getNodes(block); while(it.hasNext()) { DatanodeDescriptor node = it.next(); Collection excessBlocks = excessReplicateMap.get(node.getStorageID()); if ((nodesCorrupt != null) && (nodesCorrupt.contains(node))) corrupt++; else if (node.isDecommissionInProgress() || node.isDecommissioned()) decommissioned++; else if (excessBlocks != null && excessBlocks.contains(block)) { excess++; } else { live++; } containingNodes.add(node); // Check if this replica is corrupt // If so, do not select the node as src node if ((nodesCorrupt != null) && nodesCorrupt.contains(node)) continue; if(node.getNumberOfBlocksToBeReplicated() >= maxReplicationStreams) continue; // already reached replication limit // the block must not be scheduled for removal on srcNode if(excessBlocks != null && excessBlocks.contains(block)) continue; // never use already decommissioned nodes if(node.isDecommissioned()) continue; // we prefer nodes that are in DECOMMISSION_INPROGRESS state if(node.isDecommissionInProgress() || srcNode == null) { srcNode = node; continue; } if(srcNode.isDecommissionInProgress()) continue; // switch to a different node randomly // this to prevent from deterministically selecting the same node even // if the node failed to replicate the block on previous iterations if(r.nextBoolean()) srcNode = node; } if(numReplicas != null) numReplicas.initialize(live, decommissioned, corrupt, excess); return srcNode; } /** * If there were any replication requests that timed out, reap them * and put them back into the neededReplication queue */ void processPendingReplications() { Block[] timedOutItems = pendingReplications.getTimedOutBlocks(); if (timedOutItems != null) { namesystem.writeLock(); try { for (int i = 0; i < timedOutItems.length; i++) { NumberReplicas num = countNodes(timedOutItems[i]); if (isNeededReplication(timedOutItems[i], getReplication(timedOutItems[i]), num.liveReplicas())) { neededReplications.add(timedOutItems[i], num.liveReplicas(), num.decommissionedReplicas(), getReplication(timedOutItems[i])); } } } finally { namesystem.writeUnlock(); } /* If we know the target datanodes where the replication timedout, * we could invoke decBlocksScheduled() on it. Its ok for now. */ } } /** * The given node is reporting all its blocks. Use this info to * update the (machine-->blocklist) and (block-->machinelist) tables. */ public void processReport(DatanodeDescriptor node, BlockListAsLongs report) throws IOException { // // Modify the (block-->datanode) map, according to the difference // between the old and new block report. // Collection toAdd = new LinkedList(); Collection toRemove = new LinkedList(); Collection toInvalidate = new LinkedList(); Collection toCorrupt = new LinkedList(); node.reportDiff(this, report, toAdd, toRemove, toInvalidate, toCorrupt); for (Block b : toRemove) { removeStoredBlock(b, node); } for (Block b : toAdd) { addStoredBlock(b, node, null); } for (Block b : toInvalidate) { NameNode.stateChangeLog.info("BLOCK* NameSystem.processReport: block " + b + " on " + node.getName() + " size " + b.getNumBytes() + " does not belong to any file."); addToInvalidates(b, node); } for (BlockInfo b : toCorrupt) { markBlockAsCorrupt(b, node); } } /** * Modify (block-->datanode) map. Remove block from set of * needed replications if this takes care of the problem. * @return the block that is stored in blockMap. */ private Block addStoredBlock(final Block block, DatanodeDescriptor node, DatanodeDescriptor delNodeHint) throws IOException { assert (namesystem.hasWriteLock()); BlockInfo storedBlock = blocksMap.getStoredBlock(block); if (storedBlock == null || storedBlock.getINode() == null) { // If this block does not belong to anyfile, then we are done. NameNode.stateChangeLog.info("BLOCK* NameSystem.addStoredBlock: " + "addStoredBlock request received for " + block + " on " + node.getName() + " size " + block.getNumBytes() + " But it does not belong to any file."); // we could add this block to invalidate set of this datanode. // it will happen in next block report otherwise. return block; } assert storedBlock != null : "Block must be stored by now"; INodeFile fileINode = storedBlock.getINode(); assert fileINode != null : "Block must belong to a file"; // add block to the data-node boolean added = node.addBlock(storedBlock); int curReplicaDelta = 0; if (added) { curReplicaDelta = 1; // // At startup time, because too many new blocks come in // they take up lots of space in the log file. // So, we log only when namenode is out of safemode. // if (!namesystem.isInSafeMode()) { NameNode.stateChangeLog.info("BLOCK* NameSystem.addStoredBlock: " + "blockMap updated: " + node.getName() + " is added to " + storedBlock + " size " + storedBlock.getNumBytes()); } } else { NameNode.stateChangeLog.warn("BLOCK* NameSystem.addStoredBlock: " + "Redundant addStoredBlock request received for " + storedBlock + " on " + node.getName() + " size " + storedBlock.getNumBytes()); } // filter out containingNodes that are marked for decommission. NumberReplicas num = countNodes(storedBlock); int numLiveReplicas = num.liveReplicas(); int numCurrentReplica = numLiveReplicas + pendingReplications.getNumReplicas(storedBlock); if(storedBlock.getBlockUCState() == BlockUCState.COMMITTED && numLiveReplicas >= minReplication) storedBlock = completeBlock(fileINode, storedBlock); // check whether safe replication is reached for the block // only complete blocks are counted towards that if(storedBlock.isComplete()) namesystem.incrementSafeBlockCount(numCurrentReplica); // if file is under construction, then check whether the block // can be completed if (fileINode.isUnderConstruction()) { return storedBlock; } // do not handle mis-replicated blocks during startup if (namesystem.isInSafeMode()) return storedBlock; // handle underReplication/overReplication short fileReplication = fileINode.getReplication(); if (!isNeededReplication(storedBlock, fileReplication, numCurrentReplica)) { neededReplications.remove(storedBlock, numCurrentReplica, num.decommissionedReplicas, fileReplication); } else { updateNeededReplications(storedBlock, curReplicaDelta, 0); } if (numCurrentReplica > fileReplication) { processOverReplicatedBlock(storedBlock, fileReplication, node, delNodeHint); } // If the file replication has reached desired value // we can remove any corrupt replicas the block may have int corruptReplicasCount = corruptReplicas.numCorruptReplicas(storedBlock); int numCorruptNodes = num.corruptReplicas(); if (numCorruptNodes != corruptReplicasCount) { FSNamesystem.LOG.warn("Inconsistent number of corrupt replicas for " + storedBlock + "blockMap has " + numCorruptNodes + " but corrupt replicas map has " + corruptReplicasCount); } if ((corruptReplicasCount > 0) && (numLiveReplicas >= fileReplication)) invalidateCorruptReplicas(storedBlock); return storedBlock; } /** * Invalidate corrupt replicas. *

* This will remove the replicas from the block's location list, * add them to {@link #recentInvalidateSets} so that they could be further * deleted from the respective data-nodes, * and remove the block from corruptReplicasMap. *

* This method should be called when the block has sufficient * number of live replicas. * * @param blk Block whose corrupt replicas need to be invalidated */ private void invalidateCorruptReplicas(Block blk) { Collection nodes = corruptReplicas.getNodes(blk); boolean gotException = false; if (nodes == null) return; // make a copy of the array of nodes in order to avoid // ConcurrentModificationException, when the block is removed from the node DatanodeDescriptor[] nodesCopy = nodes.toArray(new DatanodeDescriptor[0]); for (DatanodeDescriptor node : nodesCopy) { try { invalidateBlock(blk, node); } catch (IOException e) { NameNode.stateChangeLog.info("NameNode.invalidateCorruptReplicas " + "error in deleting bad block " + blk + " on " + node + e); gotException = true; } } // Remove the block from corruptReplicasMap if (!gotException) corruptReplicas.removeFromCorruptReplicasMap(blk); } /** * For each block in the name-node verify whether it belongs to any file, * over or under replicated. Place it into the respective queue. */ void processMisReplicatedBlocks() { long nrInvalid = 0, nrOverReplicated = 0, nrUnderReplicated = 0; namesystem.writeLock(); try { neededReplications.clear(); for (BlockInfo block : blocksMap.getBlocks()) { INodeFile fileINode = block.getINode(); if (fileINode == null) { // block does not belong to any file nrInvalid++; addToInvalidates(block); continue; } // calculate current replication short expectedReplication = fileINode.getReplication(); NumberReplicas num = countNodes(block); int numCurrentReplica = num.liveReplicas(); // add to under-replicated queue if need to be if (isNeededReplication(block, expectedReplication, numCurrentReplica)) { if (neededReplications.add(block, numCurrentReplica, num .decommissionedReplicas(), expectedReplication)) { nrUnderReplicated++; } } if (numCurrentReplica > expectedReplication) { // over-replicated block nrOverReplicated++; processOverReplicatedBlock(block, expectedReplication, null, null); } } } finally { namesystem.writeUnlock(); } FSNamesystem.LOG.info("Total number of blocks = " + blocksMap.size()); FSNamesystem.LOG.info("Number of invalid blocks = " + nrInvalid); FSNamesystem.LOG.info("Number of under-replicated blocks = " + nrUnderReplicated); FSNamesystem.LOG.info("Number of over-replicated blocks = " + nrOverReplicated); } /** * Find how many of the containing nodes are "extra", if any. * If there are any extras, call chooseExcessReplicates() to * mark them in the excessReplicateMap. */ void processOverReplicatedBlock(Block block, short replication, DatanodeDescriptor addedNode, DatanodeDescriptor delNodeHint) { if (addedNode == delNodeHint) { delNodeHint = null; } Collection nonExcess = new ArrayList(); Collection corruptNodes = corruptReplicas .getNodes(block); for (Iterator it = blocksMap.nodeIterator(block); it.hasNext();) { DatanodeDescriptor cur = it.next(); Collection excessBlocks = excessReplicateMap.get(cur .getStorageID()); if (excessBlocks == null || !excessBlocks.contains(block)) { if (!cur.isDecommissionInProgress() && !cur.isDecommissioned()) { // exclude corrupt replicas if (corruptNodes == null || !corruptNodes.contains(cur)) { nonExcess.add(cur); } } } } namesystem.chooseExcessReplicates(nonExcess, block, replication, addedNode, delNodeHint, replicator); } void addToExcessReplicate(DatanodeInfo dn, Block block) { Collection excessBlocks = excessReplicateMap.get(dn.getStorageID()); if (excessBlocks == null) { excessBlocks = new TreeSet(); excessReplicateMap.put(dn.getStorageID(), excessBlocks); } if (excessBlocks.add(block)) { excessBlocksCount++; if(NameNode.stateChangeLog.isDebugEnabled()) { NameNode.stateChangeLog.debug("BLOCK* NameSystem.chooseExcessReplicates:" + " (" + dn.getName() + ", " + block + ") is added to excessReplicateMap"); } } } /** * Modify (block-->datanode) map. Possibly generate replication tasks, if the * removed block is still valid. */ void removeStoredBlock(Block block, DatanodeDescriptor node) { if(NameNode.stateChangeLog.isDebugEnabled()) { NameNode.stateChangeLog.debug("BLOCK* NameSystem.removeStoredBlock: " + block + " from " + node.getName()); } assert (namesystem.hasWriteLock()); { if (!blocksMap.removeNode(block, node)) { if(NameNode.stateChangeLog.isDebugEnabled()) { NameNode.stateChangeLog.debug("BLOCK* NameSystem.removeStoredBlock: " + block + " has already been removed from node " + node); } return; } // // It's possible that the block was removed because of a datanode // failure. If the block is still valid, check if replication is // necessary. In that case, put block on a possibly-will- // be-replicated list. // INode fileINode = blocksMap.getINode(block); if (fileINode != null) { namesystem.decrementSafeBlockCount(block); updateNeededReplications(block, -1, 0); } // // We've removed a block from a node, so it's definitely no longer // in "excess" there. // Collection excessBlocks = excessReplicateMap.get(node .getStorageID()); if (excessBlocks != null) { if (excessBlocks.remove(block)) { excessBlocksCount--; if(NameNode.stateChangeLog.isDebugEnabled()) { NameNode.stateChangeLog.debug( "BLOCK* NameSystem.removeStoredBlock: " + block + " is removed from excessBlocks"); } if (excessBlocks.size() == 0) { excessReplicateMap.remove(node.getStorageID()); } } } // Remove the replica from corruptReplicas corruptReplicas.removeFromCorruptReplicasMap(block, node); } } /** * The given node is reporting that it received a certain block. */ void addBlock(DatanodeDescriptor node, Block block, String delHint) throws IOException { // decrement number of blocks scheduled to this datanode. node.decBlocksScheduled(); // get the deletion hint node DatanodeDescriptor delHintNode = null; if (delHint != null && delHint.length() != 0) { delHintNode = namesystem.getDatanode(delHint); if (delHintNode == null) { NameNode.stateChangeLog.warn("BLOCK* NameSystem.blockReceived: " + block + " is expected to be removed from an unrecorded node " + delHint); } } // // Modify the blocks->datanode map and node's map. // pendingReplications.remove(block); // blockReceived reports a finalized block Collection toAdd = new LinkedList(); Collection toInvalidate = new LinkedList(); Collection toCorrupt = new LinkedList(); node.processReportedBlock(this, block, ReplicaState.FINALIZED, toAdd, toInvalidate, toCorrupt); // the block is only in one of the lists // if it is in none then data-node already has it assert toAdd.size() + toInvalidate.size() <= 1 : "The block should be only in one of the lists."; for (Block b : toAdd) { addStoredBlock(b, node, delHintNode); } for (Block b : toInvalidate) { NameNode.stateChangeLog.info("BLOCK* NameSystem.addBlock: block " + b + " on " + node.getName() + " size " + b.getNumBytes() + " does not belong to any file."); addToInvalidates(b, node); } for (BlockInfo b : toCorrupt) { markBlockAsCorrupt(b, node); } } /** * Return the number of nodes that are live and decommissioned. */ NumberReplicas countNodes(Block b) { int count = 0; int live = 0; int corrupt = 0; int excess = 0; Iterator nodeIter = blocksMap.nodeIterator(b); Collection nodesCorrupt = corruptReplicas.getNodes(b); while (nodeIter.hasNext()) { DatanodeDescriptor node = nodeIter.next(); if ((nodesCorrupt != null) && (nodesCorrupt.contains(node))) { corrupt++; } else if (node.isDecommissionInProgress() || node.isDecommissioned()) { count++; } else { Collection blocksExcess = excessReplicateMap.get(node.getStorageID()); if (blocksExcess != null && blocksExcess.contains(b)) { excess++; } else { live++; } } } return new NumberReplicas(live, count, corrupt, excess); } private void logBlockReplicationInfo(Block block, DatanodeDescriptor srcNode, NumberReplicas num) { int curReplicas = num.liveReplicas(); int curExpectedReplicas = getReplication(block); INode fileINode = blocksMap.getINode(block); Iterator nodeIter = blocksMap.nodeIterator(block); StringBuilder nodeList = new StringBuilder(); while (nodeIter.hasNext()) { DatanodeDescriptor node = nodeIter.next(); nodeList.append(node.name); nodeList.append(" "); } FSNamesystem.LOG.info("Block: " + block + ", Expected Replicas: " + curExpectedReplicas + ", live replicas: " + curReplicas + ", corrupt replicas: " + num.corruptReplicas() + ", decommissioned replicas: " + num.decommissionedReplicas() + ", excess replicas: " + num.excessReplicas() + ", Is Open File: " + fileINode.isUnderConstruction() + ", Datanodes having this block: " + nodeList + ", Current Datanode: " + srcNode.name + ", Is current datanode decommissioning: " + srcNode.isDecommissionInProgress()); } /** * Return true if there are any blocks on this node that have not * yet reached their replication factor. Otherwise returns false. */ boolean isReplicationInProgress(DatanodeDescriptor srcNode) { boolean status = false; int underReplicatedBlocks = 0; int decommissionOnlyReplicas = 0; int underReplicatedInOpenFiles = 0; final Iterator it = srcNode.getBlockIterator(); while(it.hasNext()) { final Block block = it.next(); INode fileINode = blocksMap.getINode(block); if (fileINode != null) { NumberReplicas num = countNodes(block); int curReplicas = num.liveReplicas(); int curExpectedReplicas = getReplication(block); if (isNeededReplication(block, curExpectedReplicas, curReplicas)) { if (curExpectedReplicas > curReplicas) { //Log info about one block for this node which needs replication if (!status) { status = true; logBlockReplicationInfo(block, srcNode, num); } underReplicatedBlocks++; if ((curReplicas == 0) && (num.decommissionedReplicas() > 0)) { decommissionOnlyReplicas++; } if (fileINode.isUnderConstruction()) { underReplicatedInOpenFiles++; } } if (!neededReplications.contains(block) && pendingReplications.getNumReplicas(block) == 0) { // // These blocks have been reported from the datanode // after the startDecommission method has been executed. These // blocks were in flight when the decommissioning was started. // neededReplications.add(block, curReplicas, num.decommissionedReplicas(), curExpectedReplicas); } } } } srcNode.decommissioningStatus.set(underReplicatedBlocks, decommissionOnlyReplicas, underReplicatedInOpenFiles); return status; } int getActiveBlockCount() { return blocksMap.size() - (int)pendingDeletionBlocksCount; } DatanodeDescriptor[] getNodes(BlockInfo block) { DatanodeDescriptor[] nodes = new DatanodeDescriptor[block.numNodes()]; Iterator it = blocksMap.nodeIterator(block); for (int i = 0; it != null && it.hasNext(); i++) { nodes[i] = it.next(); } return nodes; } int getTotalBlocks() { return blocksMap.size(); } void removeBlock(Block block) { addToInvalidates(block); corruptReplicas.removeFromCorruptReplicasMap(block); blocksMap.removeBlock(block); } BlockInfo getStoredBlock(Block block) { return blocksMap.getStoredBlock(block); } /* updates a block in under replication queue */ void updateNeededReplications(Block block, int curReplicasDelta, int expectedReplicasDelta) { namesystem.writeLock(); try { NumberReplicas repl = countNodes(block); int curExpectedReplicas = getReplication(block); if (isNeededReplication(block, curExpectedReplicas, repl.liveReplicas())) { neededReplications.update(block, repl.liveReplicas(), repl .decommissionedReplicas(), curExpectedReplicas, curReplicasDelta, expectedReplicasDelta); } else { int oldReplicas = repl.liveReplicas()-curReplicasDelta; int oldExpectedReplicas = curExpectedReplicas-expectedReplicasDelta; neededReplications.remove(block, oldReplicas, repl.decommissionedReplicas(), oldExpectedReplicas); } } finally { namesystem.writeUnlock(); } } void checkReplication(Block block, int numExpectedReplicas) { // filter out containingNodes that are marked for decommission. NumberReplicas number = countNodes(block); if (isNeededReplication(block, numExpectedReplicas, number.liveReplicas())) { neededReplications.add(block, number.liveReplicas(), number.decommissionedReplicas, numExpectedReplicas); } } /* get replication factor of a block */ private int getReplication(Block block) { INodeFile fileINode = blocksMap.getINode(block); if (fileINode == null) { // block does not belong to any file return 0; } assert !fileINode.isDirectory() : "Block cannot belong to a directory."; return fileINode.getReplication(); } /** * Remove a datanode from the invalidatesSet * @param n datanode */ void removeFromInvalidates(String storageID) { Collection blocks = recentInvalidateSets.remove(storageID); if (blocks != null) { pendingDeletionBlocksCount -= blocks.size(); } } /** * Get blocks to invalidate for nodeId * in {@link #recentInvalidateSets}. * * @return number of blocks scheduled for removal during this iteration. */ private int invalidateWorkForOneNode(String nodeId) { namesystem.writeLock(); try { // blocks should not be replicated or removed if safe mode is on if (namesystem.isInSafeMode()) return 0; // get blocks to invalidate for the nodeId assert nodeId != null; DatanodeDescriptor dn = namesystem.getDatanode(nodeId); if (dn == null) { removeFromInvalidates(nodeId); return 0; } Collection invalidateSet = recentInvalidateSets.get(nodeId); if (invalidateSet == null) return 0; ArrayList blocksToInvalidate = new ArrayList( namesystem.blockInvalidateLimit); // # blocks that can be sent in one message is limited Iterator it = invalidateSet.iterator(); for (int blkCount = 0; blkCount < namesystem.blockInvalidateLimit && it.hasNext(); blkCount++) { blocksToInvalidate.add(it.next()); it.remove(); } // If we send everything in this message, remove this node entry if (!it.hasNext()) { removeFromInvalidates(nodeId); } dn.addBlocksToBeInvalidated(blocksToInvalidate); if (NameNode.stateChangeLog.isInfoEnabled()) { StringBuilder blockList = new StringBuilder(); for (Block blk : blocksToInvalidate) { blockList.append(' '); blockList.append(blk); } NameNode.stateChangeLog.info("BLOCK* ask " + dn.getName() + " to delete " + blockList); } pendingDeletionBlocksCount -= blocksToInvalidate.size(); return blocksToInvalidate.size(); } finally { namesystem.writeUnlock(); } } //Returns the number of racks over which a given block is replicated //decommissioning/decommissioned nodes are not counted. corrupt replicas //are also ignored int getNumberOfRacks(Block b) { HashSet rackSet = new HashSet(0); Collection corruptNodes = corruptReplicas.getNodes(b); for (Iterator it = blocksMap.nodeIterator(b); it.hasNext();) { DatanodeDescriptor cur = it.next(); if (!cur.isDecommissionInProgress() && !cur.isDecommissioned()) { if ((corruptNodes == null ) || !corruptNodes.contains(cur)) { String rackName = cur.getNetworkLocation(); if (!rackSet.contains(rackName)) { rackSet.add(rackName); } } } } return rackSet.size(); } boolean blockHasEnoughRacks(Block b) { if (!this.shouldCheckForEnoughRacks) { return true; } boolean enoughRacks = false;; Collection corruptNodes = corruptReplicas.getNodes(b); int numExpectedReplicas = getReplication(b); String rackName = null; for (Iterator it = blocksMap.nodeIterator(b); it.hasNext();) { DatanodeDescriptor cur = it.next(); if (!cur.isDecommissionInProgress() && !cur.isDecommissioned()) { if ((corruptNodes == null ) || !corruptNodes.contains(cur)) { if (numExpectedReplicas == 1) { enoughRacks = true; break; } String rackNameNew = cur.getNetworkLocation(); if (rackName == null) { rackName = rackNameNew; } else if (!rackName.equals(rackNameNew)) { enoughRacks = true; break; } } } } return enoughRacks; } boolean isNeededReplication(Block b, int expectedReplication, int curReplicas) { if ((curReplicas >= expectedReplication) && (blockHasEnoughRacks(b))) { return false; } else { return true; } } long getMissingBlocksCount() { // not locking return Math.max(missingBlocksInPrevIter, missingBlocksInCurIter); } BlockInfo addINode(BlockInfo block, INodeFile iNode) { return blocksMap.addINode(block, iNode); } INodeFile getINode(Block b) { return blocksMap.getINode(b); } void removeFromCorruptReplicasMap(Block block) { corruptReplicas.removeFromCorruptReplicasMap(block); } int numCorruptReplicas(Block block) { return corruptReplicas.numCorruptReplicas(block); } void removeBlockFromMap(Block block) { blocksMap.removeBlock(block); } int getCapacity() { namesystem.readLock(); try { return blocksMap.getCapacity(); } finally { namesystem.readUnlock(); } } /** * Return a range of corrupt replica block ids. Up to numExpectedBlocks * blocks starting at the next block after startingBlockId are returned * (fewer if numExpectedBlocks blocks are unavailable). If startingBlockId * is null, up to numExpectedBlocks blocks are returned from the beginning. * If startingBlockId cannot be found, null is returned. * * @param numExpectedBlocks Number of block ids to return. * 0 <= numExpectedBlocks <= 100 * @param startingBlockId Block id from which to start. If null, start at * beginning. * @return Up to numExpectedBlocks blocks from startingBlockId if it exists * */ long[] getCorruptReplicaBlockIds(int numExpectedBlocks, Long startingBlockId) { return corruptReplicas.getCorruptReplicaBlockIds(numExpectedBlocks, startingBlockId); } /** * Return an iterator over the set of blocks for which there are no replicas. */ BlockIterator getCorruptReplicaBlockIterator() { return neededReplications .iterator(UnderReplicatedBlocks.QUEUE_WITH_CORRUPT_BLOCKS); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy