All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hdfs.server.blockmanagement.BlockManager Maven / Gradle / Ivy

There is a newer version: 3.4.1
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hdfs.server.blockmanagement;

import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.TreeSet;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.HadoopIllegalArgumentException;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.BlockListAsLongs;
import org.apache.hadoop.hdfs.protocol.BlockListAsLongs.BlockReportIterator;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
import org.apache.hadoop.hdfs.protocol.UnregisteredNodeException;
import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager;
import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager.AccessMode;
import org.apache.hadoop.hdfs.security.token.block.ExportedBlockKeys;

import static org.apache.hadoop.util.ExitUtil.terminate;

import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState;
import org.apache.hadoop.hdfs.server.common.Util;
import org.apache.hadoop.hdfs.server.namenode.FSClusterStats;
import org.apache.hadoop.hdfs.server.namenode.INode;
import org.apache.hadoop.hdfs.server.namenode.INodeFile;
import org.apache.hadoop.hdfs.server.namenode.INodeFileUnderConstruction;
import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.hdfs.server.namenode.Namesystem;
import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics;
import org.apache.hadoop.hdfs.server.protocol.BlockCommand;
import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations;
import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations.BlockWithLocations;
import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand;
import org.apache.hadoop.hdfs.server.protocol.KeyUpdateCommand;
import org.apache.hadoop.hdfs.server.protocol.ReceivedDeletedBlockInfo;
import org.apache.hadoop.hdfs.util.LightWeightLinkedSet;
import org.apache.hadoop.net.Node;
import org.apache.hadoop.util.Daemon;

import com.google.common.annotations.VisibleForTesting;

/**
 * Keeps information related to the blocks stored in the Hadoop cluster.
 */
@InterfaceAudience.Private
public class BlockManager {

  static final Log LOG = LogFactory.getLog(BlockManager.class);
  static final Log blockLog = NameNode.blockStateChangeLog;

  /** Default load factor of map */
  public static final float DEFAULT_MAP_LOAD_FACTOR = 0.75f;

  private final Namesystem namesystem;

  private final DatanodeManager datanodeManager;
  private final HeartbeatManager heartbeatManager;
  private final BlockTokenSecretManager blockTokenSecretManager;

  private volatile long pendingReplicationBlocksCount = 0L;
  private volatile long corruptReplicaBlocksCount = 0L;
  private volatile long underReplicatedBlocksCount = 0L;
  private volatile long scheduledReplicationBlocksCount = 0L;
  private volatile long excessBlocksCount = 0L;
  
  /** Used by metrics */
  public long getPendingReplicationBlocksCount() {
    return pendingReplicationBlocksCount;
  }
  /** Used by metrics */
  public long getUnderReplicatedBlocksCount() {
    return underReplicatedBlocksCount;
  }
  /** Used by metrics */
  public long getCorruptReplicaBlocksCount() {
    return corruptReplicaBlocksCount;
  }
  /** Used by metrics */
  public long getScheduledReplicationBlocksCount() {
    return scheduledReplicationBlocksCount;
  }
  /** Used by metrics */
  public long getPendingDeletionBlocksCount() {
    return invalidateBlocks.numBlocks();
  }
  /** Used by metrics */
  public long getExcessBlocksCount() {
    return excessBlocksCount;
  }

  /**replicationRecheckInterval is how often namenode checks for new replication work*/
  private final long replicationRecheckInterval;
  
  /**
   * Mapping: Block -> { INode, datanodes, self ref }
   * Updated only in response to client-sent information.
   */
  final BlocksMap blocksMap;

  /** Replication thread. */
  final Daemon replicationThread = new Daemon(new ReplicationMonitor());
  
  /** Store blocks -> datanodedescriptor(s) map of corrupt replicas */
  final CorruptReplicasMap corruptReplicas = new CorruptReplicasMap();

  /** Blocks to be invalidated. */
  private final InvalidateBlocks invalidateBlocks;

  //
  // Keeps a TreeSet for every named node. Each treeset contains
  // a list of the blocks that are "extra" at that location. We'll
  // eventually remove these extras.
  // Mapping: StorageID -> TreeSet
  //
  public final Map> excessReplicateMap =
    new TreeMap>();

  //
  // Store set of Blocks that need to be replicated 1 or more times.
  // We also store pending replication-orders.
  //
  public final UnderReplicatedBlocks neededReplications = new UnderReplicatedBlocks();
  @VisibleForTesting
  final PendingReplicationBlocks pendingReplications;

  /** The maximum number of replicas allowed for a block */
  public final short maxReplication;
  /**
   * The maximum number of outgoing replication streams a given node should have
   * at one time considering all but the highest priority replications needed.
   */
  int maxReplicationStreams;
  /**
   * The maximum number of outgoing replication streams a given node should have
   * at one time.
   */
  int replicationStreamsHardLimit;
  /** Minimum copies needed or else write is disallowed */
  public final short minReplication;
  /** Default number of replicas */
  public final int defaultReplication;
  /** The maximum number of entries returned by getCorruptInodes() */
  final int maxCorruptFilesReturned;

  /** variable to enable check for enough racks */
  final boolean shouldCheckForEnoughRacks;

  /** for block replicas placement */
  private BlockPlacementPolicy blockplacement;
  
  public BlockManager(final Namesystem namesystem, final FSClusterStats stats,
      final Configuration conf) throws IOException {
    this.namesystem = namesystem;
    datanodeManager = new DatanodeManager(this, namesystem, conf);
    heartbeatManager = datanodeManager.getHeartbeatManager();
    invalidateBlocks = new InvalidateBlocks(datanodeManager);

    blocksMap = new BlocksMap(DEFAULT_MAP_LOAD_FACTOR);
    blockplacement = BlockPlacementPolicy.getInstance(
        conf, stats, datanodeManager.getNetworkTopology());
    pendingReplications = new PendingReplicationBlocks(conf.getInt(
      DFSConfigKeys.DFS_NAMENODE_REPLICATION_PENDING_TIMEOUT_SEC_KEY,
      DFSConfigKeys.DFS_NAMENODE_REPLICATION_PENDING_TIMEOUT_SEC_DEFAULT) * 1000L);

    blockTokenSecretManager = createBlockTokenSecretManager(conf);

    this.maxCorruptFilesReturned = conf.getInt(
      DFSConfigKeys.DFS_DEFAULT_MAX_CORRUPT_FILES_RETURNED_KEY,
      DFSConfigKeys.DFS_DEFAULT_MAX_CORRUPT_FILES_RETURNED);
    this.defaultReplication = conf.getInt(DFSConfigKeys.DFS_REPLICATION_KEY, 
                                          DFSConfigKeys.DFS_REPLICATION_DEFAULT);

    final int maxR = conf.getInt(DFSConfigKeys.DFS_REPLICATION_MAX_KEY, 
                                 DFSConfigKeys.DFS_REPLICATION_MAX_DEFAULT);
    final int minR = conf.getInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_KEY,
                                 DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_DEFAULT);
    if (minR <= 0)
      throw new IOException("Unexpected configuration parameters: "
          + DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_KEY
          + " = " + minR + " <= 0");
    if (maxR > Short.MAX_VALUE)
      throw new IOException("Unexpected configuration parameters: "
          + DFSConfigKeys.DFS_REPLICATION_MAX_KEY
          + " = " + maxR + " > " + Short.MAX_VALUE);
    if (minR > maxR)
      throw new IOException("Unexpected configuration parameters: "
          + DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_KEY
          + " = " + minR + " > "
          + DFSConfigKeys.DFS_REPLICATION_MAX_KEY
          + " = " + maxR);
    this.minReplication = (short)minR;
    this.maxReplication = (short)maxR;

    this.maxReplicationStreams =
        conf.getInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_MAX_STREAMS_KEY,
            DFSConfigKeys.DFS_NAMENODE_REPLICATION_MAX_STREAMS_DEFAULT);
    this.replicationStreamsHardLimit =
        conf.getInt(
            DFSConfigKeys.DFS_NAMENODE_REPLICATION_STREAMS_HARD_LIMIT_KEY,
            DFSConfigKeys.DFS_NAMENODE_REPLICATION_STREAMS_HARD_LIMIT_DEFAULT);
    this.shouldCheckForEnoughRacks =
        conf.get(DFSConfigKeys.NET_TOPOLOGY_SCRIPT_FILE_NAME_KEY) != null;
    
    this.replicationRecheckInterval = 
      conf.getInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_INTERVAL_KEY, 
                  DFSConfigKeys.DFS_NAMENODE_REPLICATION_INTERVAL_DEFAULT) * 1000L;
    LOG.info("defaultReplication = " + defaultReplication);
    LOG.info("maxReplication     = " + maxReplication);
    LOG.info("minReplication     = " + minReplication);
    LOG.info("maxReplicationStreams      = " + maxReplicationStreams);
    LOG.info("shouldCheckForEnoughRacks  = " + shouldCheckForEnoughRacks);
    LOG.info("replicationRecheckInterval = " + replicationRecheckInterval);
  }

  private static BlockTokenSecretManager createBlockTokenSecretManager(
      final Configuration conf) throws IOException {
    final boolean isEnabled = conf.getBoolean(
        DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_ENABLE_KEY, 
        DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_ENABLE_DEFAULT);
    LOG.info(DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_ENABLE_KEY + "=" + isEnabled);

    if (!isEnabled) {
      return null;
    }

    final long updateMin = conf.getLong(
        DFSConfigKeys.DFS_BLOCK_ACCESS_KEY_UPDATE_INTERVAL_KEY, 
        DFSConfigKeys.DFS_BLOCK_ACCESS_KEY_UPDATE_INTERVAL_DEFAULT);
    final long lifetimeMin = conf.getLong(
        DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_LIFETIME_KEY, 
        DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_LIFETIME_DEFAULT);
    LOG.info(DFSConfigKeys.DFS_BLOCK_ACCESS_KEY_UPDATE_INTERVAL_KEY
        + "=" + updateMin + " min(s), "
        + DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_LIFETIME_KEY
        + "=" + lifetimeMin + " min(s)");
    return new BlockTokenSecretManager(true,
        updateMin*60*1000L, lifetimeMin*60*1000L);
  }

  /** get the BlockTokenSecretManager */
  BlockTokenSecretManager getBlockTokenSecretManager() {
    return blockTokenSecretManager;
  }

  private boolean isBlockTokenEnabled() {
    return blockTokenSecretManager != null;
  }

  /** Should the access keys be updated? */
  boolean shouldUpdateBlockKey(final long updateTime) throws IOException {
    return isBlockTokenEnabled()? blockTokenSecretManager.updateKeys(updateTime)
        : false;
  }

  public void activate(Configuration conf) {
    pendingReplications.start();
    datanodeManager.activate(conf);
    this.replicationThread.start();
  }

  public void close() {
    if (pendingReplications != null) pendingReplications.stop();
    blocksMap.close();
    datanodeManager.close();
    if (replicationThread != null) replicationThread.interrupt();
  }

  /** @return the datanodeManager */
  public DatanodeManager getDatanodeManager() {
    return datanodeManager;
  }

  /** @return the BlockPlacementPolicy */
  public BlockPlacementPolicy getBlockPlacementPolicy() {
    return blockplacement;
  }

  /** Set BlockPlacementPolicy */
  public void setBlockPlacementPolicy(BlockPlacementPolicy newpolicy) {
    if (newpolicy == null) {
      throw new HadoopIllegalArgumentException("newpolicy == null");
    }
    this.blockplacement = newpolicy;
  }

  /** Dump meta data to out. */
  public void metaSave(PrintWriter out) {
    assert namesystem.hasWriteLock();
    final List live = new ArrayList();
    final List dead = new ArrayList();
    datanodeManager.fetchDatanodes(live, dead, false);
    out.println("Live Datanodes: " + live.size());
    out.println("Dead Datanodes: " + dead.size());
    //
    // Dump contents of neededReplication
    //
    synchronized (neededReplications) {
      out.println("Metasave: Blocks waiting for replication: " + 
                  neededReplications.size());
      for (Block block : neededReplications) {
        List containingNodes =
                                          new ArrayList();
        List containingLiveReplicasNodes =
          new ArrayList();
        
        NumberReplicas numReplicas = new NumberReplicas();
        // source node returned is not used
        chooseSourceDatanode(block, containingNodes,
            containingLiveReplicasNodes, numReplicas,
            UnderReplicatedBlocks.LEVEL);
        assert containingLiveReplicasNodes.size() == numReplicas.liveReplicas();
        int usableReplicas = numReplicas.liveReplicas() +
                             numReplicas.decommissionedReplicas();
       
        if (block instanceof BlockInfo) {
          String fileName = ((BlockInfo)block).getINode().getFullPathName();
          out.print(fileName + ": ");
        }
        // l: == live:, d: == decommissioned c: == corrupt e: == excess
        out.print(block + ((usableReplicas > 0)? "" : " MISSING") + 
                  " (replicas:" +
                  " l: " + numReplicas.liveReplicas() +
                  " d: " + numReplicas.decommissionedReplicas() +
                  " c: " + numReplicas.corruptReplicas() +
                  " e: " + numReplicas.excessReplicas() + ") "); 

        Collection corruptNodes = 
                                      corruptReplicas.getNodes(block);
        
        for (Iterator jt = blocksMap.nodeIterator(block);
             jt.hasNext();) {
          DatanodeDescriptor node = jt.next();
          String state = "";
          if (corruptNodes != null && corruptNodes.contains(node)) {
            state = "(corrupt)";
          } else if (node.isDecommissioned() || 
              node.isDecommissionInProgress()) {
            state = "(decommissioned)";
          }          
          out.print(" " + node + state + " : ");
        }
        out.println("");
      }
    }

    // Dump blocks from pendingReplication
    pendingReplications.metaSave(out);

    // Dump blocks that are waiting to be deleted
    invalidateBlocks.dump(out);

    // Dump all datanodes
    getDatanodeManager().datanodeDump(out);
  }

  /** @return maxReplicationStreams */
  public int getMaxReplicationStreams() {
    return maxReplicationStreams;
  }

  /**
   * @param block
   * @return true if the block has minimum replicas
   */
  public boolean checkMinReplication(Block block) {
    return (countNodes(block).liveReplicas() >= minReplication);
  }

  /**
   * Commit a block of a file
   * 
   * @param block block to be committed
   * @param commitBlock - contains client reported block length and generation
   * @return true if the block is changed to committed state.
   * @throws IOException if the block does not have at least a minimal number
   * of replicas reported from data-nodes.
   */
  private boolean commitBlock(final BlockInfoUnderConstruction block,
      final Block commitBlock) throws IOException {
    if (block.getBlockUCState() == BlockUCState.COMMITTED)
      return false;
    assert block.getNumBytes() <= commitBlock.getNumBytes() :
      "commitBlock length is less than the stored one "
      + commitBlock.getNumBytes() + " vs. " + block.getNumBytes();
    block.commitBlock(commitBlock);
    return true;
  }
  
  /**
   * Commit the last block of the file and mark it as complete if it has
   * meets the minimum replication requirement
   * 
   * @param fileINode file inode
   * @param commitBlock - contains client reported block length and generation
   * @return true if the last block is changed to committed state.
   * @throws IOException if the block does not have at least a minimal number
   * of replicas reported from data-nodes.
   */
  public boolean commitOrCompleteLastBlock(INodeFileUnderConstruction fileINode, 
      Block commitBlock) throws IOException {
    if(commitBlock == null)
      return false; // not committing, this is a block allocation retry
    BlockInfo lastBlock = fileINode.getLastBlock();
    if(lastBlock == null)
      return false; // no blocks in file yet
    if(lastBlock.isComplete())
      return false; // already completed (e.g. by syncBlock)
    
    final boolean b = commitBlock((BlockInfoUnderConstruction)lastBlock, commitBlock);
    if(countNodes(lastBlock).liveReplicas() >= minReplication)
      completeBlock(fileINode,fileINode.numBlocks()-1);
    return b;
  }

  /**
   * Convert a specified block of the file to a complete block.
   * @param fileINode file
   * @param blkIndex  block index in the file
   * @throws IOException if the block does not have at least a minimal number
   * of replicas reported from data-nodes.
   */
  private BlockInfo completeBlock(final INodeFile fileINode,
      final int blkIndex) throws IOException {
    return completeBlock(fileINode, blkIndex, false);
  }

  public BlockInfo completeBlock(final INodeFile fileINode, 
      final int blkIndex, final boolean force) throws IOException {
    if(blkIndex < 0)
      return null;
    BlockInfo curBlock = fileINode.getBlocks()[blkIndex];
    if(curBlock.isComplete())
      return curBlock;
    BlockInfoUnderConstruction ucBlock = (BlockInfoUnderConstruction)curBlock;
    if(!force && ucBlock.numNodes() < minReplication)
      throw new IOException("Cannot complete block: " +
          "block does not satisfy minimal replication requirement.");
    if(!force && ucBlock.getBlockUCState() != BlockUCState.COMMITTED)
      throw new IOException(
          "Cannot complete block: block has not been COMMITTED by the client");
    BlockInfo completeBlock = ucBlock.convertToCompleteBlock();
    // replace penultimate block in file
    fileINode.setBlock(blkIndex, completeBlock);
    // replace block in the blocksMap
    return blocksMap.replaceBlock(completeBlock);
  }

  private BlockInfo completeBlock(final INodeFile fileINode,
      final BlockInfo block) throws IOException {
    BlockInfo[] fileBlocks = fileINode.getBlocks();
    for(int idx = 0; idx < fileBlocks.length; idx++)
      if(fileBlocks[idx] == block) {
        return completeBlock(fileINode, idx);
      }
    return block;
  }

  /**
   * Convert the last block of the file to an under construction block.

* The block is converted only if the file has blocks and the last one * is a partial block (its size is less than the preferred block size). * The converted block is returned to the client. * The client uses the returned block locations to form the data pipeline * for this block.
* The methods returns null if there is no partial block at the end. * The client is supposed to allocate a new block with the next call. * * @param fileINode file * @return the last block locations if the block is partial or null otherwise */ public LocatedBlock convertLastBlockToUnderConstruction( INodeFileUnderConstruction fileINode) throws IOException { BlockInfo oldBlock = fileINode.getLastBlock(); if(oldBlock == null || fileINode.getPreferredBlockSize() == oldBlock.getNumBytes()) return null; assert oldBlock == getStoredBlock(oldBlock) : "last block of the file is not in blocksMap"; DatanodeDescriptor[] targets = getNodes(oldBlock); BlockInfoUnderConstruction ucBlock = fileINode.setLastBlock(oldBlock, targets); blocksMap.replaceBlock(ucBlock); // Remove block from replication queue. updateNeededReplications(oldBlock, 0, 0); // remove this block from the list of pending blocks to be deleted. for (DatanodeDescriptor dd : targets) { String datanodeId = dd.getStorageID(); invalidateBlocks.remove(datanodeId, oldBlock); } final long fileLength = fileINode.computeContentSummary().getLength(); final long pos = fileLength - ucBlock.getNumBytes(); return createLocatedBlock(ucBlock, pos, AccessMode.WRITE); } /** * Get all valid locations of the block */ private List getValidLocations(Block block) { ArrayList machineSet = new ArrayList(blocksMap.numNodes(block)); for(Iterator it = blocksMap.nodeIterator(block); it.hasNext();) { String storageID = it.next().getStorageID(); // filter invalidate replicas if(!invalidateBlocks.contains(storageID, block)) { machineSet.add(storageID); } } return machineSet; } private List createLocatedBlockList(final BlockInfo[] blocks, final long offset, final long length, final int nrBlocksToReturn, final AccessMode mode) throws IOException { int curBlk = 0; long curPos = 0, blkSize = 0; int nrBlocks = (blocks[0].getNumBytes() == 0) ? 0 : blocks.length; for (curBlk = 0; curBlk < nrBlocks; curBlk++) { blkSize = blocks[curBlk].getNumBytes(); assert blkSize > 0 : "Block of size 0"; if (curPos + blkSize > offset) { break; } curPos += blkSize; } if (nrBlocks > 0 && curBlk == nrBlocks) // offset >= end of file return Collections.emptyList(); long endOff = offset + length; List results = new ArrayList(blocks.length); do { results.add(createLocatedBlock(blocks[curBlk], curPos, mode)); curPos += blocks[curBlk].getNumBytes(); curBlk++; } while (curPos < endOff && curBlk < blocks.length && results.size() < nrBlocksToReturn); return results; } private LocatedBlock createLocatedBlock(final BlockInfo blk, final long pos, final BlockTokenSecretManager.AccessMode mode) throws IOException { final LocatedBlock lb = createLocatedBlock(blk, pos); if (mode != null) { setBlockToken(lb, mode); } return lb; } /** @return a LocatedBlock for the given block */ private LocatedBlock createLocatedBlock(final BlockInfo blk, final long pos ) throws IOException { if (blk instanceof BlockInfoUnderConstruction) { if (blk.isComplete()) { throw new IOException( "blk instanceof BlockInfoUnderConstruction && blk.isComplete()" + ", blk=" + blk); } final BlockInfoUnderConstruction uc = (BlockInfoUnderConstruction)blk; final DatanodeDescriptor[] locations = uc.getExpectedLocations(); final ExtendedBlock eb = new ExtendedBlock(namesystem.getBlockPoolId(), blk); return new LocatedBlock(eb, locations, pos, false); } // get block locations final int numCorruptNodes = countNodes(blk).corruptReplicas(); final int numCorruptReplicas = corruptReplicas.numCorruptReplicas(blk); if (numCorruptNodes != numCorruptReplicas) { LOG.warn("Inconsistent number of corrupt replicas for " + blk + " blockMap has " + numCorruptNodes + " but corrupt replicas map has " + numCorruptReplicas); } final int numNodes = blocksMap.numNodes(blk); final boolean isCorrupt = numCorruptNodes == numNodes; final int numMachines = isCorrupt ? numNodes: numNodes - numCorruptNodes; final DatanodeDescriptor[] machines = new DatanodeDescriptor[numMachines]; if (numMachines > 0) { int j = 0; for(Iterator it = blocksMap.nodeIterator(blk); it.hasNext();) { final DatanodeDescriptor d = it.next(); final boolean replicaCorrupt = corruptReplicas.isReplicaCorrupt(blk, d); if (isCorrupt || (!isCorrupt && !replicaCorrupt)) machines[j++] = d; } } final ExtendedBlock eb = new ExtendedBlock(namesystem.getBlockPoolId(), blk); return new LocatedBlock(eb, machines, pos, isCorrupt); } /** Create a LocatedBlocks. */ public LocatedBlocks createLocatedBlocks(final BlockInfo[] blocks, final long fileSizeExcludeBlocksUnderConstruction, final boolean isFileUnderConstruction, final long offset, final long length, final boolean needBlockToken ) throws IOException { assert namesystem.hasReadOrWriteLock(); if (blocks == null) { return null; } else if (blocks.length == 0) { return new LocatedBlocks(0, isFileUnderConstruction, Collections.emptyList(), null, false); } else { if (LOG.isDebugEnabled()) { LOG.debug("blocks = " + java.util.Arrays.asList(blocks)); } final AccessMode mode = needBlockToken? AccessMode.READ: null; final List locatedblocks = createLocatedBlockList( blocks, offset, length, Integer.MAX_VALUE, mode); final BlockInfo last = blocks[blocks.length - 1]; final long lastPos = last.isComplete()? fileSizeExcludeBlocksUnderConstruction - last.getNumBytes() : fileSizeExcludeBlocksUnderConstruction; final LocatedBlock lastlb = createLocatedBlock(last, lastPos, mode); return new LocatedBlocks( fileSizeExcludeBlocksUnderConstruction, isFileUnderConstruction, locatedblocks, lastlb, last.isComplete()); } } /** @return current access keys. */ public ExportedBlockKeys getBlockKeys() { return isBlockTokenEnabled()? blockTokenSecretManager.exportKeys() : ExportedBlockKeys.DUMMY_KEYS; } /** Generate a block token for the located block. */ public void setBlockToken(final LocatedBlock b, final BlockTokenSecretManager.AccessMode mode) throws IOException { if (isBlockTokenEnabled()) { b.setBlockToken(blockTokenSecretManager.generateToken(b.getBlock(), EnumSet.of(mode))); } } void addKeyUpdateCommand(final List cmds, final DatanodeDescriptor nodeinfo) { // check access key update if (isBlockTokenEnabled() && nodeinfo.needKeyUpdate) { cmds.add(new KeyUpdateCommand(blockTokenSecretManager.exportKeys())); nodeinfo.needKeyUpdate = false; } } /** * Clamp the specified replication between the minimum and the maximum * replication levels. */ public short adjustReplication(short replication) { return replication < minReplication? minReplication : replication > maxReplication? maxReplication: replication; } /** * Check whether the replication parameter is within the range * determined by system configuration. */ public void verifyReplication(String src, short replication, String clientName) throws IOException { if (replication >= minReplication && replication <= maxReplication) { //common case. avoid building 'text' return; } String text = "file " + src + ((clientName != null) ? " on client " + clientName : "") + ".\n" + "Requested replication " + replication; if (replication > maxReplication) throw new IOException(text + " exceeds maximum " + maxReplication); if (replication < minReplication) throw new IOException(text + " is less than the required minimum " + minReplication); } /** * return a list of blocks & their locations on datanode whose * total size is size * * @param datanode on which blocks are located * @param size total size of blocks */ public BlocksWithLocations getBlocks(DatanodeID datanode, long size ) throws IOException { namesystem.readLock(); try { namesystem.checkSuperuserPrivilege(); return getBlocksWithLocations(datanode, size); } finally { namesystem.readUnlock(); } } /** Get all blocks with location information from a datanode. */ private BlocksWithLocations getBlocksWithLocations(final DatanodeID datanode, final long size) throws UnregisteredNodeException { final DatanodeDescriptor node = getDatanodeManager().getDatanode(datanode); if (node == null) { blockLog.warn("BLOCK* getBlocks: " + "Asking for blocks from an unrecorded node " + datanode.getName()); throw new HadoopIllegalArgumentException( "Datanode " + datanode.getName() + " not found."); } int numBlocks = node.numBlocks(); if(numBlocks == 0) { return new BlocksWithLocations(new BlockWithLocations[0]); } Iterator iter = node.getBlockIterator(); int startBlock = DFSUtil.getRandom().nextInt(numBlocks); // starting from a random block // skip blocks for(int i=0; i results = new ArrayList(); long totalSize = 0; BlockInfo curBlock; while(totalSize it = node.getBlockIterator(); while(it.hasNext()) { removeStoredBlock(it.next(), node); } node.resetBlocks(); invalidateBlocks.remove(node.getStorageID()); } /** * Adds block to list of blocks which will be invalidated on specified * datanode and log the operation */ void addToInvalidates(final Block block, final DatanodeInfo datanode) { invalidateBlocks.add(block, datanode, true); } /** * Adds block to list of blocks which will be invalidated on all its * datanodes. */ private void addToInvalidates(Block b) { StringBuilder datanodes = new StringBuilder(); for (Iterator it = blocksMap.nodeIterator(b); it .hasNext();) { DatanodeDescriptor node = it.next(); invalidateBlocks.add(b, node, false); datanodes.append(node.getName()).append(" "); } if (datanodes.length() != 0) { blockLog.info("BLOCK* addToInvalidates: " + b + " to " + datanodes.toString()); } } /** * Mark the block belonging to datanode as corrupt * @param blk Block to be marked as corrupt * @param dn Datanode which holds the corrupt replica * @param reason a textual reason why the block should be marked corrupt, * for logging purposes */ public void findAndMarkBlockAsCorrupt(final ExtendedBlock blk, final DatanodeInfo dn, String reason) throws IOException { namesystem.writeLock(); try { final BlockInfo storedBlock = getStoredBlock(blk.getLocalBlock()); if (storedBlock == null) { // Check if the replica is in the blockMap, if not // ignore the request for now. This could happen when BlockScanner // thread of Datanode reports bad block before Block reports are sent // by the Datanode on startup blockLog.info("BLOCK* findAndMarkBlockAsCorrupt: " + blk + " not found."); return; } markBlockAsCorrupt(storedBlock, dn, reason); } finally { namesystem.writeUnlock(); } } private void markBlockAsCorrupt(BlockInfo storedBlock, DatanodeInfo dn, String reason) throws IOException { assert storedBlock != null : "storedBlock should not be null"; DatanodeDescriptor node = getDatanodeManager().getDatanode(dn); if (node == null) { throw new IOException("Cannot mark block " + storedBlock.getBlockName() + " as corrupt because datanode " + dn.getName() + " does not exist. "); } INodeFile inode = storedBlock.getINode(); if (inode == null) { blockLog.info("BLOCK markBlockAsCorrupt: " + "block " + storedBlock + " could not be marked as corrupt as it" + " does not belong to any file"); addToInvalidates(storedBlock, node); return; } // Add replica to the data-node if it is not already there node.addBlock(storedBlock); // Add this replica to corruptReplicas Map corruptReplicas.addToCorruptReplicasMap(storedBlock, node, reason); if (countNodes(storedBlock).liveReplicas() >= inode.getReplication()) { // the block is over-replicated so invalidate the replicas immediately invalidateBlock(storedBlock, node); } else if (namesystem.isPopulatingReplQueues()) { // add the block to neededReplication updateNeededReplications(storedBlock, -1, 0); } } /** * Invalidates the given block on the given datanode. */ private void invalidateBlock(Block blk, DatanodeInfo dn) throws IOException { blockLog.info("BLOCK* invalidateBlock: " + blk + " on " + dn.getName()); DatanodeDescriptor node = getDatanodeManager().getDatanode(dn); if (node == null) { throw new IOException("Cannot invalidate block " + blk + " because datanode " + dn.getName() + " does not exist."); } // Check how many copies we have of the block. If we have at least one // copy on a live node, then we can delete it. int count = countNodes(blk).liveReplicas(); if (count >= 1) { addToInvalidates(blk, dn); removeStoredBlock(blk, node); if(blockLog.isDebugEnabled()) { blockLog.debug("BLOCK* invalidateBlocks: " + blk + " on " + dn.getName() + " listed for deletion."); } } else { blockLog.info("BLOCK* invalidateBlocks: " + blk + " on " + dn.getName() + " is the only copy and was not deleted."); } } void updateState() { pendingReplicationBlocksCount = pendingReplications.size(); underReplicatedBlocksCount = neededReplications.size(); corruptReplicaBlocksCount = corruptReplicas.size(); } /** Return number of under-replicated but not missing blocks */ public int getUnderReplicatedNotMissingBlocks() { return neededReplications.getUnderReplicatedBlockCount(); } /** * Schedule blocks for deletion at datanodes * @param nodesToProcess number of datanodes to schedule deletion work * @return total number of block for deletion */ int computeInvalidateWork(int nodesToProcess) { final List nodes = invalidateBlocks.getStorageIDs(); Collections.shuffle(nodes); nodesToProcess = Math.min(nodes.size(), nodesToProcess); int blockCnt = 0; for(int nodeCnt = 0; nodeCnt < nodesToProcess; nodeCnt++ ) { blockCnt += invalidateWorkForOneNode(nodes.get(nodeCnt)); } return blockCnt; } /** * Scan blocks in {@link #neededReplications} and assign replication * work to data-nodes they belong to. * * The number of process blocks equals either twice the number of live * data-nodes or the number of under-replicated blocks whichever is less. * * @return number of blocks scheduled for replication during this iteration. */ private int computeReplicationWork(int blocksToProcess) throws IOException { List> blocksToReplicate = null; namesystem.writeLock(); try { // Choose the blocks to be replicated blocksToReplicate = neededReplications .chooseUnderReplicatedBlocks(blocksToProcess); } finally { namesystem.writeUnlock(); } return computeReplicationWorkForBlocks(blocksToReplicate); } /** Replicate a set of blocks * * @param blocksToReplicate blocks to be replicated, for each priority * @return the number of blocks scheduled for replication */ @VisibleForTesting int computeReplicationWorkForBlocks(List> blocksToReplicate) { int requiredReplication, numEffectiveReplicas; List containingNodes, liveReplicaNodes; DatanodeDescriptor srcNode; INodeFile fileINode = null; int additionalReplRequired; int scheduledWork = 0; List work = new LinkedList(); namesystem.writeLock(); try { synchronized (neededReplications) { for (int priority = 0; priority < blocksToReplicate.size(); priority++) { for (Block block : blocksToReplicate.get(priority)) { // block should belong to a file fileINode = blocksMap.getINode(block); // abandoned block or block reopened for append if(fileINode == null || fileINode.isUnderConstruction()) { neededReplications.remove(block, priority); // remove from neededReplications neededReplications.decrementReplicationIndex(priority); continue; } requiredReplication = fileINode.getReplication(); // get a source data-node containingNodes = new ArrayList(); liveReplicaNodes = new ArrayList(); NumberReplicas numReplicas = new NumberReplicas(); srcNode = chooseSourceDatanode( block, containingNodes, liveReplicaNodes, numReplicas, priority); if(srcNode == null) // block can not be replicated from any node continue; assert liveReplicaNodes.size() == numReplicas.liveReplicas(); // do not schedule more if enough replicas is already pending numEffectiveReplicas = numReplicas.liveReplicas() + pendingReplications.getNumReplicas(block); if (numEffectiveReplicas >= requiredReplication) { if ( (pendingReplications.getNumReplicas(block) > 0) || (blockHasEnoughRacks(block)) ) { neededReplications.remove(block, priority); // remove from neededReplications neededReplications.decrementReplicationIndex(priority); blockLog.info("BLOCK* " + "Removing block " + block + " from neededReplications as it has enough replicas."); continue; } } if (numReplicas.liveReplicas() < requiredReplication) { additionalReplRequired = requiredReplication - numEffectiveReplicas; } else { additionalReplRequired = 1; // Needed on a new rack } work.add(new ReplicationWork(block, fileINode, srcNode, containingNodes, liveReplicaNodes, additionalReplRequired, priority)); } } } } finally { namesystem.writeUnlock(); } HashMap excludedNodes = new HashMap(); for(ReplicationWork rw : work){ // Exclude all of the containing nodes from being targets. // This list includes decommissioning or corrupt nodes. excludedNodes.clear(); for (DatanodeDescriptor dn : rw.containingNodes) { excludedNodes.put(dn, dn); } // choose replication targets: NOT HOLDING THE GLOBAL LOCK // It is costly to extract the filename for which chooseTargets is called, // so for now we pass in the Inode itself. rw.targets = blockplacement.chooseTarget(rw.fileINode, rw.additionalReplRequired, rw.srcNode, rw.liveReplicaNodes, excludedNodes, rw.block.getNumBytes()); } namesystem.writeLock(); try { for(ReplicationWork rw : work){ DatanodeDescriptor[] targets = rw.targets; if(targets == null || targets.length == 0){ rw.targets = null; continue; } synchronized (neededReplications) { Block block = rw.block; int priority = rw.priority; // Recheck since global lock was released // block should belong to a file fileINode = blocksMap.getINode(block); // abandoned block or block reopened for append if(fileINode == null || fileINode.isUnderConstruction()) { neededReplications.remove(block, priority); // remove from neededReplications rw.targets = null; neededReplications.decrementReplicationIndex(priority); continue; } requiredReplication = fileINode.getReplication(); // do not schedule more if enough replicas is already pending NumberReplicas numReplicas = countNodes(block); numEffectiveReplicas = numReplicas.liveReplicas() + pendingReplications.getNumReplicas(block); if (numEffectiveReplicas >= requiredReplication) { if ( (pendingReplications.getNumReplicas(block) > 0) || (blockHasEnoughRacks(block)) ) { neededReplications.remove(block, priority); // remove from neededReplications neededReplications.decrementReplicationIndex(priority); rw.targets = null; NameNode.stateChangeLog.info("BLOCK* " + "Removing block " + block + " from neededReplications as it has enough replicas."); continue; } } if ( (numReplicas.liveReplicas() >= requiredReplication) && (!blockHasEnoughRacks(block)) ) { if (rw.srcNode.getNetworkLocation().equals(targets[0].getNetworkLocation())) { //No use continuing, unless a new rack in this case continue; } } // Add block to the to be replicated list rw.srcNode.addBlockToBeReplicated(block, targets); scheduledWork++; for (DatanodeDescriptor dn : targets) { dn.incBlocksScheduled(); } // Move the block-replication into a "pending" state. // The reason we use 'pending' is so we can retry // replications that fail after an appropriate amount of time. pendingReplications.increment(block, targets.length); if(NameNode.stateChangeLog.isDebugEnabled()) { NameNode.stateChangeLog.debug( "BLOCK* block " + block + " is moved from neededReplications to pendingReplications"); } // remove from neededReplications if(numEffectiveReplicas + targets.length >= requiredReplication) { neededReplications.remove(block, priority); // remove from neededReplications neededReplications.decrementReplicationIndex(priority); } } } } finally { namesystem.writeUnlock(); } if (blockLog.isInfoEnabled()) { // log which blocks have been scheduled for replication for(ReplicationWork rw : work){ DatanodeDescriptor[] targets = rw.targets; if (targets != null && targets.length != 0) { StringBuilder targetList = new StringBuilder("datanode(s)"); for (int k = 0; k < targets.length; k++) { targetList.append(' '); targetList.append(targets[k].getName()); } blockLog.info( "BLOCK* ask " + rw.srcNode.getName() + " to replicate " + rw.block + " to " + targetList); } } } if(NameNode.stateChangeLog.isDebugEnabled()) { NameNode.stateChangeLog.debug( "BLOCK* neededReplications = " + neededReplications.size() + " pendingReplications = " + pendingReplications.size()); } return scheduledWork; } /** * Choose target datanodes according to the replication policy. * * @throws IOException * if the number of targets < minimum replication. * @see BlockPlacementPolicy#chooseTarget(String, int, DatanodeDescriptor, * List, boolean, HashMap, long) */ public DatanodeDescriptor[] chooseTarget(final String src, final int numOfReplicas, final DatanodeDescriptor client, final HashMap excludedNodes, final long blocksize) throws IOException { // choose targets for the new block to be allocated. final DatanodeDescriptor targets[] = blockplacement.chooseTarget( src, numOfReplicas, client, excludedNodes, blocksize); if (targets.length < minReplication) { throw new IOException("File " + src + " could only be replicated to " + targets.length + " nodes instead of minReplication (=" + minReplication + "). There are " + getDatanodeManager().getNetworkTopology().getNumOfLeaves() + " datanode(s) running and " + (excludedNodes == null? "no": excludedNodes.size()) + " node(s) are excluded in this operation."); } return targets; } /** * Parse the data-nodes the block belongs to and choose one, * which will be the replication source. * * We prefer nodes that are in DECOMMISSION_INPROGRESS state to other nodes * since the former do not have write traffic and hence are less busy. * We do not use already decommissioned nodes as a source. * Otherwise we choose a random node among those that did not reach their * replication limits. However, if the replication is of the highest priority * and all nodes have reached their replication limits, we will choose a * random node despite the replication limit. * * In addition form a list of all nodes containing the block * and calculate its replication numbers. * * @param block Block for which a replication source is needed * @param containingNodes List to be populated with nodes found to contain the * given block * @param nodesContainingLiveReplicas List to be populated with nodes found to * contain live replicas of the given block * @param numReplicas NumberReplicas instance to be initialized with the * counts of live, corrupt, excess, and * decommissioned replicas of the given * block. * @param priority integer representing replication priority of the given * block * @return the DatanodeDescriptor of the chosen node from which to replicate * the given block */ @VisibleForTesting DatanodeDescriptor chooseSourceDatanode( Block block, List containingNodes, List nodesContainingLiveReplicas, NumberReplicas numReplicas, int priority) { containingNodes.clear(); nodesContainingLiveReplicas.clear(); DatanodeDescriptor srcNode = null; int live = 0; int decommissioned = 0; int corrupt = 0; int excess = 0; Iterator it = blocksMap.nodeIterator(block); Collection nodesCorrupt = corruptReplicas.getNodes(block); while(it.hasNext()) { DatanodeDescriptor node = it.next(); LightWeightLinkedSet excessBlocks = excessReplicateMap.get(node.getStorageID()); if ((nodesCorrupt != null) && (nodesCorrupt.contains(node))) corrupt++; else if (node.isDecommissionInProgress() || node.isDecommissioned()) decommissioned++; else if (excessBlocks != null && excessBlocks.contains(block)) { excess++; } else { nodesContainingLiveReplicas.add(node); live++; } containingNodes.add(node); // Check if this replica is corrupt // If so, do not select the node as src node if ((nodesCorrupt != null) && nodesCorrupt.contains(node)) continue; if(priority != UnderReplicatedBlocks.QUEUE_HIGHEST_PRIORITY && node.getNumberOfBlocksToBeReplicated() >= maxReplicationStreams) { continue; // already reached replication limit } if (node.getNumberOfBlocksToBeReplicated() >= replicationStreamsHardLimit) { continue; } // the block must not be scheduled for removal on srcNode if(excessBlocks != null && excessBlocks.contains(block)) continue; // never use already decommissioned nodes if(node.isDecommissioned()) continue; // we prefer nodes that are in DECOMMISSION_INPROGRESS state if(node.isDecommissionInProgress() || srcNode == null) { srcNode = node; continue; } if(srcNode.isDecommissionInProgress()) continue; // switch to a different node randomly // this to prevent from deterministically selecting the same node even // if the node failed to replicate the block on previous iterations if(DFSUtil.getRandom().nextBoolean()) srcNode = node; } if(numReplicas != null) numReplicas.initialize(live, decommissioned, corrupt, excess); return srcNode; } /** * If there were any replication requests that timed out, reap them * and put them back into the neededReplication queue */ private void processPendingReplications() { Block[] timedOutItems = pendingReplications.getTimedOutBlocks(); if (timedOutItems != null) { namesystem.writeLock(); try { for (int i = 0; i < timedOutItems.length; i++) { NumberReplicas num = countNodes(timedOutItems[i]); if (isNeededReplication(timedOutItems[i], getReplication(timedOutItems[i]), num.liveReplicas())) { neededReplications.add(timedOutItems[i], num.liveReplicas(), num.decommissionedReplicas(), getReplication(timedOutItems[i])); } } } finally { namesystem.writeUnlock(); } /* If we know the target datanodes where the replication timedout, * we could invoke decBlocksScheduled() on it. Its ok for now. */ } } /** * StatefulBlockInfo is used to build the "toUC" list, which is a list of * updates to the information about under-construction blocks. * Besides the block in question, it provides the ReplicaState * reported by the datanode in the block report. */ private static class StatefulBlockInfo { final BlockInfoUnderConstruction storedBlock; final ReplicaState reportedState; StatefulBlockInfo(BlockInfoUnderConstruction storedBlock, ReplicaState reportedState) { this.storedBlock = storedBlock; this.reportedState = reportedState; } } /** * BlockToMarkCorrupt is used to build the "toCorrupt" list, which is a * list of blocks that should be considered corrupt due to a block report. */ private static class BlockToMarkCorrupt { final BlockInfo blockInfo; final String reason; BlockToMarkCorrupt(BlockInfo blockInfo, String reason) { super(); this.blockInfo = blockInfo; this.reason = reason; } } /** * The given datanode is reporting all its blocks. * Update the (machine-->blocklist) and (block-->machinelist) maps. */ public void processReport(final DatanodeID nodeID, final String poolId, final BlockListAsLongs newReport) throws IOException { namesystem.writeLock(); final long startTime = Util.now(); //after acquiring write lock final long endTime; try { final DatanodeDescriptor node = datanodeManager.getDatanode(nodeID); if (node == null || !node.isAlive) { throw new IOException("ProcessReport from dead or unregistered node: " + nodeID.getName()); } // To minimize startup time, we discard any second (or later) block reports // that we receive while still in startup phase. if (namesystem.isInStartupSafeMode() && !node.isFirstBlockReport()) { blockLog.info("BLOCK* processReport: " + "discarded non-initial block report from " + nodeID.getName() + " because namenode still in startup phase"); return; } if (node.numBlocks() == 0) { // The first block report can be processed a lot more efficiently than // ordinary block reports. This shortens restart times. processFirstBlockReport(node, newReport); } else { processReport(node, newReport); } node.receivedBlockReport(); } finally { endTime = Util.now(); namesystem.writeUnlock(); } // Log the block report processing stats from Namenode perspective final NameNodeMetrics metrics = NameNode.getNameNodeMetrics(); if (metrics != null) { metrics.addBlockReport((int) (endTime - startTime)); } blockLog.info("BLOCK* processReport: from " + nodeID.getName() + ", blocks: " + newReport.getNumberOfBlocks() + ", processing time: " + (endTime - startTime) + " msecs"); } private void processReport(final DatanodeDescriptor node, final BlockListAsLongs report) throws IOException { // Normal case: // Modify the (block-->datanode) map, according to the difference // between the old and new block report. // Collection toAdd = new LinkedList(); Collection toRemove = new LinkedList(); Collection toInvalidate = new LinkedList(); Collection toCorrupt = new LinkedList(); Collection toUC = new LinkedList(); reportDiff(node, report, toAdd, toRemove, toInvalidate, toCorrupt, toUC); // Process the blocks on each queue for (StatefulBlockInfo b : toUC) { addStoredBlockUnderConstruction(b.storedBlock, node, b.reportedState); } for (Block b : toRemove) { removeStoredBlock(b, node); } for (BlockInfo b : toAdd) { addStoredBlock(b, node, null, true); } for (Block b : toInvalidate) { blockLog.info("BLOCK* processReport: block " + b + " on " + node.getName() + " size " + b.getNumBytes() + " does not belong to any file."); addToInvalidates(b, node); } for (BlockToMarkCorrupt b : toCorrupt) { markBlockAsCorrupt(b.blockInfo, node, b.reason); } } /** * processFirstBlockReport is intended only for processing "initial" block * reports, the first block report received from a DN after it registers. * It just adds all the valid replicas to the datanode, without calculating * a toRemove list (since there won't be any). It also silently discards * any invalid blocks, thereby deferring their processing until * the next block report. * @param node - DatanodeDescriptor of the node that sent the report * @param report - the initial block report, to be processed * @throws IOException */ private void processFirstBlockReport(final DatanodeDescriptor node, final BlockListAsLongs report) throws IOException { if (report == null) return; assert (namesystem.hasWriteLock()); assert (node.numBlocks() == 0); BlockReportIterator itBR = report.getBlockReportIterator(); while(itBR.hasNext()) { Block iblk = itBR.next(); ReplicaState reportedState = itBR.getCurrentReplicaState(); BlockInfo storedBlock = blocksMap.getStoredBlock(iblk); // If block does not belong to any file, we are done. if (storedBlock == null) continue; // If block is corrupt, mark it and continue to next block. BlockUCState ucState = storedBlock.getBlockUCState(); BlockToMarkCorrupt c = checkReplicaCorrupt( iblk, reportedState, storedBlock, ucState, node); if (c != null) { markBlockAsCorrupt(c.blockInfo, node, c.reason); continue; } // If block is under construction, add this replica to its list if (isBlockUnderConstruction(storedBlock, ucState, reportedState)) { ((BlockInfoUnderConstruction)storedBlock).addReplicaIfNotPresent( node, iblk, reportedState); //and fall through to next clause } //add replica if appropriate if (reportedState == ReplicaState.FINALIZED) { addStoredBlockImmediate(storedBlock, node); } } } private void reportDiff(DatanodeDescriptor dn, BlockListAsLongs newReport, Collection toAdd, // add to DatanodeDescriptor Collection toRemove, // remove from DatanodeDescriptor Collection toInvalidate, // should be removed from DN Collection toCorrupt, // add to corrupt replicas list Collection toUC) { // add to under-construction list // place a delimiter in the list which separates blocks // that have been reported from those that have not BlockInfo delimiter = new BlockInfo(new Block(), 1); boolean added = dn.addBlock(delimiter); assert added : "Delimiting block cannot be present in the node"; int headIndex = 0; //currently the delimiter is in the head of the list int curIndex; if (newReport == null) newReport = new BlockListAsLongs(); // scan the report and process newly reported blocks BlockReportIterator itBR = newReport.getBlockReportIterator(); while(itBR.hasNext()) { Block iblk = itBR.next(); ReplicaState iState = itBR.getCurrentReplicaState(); BlockInfo storedBlock = processReportedBlock(dn, iblk, iState, toAdd, toInvalidate, toCorrupt, toUC); // move block to the head of the list if (storedBlock != null && (curIndex = storedBlock.findDatanode(dn)) >= 0) { headIndex = dn.moveBlockToHead(storedBlock, curIndex, headIndex); } } // collect blocks that have not been reported // all of them are next to the delimiter Iterator it = new DatanodeDescriptor.BlockIterator( delimiter.getNext(0), dn); while(it.hasNext()) toRemove.add(it.next()); dn.removeBlock(delimiter); } /** * Process a block replica reported by the data-node. * No side effects except adding to the passed-in Collections. * *

    *
  1. If the block is not known to the system (not in blocksMap) then the * data-node should be notified to invalidate this block.
  2. *
  3. If the reported replica is valid that is has the same generation stamp * and length as recorded on the name-node, then the replica location should * be added to the name-node.
  4. *
  5. If the reported replica is not valid, then it is marked as corrupt, * which triggers replication of the existing valid replicas. * Corrupt replicas are removed from the system when the block * is fully replicated.
  6. *
  7. If the reported replica is for a block currently marked "under * construction" in the NN, then it should be added to the * BlockInfoUnderConstruction's list of replicas.
  8. *
* * @param dn descriptor for the datanode that made the report * @param block reported block replica * @param reportedState reported replica state * @param toAdd add to DatanodeDescriptor * @param toInvalidate missing blocks (not in the blocks map) * should be removed from the data-node * @param toCorrupt replicas with unexpected length or generation stamp; * add to corrupt replicas * @param toUC replicas of blocks currently under construction * @return */ private BlockInfo processReportedBlock(final DatanodeDescriptor dn, final Block block, final ReplicaState reportedState, final Collection toAdd, final Collection toInvalidate, final Collection toCorrupt, final Collection toUC) { if(LOG.isDebugEnabled()) { LOG.debug("Reported block " + block + " on " + dn.getName() + " size " + block.getNumBytes() + " replicaState = " + reportedState); } // find block by blockId BlockInfo storedBlock = blocksMap.getStoredBlock(block); if(storedBlock == null) { // If blocksMap does not contain reported block id, // the replica should be removed from the data-node. toInvalidate.add(new Block(block)); return null; } BlockUCState ucState = storedBlock.getBlockUCState(); // Block is on the NN if(LOG.isDebugEnabled()) { LOG.debug("In memory blockUCState = " + ucState); } // Ignore replicas already scheduled to be removed from the DN if(invalidateBlocks.contains(dn.getStorageID(), block)) { return storedBlock; } BlockToMarkCorrupt c = checkReplicaCorrupt( block, reportedState, storedBlock, ucState, dn); if (c != null) { toCorrupt.add(c); return storedBlock; } if (isBlockUnderConstruction(storedBlock, ucState, reportedState)) { toUC.add(new StatefulBlockInfo( (BlockInfoUnderConstruction)storedBlock, reportedState)); return storedBlock; } //add replica if appropriate if (reportedState == ReplicaState.FINALIZED && storedBlock.findDatanode(dn) < 0) { toAdd.add(storedBlock); } return storedBlock; } /** * The next two methods test the various cases under which we must conclude * the replica is corrupt, or under construction. These are laid out * as switch statements, on the theory that it is easier to understand * the combinatorics of reportedState and ucState that way. It should be * at least as efficient as boolean expressions. * * @return a BlockToMarkCorrupt object, or null if the replica is not corrupt */ private BlockToMarkCorrupt checkReplicaCorrupt( Block iblk, ReplicaState reportedState, BlockInfo storedBlock, BlockUCState ucState, DatanodeDescriptor dn) { switch(reportedState) { case FINALIZED: switch(ucState) { case COMPLETE: case COMMITTED: if (storedBlock.getGenerationStamp() != iblk.getGenerationStamp()) { return new BlockToMarkCorrupt(storedBlock, "block is " + ucState + " and reported genstamp " + iblk.getGenerationStamp() + " does not match " + "genstamp in block map " + storedBlock.getGenerationStamp()); } else if (storedBlock.getNumBytes() != iblk.getNumBytes()) { return new BlockToMarkCorrupt(storedBlock, "block is " + ucState + " and reported length " + iblk.getNumBytes() + " does not match " + "length in block map " + storedBlock.getNumBytes()); } else { return null; // not corrupt } default: return null; } case RBW: case RWR: if (!storedBlock.isComplete()) { return null; // not corrupt } else if (storedBlock.getGenerationStamp() != iblk.getGenerationStamp()) { return new BlockToMarkCorrupt(storedBlock, "reported " + reportedState + " replica with genstamp " + iblk.getGenerationStamp() + " does not match COMPLETE block's " + "genstamp in block map " + storedBlock.getGenerationStamp()); } else { // COMPLETE block, same genstamp if (reportedState == ReplicaState.RBW) { // If it's a RBW report for a COMPLETE block, it may just be that // the block report got a little bit delayed after the pipeline // closed. So, ignore this report, assuming we will get a // FINALIZED replica later. See HDFS-2791 LOG.info("Received an RBW replica for block " + storedBlock + " on " + dn.getName() + ": ignoring it, since the block is " + "complete with the same generation stamp."); return null; } else { return new BlockToMarkCorrupt(storedBlock, "reported replica has invalid state " + reportedState); } } case RUR: // should not be reported case TEMPORARY: // should not be reported default: String msg = "Unexpected replica state " + reportedState + " for block: " + storedBlock + " on " + dn.getName() + " size " + storedBlock.getNumBytes(); // log here at WARN level since this is really a broken HDFS // invariant LOG.warn(msg); return new BlockToMarkCorrupt(storedBlock, msg); } } private boolean isBlockUnderConstruction(BlockInfo storedBlock, BlockUCState ucState, ReplicaState reportedState) { switch(reportedState) { case FINALIZED: switch(ucState) { case UNDER_CONSTRUCTION: case UNDER_RECOVERY: return true; default: return false; } case RBW: case RWR: return (!storedBlock.isComplete()); case RUR: // should not be reported case TEMPORARY: // should not be reported default: return false; } } void addStoredBlockUnderConstruction( BlockInfoUnderConstruction block, DatanodeDescriptor node, ReplicaState reportedState) throws IOException { block.addReplicaIfNotPresent(node, block, reportedState); if (reportedState == ReplicaState.FINALIZED && block.findDatanode(node) < 0) { addStoredBlock(block, node, null, true); } } /** * Faster version of * {@link #addStoredBlock(BlockInfo, DatanodeDescriptor, DatanodeDescriptor, boolean)} * , intended for use with initial block report at startup. If not in startup * safe mode, will call standard addStoredBlock(). Assumes this method is * called "immediately" so there is no need to refresh the storedBlock from * blocksMap. Doesn't handle underReplication/overReplication, or worry about * pendingReplications or corruptReplicas, because it's in startup safe mode. * Doesn't log every block, because there are typically millions of them. * * @throws IOException */ private void addStoredBlockImmediate(BlockInfo storedBlock, DatanodeDescriptor node) throws IOException { assert (storedBlock != null && namesystem.hasWriteLock()); if (!namesystem.isInStartupSafeMode() || namesystem.isPopulatingReplQueues()) { addStoredBlock(storedBlock, node, null, false); return; } // just add it node.addBlock(storedBlock); // Now check for completion of blocks and safe block count int numCurrentReplica = countLiveNodes(storedBlock); if (storedBlock.getBlockUCState() == BlockUCState.COMMITTED && numCurrentReplica >= minReplication) storedBlock = completeBlock(storedBlock.getINode(), storedBlock); // check whether safe replication is reached for the block // only complete blocks are counted towards that if(storedBlock.isComplete()) namesystem.incrementSafeBlockCount(numCurrentReplica); } /** * Modify (block-->datanode) map. Remove block from set of * needed replications if this takes care of the problem. * @return the block that is stored in blockMap. */ private Block addStoredBlock(final BlockInfo block, DatanodeDescriptor node, DatanodeDescriptor delNodeHint, boolean logEveryBlock) throws IOException { assert block != null && namesystem.hasWriteLock(); BlockInfo storedBlock; if (block instanceof BlockInfoUnderConstruction) { //refresh our copy in case the block got completed in another thread storedBlock = blocksMap.getStoredBlock(block); } else { storedBlock = block; } if (storedBlock == null || storedBlock.getINode() == null) { // If this block does not belong to anyfile, then we are done. blockLog.info("BLOCK* addStoredBlock: " + block + " on " + node.getName() + " size " + block.getNumBytes() + " but it does not belong to any file."); // we could add this block to invalidate set of this datanode. // it will happen in next block report otherwise. return block; } assert storedBlock != null : "Block must be stored by now"; INodeFile fileINode = storedBlock.getINode(); assert fileINode != null : "Block must belong to a file"; // add block to the datanode boolean added = node.addBlock(storedBlock); int curReplicaDelta; if (added) { curReplicaDelta = 1; if (logEveryBlock) { blockLog.info("BLOCK* addStoredBlock: " + "blockMap updated: " + node.getName() + " is added to " + storedBlock + " size " + storedBlock.getNumBytes()); } } else { curReplicaDelta = 0; blockLog.warn("BLOCK* addStoredBlock: " + "Redundant addStoredBlock request received for " + storedBlock + " on " + node.getName() + " size " + storedBlock.getNumBytes()); } // Now check for completion of blocks and safe block count NumberReplicas num = countNodes(storedBlock); int numLiveReplicas = num.liveReplicas(); int numCurrentReplica = numLiveReplicas + pendingReplications.getNumReplicas(storedBlock); if(storedBlock.getBlockUCState() == BlockUCState.COMMITTED && numLiveReplicas >= minReplication) storedBlock = completeBlock(fileINode, storedBlock); // check whether safe replication is reached for the block // only complete blocks are counted towards that // Is no-op if not in safe mode. if(storedBlock.isComplete()) namesystem.incrementSafeBlockCount(numCurrentReplica); // if file is under construction, then done for now if (fileINode.isUnderConstruction()) { return storedBlock; } // do not try to handle over/under-replicated blocks during safe mode if (!namesystem.isPopulatingReplQueues()) { return storedBlock; } // handle underReplication/overReplication short fileReplication = fileINode.getReplication(); if (!isNeededReplication(storedBlock, fileReplication, numCurrentReplica)) { neededReplications.remove(storedBlock, numCurrentReplica, num.decommissionedReplicas(), fileReplication); } else { updateNeededReplications(storedBlock, curReplicaDelta, 0); } if (numCurrentReplica > fileReplication) { processOverReplicatedBlock(storedBlock, fileReplication, node, delNodeHint); } // If the file replication has reached desired value // we can remove any corrupt replicas the block may have int corruptReplicasCount = corruptReplicas.numCorruptReplicas(storedBlock); int numCorruptNodes = num.corruptReplicas(); if (numCorruptNodes != corruptReplicasCount) { LOG.warn("Inconsistent number of corrupt replicas for " + storedBlock + "blockMap has " + numCorruptNodes + " but corrupt replicas map has " + corruptReplicasCount); } if ((corruptReplicasCount > 0) && (numLiveReplicas >= fileReplication)) invalidateCorruptReplicas(storedBlock); return storedBlock; } /** * Invalidate corrupt replicas. *

* This will remove the replicas from the block's location list, * add them to {@link #invalidateBlocks} so that they could be further * deleted from the respective data-nodes, * and remove the block from corruptReplicasMap. *

* This method should be called when the block has sufficient * number of live replicas. * * @param blk Block whose corrupt replicas need to be invalidated */ private void invalidateCorruptReplicas(Block blk) { Collection nodes = corruptReplicas.getNodes(blk); boolean gotException = false; if (nodes == null) return; // make a copy of the array of nodes in order to avoid // ConcurrentModificationException, when the block is removed from the node DatanodeDescriptor[] nodesCopy = nodes.toArray(new DatanodeDescriptor[0]); for (DatanodeDescriptor node : nodesCopy) { try { invalidateBlock(blk, node); } catch (IOException e) { blockLog.info("NameNode.invalidateCorruptReplicas " + "error in deleting bad block " + blk + " on " + node + e); gotException = true; } } // Remove the block from corruptReplicasMap if (!gotException) corruptReplicas.removeFromCorruptReplicasMap(blk); } /** * For each block in the name-node verify whether it belongs to any file, * over or under replicated. Place it into the respective queue. */ public void processMisReplicatedBlocks() { assert namesystem.hasWriteLock(); long nrInvalid = 0, nrOverReplicated = 0, nrUnderReplicated = 0, nrUnderConstruction = 0; neededReplications.clear(); for (BlockInfo block : blocksMap.getBlocks()) { INodeFile fileINode = block.getINode(); if (fileINode == null) { // block does not belong to any file nrInvalid++; addToInvalidates(block); continue; } if (!block.isComplete()) { // Incomplete blocks are never considered mis-replicated -- // they'll be reached when they are completed or recovered. nrUnderConstruction++; continue; } // calculate current replication short expectedReplication = fileINode.getReplication(); NumberReplicas num = countNodes(block); int numCurrentReplica = num.liveReplicas(); // add to under-replicated queue if need to be if (isNeededReplication(block, expectedReplication, numCurrentReplica)) { if (neededReplications.add(block, numCurrentReplica, num .decommissionedReplicas(), expectedReplication)) { nrUnderReplicated++; } } if (numCurrentReplica > expectedReplication) { // over-replicated block nrOverReplicated++; processOverReplicatedBlock(block, expectedReplication, null, null); } } LOG.info("Total number of blocks = " + blocksMap.size()); LOG.info("Number of invalid blocks = " + nrInvalid); LOG.info("Number of under-replicated blocks = " + nrUnderReplicated); LOG.info("Number of over-replicated blocks = " + nrOverReplicated); LOG.info("Number of blocks being written = " + nrUnderConstruction); } /** Set replication for the blocks. */ public void setReplication(final short oldRepl, final short newRepl, final String src, final Block... blocks) throws IOException { if (newRepl == oldRepl) { return; } // update needReplication priority queues for(Block b : blocks) { updateNeededReplications(b, 0, newRepl-oldRepl); } if (oldRepl > newRepl) { // old replication > the new one; need to remove copies LOG.info("Decreasing replication from " + oldRepl + " to " + newRepl + " for " + src); for(Block b : blocks) { processOverReplicatedBlock(b, newRepl, null, null); } } else { // replication factor is increased LOG.info("Increasing replication from " + oldRepl + " to " + newRepl + " for " + src); } } /** * Find how many of the containing nodes are "extra", if any. * If there are any extras, call chooseExcessReplicates() to * mark them in the excessReplicateMap. */ private void processOverReplicatedBlock(final Block block, final short replication, final DatanodeDescriptor addedNode, DatanodeDescriptor delNodeHint) { assert namesystem.hasWriteLock(); if (addedNode == delNodeHint) { delNodeHint = null; } Collection nonExcess = new ArrayList(); Collection corruptNodes = corruptReplicas .getNodes(block); for (Iterator it = blocksMap.nodeIterator(block); it.hasNext();) { DatanodeDescriptor cur = it.next(); LightWeightLinkedSet excessBlocks = excessReplicateMap.get(cur .getStorageID()); if (excessBlocks == null || !excessBlocks.contains(block)) { if (!cur.isDecommissionInProgress() && !cur.isDecommissioned()) { // exclude corrupt replicas if (corruptNodes == null || !corruptNodes.contains(cur)) { nonExcess.add(cur); } } } } chooseExcessReplicates(nonExcess, block, replication, addedNode, delNodeHint, blockplacement); } /** * We want "replication" replicates for the block, but we now have too many. * In this method, copy enough nodes from 'srcNodes' into 'dstNodes' such that: * * srcNodes.size() - dstNodes.size() == replication * * We pick node that make sure that replicas are spread across racks and * also try hard to pick one with least free space. * The algorithm is first to pick a node with least free space from nodes * that are on a rack holding more than one replicas of the block. * So removing such a replica won't remove a rack. * If no such a node is available, * then pick a node with least free space */ private void chooseExcessReplicates(Collection nonExcess, Block b, short replication, DatanodeDescriptor addedNode, DatanodeDescriptor delNodeHint, BlockPlacementPolicy replicator) { assert namesystem.hasWriteLock(); // first form a rack to datanodes map and INodeFile inode = getINode(b); final Map> rackMap = new HashMap>(); for(final Iterator iter = nonExcess.iterator(); iter.hasNext(); ) { final DatanodeDescriptor node = iter.next(); final String rackName = node.getNetworkLocation(); List datanodeList = rackMap.get(rackName); if (datanodeList == null) { datanodeList = new ArrayList(); rackMap.put(rackName, datanodeList); } datanodeList.add(node); } // split nodes into two sets // priSet contains nodes on rack with more than one replica // remains contains the remaining nodes final List priSet = new ArrayList(); final List remains = new ArrayList(); for(List datanodeList : rackMap.values()) { if (datanodeList.size() == 1 ) { remains.add(datanodeList.get(0)); } else { priSet.addAll(datanodeList); } } // pick one node to delete that favors the delete hint // otherwise pick one with least space from priSet if it is not empty // otherwise one node with least space from remains boolean firstOne = true; while (nonExcess.size() - replication > 0) { // check if we can delete delNodeHint final DatanodeInfo cur; if (firstOne && delNodeHint !=null && nonExcess.contains(delNodeHint) && (priSet.contains(delNodeHint) || (addedNode != null && !priSet.contains(addedNode))) ) { cur = delNodeHint; } else { // regular excessive replica removal cur = replicator.chooseReplicaToDelete(inode, b, replication, priSet, remains); } firstOne = false; // adjust rackmap, priSet, and remains String rack = cur.getNetworkLocation(); final List datanodes = rackMap.get(rack); datanodes.remove(cur); if (datanodes.isEmpty()) { rackMap.remove(rack); } if (priSet.remove(cur)) { if (datanodes.size() == 1) { priSet.remove(datanodes.get(0)); remains.add(datanodes.get(0)); } } else { remains.remove(cur); } nonExcess.remove(cur); addToExcessReplicate(cur, b); // // The 'excessblocks' tracks blocks until we get confirmation // that the datanode has deleted them; the only way we remove them // is when we get a "removeBlock" message. // // The 'invalidate' list is used to inform the datanode the block // should be deleted. Items are removed from the invalidate list // upon giving instructions to the namenode. // addToInvalidates(b, cur); blockLog.info("BLOCK* chooseExcessReplicates: " +"("+cur.getName()+", "+b+") is added to invalidated blocks set."); } } private void addToExcessReplicate(DatanodeInfo dn, Block block) { assert namesystem.hasWriteLock(); LightWeightLinkedSet excessBlocks = excessReplicateMap.get(dn.getStorageID()); if (excessBlocks == null) { excessBlocks = new LightWeightLinkedSet(); excessReplicateMap.put(dn.getStorageID(), excessBlocks); } if (excessBlocks.add(block)) { excessBlocksCount++; if(blockLog.isDebugEnabled()) { blockLog.debug("BLOCK* addToExcessReplicate:" + " (" + dn.getName() + ", " + block + ") is added to excessReplicateMap"); } } } /** * Modify (block-->datanode) map. Possibly generate replication tasks, if the * removed block is still valid. */ public void removeStoredBlock(Block block, DatanodeDescriptor node) { if(blockLog.isDebugEnabled()) { blockLog.debug("BLOCK* removeStoredBlock: " + block + " from " + node.getName()); } assert (namesystem.hasWriteLock()); { if (!blocksMap.removeNode(block, node)) { if(blockLog.isDebugEnabled()) { blockLog.debug("BLOCK* removeStoredBlock: " + block + " has already been removed from node " + node); } return; } // // It's possible that the block was removed because of a datanode // failure. If the block is still valid, check if replication is // necessary. In that case, put block on a possibly-will- // be-replicated list. // INode fileINode = blocksMap.getINode(block); if (fileINode != null) { namesystem.decrementSafeBlockCount(block); updateNeededReplications(block, -1, 0); } // // We've removed a block from a node, so it's definitely no longer // in "excess" there. // LightWeightLinkedSet excessBlocks = excessReplicateMap.get(node .getStorageID()); if (excessBlocks != null) { if (excessBlocks.remove(block)) { excessBlocksCount--; if(blockLog.isDebugEnabled()) { blockLog.debug("BLOCK* removeStoredBlock: " + block + " is removed from excessBlocks"); } if (excessBlocks.size() == 0) { excessReplicateMap.remove(node.getStorageID()); } } } // Remove the replica from corruptReplicas corruptReplicas.removeFromCorruptReplicasMap(block, node); } } /** * Get all valid locations of the block & add the block to results * return the length of the added block; 0 if the block is not added */ private long addBlock(Block block, List results) { final List machineSet = getValidLocations(block); if(machineSet.size() == 0) { return 0; } else { results.add(new BlockWithLocations(block, machineSet.toArray(new String[machineSet.size()]))); return block.getNumBytes(); } } /** * The given node is reporting that it received a certain block. */ @VisibleForTesting void addBlock(DatanodeDescriptor node, Block block, String delHint) throws IOException { // decrement number of blocks scheduled to this datanode. node.decBlocksScheduled(); // get the deletion hint node DatanodeDescriptor delHintNode = null; if (delHint != null && delHint.length() != 0) { delHintNode = datanodeManager.getDatanode(delHint); if (delHintNode == null) { blockLog.warn("BLOCK* blockReceived: " + block + " is expected to be removed from an unrecorded node " + delHint); } } // // Modify the blocks->datanode map and node's map. // pendingReplications.decrement(block); // blockReceived reports a finalized block Collection toAdd = new LinkedList(); Collection toInvalidate = new LinkedList(); Collection toCorrupt = new LinkedList(); Collection toUC = new LinkedList(); processReportedBlock(node, block, ReplicaState.FINALIZED, toAdd, toInvalidate, toCorrupt, toUC); // the block is only in one of the to-do lists // if it is in none then data-node already has it assert toUC.size() + toAdd.size() + toInvalidate.size() + toCorrupt.size() <= 1 : "The block should be only in one of the lists."; for (StatefulBlockInfo b : toUC) { addStoredBlockUnderConstruction(b.storedBlock, node, b.reportedState); } for (BlockInfo b : toAdd) { addStoredBlock(b, node, delHintNode, true); } for (Block b : toInvalidate) { blockLog.info("BLOCK* addBlock: block " + b + " on " + node.getName() + " size " + b.getNumBytes() + " does not belong to any file."); addToInvalidates(b, node); } for (BlockToMarkCorrupt b : toCorrupt) { markBlockAsCorrupt(b.blockInfo, node, b.reason); } } /** The given node is reporting that it received/deleted certain blocks. */ public void blockReceivedAndDeleted(final DatanodeID nodeID, final String poolId, final ReceivedDeletedBlockInfo receivedAndDeletedBlocks[] ) throws IOException { namesystem.writeLock(); int received = 0; int deleted = 0; try { final DatanodeDescriptor node = datanodeManager.getDatanode(nodeID); if (node == null || !node.isAlive) { blockLog.warn("BLOCK* blockReceivedDeleted" + " is received from dead or unregistered node " + nodeID.getName()); throw new IOException( "Got blockReceivedDeleted message from unregistered or dead node"); } for (int i = 0; i < receivedAndDeletedBlocks.length; i++) { if (receivedAndDeletedBlocks[i].isDeletedBlock()) { removeStoredBlock( receivedAndDeletedBlocks[i].getBlock(), node); deleted++; } else { addBlock(node, receivedAndDeletedBlocks[i].getBlock(), receivedAndDeletedBlocks[i].getDelHints()); received++; } if (blockLog.isDebugEnabled()) { blockLog.debug("BLOCK* block" + (receivedAndDeletedBlocks[i].isDeletedBlock() ? "Deleted" : "Received") + ": " + receivedAndDeletedBlocks[i].getBlock() + " is received from " + nodeID.getName()); } } } finally { namesystem.writeUnlock(); if (blockLog.isDebugEnabled()) { blockLog.debug("*BLOCK* NameNode.blockReceivedAndDeleted: " + "from " + nodeID.getName() + " received: " + received + ", " + " deleted: " + deleted); } } } /** * Return the number of nodes that are live and decommissioned. */ public NumberReplicas countNodes(Block b) { int count = 0; int live = 0; int corrupt = 0; int excess = 0; Iterator nodeIter = blocksMap.nodeIterator(b); Collection nodesCorrupt = corruptReplicas.getNodes(b); while (nodeIter.hasNext()) { DatanodeDescriptor node = nodeIter.next(); if ((nodesCorrupt != null) && (nodesCorrupt.contains(node))) { corrupt++; } else if (node.isDecommissionInProgress() || node.isDecommissioned()) { count++; } else { LightWeightLinkedSet blocksExcess = excessReplicateMap.get(node .getStorageID()); if (blocksExcess != null && blocksExcess.contains(b)) { excess++; } else { live++; } } } return new NumberReplicas(live, count, corrupt, excess); } /** * Simpler, faster form of {@link #countNodes(Block)} that only returns the number * of live nodes. If in startup safemode (or its 30-sec extension period), * then it gains speed by ignoring issues of excess replicas or nodes * that are decommissioned or in process of becoming decommissioned. * If not in startup, then it calls {@link countNodes()} instead. * * @param b - the block being tested * @return count of live nodes for this block */ int countLiveNodes(BlockInfo b) { if (!namesystem.isInStartupSafeMode()) { return countNodes(b).liveReplicas(); } // else proceed with fast case int live = 0; Iterator nodeIter = blocksMap.nodeIterator(b); Collection nodesCorrupt = corruptReplicas.getNodes(b); while (nodeIter.hasNext()) { DatanodeDescriptor node = nodeIter.next(); if ((nodesCorrupt == null) || (!nodesCorrupt.contains(node))) live++; } return live; } private void logBlockReplicationInfo(Block block, DatanodeDescriptor srcNode, NumberReplicas num) { int curReplicas = num.liveReplicas(); int curExpectedReplicas = getReplication(block); INode fileINode = blocksMap.getINode(block); Iterator nodeIter = blocksMap.nodeIterator(block); StringBuilder nodeList = new StringBuilder(); while (nodeIter.hasNext()) { DatanodeDescriptor node = nodeIter.next(); nodeList.append(node.name); nodeList.append(" "); } LOG.info("Block: " + block + ", Expected Replicas: " + curExpectedReplicas + ", live replicas: " + curReplicas + ", corrupt replicas: " + num.corruptReplicas() + ", decommissioned replicas: " + num.decommissionedReplicas() + ", excess replicas: " + num.excessReplicas() + ", Is Open File: " + fileINode.isUnderConstruction() + ", Datanodes having this block: " + nodeList + ", Current Datanode: " + srcNode.name + ", Is current datanode decommissioning: " + srcNode.isDecommissionInProgress()); } /** * On stopping decommission, check if the node has excess replicas. * If there are any excess replicas, call processOverReplicatedBlock() */ void processOverReplicatedBlocksOnReCommission( final DatanodeDescriptor srcNode) { final Iterator it = srcNode.getBlockIterator(); int numOverReplicated = 0; while(it.hasNext()) { final Block block = it.next(); INodeFile fileINode = blocksMap.getINode(block); short expectedReplication = fileINode.getReplication(); NumberReplicas num = countNodes(block); int numCurrentReplica = num.liveReplicas(); if (numCurrentReplica > expectedReplication) { // over-replicated block processOverReplicatedBlock(block, expectedReplication, null, null); numOverReplicated++; } } LOG.info("Invalidated " + numOverReplicated + " over-replicated blocks on " + srcNode + " during recommissioning"); } /** * Return true if there are any blocks on this node that have not * yet reached their replication factor. Otherwise returns false. */ boolean isReplicationInProgress(DatanodeDescriptor srcNode) { boolean status = false; int underReplicatedBlocks = 0; int decommissionOnlyReplicas = 0; int underReplicatedInOpenFiles = 0; final Iterator it = srcNode.getBlockIterator(); while(it.hasNext()) { final Block block = it.next(); INode fileINode = blocksMap.getINode(block); if (fileINode != null) { NumberReplicas num = countNodes(block); int curReplicas = num.liveReplicas(); int curExpectedReplicas = getReplication(block); if (isNeededReplication(block, curExpectedReplicas, curReplicas)) { if (curExpectedReplicas > curReplicas) { //Log info about one block for this node which needs replication if (!status) { status = true; logBlockReplicationInfo(block, srcNode, num); } underReplicatedBlocks++; if ((curReplicas == 0) && (num.decommissionedReplicas() > 0)) { decommissionOnlyReplicas++; } if (fileINode.isUnderConstruction()) { underReplicatedInOpenFiles++; } } if (!neededReplications.contains(block) && pendingReplications.getNumReplicas(block) == 0) { // // These blocks have been reported from the datanode // after the startDecommission method has been executed. These // blocks were in flight when the decommissioning was started. // neededReplications.add(block, curReplicas, num.decommissionedReplicas(), curExpectedReplicas); } } } } srcNode.decommissioningStatus.set(underReplicatedBlocks, decommissionOnlyReplicas, underReplicatedInOpenFiles); return status; } public int getActiveBlockCount() { return blocksMap.size() - (int)invalidateBlocks.numBlocks(); } public DatanodeDescriptor[] getNodes(BlockInfo block) { DatanodeDescriptor[] nodes = new DatanodeDescriptor[block.numNodes()]; Iterator it = blocksMap.nodeIterator(block); for (int i = 0; it != null && it.hasNext(); i++) { nodes[i] = it.next(); } return nodes; } public int getTotalBlocks() { return blocksMap.size(); } public void removeBlock(Block block) { block.setNumBytes(BlockCommand.NO_ACK); addToInvalidates(block); corruptReplicas.removeFromCorruptReplicasMap(block); blocksMap.removeBlock(block); // Remove the block from pendingReplications pendingReplications.remove(block); } public BlockInfo getStoredBlock(Block block) { return blocksMap.getStoredBlock(block); } /** updates a block in under replication queue */ private void updateNeededReplications(final Block block, final int curReplicasDelta, int expectedReplicasDelta) { namesystem.writeLock(); try { NumberReplicas repl = countNodes(block); int curExpectedReplicas = getReplication(block); if (isNeededReplication(block, curExpectedReplicas, repl.liveReplicas())) { neededReplications.update(block, repl.liveReplicas(), repl .decommissionedReplicas(), curExpectedReplicas, curReplicasDelta, expectedReplicasDelta); } else { int oldReplicas = repl.liveReplicas()-curReplicasDelta; int oldExpectedReplicas = curExpectedReplicas-expectedReplicasDelta; neededReplications.remove(block, oldReplicas, repl.decommissionedReplicas(), oldExpectedReplicas); } } finally { namesystem.writeUnlock(); } } public void checkReplication(Block block, short numExpectedReplicas) { // filter out containingNodes that are marked for decommission. NumberReplicas number = countNodes(block); if (isNeededReplication(block, numExpectedReplicas, number.liveReplicas())) { neededReplications.add(block, number.liveReplicas(), number.decommissionedReplicas(), numExpectedReplicas); return; } if (number.liveReplicas() > numExpectedReplicas) { processOverReplicatedBlock(block, numExpectedReplicas, null, null); } } /* get replication factor of a block */ private int getReplication(Block block) { INodeFile fileINode = blocksMap.getINode(block); if (fileINode == null) { // block does not belong to any file return 0; } assert !fileINode.isDirectory() : "Block cannot belong to a directory."; return fileINode.getReplication(); } /** * Get blocks to invalidate for nodeId * in {@link #invalidateBlocks}. * * @return number of blocks scheduled for removal during this iteration. */ private int invalidateWorkForOneNode(String nodeId) { namesystem.writeLock(); try { // blocks should not be replicated or removed if safe mode is on if (namesystem.isInSafeMode()) return 0; // get blocks to invalidate for the nodeId assert nodeId != null; return invalidateBlocks.invalidateWork(nodeId); } finally { namesystem.writeUnlock(); } } boolean blockHasEnoughRacks(Block b) { if (!this.shouldCheckForEnoughRacks) { return true; } boolean enoughRacks = false;; Collection corruptNodes = corruptReplicas.getNodes(b); int numExpectedReplicas = getReplication(b); String rackName = null; for (Iterator it = blocksMap.nodeIterator(b); it.hasNext();) { DatanodeDescriptor cur = it.next(); if (!cur.isDecommissionInProgress() && !cur.isDecommissioned()) { if ((corruptNodes == null ) || !corruptNodes.contains(cur)) { if (numExpectedReplicas == 1 || (numExpectedReplicas > 1 && !datanodeManager.hasClusterEverBeenMultiRack())) { enoughRacks = true; break; } String rackNameNew = cur.getNetworkLocation(); if (rackName == null) { rackName = rackNameNew; } else if (!rackName.equals(rackNameNew)) { enoughRacks = true; break; } } } } return enoughRacks; } boolean isNeededReplication(Block b, int expectedReplication, int curReplicas) { if ((curReplicas >= expectedReplication) && (blockHasEnoughRacks(b))) { return false; } else { return true; } } public long getMissingBlocksCount() { // not locking return this.neededReplications.getCorruptBlockSize(); } public BlockInfo addINode(BlockInfo block, INodeFile iNode) { return blocksMap.addINode(block, iNode); } public INodeFile getINode(Block b) { return blocksMap.getINode(b); } /** @return an iterator of the datanodes. */ public Iterator datanodeIterator(final Block block) { return blocksMap.nodeIterator(block); } public int numCorruptReplicas(Block block) { return corruptReplicas.numCorruptReplicas(block); } public void removeBlockFromMap(Block block) { blocksMap.removeBlock(block); // If block is removed from blocksMap remove it from corruptReplicasMap corruptReplicas.removeFromCorruptReplicasMap(block); } public int getCapacity() { namesystem.readLock(); try { return blocksMap.getCapacity(); } finally { namesystem.readUnlock(); } } /** * Return a range of corrupt replica block ids. Up to numExpectedBlocks * blocks starting at the next block after startingBlockId are returned * (fewer if numExpectedBlocks blocks are unavailable). If startingBlockId * is null, up to numExpectedBlocks blocks are returned from the beginning. * If startingBlockId cannot be found, null is returned. * * @param numExpectedBlocks Number of block ids to return. * 0 <= numExpectedBlocks <= 100 * @param startingBlockId Block id from which to start. If null, start at * beginning. * @return Up to numExpectedBlocks blocks from startingBlockId if it exists * */ public long[] getCorruptReplicaBlockIds(int numExpectedBlocks, Long startingBlockId) { return corruptReplicas.getCorruptReplicaBlockIds(numExpectedBlocks, startingBlockId); } /** * Return an iterator over the set of blocks for which there are no replicas. */ public Iterator getCorruptReplicaBlockIterator() { return neededReplications.iterator( UnderReplicatedBlocks.QUEUE_WITH_CORRUPT_BLOCKS); } /** @return the size of UnderReplicatedBlocks */ public int numOfUnderReplicatedBlocks() { return neededReplications.size(); } /** * Periodically calls computeReplicationWork(). */ private class ReplicationMonitor implements Runnable { private static final int INVALIDATE_WORK_PCT_PER_ITERATION = 32; private static final int REPLICATION_WORK_MULTIPLIER_PER_ITERATION = 2; @Override public void run() { while (namesystem.isRunning()) { try { computeDatanodeWork(); processPendingReplications(); Thread.sleep(replicationRecheckInterval); } catch (InterruptedException ie) { LOG.warn("ReplicationMonitor thread received InterruptedException.", ie); break; } catch (IOException ie) { LOG.warn("ReplicationMonitor thread received exception. " , ie); } catch (Throwable t) { LOG.fatal("ReplicationMonitor thread received Runtime exception. ", t); terminate(1); } } } } /** * Compute block replication and block invalidation work that can be scheduled * on data-nodes. The datanode will be informed of this work at the next * heartbeat. * * @return number of blocks scheduled for replication or removal. * @throws IOException */ int computeDatanodeWork() throws IOException { int workFound = 0; // Blocks should not be replicated or removed if in safe mode. // It's OK to check safe mode here w/o holding lock, in the worst // case extra replications will be scheduled, and these will get // fixed up later. if (namesystem.isInSafeMode()) return workFound; final int numlive = heartbeatManager.getLiveDatanodeCount(); final int blocksToProcess = numlive * ReplicationMonitor.REPLICATION_WORK_MULTIPLIER_PER_ITERATION; final int nodesToProcess = (int) Math.ceil(numlive * ReplicationMonitor.INVALIDATE_WORK_PCT_PER_ITERATION / 100.0); workFound = this.computeReplicationWork(blocksToProcess); // Update counters namesystem.writeLock(); try { this.updateState(); this.scheduledReplicationBlocksCount = workFound; } finally { namesystem.writeUnlock(); } workFound += this.computeInvalidateWork(nodesToProcess); return workFound; } private static class ReplicationWork { private Block block; private INodeFile fileINode; private DatanodeDescriptor srcNode; private List containingNodes; private List liveReplicaNodes; private int additionalReplRequired; private DatanodeDescriptor targets[]; private int priority; public ReplicationWork(Block block, INodeFile fileINode, DatanodeDescriptor srcNode, List containingNodes, List liveReplicaNodes, int additionalReplRequired, int priority) { this.block = block; this.fileINode = fileINode; this.srcNode = srcNode; this.containingNodes = containingNodes; this.liveReplicaNodes = liveReplicaNodes; this.additionalReplRequired = additionalReplRequired; this.priority = priority; this.targets = null; } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy