org.apache.hadoop.hdfs.server.blockmanagement.HeartbeatManager Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hadoop-hdfs Show documentation
Apache Hadoop HDFS
There is a newer version: 3.4.1
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hdfs.server.blockmanagement;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.StorageType;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.server.namenode.Namesystem;
import org.apache.hadoop.hdfs.server.protocol.StorageReport;
import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary;
import org.apache.hadoop.util.Daemon;
import org.apache.hadoop.util.StopWatch;
import org.apache.hadoop.util.Time;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.annotations.VisibleForTesting;

/**
 * Manage the heartbeats received from datanodes.
 * The datanode list and statistics are synchronized
 * by the heartbeat manager lock.
 */
class HeartbeatManager implements DatanodeStatistics {
  static final Logger LOG = LoggerFactory.getLogger(HeartbeatManager.class);
  private static final String REPORT_DELTA_STALE_DN_HEADER =
      "StaleNodes Report: [New Stale Nodes]: %d";
  private static final String REPORT_STALE_DN_LINE_ENTRY = "%n\t %s";
  private static final String REPORT_STALE_DN_LINE_TAIL = ", %s";
  private static final String REPORT_REMOVE_DEAD_NODE_ENTRY =
      "StaleNodes Report: [Remove DeadNode]: %s";
  private static final String REPORT_REMOVE_STALE_NODE_ENTRY =
      "StaleNodes Report: [Remove StaleNode]: %s";
  private static final int REPORT_STALE_NODE_NODES_PER_LINE = 10;
  /**
   * Stores a subset of the datanodeMap in DatanodeManager,
   * containing nodes that are considered alive.
   * The HeartbeatMonitor periodically checks for out-dated entries,
   * and removes them from the list.
   * It is synchronized by the heartbeat manager lock.
   */
  private final List datanodes = new ArrayList<>();

  /** Statistics, which are synchronized by the heartbeat manager lock. */
  private final DatanodeStats stats = new DatanodeStats();

  /** The time period to check for expired datanodes. */
  private final long heartbeatRecheckInterval;
  /** Heartbeat monitor thread. */
  private final Daemon heartbeatThread = new Daemon(new Monitor());
  private final StopWatch heartbeatStopWatch = new StopWatch();

  final Namesystem namesystem;
  final BlockManager blockManager;
  /** Enable log for datanode staleness. */
  private final boolean enableLogStaleNodes;

  /** reports for stale datanodes. */
  private final Set staleDataNodes = new HashSet<>();

  HeartbeatManager(final Namesystem namesystem,
      final BlockManager blockManager, final Configuration conf) {
    this.namesystem = namesystem;
    this.blockManager = blockManager;
    boolean avoidStaleDataNodesForWrite = conf.getBoolean(
        DFSConfigKeys.DFS_NAMENODE_AVOID_STALE_DATANODE_FOR_WRITE_KEY,
        DFSConfigKeys.DFS_NAMENODE_AVOID_STALE_DATANODE_FOR_WRITE_DEFAULT);
    long recheckInterval = conf.getInt(
        DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY,
        DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_DEFAULT); // 5 min
    long staleInterval = conf.getLong(
        DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_KEY,
        DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_DEFAULT);// 30s
    enableLogStaleNodes = conf.getBoolean(
        DFSConfigKeys.DFS_NAMENODE_ENABLE_LOG_STALE_DATANODE_KEY,
        DFSConfigKeys.DFS_NAMENODE_ENABLE_LOG_STALE_DATANODE_DEFAULT);

    if (avoidStaleDataNodesForWrite && staleInterval < recheckInterval) {
      this.heartbeatRecheckInterval = staleInterval;
      LOG.info("Setting heartbeat recheck interval to " + staleInterval
          + " since " + DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_KEY
          + " is less than "
          + DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY);
    } else {
      this.heartbeatRecheckInterval = recheckInterval;
    }
  }

  void activate() {
    heartbeatThread.start();
  }

  void close() {
    heartbeatThread.interrupt();
    try {
      // This will no effect if the thread hasn't yet been started.
      heartbeatThread.join(3000);
    } catch (InterruptedException ignored) {
    }
  }
  
  synchronized int getLiveDatanodeCount() {
    return datanodes.size();
  }

  @Override
  public long getCapacityTotal() {
    return stats.getCapacityTotal();
  }

  @Override
  public long getCapacityUsed() {
    return stats.getCapacityUsed();
  }

  @Override
  public float getCapacityUsedPercent() {
    return stats.getCapacityUsedPercent();
  }

  @Override
  public long getCapacityRemaining() {
    return stats.getCapacityRemaining();
  }

  @Override
  public float getCapacityRemainingPercent() {
    return stats.getCapacityRemainingPercent();
  }

  @Override
  public long getBlockPoolUsed() {
    return stats.getBlockPoolUsed();
  }

  @Override
  public float getPercentBlockPoolUsed() {
    return stats.getPercentBlockPoolUsed();
  }

  @Override
  public long getCapacityUsedNonDFS() {
    return stats.getCapacityUsedNonDFS();
  }

  @Override
  public int getXceiverCount() {
    return stats.getXceiverCount();
  }
  
  @Override
  public int getInServiceXceiverCount() {
    return stats.getNodesInServiceXceiverCount();
  }
  
  @Override
  public int getNumDatanodesInService() {
    return stats.getNodesInService();
  }
  
  @Override
  public long getCacheCapacity() {
    return stats.getCacheCapacity();
  }

  @Override
  public long getCacheUsed() {
    return stats.getCacheUsed();
  }

  @Override
  public synchronized long[] getStats() {
    return new long[] {getCapacityTotal(),
                       getCapacityUsed(),
                       getCapacityRemaining(),
                       -1L,
                       -1L,
                       -1L,
                       -1L,
                       -1L,
                       -1L};
  }

  @Override
  public int getExpiredHeartbeats() {
    return stats.getExpiredHeartbeats();
  }

  @Override
  public Map getStorageTypeStats() {
    return stats.getStatsMap();
  }

  synchronized void register(final DatanodeDescriptor d) {
    if (!d.isAlive()) {
      addDatanode(d);

      //update its timestamp
      d.updateHeartbeatState(StorageReport.EMPTY_ARRAY, 0L, 0L, 0, 0, null);
      stats.add(d);
    }
  }

  synchronized DatanodeDescriptor[] getDatanodes() {
    return datanodes.toArray(new DatanodeDescriptor[datanodes.size()]);
  }

  synchronized void addDatanode(final DatanodeDescriptor d) {
    // update in-service node count
    datanodes.add(d);
    d.setAlive(true);
  }

  void updateDnStat(final DatanodeDescriptor d){
    stats.add(d);
  }

  synchronized void removeDatanode(DatanodeDescriptor node) {
    if (node.isAlive()) {
      stats.subtract(node);
      datanodes.remove(node);
      removeNodeFromStaleList(node);
      node.setAlive(false);
    }
  }

  synchronized void updateHeartbeat(final DatanodeDescriptor node,
      StorageReport[] reports, long cacheCapacity, long cacheUsed,
      int xceiverCount, int failedVolumes,
      VolumeFailureSummary volumeFailureSummary) {
    stats.subtract(node);
    node.updateHeartbeat(reports, cacheCapacity, cacheUsed,
      xceiverCount, failedVolumes, volumeFailureSummary);
    stats.add(node);
  }

  synchronized void updateLifeline(final DatanodeDescriptor node,
      StorageReport[] reports, long cacheCapacity, long cacheUsed,
      int xceiverCount, int failedVolumes,
      VolumeFailureSummary volumeFailureSummary) {
    stats.subtract(node);
    // This intentionally calls updateHeartbeatState instead of
    // updateHeartbeat, because we don't want to modify the
    // heartbeatedSinceRegistration flag.  Arrival of a lifeline message does
    // not count as arrival of the first heartbeat.
    node.updateHeartbeatState(reports, cacheCapacity, cacheUsed,
        xceiverCount, failedVolumes, volumeFailureSummary);
    stats.add(node);
  }

  synchronized void startDecommission(final DatanodeDescriptor node) {
    if (!node.isAlive()) {
      LOG.info("Dead node {} is decommissioned immediately.", node);
      node.setDecommissioned();
    } else {
      stats.subtract(node);
      node.startDecommission();
      stats.add(node);
    }
  }

  synchronized void startMaintenance(final DatanodeDescriptor node) {
    if (!node.isAlive()) {
      LOG.info("Dead node {} is put in maintenance state immediately.", node);
      node.setInMaintenance();
    } else {
      stats.subtract(node);
      if (node.isDecommissioned()) {
        LOG.info("Decommissioned node " + node + " is put in maintenance state"
            + " immediately.");
        node.setInMaintenance();
      } else if (blockManager.getMinReplicationToBeInMaintenance() == 0) {
        LOG.info("MinReplicationToBeInMaintenance is set to zero. " + node +
            " is put in maintenance state" + " immediately.");
        node.setInMaintenance();
      } else {
        node.startMaintenance();
      }
      stats.add(node);
    }
  }

  synchronized void stopMaintenance(final DatanodeDescriptor node) {
    LOG.info("Stopping maintenance of {} node {}",
        node.isAlive() ? "live" : "dead", node);
    if (!node.isAlive()) {
      node.stopMaintenance();
    } else {
      stats.subtract(node);
      node.stopMaintenance();
      stats.add(node);
    }
  }

  synchronized void stopDecommission(final DatanodeDescriptor node) {
    LOG.info("Stopping decommissioning of {} node {}",
        node.isAlive() ? "live" : "dead", node);
    if (!node.isAlive()) {
      node.stopDecommission();
    } else {
      stats.subtract(node);
      node.stopDecommission();
      stats.add(node);
    }
  }

  @VisibleForTesting
  void restartHeartbeatStopWatch() {
    heartbeatStopWatch.reset().start();
  }

  @VisibleForTesting
  boolean shouldAbortHeartbeatCheck(long offset) {
    long elapsed = heartbeatStopWatch.now(TimeUnit.MILLISECONDS);
    return elapsed + offset > heartbeatRecheckInterval;
  }

  /**
   * Remove deadNode from StaleNodeList if it exists.
   * This method assumes that it is called inside a synchronized block.
   *
   * @param d node descriptor to be marked as dead.
   * @return true if the node was already on the stale list.
   */
  private boolean removeNodeFromStaleList(DatanodeDescriptor d) {
    return removeNodeFromStaleList(d, true);
  }

  /**
   * Remove node from StaleNodeList if it exists.
   * If enabled, the log will show whether the node is removed from list because
   * it is dead or not.
   * This method assumes that it is called inside a synchronized block.
   *
   * @param d node descriptor to be marked as dead.
   * @param isDead
   * @return true if the node was already in the stale list.
   */
  private boolean removeNodeFromStaleList(DatanodeDescriptor d,
      boolean isDead) {
    boolean result = false;
    result = staleDataNodes.remove(d);
    if (enableLogStaleNodes && result) {
      LOG.info(String.format(isDead ?
              REPORT_REMOVE_DEAD_NODE_ENTRY : REPORT_REMOVE_STALE_NODE_ENTRY,
          d));
    }
    return result;
  }

  /**
   * Dump the new stale data nodes added since last heartbeat check.
   *
   * @param staleNodes list of datanodes added in the last heartbeat check.
   */
  private void dumpStaleNodes(List staleNodes) {
    // log nodes detected as stale
    if (enableLogStaleNodes && (!staleNodes.isEmpty())) {
      StringBuilder staleLogMSG =
          new StringBuilder(String.format(REPORT_DELTA_STALE_DN_HEADER,
              staleNodes.size()));
      for (int ind = 0; ind < staleNodes.size(); ind++) {
        String logFormat = (ind % REPORT_STALE_NODE_NODES_PER_LINE == 0) ?
            REPORT_STALE_DN_LINE_ENTRY : REPORT_STALE_DN_LINE_TAIL;
        staleLogMSG.append(String.format(logFormat, staleNodes.get(ind)));
      }
      LOG.info(staleLogMSG.toString());
    }
  }

  /**
   * Check if there are any expired heartbeats, and if so,
   * whether any blocks have to be re-replicated.
   * While removing dead datanodes, make sure that only one datanode is marked
   * dead at a time within the synchronized section. Otherwise, a cascading
   * effect causes more datanodes to be declared dead.
   * Check if there are any failed storage and if so,
   * Remove all the blocks on the storage. It also covers the following less
   * common scenarios. After DatanodeStorage is marked FAILED, it is still
   * possible to receive IBR for this storage.
   * 1) DN could deliver IBR for failed storage due to its implementation.
   *    a) DN queues a pending IBR request.
   *    b) The storage of the block fails.
   *    c) DN first sends HB, NN will mark the storage FAILED.
   *    d) DN then sends the pending IBR request.
   * 2) SBN processes block request from pendingDNMessages.
   *    It is possible to have messages in pendingDNMessages that refer
   *    to some failed storage.
   *    a) SBN receives a IBR and put it in pendingDNMessages.
   *    b) The storage of the block fails.
   *    c) Edit log replay get the IBR from pendingDNMessages.
   * Alternatively, we can resolve these scenarios with the following approaches.
   * A. Make sure DN don't deliver IBR for failed storage.
   * B. Remove all blocks in PendingDataNodeMessages for the failed storage
   *    when we remove all blocks from BlocksMap for that storage.
   */
  @VisibleForTesting
  void heartbeatCheck() {
    final DatanodeManager dm = blockManager.getDatanodeManager();
    // It's OK to check safe mode w/o taking the lock here, we re-check
    // for safe mode after taking the lock before removing a datanode.
    if (namesystem.isInStartupSafeMode()) {
      return;
    }
    boolean allAlive = false;
    while (!allAlive) {
      // locate the first dead node.
      DatanodeDescriptor dead = null;

      // locate the first failed storage that isn't on a dead node.
      DatanodeStorageInfo failedStorage = null;

      // check the number of stale storages
      int numOfStaleStorages = 0;
      List staleNodes = new ArrayList<>();
      synchronized(this) {
        for (DatanodeDescriptor d : datanodes) {
          // check if an excessive GC pause has occurred
          if (shouldAbortHeartbeatCheck(0)) {
            return;
          }
          if (dead == null && dm.isDatanodeDead(d)) {
            stats.incrExpiredHeartbeats();
            dead = d;
            // remove the node from stale list to adjust the stale list size
            // before setting the stale count of the DatanodeManager
            removeNodeFromStaleList(d);
          } else {
            if (d.isStale(dm.getStaleInterval())) {
              if (staleDataNodes.add(d)) {
                // the node is n
                staleNodes.add(d);
              }
            } else {
              // remove the node if it is no longer stale
              removeNodeFromStaleList(d, false);
            }
          }
          DatanodeStorageInfo[] storageInfos = d.getStorageInfos();
          for(DatanodeStorageInfo storageInfo : storageInfos) {
            if (storageInfo.areBlockContentsStale()) {
              numOfStaleStorages++;
            }

            if (failedStorage == null &&
                storageInfo.areBlocksOnFailedStorage() &&
                d != dead) {
              failedStorage = storageInfo;
            }
          }
        }
        
        // Set the number of stale nodes in the DatanodeManager
        dm.setNumStaleNodes(staleDataNodes.size());
        dm.setNumStaleStorages(numOfStaleStorages);
      }

      // log nodes detected as stale since last heartBeat
      dumpStaleNodes(staleNodes);

      allAlive = dead == null && failedStorage == null;
      if (!allAlive && namesystem.isInStartupSafeMode()) {
        return;
      }

      if (dead != null) {
        // acquire the fsnamesystem lock, and then remove the dead node.
        namesystem.writeLock();
        try {
          dm.removeDeadDatanode(dead, !dead.isMaintenance());
        } finally {
          namesystem.writeUnlock();
        }
      }
      if (failedStorage != null) {
        // acquire the fsnamesystem lock, and remove blocks on the storage.
        namesystem.writeLock();
        try {
          blockManager.removeBlocksAssociatedTo(failedStorage);
        } finally {
          namesystem.writeUnlock();
        }
      }
    }
  }

  /** Periodically check heartbeat and update block key */
  private class Monitor implements Runnable {
    private long lastHeartbeatCheck;
    private long lastBlockKeyUpdate;

    @Override
    public void run() {
      while(namesystem.isRunning()) {
        restartHeartbeatStopWatch();
        try {
          final long now = Time.monotonicNow();
          if (lastHeartbeatCheck + heartbeatRecheckInterval < now) {
            heartbeatCheck();
            lastHeartbeatCheck = now;
          }
          if (blockManager.shouldUpdateBlockKey(now - lastBlockKeyUpdate)) {
            synchronized(HeartbeatManager.this) {
              for(DatanodeDescriptor d : datanodes) {
                d.setNeedKeyUpdate(true);
              }
            }
            lastBlockKeyUpdate = now;
          }
        } catch (Exception e) {
          LOG.error("Exception while checking heartbeat", e);
        }
        try {
          Thread.sleep(5000);  // 5 seconds
        } catch (InterruptedException ignored) {
        }
        // avoid declaring nodes dead for another cycle if a GC pause lasts
        // longer than the node recheck interval
        if (shouldAbortHeartbeatCheck(-5000)) {
          LOG.warn("Skipping next heartbeat scan due to excessive pause");
          lastHeartbeatCheck = Time.monotonicNow();
        }
      }
    }
  }
}