org.apache.hadoop.hdfs.server.datanode.BlockRecoveryWorker Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hadoop-hdfs Show documentation
Apache Hadoop HDFS
The newest version!
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hdfs.server.datanode;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Joiner;
import com.google.common.base.Preconditions;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.ErasureCodingPolicy;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.RecoveryInProgressException;
import org.apache.hadoop.hdfs.protocolPB.DatanodeProtocolClientSideTranslatorPB;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState;
import org.apache.hadoop.hdfs.server.protocol.BlockRecoveryCommand.RecoveringBlock;
import org.apache.hadoop.hdfs.server.protocol.BlockRecoveryCommand.RecoveringStripedBlock;
import org.apache.hadoop.hdfs.server.protocol.InterDatanodeProtocol;
import org.apache.hadoop.hdfs.server.protocol.ReplicaRecoveryInfo;
import org.apache.hadoop.hdfs.util.StripedBlockUtil;
import org.apache.hadoop.ipc.RemoteException;
import org.apache.hadoop.util.Daemon;
import org.slf4j.Logger;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import static org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BLOCK_GROUP_INDEX_MASK;
import static org.apache.hadoop.hdfs.util.StripedBlockUtil.getInternalBlockLength;

/**
 * This class handles the block recovery work commands.
 */
@InterfaceAudience.Private
public class BlockRecoveryWorker {
  public static final Logger LOG = DataNode.LOG;

  private final DataNode datanode;
  private final Configuration conf;
  private final DNConf dnConf;

  BlockRecoveryWorker(DataNode datanode) {
    this.datanode = datanode;
    conf = datanode.getConf();
    dnConf = datanode.getDnConf();
  }

  /** A convenient class used in block recovery. */
  static class BlockRecord {
    private final DatanodeID id;
    private final InterDatanodeProtocol datanode;
    private final ReplicaRecoveryInfo rInfo;

    private String storageID;

    BlockRecord(DatanodeID id, InterDatanodeProtocol datanode,
        ReplicaRecoveryInfo rInfo) {
      this.id = id;
      this.datanode = datanode;
      this.rInfo = rInfo;
    }

    private void updateReplicaUnderRecovery(String bpid, long recoveryId,
        long newBlockId, long newLength) throws IOException {
      final ExtendedBlock b = new ExtendedBlock(bpid, rInfo);
      storageID = datanode.updateReplicaUnderRecovery(b, recoveryId, newBlockId,
          newLength);
    }

    public ReplicaRecoveryInfo getReplicaRecoveryInfo(){
      return rInfo;
    }

    @Override
    public String toString() {
      return "block:" + rInfo + " node:" + id;
    }
  }

  /** A block recovery task for a contiguous block. */
  class RecoveryTaskContiguous {
    private final RecoveringBlock rBlock;
    private final ExtendedBlock block;
    private final String bpid;
    private final DatanodeInfo[] locs;
    private final long recoveryId;

    RecoveryTaskContiguous(RecoveringBlock rBlock) {
      this.rBlock = rBlock;
      block = rBlock.getBlock();
      bpid = block.getBlockPoolId();
      locs = rBlock.getLocations();
      recoveryId = rBlock.getNewGenerationStamp();
    }

    protected void recover() throws IOException {
      List syncList = new ArrayList<>(locs.length);
      int errorCount = 0;
      int candidateReplicaCnt = 0;

      // Check generation stamps, replica size and state. Replica must satisfy
      // the following criteria to be included in syncList for recovery:
      // - Valid generation stamp
      // - Non-zero length
      // - Original state is RWR or better
      for(DatanodeID id : locs) {
        try {
          DatanodeID bpReg = getDatanodeID(bpid);
          InterDatanodeProtocol proxyDN = bpReg.equals(id)?
              datanode: DataNode.createInterDataNodeProtocolProxy(id, conf,
              dnConf.socketTimeout, dnConf.connectToDnViaHostname);
          ReplicaRecoveryInfo info = callInitReplicaRecovery(proxyDN, rBlock);
          if (info != null &&
              info.getGenerationStamp() >= block.getGenerationStamp() &&
              info.getNumBytes() > 0) {
            // Count the number of candidate replicas received.
            ++candidateReplicaCnt;
            if (info.getOriginalReplicaState().getValue() <=
                ReplicaState.RWR.getValue()) {
              syncList.add(new BlockRecord(id, proxyDN, info));
            } else {
              if (LOG.isDebugEnabled()) {
                LOG.debug("Block recovery: Ignored replica with invalid " +
                    "original state: " + info + " from DataNode: " + id);
              }
            }
          } else {
            if (LOG.isDebugEnabled()) {
              if (info == null) {
                LOG.debug("Block recovery: DataNode: " + id + " does not have "
                    + "replica for block: " + block);
              } else {
                LOG.debug("Block recovery: Ignored replica with invalid "
                    + "generation stamp or length: " + info + " from " +
                    "DataNode: " + id);
              }
            }
          }
        } catch (RecoveryInProgressException ripE) {
          InterDatanodeProtocol.LOG.warn(
              "Recovery for replica " + block + " on data-node " + id
                  + " is already in progress. Recovery id = "
                  + rBlock.getNewGenerationStamp() + " is aborted.", ripE);
          return;
        } catch (IOException e) {
          ++errorCount;
          InterDatanodeProtocol.LOG.warn("Failed to recover block (block="
              + block + ", datanode=" + id + ")", e);
        }
      }

      if (errorCount == locs.length) {
        throw new IOException("All datanodes failed: block=" + block
            + ", datanodeids=" + Arrays.asList(locs));
      }

      // None of the replicas reported by DataNodes has the required original
      // state, report the error.
      if (candidateReplicaCnt > 0 && syncList.isEmpty()) {
        throw new IOException("Found " + candidateReplicaCnt +
            " replica(s) for block " + block + " but none is in " +
            ReplicaState.RWR.name() + " or better state. datanodeids=" +
            Arrays.asList(locs));
      }

      syncBlock(syncList);
    }

    /** Block synchronization. */
    void syncBlock(List syncList) throws IOException {
      DatanodeProtocolClientSideTranslatorPB nn =
          getActiveNamenodeForBP(block.getBlockPoolId());

      boolean isTruncateRecovery = rBlock.getNewBlock() != null;
      long blockId = (isTruncateRecovery) ?
          rBlock.getNewBlock().getBlockId() : block.getBlockId();

      LOG.info("BlockRecoveryWorker: block={} (length={}),"
              + " isTruncateRecovery={}, syncList={}", block,
          block.getNumBytes(), isTruncateRecovery, syncList);

      // syncList.isEmpty() means that all data-nodes do not have the block
      // or their replicas have 0 length.
      // The block can be deleted.
      if (syncList.isEmpty()) {
        if (LOG.isDebugEnabled()) {
          LOG.debug("syncBlock for block " + block + ", all datanodes don't " +
              "have the block or their replicas have 0 length. The block can " +
              "be deleted.");
        }
        nn.commitBlockSynchronization(block, recoveryId, 0,
            true, true, DatanodeID.EMPTY_ARRAY, null);
        return;
      }

      // Calculate the best available replica state.
      ReplicaState bestState = ReplicaState.RWR;
      long finalizedLength = -1;
      for (BlockRecord r : syncList) {
        assert r.rInfo.getNumBytes() > 0 : "zero length replica";
        ReplicaState rState = r.rInfo.getOriginalReplicaState();
        if (rState.getValue() < bestState.getValue()) {
          bestState = rState;
        }
        if(rState == ReplicaState.FINALIZED) {
          if (finalizedLength > 0 && finalizedLength != r.rInfo.getNumBytes()) {
            throw new IOException("Inconsistent size of finalized replicas. " +
                "Replica " + r.rInfo + " expected size: " + finalizedLength);
          }
          finalizedLength = r.rInfo.getNumBytes();
        }
      }

      // Calculate list of nodes that will participate in the recovery
      // and the new block size
      List participatingList = new ArrayList<>();
      final ExtendedBlock newBlock = new ExtendedBlock(bpid, blockId,
          -1, recoveryId);
      switch(bestState) {
      case FINALIZED:
        assert finalizedLength > 0 : "finalizedLength is not positive";
        for(BlockRecord r : syncList) {
          ReplicaState rState = r.rInfo.getOriginalReplicaState();
          if (rState == ReplicaState.FINALIZED ||
              rState == ReplicaState.RBW &&
                  r.rInfo.getNumBytes() == finalizedLength) {
            participatingList.add(r);
          }
          if (LOG.isDebugEnabled()) {
            LOG.debug("syncBlock replicaInfo: block=" + block +
                ", from datanode " + r.id + ", receivedState=" + rState.name() +
                ", receivedLength=" + r.rInfo.getNumBytes() +
                ", bestState=FINALIZED, finalizedLength=" + finalizedLength);
          }
        }
        newBlock.setNumBytes(finalizedLength);
        break;
      case RBW:
      case RWR:
        long minLength = Long.MAX_VALUE;
        for(BlockRecord r : syncList) {
          ReplicaState rState = r.rInfo.getOriginalReplicaState();
          if(rState == bestState) {
            minLength = Math.min(minLength, r.rInfo.getNumBytes());
            participatingList.add(r);
          }
          if (LOG.isDebugEnabled()) {
            LOG.debug("syncBlock replicaInfo: block=" + block +
                ", from datanode " + r.id + ", receivedState=" + rState.name() +
                ", receivedLength=" + r.rInfo.getNumBytes() + ", bestState=" +
                bestState.name());
          }
        }
        // recover() guarantees syncList will have at least one replica with RWR
        // or better state.
        if (minLength == Long.MAX_VALUE) {
          throw new IOException("Incorrect block size");
        }
        newBlock.setNumBytes(minLength);
        break;
      case RUR:
      case TEMPORARY:
        assert false : "bad replica state: " + bestState;
      default:
        break; // we have 'case' all enum values
      }
      if (isTruncateRecovery) {
        newBlock.setNumBytes(rBlock.getNewBlock().getNumBytes());
      }

      LOG.info("BlockRecoveryWorker: block={} (length={}), bestState={},"
              + " newBlock={} (length={}), participatingList={}",
          block, block.getNumBytes(), bestState.name(), newBlock,
          newBlock.getNumBytes(), participatingList);

      List failedList = new ArrayList<>();
      final List successList = new ArrayList<>();
      for (BlockRecord r : participatingList) {
        try {
          r.updateReplicaUnderRecovery(bpid, recoveryId, blockId,
              newBlock.getNumBytes());
          successList.add(r);
        } catch (IOException e) {
          InterDatanodeProtocol.LOG.warn("Failed to updateBlock (newblock="
              + newBlock + ", datanode=" + r.id + ")", e);
          failedList.add(r.id);
        }
      }

      // Abort if all failed.
      if (successList.isEmpty()) {
        throw new IOException("Cannot recover " + block
            + ", the following datanodes failed: " + failedList);
      }

      // Notify the name-node about successfully recovered replicas.
      final DatanodeID[] datanodes = new DatanodeID[successList.size()];
      final String[] storages = new String[datanodes.length];
      for (int i = 0; i < datanodes.length; i++) {
        final BlockRecord r = successList.get(i);
        datanodes[i] = r.id;
        storages[i] = r.storageID;
      }

      if (LOG.isDebugEnabled()) {
        LOG.debug("Datanode triggering commitBlockSynchronization, block=" +
            block + ", newGs=" + newBlock.getGenerationStamp() +
            ", newLength=" + newBlock.getNumBytes());
      }

      nn.commitBlockSynchronization(block,
          newBlock.getGenerationStamp(), newBlock.getNumBytes(), true, false,
          datanodes, storages);
    }
  }

  /**
   * blk_0  blk_1  blk_2  blk_3  blk_4  blk_5  blk_6  blk_7  blk_8
   *  64k    64k    64k    64k    64k    64k    64k    64k    64k   <-- stripe_0
   *  64k    64k    64k    64k    64k    64k    64k    64k    64k
   *  64k    64k    64k    64k    64k    64k    64k    61k    <-- startStripeIdx
   *  64k    64k    64k    64k    64k    64k    64k
   *  64k    64k    64k    64k    64k    64k    59k
   *  64k    64k    64k    64k    64k    64k
   *  64k    64k    64k    64k    64k    64k                <-- last full stripe
   *  64k    64k    13k    64k    55k     3k              <-- target last stripe
   *  64k    64k           64k     1k
   *  64k    64k           58k
   *  64k    64k
   *  64k    19k
   *  64k                                               <-- total visible stripe
   *
   *  Due to different speed of streamers, the internal blocks in a block group
   *  could have different lengths when the block group isn't ended normally.
   *  The purpose of this class is to recover the UnderConstruction block group,
   *  so all internal blocks end at the same stripe.
   *
   * The steps:
   * 1. get all blocks lengths from DataNodes.
   * 2. calculate safe length, which is at the target last stripe.
   * 3. decode and feed blk_6~8, make them end at last full stripe. (the last
   * full stripe means the last decodable stripe.)
   * 4. encode the target last stripe, with the remaining sequential data. In
   * this case, the sequential data is 64k+64k+13k. Feed blk_6~8 the parity cells.
   * Overwrite the parity cell if have to.
   * 5. truncate the stripes from visible stripe, to target last stripe.
   * TODO: implement step 3,4
   */
  public class RecoveryTaskStriped {
    private final RecoveringBlock rBlock;
    private final ExtendedBlock block;
    private final String bpid;
    private final DatanodeInfo[] locs;
    private final long recoveryId;

    private final byte[] blockIndices;
    private final ErasureCodingPolicy ecPolicy;

    RecoveryTaskStriped(RecoveringStripedBlock rBlock) {
      this.rBlock = rBlock;
      // TODO: support truncate
      Preconditions.checkArgument(rBlock.getNewBlock() == null);

      block = rBlock.getBlock();
      bpid = block.getBlockPoolId();
      locs = rBlock.getLocations();
      recoveryId = rBlock.getNewGenerationStamp();
      blockIndices = rBlock.getBlockIndices();
      ecPolicy = rBlock.getErasureCodingPolicy();
    }

    protected void recover() throws IOException {
      checkLocations(locs.length);

      Map syncBlocks = new HashMap<>(locs.length);
      final int dataBlkNum = ecPolicy.getNumDataUnits();
      final int totalBlkNum = dataBlkNum + ecPolicy.getNumParityUnits();
      //check generation stamps
      for (int i = 0; i < locs.length; i++) {
        DatanodeID id = locs[i];
        try {
          DatanodeID bpReg = getDatanodeID(bpid);
          InterDatanodeProtocol proxyDN = bpReg.equals(id) ?
              datanode : DataNode.createInterDataNodeProtocolProxy(id, conf,
              dnConf.socketTimeout, dnConf.connectToDnViaHostname);
          ExtendedBlock internalBlk = new ExtendedBlock(block);
          final long blockId = block.getBlockId() + blockIndices[i];
          internalBlk.setBlockId(blockId);
          ReplicaRecoveryInfo info = callInitReplicaRecovery(proxyDN,
              new RecoveringBlock(internalBlk, null, recoveryId));

          if (info != null &&
              info.getGenerationStamp() >= block.getGenerationStamp() &&
              info.getNumBytes() > 0) {
            final BlockRecord existing = syncBlocks.get(blockId);
            if (existing == null ||
                info.getNumBytes() > existing.rInfo.getNumBytes()) {
              // if we have >1 replicas for the same internal block, we
              // simply choose the one with larger length.
              // TODO: better usage of redundant replicas
              syncBlocks.put(blockId, new BlockRecord(id, proxyDN, info));
            }
          }
        } catch (RecoveryInProgressException ripE) {
          InterDatanodeProtocol.LOG.warn(
              "Recovery for replica " + block + " on data-node " + id
                  + " is already in progress. Recovery id = "
                  + rBlock.getNewGenerationStamp() + " is aborted.", ripE);
          return;
        } catch (IOException e) {
          InterDatanodeProtocol.LOG.warn("Failed to recover block (block="
              + block + ", datanode=" + id + ")", e);
        }
      }
      checkLocations(syncBlocks.size());

      final long safeLength = getSafeLength(syncBlocks);
      if (LOG.isDebugEnabled()) {
        LOG.debug("Recovering block " + block
            + ", length=" + block.getNumBytes() + ", safeLength=" + safeLength
            + ", syncList=" + syncBlocks);
      }

      // If some internal blocks reach the safe length, convert them to RUR
      List rurList = new ArrayList<>(locs.length);
      for (BlockRecord r : syncBlocks.values()) {
        int blockIndex = (int) (r.rInfo.getBlockId() & BLOCK_GROUP_INDEX_MASK);
        long newSize = getInternalBlockLength(safeLength, ecPolicy.getCellSize(),
            dataBlkNum, blockIndex);
        if (r.rInfo.getNumBytes() >= newSize) {
          rurList.add(r);
        }
      }
      assert rurList.size() >= dataBlkNum : "incorrect safe length";

      // Recovery the striped block by truncating internal blocks to the safe
      // length. Abort if there is any failure in this step.
      truncatePartialBlock(rurList, safeLength);

      // notify Namenode the new size and locations
      final DatanodeID[] newLocs = new DatanodeID[totalBlkNum];
      final String[] newStorages = new String[totalBlkNum];
      for (int i = 0; i < totalBlkNum; i++) {
        newLocs[blockIndices[i]] = DatanodeID.EMPTY_DATANODE_ID;
        newStorages[blockIndices[i]] = "";
      }
      for (BlockRecord r : rurList) {
        int index = (int) (r.rInfo.getBlockId() &
            HdfsServerConstants.BLOCK_GROUP_INDEX_MASK);
        newLocs[index] = r.id;
        newStorages[index] = r.storageID;
      }
      ExtendedBlock newBlock = new ExtendedBlock(bpid, block.getBlockId(),
          safeLength, recoveryId);
      DatanodeProtocolClientSideTranslatorPB nn = getActiveNamenodeForBP(bpid);
      nn.commitBlockSynchronization(block, newBlock.getGenerationStamp(),
          newBlock.getNumBytes(), true, false, newLocs, newStorages);
    }

    private void truncatePartialBlock(List rurList,
        long safeLength) throws IOException {
      int cellSize = ecPolicy.getCellSize();
      int dataBlkNum = ecPolicy.getNumDataUnits();
      List failedList = new ArrayList<>();
      for (BlockRecord r : rurList) {
        int blockIndex = (int) (r.rInfo.getBlockId() & BLOCK_GROUP_INDEX_MASK);
        long newSize = getInternalBlockLength(safeLength, cellSize, dataBlkNum,
            blockIndex);
        try {
          r.updateReplicaUnderRecovery(bpid, recoveryId, r.rInfo.getBlockId(),
              newSize);
        } catch (IOException e) {
          InterDatanodeProtocol.LOG.warn("Failed to updateBlock (newblock="
              + ", datanode=" + r.id + ")", e);
          failedList.add(r.id);
        }
      }

      // If any of the data-nodes failed, the recovery fails, because
      // we never know the actual state of the replica on failed data-nodes.
      // The recovery should be started over.
      if (!failedList.isEmpty()) {
        throw new IOException("Cannot recover " + block
            + ", the following datanodes failed: " + failedList);
      }
    }

    /**
     * TODO: the current implementation depends on the assumption that the
     * parity cells are only generated based on the full stripe. This is not
     * true after we support hflush.
     */
    @VisibleForTesting
    long getSafeLength(Map syncBlocks) {
      final int dataBlkNum = ecPolicy.getNumDataUnits();
      Preconditions.checkArgument(syncBlocks.size() >= dataBlkNum);
      long[] blockLengths = new long[syncBlocks.size()];
      int i = 0;
      for (BlockRecord r : syncBlocks.values()) {
        ReplicaRecoveryInfo rInfo = r.getReplicaRecoveryInfo();
        blockLengths[i++] = rInfo.getNumBytes();
      }
      return StripedBlockUtil.getSafeLength(ecPolicy, blockLengths);
    }

    private void checkLocations(int locationCount)
        throws IOException {
      if (locationCount < ecPolicy.getNumDataUnits()) {
        throw new IOException(block + " has no enough internal blocks" +
            ", unable to start recovery. Locations=" + Arrays.asList(locs));
      }
    }
  }

  private DatanodeID getDatanodeID(String bpid) throws IOException {
    BPOfferService bpos = datanode.getBPOfferService(bpid);
    if (bpos == null) {
      throw new IOException("No block pool offer service for bpid=" + bpid);
    }
    return new DatanodeID(bpos.bpRegistration);
  }

  private static void logRecoverBlock(String who, RecoveringBlock rb) {
    ExtendedBlock block = rb.getBlock();
    DatanodeInfo[] targets = rb.getLocations();

    LOG.info("BlockRecoveryWorker: " + who + " calls recoverBlock(" + block
        + ", targets=[" + Joiner.on(", ").join(targets) + "]"
        + ", newGenerationStamp=" + rb.getNewGenerationStamp()
        + ", newBlock=" + rb.getNewBlock()
        + ", isStriped=" + rb.isStriped()
        + ")");
  }

  /**
   * Convenience method, which unwraps RemoteException.
   * @throws IOException not a RemoteException.
   */
  private static ReplicaRecoveryInfo callInitReplicaRecovery(
      InterDatanodeProtocol datanode, RecoveringBlock rBlock)
      throws IOException {
    try {
      return datanode.initReplicaRecovery(rBlock);
    } catch(RemoteException re) {
      throw re.unwrapRemoteException();
    }
  }

  /**
   * Get the NameNode corresponding to the given block pool.
   *
   * @param bpid Block pool Id
   * @return Namenode corresponding to the bpid
   * @throws IOException if unable to get the corresponding NameNode
   */
  DatanodeProtocolClientSideTranslatorPB getActiveNamenodeForBP(
      String bpid) throws IOException {
    BPOfferService bpos = datanode.getBPOfferService(bpid);
    if (bpos == null) {
      throw new IOException("No block pool offer service for bpid=" + bpid);
    }

    DatanodeProtocolClientSideTranslatorPB activeNN = bpos.getActiveNN();
    if (activeNN == null) {
      throw new IOException(
          "Block pool " + bpid + " has not recognized an active NN");
    }
    return activeNN;
  }

  public Daemon recoverBlocks(final String who,
      final Collection blocks) {
    Daemon d = new Daemon(datanode.threadGroup, new Runnable() {
      @Override
      public void run() {
        for(RecoveringBlock b : blocks) {
          try {
            logRecoverBlock(who, b);
            if (b.isStriped()) {
              new RecoveryTaskStriped((RecoveringStripedBlock) b).recover();
            } else {
              new RecoveryTaskContiguous(b).recover();
            }
          } catch (IOException e) {
            LOG.warn("recoverBlocks FAILED: " + b, e);
          }
        }
      }
    });
    d.start();
    return d;
  }
}