All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hdfs.server.datanode.FSDataset Maven / Gradle / Ivy

The newest version!
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hdfs.server.datanode;

import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStream;
import java.io.RandomAccessFile;
import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Random;

import javax.management.NotCompliantMBeanException;
import javax.management.ObjectName;
import javax.management.StandardMBean;

import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.DF;
import org.apache.hadoop.fs.DU;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.BlockListAsLongs;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.protocol.RecoveryInProgressException;
import org.apache.hadoop.hdfs.server.datanode.metrics.FSDatasetMBean;
import org.apache.hadoop.hdfs.server.protocol.InterDatanodeProtocol;
import org.apache.hadoop.hdfs.server.protocol.ReplicaRecoveryInfo;
import org.apache.hadoop.hdfs.server.protocol.BlockRecoveryCommand.RecoveringBlock;
import org.apache.hadoop.metrics.util.MBeanUtil;
import org.apache.hadoop.util.DataChecksum;
import org.apache.hadoop.util.DiskChecker;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.DiskChecker.DiskErrorException;
import org.apache.hadoop.util.DiskChecker.DiskOutOfSpaceException;
import org.apache.hadoop.hdfs.server.common.GenerationStamp;
import org.apache.hadoop.hdfs.server.common.HdfsConstants.ReplicaState;
import org.apache.hadoop.io.IOUtils;

/**************************************************
 * FSDataset manages a set of data blocks.  Each block
 * has a unique name and an extent on disk.
 *
 ***************************************************/
@InterfaceAudience.Private
public class FSDataset implements FSConstants, FSDatasetInterface {


  /**
   * A node type that can be built into a tree reflecting the
   * hierarchy of blocks on the local disk.
   */
  class FSDir {
    File dir;
    int numBlocks = 0;
    FSDir children[];
    int lastChildIdx = 0;
    /**
     */
    public FSDir(File dir) 
      throws IOException {
      this.dir = dir;
      this.children = null;
      if (!dir.exists()) {
        if (!dir.mkdirs()) {
          throw new IOException("Mkdirs failed to create " + 
                                dir.toString());
        }
      } else {
        File[] files = dir.listFiles();
        int numChildren = 0;
        for (int idx = 0; idx < files.length; idx++) {
          if (files[idx].isDirectory()) {
            numChildren++;
          } else if (Block.isBlockFilename(files[idx])) {
            numBlocks++;
          }
        }
        if (numChildren > 0) {
          children = new FSDir[numChildren];
          int curdir = 0;
          for (int idx = 0; idx < files.length; idx++) {
            if (files[idx].isDirectory()) {
              children[curdir] = new FSDir(files[idx]);
              curdir++;
            }
          }
        }
      }
    }
        
    public File addBlock(Block b, File src) throws IOException {
      //First try without creating subdirectories
      File file = addBlock(b, src, false, false);          
      return (file != null) ? file : addBlock(b, src, true, true);
    }

    private File addBlock(Block b, File src, boolean createOk, 
                          boolean resetIdx) throws IOException {
      if (numBlocks < maxBlocksPerDir) {
        File dest = new File(dir, b.getBlockName());
        File metaData = getMetaFile( src, b );
        File newmeta = getMetaFile(dest, b);
        if ( ! metaData.renameTo( newmeta ) ||
            ! src.renameTo( dest ) ) {
          throw new IOException( "could not move files for " + b +
                                 " from " + src + " to " + 
                                 dest.getAbsolutePath() + " or from"
                                 + metaData + " to " + newmeta);
        }
        if (DataNode.LOG.isDebugEnabled()) {
          DataNode.LOG.debug("addBlock: Moved " + metaData + " to " + newmeta);
          DataNode.LOG.debug("addBlock: Moved " + src + " to " + dest);
        }

        numBlocks += 1;
        return dest;
      }
            
      if (lastChildIdx < 0 && resetIdx) {
        //reset so that all children will be checked
        lastChildIdx = random.nextInt(children.length);              
      }
            
      if (lastChildIdx >= 0 && children != null) {
        //Check if any child-tree has room for a block.
        for (int i=0; i < children.length; i++) {
          int idx = (lastChildIdx + i)%children.length;
          File file = children[idx].addBlock(b, src, false, resetIdx);
          if (file != null) {
            lastChildIdx = idx;
            return file; 
          }
        }
        lastChildIdx = -1;
      }
            
      if (!createOk) {
        return null;
      }
            
      if (children == null || children.length == 0) {
        children = new FSDir[maxBlocksPerDir];
        for (int idx = 0; idx < maxBlocksPerDir; idx++) {
          children[idx] = new FSDir(new File(dir, DataStorage.BLOCK_SUBDIR_PREFIX+idx));
        }
      }
            
      //now pick a child randomly for creating a new set of subdirs.
      lastChildIdx = random.nextInt(children.length);
      return children[ lastChildIdx ].addBlock(b, src, true, false); 
    }

    void getVolumeMap(ReplicasMap volumeMap, FSVolume volume) 
    throws IOException {
      if (children != null) {
        for (int i = 0; i < children.length; i++) {
          children[i].getVolumeMap(volumeMap, volume);
        }
      }

      recoverTempUnlinkedBlock();
      volume.addToReplicasMap(volumeMap, dir, true);
    }
        
    /**
     * Recover unlinked tmp files on datanode restart. If the original block
     * does not exist, then the tmp file is renamed to be the
     * original file name; otherwise the tmp file is deleted.
     */
    private void recoverTempUnlinkedBlock() throws IOException {
      File files[] = dir.listFiles();
      for (File file : files) {
        if (!FSDataset.isUnlinkTmpFile(file)) {
          continue;
        }
        File blockFile = getOrigFile(file);
        if (blockFile.exists()) {
          //
          // If the original block file still exists, then no recovery
          // is needed.
          //
          if (!file.delete()) {
            throw new IOException("Unable to cleanup unlinked tmp file " +
                file);
          }
        } else {
          if (!file.renameTo(blockFile)) {
            throw new IOException("Unable to cleanup detached file " +
                file);
          }
        }
      }
    }
    
    /**
     * check if a data diretory is healthy
     * @throws DiskErrorException
     */
    public void checkDirTree() throws DiskErrorException {
      DiskChecker.checkDir(dir);
            
      if (children != null) {
        for (int i = 0; i < children.length; i++) {
          children[i].checkDirTree();
        }
      }
    }
        
    void clearPath(File f) {
      String root = dir.getAbsolutePath();
      String dir = f.getAbsolutePath();
      if (dir.startsWith(root)) {
        String[] dirNames = dir.substring(root.length()).
          split(File.separator + "subdir");
        if (clearPath(f, dirNames, 1))
          return;
      }
      clearPath(f, null, -1);
    }
        
    /*
     * dirNames is an array of string integers derived from
     * usual directory structure data/subdirN/subdirXY/subdirM ...
     * If dirName array is non-null, we only check the child at 
     * the children[dirNames[idx]]. This avoids iterating over
     * children in common case. If directory structure changes 
     * in later versions, we need to revisit this.
     */
    private boolean clearPath(File f, String[] dirNames, int idx) {
      if ((dirNames == null || idx == dirNames.length) &&
          dir.compareTo(f) == 0) {
        numBlocks--;
        return true;
      }
          
      if (dirNames != null) {
        //guess the child index from the directory name
        if (idx > (dirNames.length - 1) || children == null) {
          return false;
        }
        int childIdx; 
        try {
          childIdx = Integer.parseInt(dirNames[idx]);
        } catch (NumberFormatException ignored) {
          // layout changed? we could print a warning.
          return false;
        }
        return (childIdx >= 0 && childIdx < children.length) ?
          children[childIdx].clearPath(f, dirNames, idx+1) : false;
      }

      //guesses failed. back to blind iteration.
      if (children != null) {
        for(int i=0; i < children.length; i++) {
          if (children[i].clearPath(f, null, -1)){
            return true;
          }
        }
      }
      return false;
    }
        
    public String toString() {
      return "FSDir{" +
        "dir=" + dir +
        ", children=" + (children == null ? null : Arrays.asList(children)) +
        "}";
    }
  }

  class FSVolume {
    private File currentDir;
    private FSDir dataDir;      // directory store Finalized replica
    private File rbwDir;        // directory store RBW replica
    private File tmpDir;        // directory store Temporary replica
    private DF usage;
    private DU dfsUsage;
    private long reserved;

    
    FSVolume(File currentDir, Configuration conf) throws IOException {
      this.reserved = conf.getLong("dfs.datanode.du.reserved", 0);
      this.currentDir = currentDir; 
      File parent = currentDir.getParentFile();
      final File finalizedDir = new File(
          currentDir, DataStorage.STORAGE_DIR_FINALIZED);

      // Files that were being written when the datanode was last shutdown
      // are now moved back to the data directory. It is possible that
      // in the future, we might want to do some sort of datanode-local
      // recovery for these blocks. For example, crc validation.
      //
      this.tmpDir = new File(parent, "tmp");
      if (tmpDir.exists()) {
        FileUtil.fullyDelete(tmpDir);
      }
      this.rbwDir = new File(currentDir, DataStorage.STORAGE_DIR_RBW);
      if (rbwDir.exists() && !supportAppends) {
        FileUtil.fullyDelete(rbwDir);
      }
      this.dataDir = new FSDir(finalizedDir);
      if (!rbwDir.mkdirs()) {  // create rbw directory if not exist
        if (!rbwDir.isDirectory()) {
          throw new IOException("Mkdirs failed to create " + rbwDir.toString());
        }
      }
      if (!tmpDir.mkdirs()) {
        if (!tmpDir.isDirectory()) {
          throw new IOException("Mkdirs failed to create " + tmpDir.toString());
        }
      }
      this.usage = new DF(parent, conf);
      this.dfsUsage = new DU(parent, conf);
      this.dfsUsage.start();
    }

    File getCurrentDir() {
      return currentDir;
    }
    
    void decDfsUsed(long value) {
      // The caller to this method (BlockFileDeleteTask.run()) does
      // not have locked FSDataset.this yet.
      synchronized(FSDataset.this) {
        dfsUsage.decDfsUsed(value);
      }
    }
    
    long getDfsUsed() throws IOException {
      return dfsUsage.getUsed();
    }
    
    /**
     * Calculate the capacity of the filesystem, after removing any
     * reserved capacity.
     * @return the unreserved number of bytes left in this filesystem. May be zero.
     */
    long getCapacity() throws IOException {
      long remaining = usage.getCapacity() - reserved;
      return remaining > 0 ? remaining : 0;
    }
      
    long getAvailable() throws IOException {
      long remaining = getCapacity()-getDfsUsed();
      long available = usage.getAvailable();
      if (remaining>available) {
        remaining = available;
      }
      return (remaining > 0) ? remaining : 0;
    }
      
    long getReserved(){
      return reserved;
    }
    
    String getMount() throws IOException {
      return usage.getMount();
    }
      
    File getDir() {
      return dataDir.dir;
    }
    
    /**
     * Temporary files. They get moved to the finalized block directory when
     * the block is finalized.
     */
    File createTmpFile(Block b) throws IOException {
      File f = new File(tmpDir, b.getBlockName());
      return FSDataset.createTmpFile(b, f);
    }

    /**
     * RBW files. They get moved to the finalized block directory when
     * the block is finalized.
     */
    File createRbwFile(Block b) throws IOException {
      File f = new File(rbwDir, b.getBlockName());
      return FSDataset.createTmpFile(b, f);
    }

    File addBlock(Block b, File f) throws IOException {
      File blockFile = dataDir.addBlock(b, f);
      File metaFile = getMetaFile( blockFile , b);
      dfsUsage.incDfsUsed(b.getNumBytes()+metaFile.length());
      return blockFile;
    }
      
    void checkDirs() throws DiskErrorException {
      dataDir.checkDirTree();
      DiskChecker.checkDir(tmpDir);
      DiskChecker.checkDir(rbwDir);
    }
      
    void getVolumeMap(ReplicasMap volumeMap) throws IOException {
      // add finalized replicas
      dataDir.getVolumeMap(volumeMap, this);
      // add rbw replicas
      addToReplicasMap(volumeMap, rbwDir, false);
    }

    /**
     * Add replicas under the given directory to the volume map
     * @param volumeMap the replicas map
     * @param dir an input directory
     * @param isFinalized true if the directory has finalized replicas;
     *                    false if the directory has rbw replicas
     */
    private void addToReplicasMap(ReplicasMap volumeMap, 
        File dir, boolean isFinalized) {
      File blockFiles[] = dir.listFiles();
      for (File blockFile : blockFiles) {
        if (!Block.isBlockFilename(blockFile))
          continue;
        
        long genStamp = getGenerationStampFromFile(blockFiles, blockFile);
        long blockId = Block.filename2id(blockFile.getName());
        ReplicaInfo newReplica = null;
        if (isFinalized) {
          newReplica = new FinalizedReplica(blockId, 
              blockFile.length(), genStamp, this, blockFile.getParentFile());
        } else {
          newReplica = new ReplicaWaitingToBeRecovered(blockId,
              validateIntegrity(blockFile, genStamp), 
              genStamp, this, blockFile.getParentFile());
        }

        ReplicaInfo oldReplica = volumeMap.add(newReplica);
        if (oldReplica != null) {
          DataNode.LOG.warn("Two block files with the same block id exist " +
              "on disk: " + oldReplica.getBlockFile() +
              " and " + blockFile );
        }
      }
    }
    
    /**
     * Find out the number of bytes in the block that match its crc.
     * 
     * This algorithm assumes that data corruption caused by unexpected 
     * datanode shutdown occurs only in the last crc chunk. So it checks
     * only the last chunk.
     * 
     * @param blockFile the block file
     * @param genStamp generation stamp of the block
     * @return the number of valid bytes
     */
    private long validateIntegrity(File blockFile, long genStamp) {
      DataInputStream checksumIn = null;
      InputStream blockIn = null;
      try {
        File metaFile = new File(getMetaFileName(blockFile.toString(), genStamp));
        long blockFileLen = blockFile.length();
        long metaFileLen = metaFile.length();
        int crcHeaderLen = DataChecksum.getChecksumHeaderSize();
        if (!blockFile.exists() || blockFileLen == 0 ||
            !metaFile.exists() || metaFileLen < (long)crcHeaderLen) {
          return 0;
        }
        checksumIn = new DataInputStream(
            new BufferedInputStream(new FileInputStream(metaFile),
                BUFFER_SIZE));

        // read and handle the common header here. For now just a version
        BlockMetadataHeader header = BlockMetadataHeader.readHeader(checksumIn);
        short version = header.getVersion();
        if (version != FSDataset.METADATA_VERSION) {
          DataNode.LOG.warn("Wrong version (" + version + ") for metadata file "
              + metaFile + " ignoring ...");
        }
        DataChecksum checksum = header.getChecksum();
        int bytesPerChecksum = checksum.getBytesPerChecksum();
        int checksumSize = checksum.getChecksumSize();
        long numChunks = Math.min(
            (blockFileLen + bytesPerChecksum - 1)/bytesPerChecksum, 
            (metaFileLen - crcHeaderLen)/checksumSize);
        if (numChunks == 0) {
          return 0;
        }
        IOUtils.skipFully(checksumIn, (numChunks-1)*checksumSize);
        blockIn = new FileInputStream(blockFile);
        long lastChunkStartPos = (numChunks-1)*bytesPerChecksum;
        IOUtils.skipFully(blockIn, lastChunkStartPos);
        int lastChunkSize = (int)Math.min(
            bytesPerChecksum, blockFileLen-lastChunkStartPos);
        byte[] buf = new byte[lastChunkSize+checksumSize];
        checksumIn.readFully(buf, lastChunkSize, checksumSize);
        IOUtils.readFully(blockIn, buf, 0, lastChunkSize);

        checksum.update(buf, 0, lastChunkSize);
        if (checksum.compare(buf, lastChunkSize)) { // last chunk matches crc
          return lastChunkStartPos + lastChunkSize;
        } else { // last chunck is corrupt
          return lastChunkStartPos;
        }
      } catch (IOException e) {
        DataNode.LOG.warn(e);
        return 0;
      } finally {
        IOUtils.closeStream(checksumIn);
        IOUtils.closeStream(blockIn);
      }
    }
      
    void clearPath(File f) {
      dataDir.clearPath(f);
    }
      
    public String toString() {
      return getDir().getAbsolutePath();
    }
  }
    
  static class FSVolumeSet {
    FSVolume[] volumes = null;
    int curVolume = 0;
    int numFailedVolumes = 0;

    FSVolumeSet(FSVolume[] volumes) {
      this.volumes = volumes;
    }
    
    private int numberOfVolumes() {
      return volumes.length;
    }

    private int numberOfFailedVolumes() {
      return numFailedVolumes;
    }

    synchronized FSVolume getNextVolume(long blockSize) throws IOException {
      
      if(volumes.length < 1) {
        throw new DiskOutOfSpaceException("No more available volumes");
      }
      
      // since volumes could've been removed because of the failure
      // make sure we are not out of bounds
      if(curVolume >= volumes.length) {
        curVolume = 0;
      }
      
      int startVolume = curVolume;
      
      while (true) {
        FSVolume volume = volumes[curVolume];
        curVolume = (curVolume + 1) % volumes.length;
        if (volume.getAvailable() > blockSize) { return volume; }
        if (curVolume == startVolume) {
          throw new DiskOutOfSpaceException("Insufficient space for an additional block");
        }
      }
    }
      
    long getDfsUsed() throws IOException {
      long dfsUsed = 0L;
      for (int idx = 0; idx < volumes.length; idx++) {
        dfsUsed += volumes[idx].getDfsUsed();
      }
      return dfsUsed;
    }

    long getCapacity() throws IOException {
      long capacity = 0L;
      for (int idx = 0; idx < volumes.length; idx++) {
        capacity += volumes[idx].getCapacity();
      }
      return capacity;
    }
      
    long getRemaining() throws IOException {
      long remaining = 0L;
      for (int idx = 0; idx < volumes.length; idx++) {
        remaining += volumes[idx].getAvailable();
      }
      return remaining;
    }
      
    synchronized void getVolumeMap(ReplicasMap volumeMap) throws IOException {
      for (int idx = 0; idx < volumes.length; idx++) {
        volumes[idx].getVolumeMap(volumeMap);
      }
    }
      
    /**
     * Calls {@link FSVolume#checkDirs()} on each volume, removing any
     * volumes from the active list that result in a DiskErrorException.
     * @return list of all the removed volumes.
     */
    synchronized List checkDirs() {
      ArrayList removedVols = null;  
      
      for (int idx = 0; idx < volumes.length; idx++) {
        FSVolume fsv = volumes[idx];
        try {
          fsv.checkDirs();
        } catch (DiskErrorException e) {
          DataNode.LOG.warn("Removing failed volume " + fsv + ": ",e);
          if (removedVols == null) {
            removedVols = new ArrayList(1);
          }
          removedVols.add(volumes[idx]);
          volumes[idx] = null; // Remove the volume
          numFailedVolumes++;
        }
      }
      
      // Remove null volumes from the volumes array
      if (removedVols != null && removedVols.size() > 0) {
        FSVolume newVols[] = new FSVolume[volumes.length - removedVols.size()];
        int i = 0;
        for (FSVolume vol : volumes) {
          if (vol != null) {
            newVols[i++] = vol;
          }
        }
        volumes = newVols; // Replace array of volumes
        DataNode.LOG.info("Completed FSVolumeSet.checkDirs. Removed "
            + removedVols.size() + " volumes. List of current volumes: "
            + this);
      }

      return removedVols;
    }
      
    public String toString() {
      StringBuilder sb = new StringBuilder();
      for (int idx = 0; idx < volumes.length; idx++) {
        sb.append(volumes[idx].toString());
        if (idx != volumes.length - 1) { sb.append(","); }
      }
      return sb.toString();
    }

    public boolean isValid(FSVolume volume) {
      for (int idx = 0; idx < volumes.length; idx++) {
        if (volumes[idx] == volume) {
          return true;
        }
      }
      return false;
    }
  }
  
  //////////////////////////////////////////////////////
  //
  // FSDataSet
  //
  //////////////////////////////////////////////////////

  //Find better place?
  public static final String METADATA_EXTENSION = ".meta";
  public static final short METADATA_VERSION = 1;
  static final String UNLINK_BLOCK_SUFFIX = ".unlinked";

  private static boolean isUnlinkTmpFile(File f) {
    String name = f.getName();
    return name.endsWith(UNLINK_BLOCK_SUFFIX);
  }
  
  static File getUnlinkTmpFile(File f) {
    return new File(f.getParentFile(), f.getName()+UNLINK_BLOCK_SUFFIX);
  }
  
  private static File getOrigFile(File unlinkTmpFile) {
    String fileName = unlinkTmpFile.getName();
    return new File(unlinkTmpFile.getParentFile(),
        fileName.substring(0, fileName.length()-UNLINK_BLOCK_SUFFIX.length()));
  }
  
  static String getMetaFileName(String blockFileName, long genStamp) {
    return blockFileName + "_" + genStamp + METADATA_EXTENSION;
  }
  
  static File getMetaFile(File f , Block b) {
    return new File(getMetaFileName(f.getAbsolutePath(),
                                    b.getGenerationStamp())); 
  }
  protected File getMetaFile(Block b) throws IOException {
    return getMetaFile(getBlockFile(b), b);
  }

  /** Find the metadata file for the specified block file.
   * Return the generation stamp from the name of the metafile.
   */
  private static long getGenerationStampFromFile(File[] listdir, File blockFile) {
    String blockName = blockFile.getName();
    for (int j = 0; j < listdir.length; j++) {
      String path = listdir[j].getName();
      if (!path.startsWith(blockName)) {
        continue;
      }
      if (blockFile == listdir[j]) {
        continue;
      }
      return Block.getGenerationStamp(listdir[j].getName());
    }
    DataNode.LOG.warn("Block " + blockFile + 
                      " does not have a metafile!");
    return GenerationStamp.GRANDFATHER_GENERATION_STAMP;
  }

  /** Find the corresponding meta data file from a given block file */
  private static File findMetaFile(final File blockFile) throws IOException {
    final String prefix = blockFile.getName() + "_";
    final File parent = blockFile.getParentFile();
    File[] matches = parent.listFiles(new FilenameFilter() {
      public boolean accept(File dir, String name) {
        return dir.equals(parent)
            && name.startsWith(prefix) && name.endsWith(METADATA_EXTENSION);
      }
    });

    if (matches == null || matches.length == 0) {
      throw new IOException("Meta file not found, blockFile=" + blockFile);
    }
    else if (matches.length > 1) {
      throw new IOException("Found more than one meta files: " 
          + Arrays.asList(matches));
    }
    return matches[0];
  }
  
  /** Find the corresponding meta data file from a given block file */
  private static long parseGenerationStamp(File blockFile, File metaFile
      ) throws IOException {
    String metaname = metaFile.getName();
    String gs = metaname.substring(blockFile.getName().length() + 1,
        metaname.length() - METADATA_EXTENSION.length());
    try {
      return Long.parseLong(gs);
    } catch(NumberFormatException nfe) {
      throw (IOException)new IOException("blockFile=" + blockFile
          + ", metaFile=" + metaFile).initCause(nfe);
    }
  }

  /** Return the block file for the given ID */ 
  public File findBlockFile(long blockId) {
    return getFile(blockId);
  }

  @Override // FSDatasetInterface
  public synchronized Block getStoredBlock(long blkid) throws IOException {
    File blockfile = findBlockFile(blkid);
    if (blockfile == null) {
      return null;
    }
    File metafile = findMetaFile(blockfile);
    return new Block(blkid, blockfile.length(),
        parseGenerationStamp(blockfile, metafile));
  }

  /**
   * Returns a clone of a replica stored in data-node memory.
   * Should be primarily used for testing.
   * @param blockId
   * @return
   */
  synchronized ReplicaInfo fetchReplicaInfo(long blockId) {
    ReplicaInfo r = volumeMap.get(blockId);
    if(r == null)
      return null;
    switch(r.getState()) {
    case FINALIZED:
      return new FinalizedReplica((FinalizedReplica)r);
    case RBW:
      return new ReplicaBeingWritten((ReplicaBeingWritten)r);
    case RWR:
      return new ReplicaWaitingToBeRecovered((ReplicaWaitingToBeRecovered)r);
    case RUR:
      return new ReplicaUnderRecovery((ReplicaUnderRecovery)r);
    case TEMPORARY:
      return new ReplicaInPipeline((ReplicaInPipeline)r);
    }
    return null;
  }

  @Override // FSDatasetInterface
  public boolean metaFileExists(Block b) throws IOException {
    return getMetaFile(b).exists();
  }
  
  @Override // FSDatasetInterface
  public long getMetaDataLength(Block b) throws IOException {
    File checksumFile = getMetaFile( b );
    return checksumFile.length();
  }

  @Override // FSDatasetInterface
  public MetaDataInputStream getMetaDataInputStream(Block b)
      throws IOException {
    File checksumFile = getMetaFile( b );
    return new MetaDataInputStream(new FileInputStream(checksumFile),
                                                    checksumFile.length());
  }

  static File createTmpFile(Block b, File f) throws IOException {
    if (f.exists()) {
      throw new IOException("Unexpected problem in creating temporary file for "+
                            b + ".  File " + f + " should not be present, but is.");
    }
    // Create the zero-length temp file
    //
    boolean fileCreated = false;
    try {
      fileCreated = f.createNewFile();
    } catch (IOException ioe) {
      throw (IOException)new IOException(DISK_ERROR +f).initCause(ioe);
    }
    if (!fileCreated) {
      throw new IOException("Unexpected problem in creating temporary file for "+
                            b + ".  File " + f + " should be creatable, but is already present.");
    }
    return f;
  }
    
  FSVolumeSet volumes;
  private int maxBlocksPerDir = 0;
  ReplicasMap volumeMap = new ReplicasMap();
  static  Random random = new Random();
  FSDatasetAsyncDiskService asyncDiskService;
  private int validVolsRequired;

  // Used for synchronizing access to usage stats
  private Object statsLock = new Object();

  boolean supportAppends = true;

  /**
   * An FSDataset has a directory where it loads its data files.
   */
  public FSDataset(DataStorage storage, Configuration conf) throws IOException {
    this.maxBlocksPerDir = conf.getInt("dfs.datanode.numblocks", 64);
    this.supportAppends = conf.getBoolean(DFSConfigKeys.DFS_SUPPORT_APPEND_KEY,
                                      DFSConfigKeys.DFS_SUPPORT_APPEND_DEFAULT);
    // The number of volumes required for operation is the total number 
    // of volumes minus the number of failed volumes we can tolerate.
    final int volFailuresTolerated =
      conf.getInt(DFSConfigKeys.DFS_DATANODE_FAILED_VOLUMES_TOLERATED_KEY,
                  DFSConfigKeys.DFS_DATANODE_FAILED_VOLUMES_TOLERATED_DEFAULT);
    this.validVolsRequired = storage.getNumStorageDirs() - volFailuresTolerated; 
    if (validVolsRequired < 1 ||
        validVolsRequired > storage.getNumStorageDirs()) {
      DataNode.LOG.error("Invalid value " + volFailuresTolerated + " for " +
          DFSConfigKeys.DFS_DATANODE_FAILED_VOLUMES_TOLERATED_KEY);
    }
    FSVolume[] volArray = new FSVolume[storage.getNumStorageDirs()];
    for (int idx = 0; idx < storage.getNumStorageDirs(); idx++) {
      volArray[idx] = new FSVolume(storage.getStorageDir(idx).getCurrentDir(), conf);
    }
    volumes = new FSVolumeSet(volArray);
    volumes.getVolumeMap(volumeMap);
    File[] roots = new File[storage.getNumStorageDirs()];
    for (int idx = 0; idx < storage.getNumStorageDirs(); idx++) {
      roots[idx] = storage.getStorageDir(idx).getCurrentDir();
    }
    asyncDiskService = new FSDatasetAsyncDiskService(roots);
    registerMBean(storage.getStorageID());
  }

  /**
   * Return the total space used by dfs datanode
   */
  public long getDfsUsed() throws IOException {
    synchronized(statsLock) {
      return volumes.getDfsUsed();
    }
  }

  /**
   * Return true - if there are still valid volumes on the DataNode. 
   */
  @Override // FSDatasetInterface
  public boolean hasEnoughResource() {
    return volumes.numberOfVolumes() >= validVolsRequired; 
  }

  /**
   * Return total capacity, used and unused
   */
  public long getCapacity() throws IOException {
    synchronized(statsLock) {
      return volumes.getCapacity();
    }
  }

  /**
   * Return how many bytes can still be stored in the FSDataset
   */
  public long getRemaining() throws IOException {
    synchronized(statsLock) {
      return volumes.getRemaining();
    }
  }

  /**
   * Return the number of failed volumes in the FSDataset.
   */
  public int getNumFailedVolumes() {
    return volumes.numberOfFailedVolumes();
  }

  /**
   * Find the block's on-disk length
   */
  @Override // FSDatasetInterface
  public long getLength(Block b) throws IOException {
    return getBlockFile(b).length();
  }

  /**
   * Get File name for a given block.
   */
  public synchronized File getBlockFile(Block b) throws IOException {
    File f = validateBlockFile(b);
    if(f == null) {
      if (InterDatanodeProtocol.LOG.isDebugEnabled()) {
        InterDatanodeProtocol.LOG.debug("b=" + b + ", volumeMap=" + volumeMap);
      }
      throw new IOException("Block " + b + " is not valid.");
    }
    return f;
  }
  
  @Override // FSDatasetInterface
  public synchronized InputStream getBlockInputStream(Block b) throws IOException {
    return new FileInputStream(getBlockFile(b));
  }

  @Override // FSDatasetInterface
  public synchronized InputStream getBlockInputStream(Block b, long seekOffset) throws IOException {

    File blockFile = getBlockFile(b);
    RandomAccessFile blockInFile = new RandomAccessFile(blockFile, "r");
    if (seekOffset > 0) {
      blockInFile.seek(seekOffset);
    }
    return new FileInputStream(blockInFile.getFD());
  }

  /**
   * Get the meta info of a block stored in volumeMap
   * @param b block
   * @return the meta replica information
   * @throws IOException if no entry is in the map or 
   *                        there is a generation stamp mismatch
   */
  private ReplicaInfo getReplicaInfo(Block b) throws IOException {
    ReplicaInfo info = volumeMap.get(b);
    if (info == null) {
      throw new IOException("Block " + b + " does not exist in volumeMap.");
    }
    return info;
  }
  
  /**
   * Returns handles to the block file and its metadata file
   */
  @Override // FSDatasetInterface
  public synchronized BlockInputStreams getTmpInputStreams(Block b, 
                          long blkOffset, long ckoff) throws IOException {

    ReplicaInfo info = getReplicaInfo(b);
    File blockFile = info.getBlockFile();
    RandomAccessFile blockInFile = new RandomAccessFile(blockFile, "r");
    if (blkOffset > 0) {
      blockInFile.seek(blkOffset);
    }
    File metaFile = info.getMetaFile();
    RandomAccessFile metaInFile = new RandomAccessFile(metaFile, "r");
    if (ckoff > 0) {
      metaInFile.seek(ckoff);
    }
    return new BlockInputStreams(new FileInputStream(blockInFile.getFD()),
                                new FileInputStream(metaInFile.getFD()));
  }
    
  /**
   * Make a copy of the block if this block is linked to an existing
   * snapshot. This ensures that modifying this block does not modify
   * data in any existing snapshots.
   * @param block Block
   * @param numLinks Unlink if the number of links exceed this value
   * @throws IOException
   * @return - true if the specified block was unlinked or the block
   *           is not in any snapshot.
   */
  public boolean unlinkBlock(Block block, int numLinks) throws IOException {
    ReplicaInfo info = null;

    synchronized (this) {
      info = getReplicaInfo(block);
    }
   return info.unlinkBlock(numLinks);
  }

  static private void truncateBlock(File blockFile, File metaFile,
      long oldlen, long newlen) throws IOException {
    DataNode.LOG.info("truncateBlock: blockFile=" + blockFile
        + ", metaFile=" + metaFile
        + ", oldlen=" + oldlen
        + ", newlen=" + newlen);

    if (newlen == oldlen) {
      return;
    }
    if (newlen > oldlen) {
      throw new IOException("Cannout truncate block to from oldlen (=" + oldlen
          + ") to newlen (=" + newlen + ")");
    }

    DataChecksum dcs = BlockMetadataHeader.readHeader(metaFile).getChecksum(); 
    int checksumsize = dcs.getChecksumSize();
    int bpc = dcs.getBytesPerChecksum();
    long n = (newlen - 1)/bpc + 1;
    long newmetalen = BlockMetadataHeader.getHeaderSize() + n*checksumsize;
    long lastchunkoffset = (n - 1)*bpc;
    int lastchunksize = (int)(newlen - lastchunkoffset); 
    byte[] b = new byte[Math.max(lastchunksize, checksumsize)]; 

    RandomAccessFile blockRAF = new RandomAccessFile(blockFile, "rw");
    try {
      //truncate blockFile 
      blockRAF.setLength(newlen);
 
      //read last chunk
      blockRAF.seek(lastchunkoffset);
      blockRAF.readFully(b, 0, lastchunksize);
    } finally {
      blockRAF.close();
    }

    //compute checksum
    dcs.update(b, 0, lastchunksize);
    dcs.writeValue(b, 0, false);

    //update metaFile 
    RandomAccessFile metaRAF = new RandomAccessFile(metaFile, "rw");
    try {
      metaRAF.setLength(newmetalen);
      metaRAF.seek(newmetalen - checksumsize);
      metaRAF.write(b, 0, checksumsize);
    } finally {
      metaRAF.close();
    }
  }

  private final static String DISK_ERROR = "Possible disk error on file creation: ";
  /** Get the cause of an I/O exception if caused by a possible disk error
   * @param ioe an I/O exception
   * @return cause if the I/O exception is caused by a possible disk error;
   *         null otherwise.
   */ 
  static IOException getCauseIfDiskError(IOException ioe) {
    if (ioe.getMessage()!=null && ioe.getMessage().startsWith(DISK_ERROR)) {
      return (IOException)ioe.getCause();
    } else {
      return null;
    }
  }

  @Override  // FSDatasetInterface
  public synchronized ReplicaInPipelineInterface append(Block b,
      long newGS, long expectedBlockLen) throws IOException {
    // If the block was successfully finalized because all packets
    // were successfully processed at the Datanode but the ack for
    // some of the packets were not received by the client. The client 
    // re-opens the connection and retries sending those packets.
    // The other reason is that an "append" is occurring to this block.
    
    // check the validity of the parameter
    if (newGS < b.getGenerationStamp()) {
      throw new IOException("The new generation stamp " + newGS + 
          " should be greater than the replica " + b + "'s generation stamp");
    }
    ReplicaInfo replicaInfo = volumeMap.get(b);
    if (replicaInfo == null) {
      throw new ReplicaNotFoundException(
          ReplicaNotFoundException.NON_EXISTENT_REPLICA + b);
    }  
    DataNode.LOG.info("Appending to replica " + replicaInfo);
    if (replicaInfo.getState() != ReplicaState.FINALIZED) {
      throw new ReplicaNotFoundException(
          ReplicaNotFoundException.UNFINALIZED_REPLICA + b);
    }
    if (replicaInfo.getNumBytes() != expectedBlockLen) {
      throw new IOException("Corrupted replica " + replicaInfo + 
          " with a length of " + replicaInfo.getNumBytes() + 
          " expected length is " + expectedBlockLen);
    }

    return append((FinalizedReplica)replicaInfo, newGS, b.getNumBytes());
  }
  
  /** Append to a finalized replica
   * Change a finalized replica to be a RBW replica and 
   * bump its generation stamp to be the newGS
   * 
   * @param replicaInfo a finalized replica
   * @param newGS new generation stamp
   * @param estimateBlockLen estimate generation stamp
   * @return a RBW replica
   * @throws IOException if moving the replica from finalized directory 
   *         to rbw directory fails
   */
  private synchronized ReplicaBeingWritten append(FinalizedReplica replicaInfo, 
      long newGS, long estimateBlockLen) throws IOException {
    // unlink the finalized replica
    replicaInfo.unlinkBlock(1);
    
    // construct a RBW replica with the new GS
    File blkfile = replicaInfo.getBlockFile();
    FSVolume v = replicaInfo.getVolume();
    if (v.getAvailable() < estimateBlockLen - replicaInfo.getNumBytes()) {
      throw new DiskOutOfSpaceException("Insufficient space for appending to "
          + replicaInfo);
    }
    File newBlkFile = new File(v.rbwDir, replicaInfo.getBlockName());
    File oldmeta = replicaInfo.getMetaFile();
    ReplicaBeingWritten newReplicaInfo = new ReplicaBeingWritten(
        replicaInfo.getBlockId(), replicaInfo.getNumBytes(), newGS,
        v, newBlkFile.getParentFile(), Thread.currentThread());
    File newmeta = newReplicaInfo.getMetaFile();

    // rename meta file to rbw directory
    if (DataNode.LOG.isDebugEnabled()) {
      DataNode.LOG.debug("Renaming " + oldmeta + " to " + newmeta);
    }
    if (!oldmeta.renameTo(newmeta)) {
      throw new IOException("Block " + replicaInfo + " reopen failed. " +
                            " Unable to move meta file  " + oldmeta +
                            " to rbw dir " + newmeta);
    }

    // rename block file to rbw directory
    if (DataNode.LOG.isDebugEnabled()) {
      DataNode.LOG.debug("Renaming " + blkfile + " to " + newBlkFile);
      DataNode.LOG.debug("Old block file length is " + blkfile.length());
    }
    if (!blkfile.renameTo(newBlkFile)) {
      if (!newmeta.renameTo(oldmeta)) {  // restore the meta file
        DataNode.LOG.warn("Cannot move meta file " + newmeta + 
            "back to the finalized directory " + oldmeta);
      }
      throw new IOException("Block " + replicaInfo + " reopen failed. " +
                              " Unable to move block file " + blkfile +
                              " to rbw dir " + newBlkFile);
    }
    
    // Replace finalized replica by a RBW replica in replicas map
    volumeMap.add(newReplicaInfo);
    
    return newReplicaInfo;
  }

  private ReplicaInfo recoverCheck(Block b, long newGS, 
      long expectedBlockLen) throws IOException {
    ReplicaInfo replicaInfo = volumeMap.get(b.getBlockId());
    if (replicaInfo == null) {
      throw new ReplicaNotFoundException(
          ReplicaNotFoundException.NON_EXISTENT_REPLICA + b);
    }
    
    // check state
    if (replicaInfo.getState() != ReplicaState.FINALIZED &&
        replicaInfo.getState() != ReplicaState.RBW) {
      throw new ReplicaNotFoundException(
          ReplicaNotFoundException.UNFINALIZED_AND_NONRBW_REPLICA + replicaInfo);
    }

    // check generation stamp
    long replicaGenerationStamp = replicaInfo.getGenerationStamp();
    if (replicaGenerationStamp < b.getGenerationStamp() ||
        replicaGenerationStamp > newGS) {
      throw new ReplicaNotFoundException(
          ReplicaNotFoundException.UNEXPECTED_GS_REPLICA + replicaGenerationStamp
          + ". Expected GS range is [" + b.getGenerationStamp() + ", " + 
          newGS + "].");
    }
    
    // stop the previous writer before check a replica's length
    long replicaLen = replicaInfo.getNumBytes();
    if (replicaInfo.getState() == ReplicaState.RBW) {
      ReplicaBeingWritten rbw = (ReplicaBeingWritten)replicaInfo;
      // kill the previous writer
      rbw.stopWriter();
      rbw.setWriter(Thread.currentThread());
      // check length: bytesRcvd, bytesOnDisk, and bytesAcked should be the same
      if (replicaLen != rbw.getBytesOnDisk() 
          || replicaLen != rbw.getBytesAcked()) {
        throw new ReplicaAlreadyExistsException("RBW replica " + replicaInfo + 
            "bytesRcvd(" + rbw.getNumBytes() + "), bytesOnDisk(" + 
            rbw.getBytesOnDisk() + "), and bytesAcked(" + rbw.getBytesAcked() +
            ") are not the same.");
      }
    }
    
    // check block length
    if (replicaLen != expectedBlockLen) {
      throw new IOException("Corrupted replica " + replicaInfo + 
          " with a length of " + replicaLen + 
          " expected length is " + expectedBlockLen);
    }
    
    return replicaInfo;
  }
  @Override  // FSDatasetInterface
  public synchronized ReplicaInPipelineInterface recoverAppend(Block b,
      long newGS, long expectedBlockLen) throws IOException {
    DataNode.LOG.info("Recover failed append to " + b);

    ReplicaInfo replicaInfo = recoverCheck(b, newGS, expectedBlockLen);

    // change the replica's state/gs etc.
    if (replicaInfo.getState() == ReplicaState.FINALIZED ) {
      return append((FinalizedReplica)replicaInfo, newGS, b.getNumBytes());
    } else { //RBW
      bumpReplicaGS(replicaInfo, newGS);
      return (ReplicaBeingWritten)replicaInfo;
    }
  }

  @Override // FSDatasetInterface
  public void recoverClose(Block b, long newGS,
      long expectedBlockLen) throws IOException {
    DataNode.LOG.info("Recover failed close " + b);
    // check replica's state
    ReplicaInfo replicaInfo = recoverCheck(b, newGS, expectedBlockLen);
    // bump the replica's GS
    bumpReplicaGS(replicaInfo, newGS);
    // finalize the replica if RBW
    if (replicaInfo.getState() == ReplicaState.RBW) {
      finalizeBlock(replicaInfo);
    }
  }
  
  /**
   * Bump a replica's generation stamp to a new one.
   * Its on-disk meta file name is renamed to be the new one too.
   * 
   * @param replicaInfo a replica
   * @param newGS new generation stamp
   * @throws IOException if rename fails
   */
  private void bumpReplicaGS(ReplicaInfo replicaInfo, 
      long newGS) throws IOException { 
    long oldGS = replicaInfo.getGenerationStamp();
    File oldmeta = replicaInfo.getMetaFile();
    replicaInfo.setGenerationStamp(newGS);
    File newmeta = replicaInfo.getMetaFile();

    // rename meta file to new GS
    if (DataNode.LOG.isDebugEnabled()) {
      DataNode.LOG.debug("Renaming " + oldmeta + " to " + newmeta);
    }
    if (!oldmeta.renameTo(newmeta)) {
      replicaInfo.setGenerationStamp(oldGS); // restore old GS
      throw new IOException("Block " + (Block)replicaInfo + " reopen failed. " +
                            " Unable to move meta file  " + oldmeta +
                            " to " + newmeta);
    }
  }

  @Override // FSDatasetInterface
  public synchronized ReplicaInPipelineInterface createRbw(Block b)
      throws IOException {
    ReplicaInfo replicaInfo = volumeMap.get(b.getBlockId());
    if (replicaInfo != null) {
      throw new ReplicaAlreadyExistsException("Block " + b +
      " already exists in state " + replicaInfo.getState() +
      " and thus cannot be created.");
    }
    // create a new block
    FSVolume v = volumes.getNextVolume(b.getNumBytes());
    // create a rbw file to hold block in the designated volume
    File f = v.createRbwFile(b);
    ReplicaBeingWritten newReplicaInfo = new ReplicaBeingWritten(b.getBlockId(), 
        b.getGenerationStamp(), v, f.getParentFile());
    volumeMap.add(newReplicaInfo);
    return newReplicaInfo;
  }
  
  @Override // FSDatasetInterface
  public synchronized ReplicaInPipelineInterface recoverRbw(Block b,
      long newGS, long minBytesRcvd, long maxBytesRcvd)
      throws IOException {
    DataNode.LOG.info("Recover the RBW replica " + b);

    ReplicaInfo replicaInfo = volumeMap.get(b.getBlockId());
    if (replicaInfo == null) {
      throw new ReplicaNotFoundException(
          ReplicaNotFoundException.NON_EXISTENT_REPLICA + b);
    }
    
    // check the replica's state
    if (replicaInfo.getState() != ReplicaState.RBW) {
      throw new ReplicaNotFoundException(
          ReplicaNotFoundException.NON_RBW_REPLICA + replicaInfo);
    }
    ReplicaBeingWritten rbw = (ReplicaBeingWritten)replicaInfo;
    
    DataNode.LOG.info("Recovering replica " + rbw);

    // Stop the previous writer
    rbw.stopWriter();
    rbw.setWriter(Thread.currentThread());

    // check generation stamp
    long replicaGenerationStamp = rbw.getGenerationStamp();
    if (replicaGenerationStamp < b.getGenerationStamp() ||
        replicaGenerationStamp > newGS) {
      throw new ReplicaNotFoundException(
          ReplicaNotFoundException.UNEXPECTED_GS_REPLICA + b +
          ". Expected GS range is [" + b.getGenerationStamp() + ", " + 
          newGS + "].");
    }
    
    // check replica length
    if (rbw.getBytesAcked() < minBytesRcvd || rbw.getNumBytes() > maxBytesRcvd){
      throw new ReplicaNotFoundException("Unmatched length replica " + 
          replicaInfo + ": BytesAcked = " + rbw.getBytesAcked() + 
          " BytesRcvd = " + rbw.getNumBytes() + " are not in the range of [" + 
          minBytesRcvd + ", " + maxBytesRcvd + "].");
    }

    // bump the replica's generation stamp to newGS
    bumpReplicaGS(rbw, newGS);
    
    return rbw;
  }
  
  @Override // FSDatasetInterface
  public synchronized ReplicaInPipelineInterface createTemporary(Block b)
      throws IOException {
    ReplicaInfo replicaInfo = volumeMap.get(b.getBlockId());
    if (replicaInfo != null) {
      throw new ReplicaAlreadyExistsException("Block " + b +
          " already exists in state " + replicaInfo.getState() +
          " and thus cannot be created.");
    }
    
    FSVolume v = volumes.getNextVolume(b.getNumBytes());
    // create a temporary file to hold block in the designated volume
    File f = v.createTmpFile(b);
    ReplicaInPipeline newReplicaInfo = new ReplicaInPipeline(b.getBlockId(), 
        b.getGenerationStamp(), v, f.getParentFile());
    volumeMap.add(newReplicaInfo);
    
    return newReplicaInfo;
  }

  /**
   * Sets the offset in the meta file so that the
   * last checksum will be overwritten.
   */
  @Override // FSDatasetInterface
  public void adjustCrcChannelPosition(Block b, BlockWriteStreams streams, 
      int checksumSize) throws IOException {
    FileOutputStream file = (FileOutputStream) streams.checksumOut;
    FileChannel channel = file.getChannel();
    long oldPos = channel.position();
    long newPos = oldPos - checksumSize;
    if (DataNode.LOG.isDebugEnabled()) {
      DataNode.LOG.debug("Changing meta file offset of block " + b + " from " +
          oldPos + " to " + newPos);
    }
    channel.position(newPos);
  }

  synchronized File createTmpFile( FSVolume vol, Block blk ) throws IOException {
    if ( vol == null ) {
      vol = getReplicaInfo( blk ).getVolume();
      if ( vol == null ) {
        throw new IOException("Could not find volume for block " + blk);
      }
    }
    return vol.createTmpFile(blk);
  }

  //
  // REMIND - mjc - eventually we should have a timeout system
  // in place to clean up block files left by abandoned clients.
  // We should have some timer in place, so that if a blockfile
  // is created but non-valid, and has been idle for >48 hours,
  // we can GC it safely.
  //

  /**
   * Complete the block write!
   */
  @Override // FSDatasetInterface
  public synchronized void finalizeBlock(Block b) throws IOException {
    ReplicaInfo replicaInfo = getReplicaInfo(b);
    if (replicaInfo.getState() == ReplicaState.FINALIZED) {
      // this is legal, when recovery happens on a file that has
      // been opened for append but never modified
      return;
    }
    finalizeReplica(replicaInfo);
  }
  
  private synchronized FinalizedReplica finalizeReplica(ReplicaInfo replicaInfo)
  throws IOException {
    FinalizedReplica newReplicaInfo = null;
    if (replicaInfo.getState() == ReplicaState.RUR &&
       ((ReplicaUnderRecovery)replicaInfo).getOrignalReplicaState() == 
         ReplicaState.FINALIZED) {
      newReplicaInfo = (FinalizedReplica)
             ((ReplicaUnderRecovery)replicaInfo).getOriginalReplica();
    } else {
      FSVolume v = replicaInfo.getVolume();
      File f = replicaInfo.getBlockFile();
      if (v == null) {
        throw new IOException("No volume for temporary file " + f + 
            " for block " + replicaInfo);
      }

      File dest = v.addBlock(replicaInfo, f);
      newReplicaInfo = new FinalizedReplica(replicaInfo, v, dest.getParentFile());
    }
    volumeMap.add(newReplicaInfo);
    return newReplicaInfo;
  }

  /**
   * Remove the temporary block file (if any)
   */
  @Override // FSDatasetInterface
  public synchronized void unfinalizeBlock(Block b) throws IOException {
    ReplicaInfo replicaInfo = volumeMap.get(b);
    if (replicaInfo != null && replicaInfo.getState() == ReplicaState.TEMPORARY) {
      // remove from volumeMap
      volumeMap.remove(b);
      
      // delete the on-disk temp file
      if (delBlockFromDisk(replicaInfo.getBlockFile(), 
          replicaInfo.getMetaFile(), b)) {
        DataNode.LOG.warn("Block " + b + " unfinalized and removed. " );
      }
    }
  }

  /**
   * Remove a block from disk
   * @param blockFile block file
   * @param metaFile block meta file
   * @param b a block
   * @return true if on-disk files are deleted; false otherwise
   */
  private boolean delBlockFromDisk(File blockFile, File metaFile, Block b) {
    if (blockFile == null) {
      DataNode.LOG.warn("No file exists for block: " + b);
      return true;
    }
    
    if (!blockFile.delete()) {
      DataNode.LOG.warn("Not able to delete the block file: " + blockFile);
      return false;
    } else { // remove the meta file
      if (metaFile != null && !metaFile.delete()) {
        DataNode.LOG.warn(
            "Not able to delete the meta block file: " + metaFile);
        return false;
      }
    }
    return true;
  }

  /**
   * Generates a block report from the in-memory block map.
   */
  @Override // FSDatasetInterface
  public BlockListAsLongs getBlockReport() {
    ArrayList finalized =
      new ArrayList(volumeMap.size());
    ArrayList uc = new ArrayList();
    synchronized(this) {
      for (ReplicaInfo b : volumeMap.replicas()) {
        switch(b.getState()) {
        case FINALIZED:
          finalized.add(b);
          break;
        case RBW:
        case RWR:
          uc.add(b);
          break;
        case RUR:
          ReplicaUnderRecovery rur = (ReplicaUnderRecovery)b;
          uc.add(rur.getOriginalReplica());
          break;
        case TEMPORARY:
          break;
        default:
          assert false : "Illegal ReplicaInfo state.";
        }
      }
      return new BlockListAsLongs(finalized, uc);
    }
  }

  /**
   * Get the block list from in-memory blockmap. Note if 
   * is false, reference to the block in the volumeMap is returned. This block
   * should not be changed. Suitable synchronization using {@link FSDataset}
   * is needed to handle concurrent modification to the block.
   */
  synchronized Block[] getBlockList(boolean deepcopy) {
    Block[] list = volumeMap.replicas().toArray(new Block[volumeMap.size()]);
    if (deepcopy) {
      for (int i = 0; i < list.length; i++) {
        list[i] = new Block(list[i]);
      }
    }
    return list;
  }

  /**
   * Get the list of finalized blocks from in-memory blockmap.
   */
  synchronized List getFinalizedBlocks() {
    ArrayList finalized = new ArrayList(volumeMap.size());
    for (ReplicaInfo b : volumeMap.replicas()) {
      if(b.getState() == ReplicaState.FINALIZED) {
        finalized.add(new Block(b));
      }
    }
    return finalized;
  }

  /**
   * Check whether the given block is a valid one.
   * valid means finalized
   */
  @Override // FSDatasetInterface
  public boolean isValidBlock(Block b) {
    ReplicaInfo replicaInfo = volumeMap.get(b);
    if (replicaInfo == null || 
        replicaInfo.getState() != ReplicaState.FINALIZED) {
      return false;
    }
    return replicaInfo.getBlockFile().exists();
  }

  /**
   * Find the file corresponding to the block and return it if it exists.
   */
  File validateBlockFile(Block b) throws IOException {
    //Should we check for metadata file too?
    File f = getFile(b);
    
    if(f != null ) {
      if(f.exists())
        return f;
   
      // if file is not null, but doesn't exist - possibly disk failed
      DataNode datanode = DataNode.getDataNode();
      datanode.checkDiskError();
    }
    
    if (InterDatanodeProtocol.LOG.isDebugEnabled()) {
      InterDatanodeProtocol.LOG.debug("b=" + b + ", f=" + f);
    }
    return null;
  }

  /** Check the files of a replica. */
  static void checkReplicaFiles(final ReplicaInfo r) throws IOException {
    //check replica's file
    final File f = r.getBlockFile();
    if (!f.exists()) {
      throw new FileNotFoundException("File " + f + " not found, r=" + r);
    }
    if (r.getBytesOnDisk() != f.length()) {
      throw new IOException("File length mismatched.  The length of "
          + f + " is " + f.length() + " but r=" + r);
    }

    //check replica's meta file
    final File metafile = getMetaFile(f, r);
    if (!metafile.exists()) {
      throw new IOException("Metafile " + metafile + " does not exist, r=" + r);
    }
    if (metafile.length() == 0) {
      throw new IOException("Metafile " + metafile + " is empty, r=" + r);
    }
  }

  /**
   * We're informed that a block is no longer valid.  We
   * could lazily garbage-collect the block, but why bother?
   * just get rid of it.
   */
  @Override // FSDatasetInterface
  public void invalidate(Block invalidBlks[]) throws IOException {
    boolean error = false;
    for (int i = 0; i < invalidBlks.length; i++) {
      File f = null;
      FSVolume v;
      synchronized (this) {
        f = getFile(invalidBlks[i]);
        ReplicaInfo dinfo = volumeMap.get(invalidBlks[i]);
        if (dinfo == null || 
            dinfo.getGenerationStamp() != invalidBlks[i].getGenerationStamp()) {
          DataNode.LOG.warn("Unexpected error trying to delete block "
                           + invalidBlks[i] + 
                           ". BlockInfo not found in volumeMap.");
          error = true;
          continue;
        }
        v = dinfo.getVolume();
        if (f == null) {
          DataNode.LOG.warn("Unexpected error trying to delete block "
                            + invalidBlks[i] + 
                            ". Block not found in blockMap." +
                            ((v == null) ? " " : " Block found in volumeMap."));
          error = true;
          continue;
        }
        if (v == null) {
          DataNode.LOG.warn("Unexpected error trying to delete block "
                            + invalidBlks[i] + 
                            ". No volume for this block." +
                            " Block found in blockMap. " + f + ".");
          error = true;
          continue;
        }
        File parent = f.getParentFile();
        if (parent == null) {
          DataNode.LOG.warn("Unexpected error trying to delete block "
                            + invalidBlks[i] + 
                            ". Parent not found for file " + f + ".");
          error = true;
          continue;
        }
        ReplicaState replicaState = dinfo.getState();
        if (replicaState == ReplicaState.FINALIZED || 
            (replicaState == ReplicaState.RUR && 
                ((ReplicaUnderRecovery)dinfo).getOrignalReplicaState() == 
                  ReplicaState.FINALIZED)) {
          v.clearPath(parent);
        }
        volumeMap.remove(invalidBlks[i]);
      }
      File metaFile = getMetaFile( f, invalidBlks[i] );
      long dfsBytes = f.length() + metaFile.length();
      
      // Delete the block asynchronously to make sure we can do it fast enough
      asyncDiskService.deleteAsync(v, f, metaFile, dfsBytes, invalidBlks[i].toString());
    }
    if (error) {
      throw new IOException("Error in deleting blocks.");
    }
  }

  /**
   * Turn the block identifier into a filename; ignore generation stamp!!!
   */
  public synchronized File getFile(Block b) {
    return getFile(b.getBlockId());
  }

  /**
   * Turn the block identifier into a filename
   * @param blockId a block's id
   * @return on disk data file path; null if the replica does not exist
   */
  private File getFile(long blockId) {
    ReplicaInfo info = volumeMap.get(blockId);
    if (info != null) {
      return info.getBlockFile();
    }
    return null;    
  }
  /**
   * check if a data directory is healthy
   * if some volumes failed - make sure to remove all the blocks that belong
   * to these volumes
   * @throws DiskErrorException
   */
  @Override // FSDatasetInterface
  public void checkDataDir() throws DiskErrorException {
    long totalBlocks=0, removedBlocks=0;
    List failedVols =  volumes.checkDirs();
    
    // If there no failed volumes return
    if (failedVols == null) { 
      return;
    }
    
    // Otherwise remove blocks for the failed volumes
    long mlsec = System.currentTimeMillis();
    synchronized (this) {
      Iterator ib = volumeMap.replicas().iterator();
      while (ib.hasNext()) {
        ReplicaInfo b = ib.next();
        totalBlocks++;
        // check if the volume block belongs to still valid
        FSVolume vol = b.getVolume();
        for (FSVolume fv: failedVols) {
          if (vol == fv) {
            DataNode.LOG.warn("Removing replica info for block " + 
              b.getBlockId() + " on failed volume " + 
              vol.dataDir.dir.getAbsolutePath());
            ib.remove();
            removedBlocks++;
            break;
          }
        }
      }
    } // end of sync
    mlsec = System.currentTimeMillis() - mlsec;
    DataNode.LOG.warn("Removed " + removedBlocks + " out of " + totalBlocks +
        "(took " + mlsec + " millisecs)");

    // report the error
    StringBuilder sb = new StringBuilder();
    for (FSVolume fv : failedVols) {
      sb.append(fv.dataDir.dir.getAbsolutePath() + ";");
    }

    throw  new DiskErrorException("DataNode failed volumes:" + sb);
  }
    

  @Override // FSDatasetInterface
  public String toString() {
    return "FSDataset{dirpath='"+volumes+"'}";
  }

  private ObjectName mbeanName;
  private Random rand = new Random();
  
  /**
   * Register the FSDataset MBean using the name
   *        "hadoop:service=DataNode,name=FSDatasetState-"
   */
  void registerMBean(final String storageId) {
    // We wrap to bypass standard mbean naming convetion.
    // This wraping can be removed in java 6 as it is more flexible in 
    // package naming for mbeans and their impl.
    StandardMBean bean;
    String storageName;
    if (storageId == null || storageId.equals("")) {// Temp fix for the uninitialized storage
      storageName = "UndefinedStorageId" + rand.nextInt();
    } else {
      storageName = storageId;
    }
    try {
      bean = new StandardMBean(this,FSDatasetMBean.class);
      mbeanName = MBeanUtil.registerMBean("DataNode", "FSDatasetState-" + storageName, bean);
    } catch (NotCompliantMBeanException e) {
      e.printStackTrace();
    }
 
    DataNode.LOG.info("Registered FSDatasetStatusMBean");
  }

  @Override // FSDatasetInterface
  public void shutdown() {
    if (mbeanName != null)
      MBeanUtil.unregisterMBean(mbeanName);
    
    if (asyncDiskService != null) {
      asyncDiskService.shutdown();
    }
    
    if(volumes != null) {
      for (FSVolume volume : volumes.volumes) {
        if(volume != null) {
          volume.dfsUsage.shutdown();
        }
      }
    }
  }

  public String getStorageInfo() {
    return toString();
  }

  /**
   * Reconcile the difference between blocks on the disk and blocks in
   * volumeMap
   *
   * Check the given block for inconsistencies. Look at the
   * current state of the block and reconcile the differences as follows:
   * 
    *
  • If the block file is missing, delete the block from volumeMap
  • *
  • If the block file exists and the block is missing in volumeMap, * add the block to volumeMap
  • *
  • If generation stamp does not match, then update the block with right * generation stamp
  • *
  • If the block length in memory does not match the actual block file length * then mark the block as corrupt and update the block length in memory
  • *
  • If the file in {@link ReplicaInfo} does not match the file on * the disk, update {@link ReplicaInfo} with the correct file
  • *
* * @param blockId Block that differs * @param diskFile Block file on the disk * @param diskMetaFile Metadata file from on the disk * @param vol Volume of the block file */ public void checkAndUpdate(long blockId, File diskFile, File diskMetaFile, FSVolume vol) { DataNode datanode = DataNode.getDataNode(); Block corruptBlock = null; ReplicaInfo memBlockInfo; synchronized (this) { memBlockInfo = volumeMap.get(blockId); if (memBlockInfo != null && memBlockInfo.getState() != ReplicaState.FINALIZED) { // Block is not finalized - ignore the difference return; } final long diskGS = diskMetaFile != null && diskMetaFile.exists() ? Block.getGenerationStamp(diskMetaFile.getName()) : GenerationStamp.GRANDFATHER_GENERATION_STAMP; if (diskFile == null || !diskFile.exists()) { if (memBlockInfo == null) { // Block file does not exist and block does not exist in memory // If metadata file exists then delete it if (diskMetaFile != null && diskMetaFile.exists() && diskMetaFile.delete()) { DataNode.LOG.warn("Deleted a metadata file without a block " + diskMetaFile.getAbsolutePath()); } return; } if (!memBlockInfo.getBlockFile().exists()) { // Block is in memory and not on the disk // Remove the block from volumeMap volumeMap.remove(blockId); if (datanode.blockScanner != null) { datanode.blockScanner.deleteBlock(new Block(blockId)); } DataNode.LOG.warn("Removed block " + blockId + " from memory with missing block file on the disk"); // Finally remove the metadata file if (diskMetaFile != null && diskMetaFile.exists() && diskMetaFile.delete()) { DataNode.LOG.warn("Deleted a metadata file for the deleted block " + diskMetaFile.getAbsolutePath()); } } return; } /* * Block file exists on the disk */ if (memBlockInfo == null) { // Block is missing in memory - add the block to volumeMap ReplicaInfo diskBlockInfo = new FinalizedReplica(blockId, diskFile.length(), diskGS, vol, diskFile.getParentFile()); volumeMap.add(diskBlockInfo); if (datanode.blockScanner != null) { datanode.blockScanner.addBlock(diskBlockInfo); } DataNode.LOG.warn("Added missing block to memory " + (Block)diskBlockInfo); return; } /* * Block exists in volumeMap and the block file exists on the disk */ // Compare block files File memFile = memBlockInfo.getBlockFile(); if (memFile.exists()) { if (memFile.compareTo(diskFile) != 0) { DataNode.LOG.warn("Block file " + memFile.getAbsolutePath() + " does not match file found by scan " + diskFile.getAbsolutePath()); // TODO: Should the diskFile be deleted? } } else { // Block refers to a block file that does not exist. // Update the block with the file found on the disk. Since the block // file and metadata file are found as a pair on the disk, update // the block based on the metadata file found on the disk DataNode.LOG.warn("Block file in volumeMap " + memFile.getAbsolutePath() + " does not exist. Updating it to the file found during scan " + diskFile.getAbsolutePath()); memBlockInfo.setDir(diskFile.getParentFile()); memFile = diskFile; DataNode.LOG.warn("Updating generation stamp for block " + blockId + " from " + memBlockInfo.getGenerationStamp() + " to " + diskGS); memBlockInfo.setGenerationStamp(diskGS); } // Compare generation stamp if (memBlockInfo.getGenerationStamp() != diskGS) { File memMetaFile = getMetaFile(diskFile, memBlockInfo); if (memMetaFile.exists()) { if (memMetaFile.compareTo(diskMetaFile) != 0) { DataNode.LOG.warn("Metadata file in memory " + memMetaFile.getAbsolutePath() + " does not match file found by scan " + diskMetaFile.getAbsolutePath()); } } else { // Metadata file corresponding to block in memory is missing // If metadata file found during the scan is on the same directory // as the block file, then use the generation stamp from it long gs = diskMetaFile != null && diskMetaFile.exists() && diskMetaFile.getParent().equals(memFile.getParent()) ? diskGS : GenerationStamp.GRANDFATHER_GENERATION_STAMP; DataNode.LOG.warn("Updating generation stamp for block " + blockId + " from " + memBlockInfo.getGenerationStamp() + " to " + gs); memBlockInfo.setGenerationStamp(gs); } } // Compare block size if (memBlockInfo.getNumBytes() != memFile.length()) { // Update the length based on the block file corruptBlock = new Block(memBlockInfo); DataNode.LOG.warn("Updating size of block " + blockId + " from " + memBlockInfo.getNumBytes() + " to " + memFile.length()); memBlockInfo.setNumBytes(memFile.length()); } } // Send corrupt block report outside the lock if (corruptBlock != null) { DatanodeInfo[] dnArr = { new DatanodeInfo(datanode.dnRegistration) }; LocatedBlock[] blocks = { new LocatedBlock(corruptBlock, dnArr) }; try { datanode.namenode.reportBadBlocks(blocks); DataNode.LOG.warn("Reporting the block " + corruptBlock + " as corrupt due to length mismatch"); } catch (IOException e) { DataNode.LOG.warn("Failed to repot bad block " + corruptBlock + "Exception:" + StringUtils.stringifyException(e)); } } } /** * @deprecated use {@link #fetchReplicaInfo(long)} instead. */ @Override // FSDatasetInterface @Deprecated public ReplicaInfo getReplica(long blockId) { assert(Thread.holdsLock(this)); return volumeMap.get(blockId); } @Override // FSDatasetInterface public synchronized ReplicaRecoveryInfo initReplicaRecovery( RecoveringBlock rBlock) throws IOException { return initReplicaRecovery( volumeMap, rBlock.getBlock(), rBlock.getNewGenerationStamp()); } /** static version of {@link #initReplicaRecovery(Block, long)}. */ static ReplicaRecoveryInfo initReplicaRecovery( ReplicasMap map, Block block, long recoveryId) throws IOException { final ReplicaInfo replica = map.get(block.getBlockId()); DataNode.LOG.info("initReplicaRecovery: block=" + block + ", recoveryId=" + recoveryId + ", replica=" + replica); //check replica if (replica == null) { return null; } //stop writer if there is any if (replica instanceof ReplicaInPipeline) { final ReplicaInPipeline rip = (ReplicaInPipeline)replica; rip.stopWriter(); //check replica bytes on disk. if (rip.getBytesOnDisk() < rip.getVisibleLength()) { throw new IOException("THIS IS NOT SUPPOSED TO HAPPEN:" + " getBytesOnDisk() < getVisibleLength(), rip=" + rip); } //check the replica's files checkReplicaFiles(rip); } //check generation stamp if (replica.getGenerationStamp() < block.getGenerationStamp()) { throw new IOException( "replica.getGenerationStamp() < block.getGenerationStamp(), block=" + block + ", replica=" + replica); } //check recovery id if (replica.getGenerationStamp() >= recoveryId) { throw new IOException("THIS IS NOT SUPPOSED TO HAPPEN:" + " replica.getGenerationStamp() >= recoveryId = " + recoveryId + ", block=" + block + ", replica=" + replica); } //check RUR final ReplicaUnderRecovery rur; if (replica.getState() == ReplicaState.RUR) { rur = (ReplicaUnderRecovery)replica; if (rur.getRecoveryID() >= recoveryId) { throw new RecoveryInProgressException( "rur.getRecoveryID() >= recoveryId = " + recoveryId + ", block=" + block + ", rur=" + rur); } final long oldRecoveryID = rur.getRecoveryID(); rur.setRecoveryID(recoveryId); DataNode.LOG.info("initReplicaRecovery: update recovery id for " + block + " from " + oldRecoveryID + " to " + recoveryId); } else { rur = new ReplicaUnderRecovery(replica, recoveryId); map.add(rur); DataNode.LOG.info("initReplicaRecovery: changing replica state for " + block + " from " + replica.getState() + " to " + rur.getState()); } return rur.createInfo(); } @Override // FSDatasetInterface public synchronized ReplicaInfo updateReplicaUnderRecovery( final Block oldBlock, final long recoveryId, final long newlength) throws IOException { //get replica final ReplicaInfo replica = volumeMap.get(oldBlock.getBlockId()); DataNode.LOG.info("updateReplica: block=" + oldBlock + ", recoveryId=" + recoveryId + ", length=" + newlength + ", replica=" + replica); //check replica if (replica == null) { throw new ReplicaNotFoundException(oldBlock); } //check replica state if (replica.getState() != ReplicaState.RUR) { throw new IOException("replica.getState() != " + ReplicaState.RUR + ", replica=" + replica); } //check replica's byte on disk if (replica.getBytesOnDisk() != oldBlock.getNumBytes()) { throw new IOException("THIS IS NOT SUPPOSED TO HAPPEN:" + " replica.getBytesOnDisk() != block.getNumBytes(), block=" + oldBlock + ", replica=" + replica); } //check replica files before update checkReplicaFiles(replica); //update replica final FinalizedReplica finalized = updateReplicaUnderRecovery( (ReplicaUnderRecovery)replica, recoveryId, newlength); //check replica files after update checkReplicaFiles(finalized); return finalized; } private FinalizedReplica updateReplicaUnderRecovery( ReplicaUnderRecovery rur, long recoveryId, long newlength) throws IOException { //check recovery id if (rur.getRecoveryID() != recoveryId) { throw new IOException("rur.getRecoveryID() != recoveryId = " + recoveryId + ", rur=" + rur); } // bump rur's GS to be recovery id bumpReplicaGS(rur, recoveryId); //update length final File replicafile = rur.getBlockFile(); if (rur.getNumBytes() < newlength) { throw new IOException("rur.getNumBytes() < newlength = " + newlength + ", rur=" + rur); } if (rur.getNumBytes() > newlength) { rur.unlinkBlock(1); truncateBlock(replicafile, rur.getMetaFile(), rur.getNumBytes(), newlength); // update RUR with the new length rur.setNumBytes(newlength); } // finalize the block return finalizeReplica(rur); } @Override // FSDatasetInterface public synchronized long getReplicaVisibleLength(final Block block) throws IOException { final Replica replica = volumeMap.get(block.getBlockId()); if (replica == null) { throw new ReplicaNotFoundException(block); } if (replica.getGenerationStamp() < block.getGenerationStamp()) { throw new IOException( "replica.getGenerationStamp() < block.getGenerationStamp(), block=" + block + ", replica=" + replica); } return replica.getVisibleLength(); } /** * Class for representing the Datanode volume information */ static class VolumeInfo { final String directory; final long usedSpace; final long freeSpace; final long reservedSpace; VolumeInfo(String dir, long usedSpace, long freeSpace, long reservedSpace) { this.directory = dir; this.usedSpace = usedSpace; this.freeSpace = freeSpace; this.reservedSpace = reservedSpace; } } synchronized Collection getVolumeInfo() { Collection info = new ArrayList(); synchronized(volumes.volumes) { for (FSVolume volume : volumes.volumes) { long used = 0; try { used = volume.getDfsUsed(); } catch (IOException e) { DataNode.LOG.warn(e.getMessage()); } long free= 0; try { free = volume.getAvailable(); } catch (IOException e) { DataNode.LOG.warn(e.getMessage()); } info.add(new VolumeInfo(volume.toString(), used, free, volume.getReserved())); } return info; } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy