All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hdfs.server.datanode.DataNode Maven / Gradle / Ivy

The newest version!
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
/**
 * gnodet: this file mainly contains a modification for clean shutdown
 * mostly related to unregistering mbeans correctly
 */
package org.apache.hadoop.hdfs.server.datanode;

import java.io.BufferedOutputStream;
import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.net.InetSocketAddress;
import java.net.ServerSocket;
import java.net.Socket;
import java.net.SocketTimeoutException;
import java.net.UnknownHostException;
import java.nio.channels.ServerSocketChannel;
import java.nio.channels.SocketChannel;
import java.security.NoSuchAlgorithmException;
import java.security.PrivilegedExceptionAction;
import java.security.SecureRandom;
import java.util.AbstractList;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;

import javax.management.InstanceAlreadyExistsException;
import javax.management.MBeanRegistrationException;
import javax.management.MalformedObjectNameException;
import javax.management.ObjectName;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.HDFSPolicyProvider;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.BlockListAsLongs;
import org.apache.hadoop.hdfs.protocol.BlockLocalPathInfo;
import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol;
import org.apache.hadoop.hdfs.protocol.DataTransferProtocol;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.protocol.UnregisteredDatanodeException;
import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager;
import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager.AccessMode;
import org.apache.hadoop.hdfs.security.token.block.ExportedBlockKeys;
import org.apache.hadoop.hdfs.server.common.GenerationStamp;
import org.apache.hadoop.hdfs.server.common.HdfsConstants;
import org.apache.hadoop.hdfs.server.common.HdfsConstants.StartupOption;
import org.apache.hadoop.hdfs.server.common.IncorrectVersionException;
import org.apache.hadoop.hdfs.server.common.Storage;
import org.apache.hadoop.hdfs.server.datanode.FSDataset.VolumeInfo;
import org.apache.hadoop.hdfs.server.datanode.SecureDataNodeStarter.SecureResources;
import org.apache.hadoop.hdfs.server.datanode.metrics.DataNodeInstrumentation;
import org.apache.hadoop.hdfs.server.datanode.web.resources.DatanodeWebHdfsMethods;
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
import org.apache.hadoop.hdfs.server.namenode.FileChecksumServlets;
import org.apache.hadoop.hdfs.server.namenode.JspHelper;
import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.hdfs.server.namenode.StreamFile;
import org.apache.hadoop.hdfs.server.protocol.BalancerBandwidthCommand;
import org.apache.hadoop.hdfs.server.protocol.BlockCommand;
import org.apache.hadoop.hdfs.server.protocol.BlockMetaDataInfo;
import org.apache.hadoop.hdfs.server.protocol.BlockRecoveryInfo;
import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand;
import org.apache.hadoop.hdfs.server.protocol.DatanodeProtocol;
import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
import org.apache.hadoop.hdfs.server.protocol.DisallowedDatanodeException;
import org.apache.hadoop.hdfs.server.protocol.InterDatanodeProtocol;
import org.apache.hadoop.hdfs.server.protocol.KeyUpdateCommand;
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
import org.apache.hadoop.hdfs.server.protocol.UpgradeCommand;
import org.apache.hadoop.hdfs.web.WebHdfsFileSystem;
import org.apache.hadoop.hdfs.web.resources.Param;
import org.apache.hadoop.http.HttpServer;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.ipc.RemoteException;
import org.apache.hadoop.ipc.Server;
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
import org.apache.hadoop.metrics2.util.MBeans;
import org.apache.hadoop.net.DNS;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.security.AccessControlException;
import org.apache.hadoop.security.SecurityUtil;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.UserGroupInformation.AuthenticationMethod;
import org.apache.hadoop.security.authorize.ServiceAuthorizationManager;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.security.token.TokenIdentifier;
import org.apache.hadoop.util.Daemon;
import org.apache.hadoop.util.DiskChecker;
import org.apache.hadoop.util.DiskChecker.DiskErrorException;
import org.apache.hadoop.util.DiskChecker.DiskOutOfSpaceException;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.VersionInfo;
import org.mortbay.util.ajax.JSON;

/**********************************************************
 * DataNode is a class (and program) that stores a set of
 * blocks for a DFS deployment.  A single deployment can
 * have one or many DataNodes.  Each DataNode communicates
 * regularly with a single NameNode.  It also communicates
 * with client code and other DataNodes from time to time.
 *
 * DataNodes store a series of named blocks.  The DataNode
 * allows client code to read these blocks, or to write new
 * block data.  The DataNode may also, in response to instructions
 * from its NameNode, delete blocks or copy blocks to/from other
 * DataNodes.
 *
 * The DataNode maintains just one critical table:
 *   block-> stream of bytes (of BLOCK_SIZE or less)
 *
 * This info is stored on a local disk.  The DataNode
 * reports the table's contents to the NameNode upon startup
 * and every so often afterwards.
 *
 * DataNodes spend their lives in an endless loop of asking
 * the NameNode for something to do.  A NameNode cannot connect
 * to a DataNode directly; a NameNode simply returns values from
 * functions invoked by a DataNode.
 *
 * DataNodes maintain an open server socket so that client code 
 * or other DataNodes can read/write data.  The host/port for
 * this server is reported to the NameNode, which then sends that
 * information to clients or other DataNodes that might be interested.
 *
 **********************************************************/
public class DataNode extends Configured 
    implements InterDatanodeProtocol, ClientDatanodeProtocol, FSConstants, 
    Runnable, DataNodeMXBean {
  public static final Log LOG = LogFactory.getLog(DataNode.class);
  
  static{
    Configuration.addDefaultResource("hdfs-default.xml");
    Configuration.addDefaultResource("hdfs-site.xml");
  }

  public static final String DN_CLIENTTRACE_FORMAT =
        "src: %s" +      // src IP
        ", dest: %s" +   // dst IP
        ", bytes: %s" +  // byte count
        ", op: %s" +     // operation
        ", cliID: %s" +  // DFSClient id
        ", offset: %s" + // offset
        ", srvID: %s" +  // DatanodeRegistration
        ", blockid: %s" + // block id
        ", duration: %s"; // duration time

  static final Log ClientTraceLog =
    LogFactory.getLog(DataNode.class.getName() + ".clienttrace");

  /**
   * Use {@link NetUtils#createSocketAddr(String)} instead.
   */
  @Deprecated
  public static InetSocketAddress createSocketAddr(String target
                                                   ) throws IOException {
    return NetUtils.createSocketAddr(target);
  }
  
  public DatanodeProtocol namenode = null;
  public FSDatasetInterface data = null;
  public DatanodeRegistration dnRegistration = null;

  volatile boolean shouldRun = true;
  private LinkedList receivedBlockList = new LinkedList();
  /** list of blocks being recovered */
  private final Map ongoingRecovery = new HashMap();
  private LinkedList delHints = new LinkedList();
  public final static String EMPTY_DEL_HINT = "";
  AtomicInteger xmitsInProgress = new AtomicInteger();
  Daemon dataXceiverServer = null;
  ThreadGroup threadGroup = null;
  long blockReportInterval;
  //disallow the sending of BR before instructed to do so
  long lastBlockReport = 0;
  boolean resetBlockReportTime = true;
  long initialBlockReportDelay = BLOCKREPORT_INITIAL_DELAY * 1000L;
  long lastHeartbeat = 0;
  long heartBeatInterval;
  private DataStorage storage = null;
  private HttpServer infoServer = null;
  DataNodeInstrumentation myMetrics;
  private static InetSocketAddress nameNodeAddr;
  private InetSocketAddress selfAddr;
  private static DataNode datanodeObject = null;
  private Thread dataNodeThread = null;
  String machineName;
  private static String dnThreadName;
  int socketTimeout;
  int socketWriteTimeout = 0;  
  boolean transferToAllowed = true;
  int writePacketSize = 0;
  private boolean supportAppends;
  boolean isBlockTokenEnabled;
  BlockTokenSecretManager blockTokenSecretManager;
  boolean isBlockTokenInitialized = false;
  final String userWithLocalPathAccess;

  /**
   * Testing hook that allows tests to delay the sending of blockReceived RPCs
   * to the namenode. This can help find bugs in append.
   */
  int artificialBlockReceivedDelay = 0;
  
  public DataBlockScanner blockScanner = null;
  public Daemon blockScannerThread = null;
  
  private static final Random R = new Random();
  
  public static final String DATA_DIR_KEY = "dfs.data.dir";
  public final static String DATA_DIR_PERMISSION_KEY = 
    "dfs.datanode.data.dir.perm";
  private static final String DEFAULT_DATA_DIR_PERMISSION = "755";

  // Thresholds for when we start to log when a block report is
  // taking a long time to generate. Under heavy disk load and
  // memory pressure, it's normal for block reports to take
  // several minutes, since they cause many disk seeks.
  private static final long LATE_BLOCK_REPORT_WARN_THRESHOLD =
      10 * 60 * 1000; // 10m
  private static final long LATE_BLOCK_REPORT_INFO_THRESHOLD =
      3 * 60 * 1000; // 3m

  // For InterDataNodeProtocol
  public Server ipcServer;

  private SecureResources secureResources = null;
  
  /**
   * Current system time.
   * @return current time in msec.
   */
  static long now() {
    return System.currentTimeMillis();
  }
  
  /**
   * Create the DataNode given a configuration and an array of dataDirs.
   * 'dataDirs' is where the blocks are stored.
   */
  DataNode(final Configuration conf, 
           final AbstractList dataDirs) throws IOException {
    this(conf, dataDirs, null);
  }
  
  /**
   * Start a Datanode with specified server sockets for secure environments
   * where they are run with privileged ports and injected from a higher
   * level of capability
   */
  DataNode(final Configuration conf,
           final AbstractList dataDirs, SecureResources resources) throws IOException {
    super(conf);
    SecurityUtil.login(conf, DFSConfigKeys.DFS_DATANODE_KEYTAB_FILE_KEY, 
        DFSConfigKeys.DFS_DATANODE_USER_NAME_KEY);

    datanodeObject = this;
    supportAppends = conf.getBoolean("dfs.support.append", false);
    this.userWithLocalPathAccess = conf
        .get(DFSConfigKeys.DFS_BLOCK_LOCAL_PATH_ACCESS_USER_KEY);
    try {
      startDataNode(conf, dataDirs, resources);
    } catch (IOException ie) {
      shutdown();
      throw ie;
    }   
  }
    
  
  /**
   * This method starts the data node with the specified conf.
   * 
   * @param conf - the configuration
   *  if conf's CONFIG_PROPERTY_SIMULATED property is set
   *  then a simulated storage based data node is created.
   * 
   * @param dataDirs - only for a non-simulated storage data node
   * @throws IOException
   * @throws MalformedObjectNameException 
   * @throws MBeanRegistrationException 
   * @throws InstanceAlreadyExistsException 
   */
  void startDataNode(Configuration conf, 
                     AbstractList dataDirs, SecureResources resources
                     ) throws IOException {
    if(UserGroupInformation.isSecurityEnabled() && resources == null)
      throw new RuntimeException("Cannot start secure cluster without " +
      		"privileged resources.");
    
    this.secureResources = resources;
    // use configured nameserver & interface to get local hostname
    if (conf.get("slave.host.name") != null) {
      machineName = conf.get("slave.host.name");   
    }
    if (machineName == null) {
      machineName = DNS.getDefaultHost(
                                     conf.get("dfs.datanode.dns.interface","default"),
                                     conf.get("dfs.datanode.dns.nameserver","default"));
    }
    InetSocketAddress nameNodeAddr = NameNode.getServiceAddress(conf, true);
    
    this.socketTimeout =  conf.getInt("dfs.socket.timeout",
                                      HdfsConstants.READ_TIMEOUT);
    this.socketWriteTimeout = conf.getInt("dfs.datanode.socket.write.timeout",
                                          HdfsConstants.WRITE_TIMEOUT);
    /* Based on results on different platforms, we might need set the default 
     * to false on some of them. */
    this.transferToAllowed = conf.getBoolean("dfs.datanode.transferTo.allowed", 
                                             true);
    this.writePacketSize = conf.getInt("dfs.write.packet.size", 64*1024);

    InetSocketAddress socAddr = DataNode.getStreamingAddr(conf);
    int tmpPort = socAddr.getPort();
    storage = new DataStorage();
    // construct registration
    this.dnRegistration = new DatanodeRegistration(machineName + ":" + tmpPort);

    // connect to name node
    this.namenode = (DatanodeProtocol) 
      RPC.waitForProxy(DatanodeProtocol.class,
                       DatanodeProtocol.versionID,
                       nameNodeAddr, 
                       conf);
    // get version and id info from the name-node
    NamespaceInfo nsInfo = handshake();
    StartupOption startOpt = getStartupOption(conf);
    assert startOpt != null : "Startup option must be set.";
    
    boolean simulatedFSDataset = 
        conf.getBoolean("dfs.datanode.simulateddatastorage", false);
    if (simulatedFSDataset) {
        setNewStorageID(dnRegistration);
        dnRegistration.storageInfo.layoutVersion = FSConstants.LAYOUT_VERSION;
        dnRegistration.storageInfo.namespaceID = nsInfo.namespaceID;
        // it would have been better to pass storage as a parameter to
        // constructor below - need to augment ReflectionUtils used below.
        conf.set("StorageId", dnRegistration.getStorageID());
        try {
          //Equivalent of following (can't do because Simulated is in test dir)
          //  this.data = new SimulatedFSDataset(conf);
          this.data = (FSDatasetInterface) ReflectionUtils.newInstance(
              Class.forName("org.apache.hadoop.hdfs.server.datanode.SimulatedFSDataset"), conf);
        } catch (ClassNotFoundException e) {
          throw new IOException(StringUtils.stringifyException(e));
        }
    } else { // real storage
      // read storage info, lock data dirs and transition fs state if necessary
      storage.recoverTransitionRead(nsInfo, dataDirs, startOpt);
      // adjust
      this.dnRegistration.setStorageInfo(storage);
      // initialize data node internal structure
      this.data = new FSDataset(storage, conf);
    }
      
    // register datanode MXBean
    this.registerMXBean(conf); // register the MXBean for DataNode
    
    // Allow configuration to delay block reports to find bugs
    artificialBlockReceivedDelay = conf.getInt(
        "dfs.datanode.artificialBlockReceivedDelay", 0);

    // find free port or use privileged port provide
    ServerSocket ss;
    if(secureResources == null) {
      ss = (socketWriteTimeout > 0) ? 
        ServerSocketChannel.open().socket() : new ServerSocket();
      Server.bind(ss, socAddr, 0);
    } else {
      ss = resources.getStreamingSocket();
    }
    ss.setReceiveBufferSize(DEFAULT_DATA_SOCKET_SIZE); 
    // adjust machine name with the actual port
    tmpPort = ss.getLocalPort();
    selfAddr = new InetSocketAddress(ss.getInetAddress().getHostAddress(),
                                     tmpPort);
    this.dnRegistration.setName(machineName + ":" + tmpPort);
    LOG.info("Opened info server at " + tmpPort);
      
    this.threadGroup = new ThreadGroup("dataXceiverServer");
    this.dataXceiverServer = new Daemon(threadGroup, 
        new DataXceiverServer(ss, conf, this));
    this.threadGroup.setDaemon(true); // auto destroy when empty

    this.blockReportInterval =
      conf.getLong("dfs.blockreport.intervalMsec", BLOCKREPORT_INTERVAL);
    this.initialBlockReportDelay = conf.getLong("dfs.blockreport.initialDelay",
                                            BLOCKREPORT_INITIAL_DELAY)* 1000L; 
    if (this.initialBlockReportDelay >= blockReportInterval) {
      this.initialBlockReportDelay = 0;
      LOG.info("dfs.blockreport.initialDelay is greater than " +
        "dfs.blockreport.intervalMsec." + " Setting initial delay to 0 msec:");
    }
    this.heartBeatInterval = conf.getLong("dfs.heartbeat.interval", HEARTBEAT_INTERVAL) * 1000L;
    DataNode.nameNodeAddr = nameNodeAddr;

    //initialize periodic block scanner
    String reason = null;
    if (conf.getInt("dfs.datanode.scan.period.hours", 0) < 0) {
      reason = "verification is turned off by configuration";
    } else if ( !(data instanceof FSDataset) ) {
      reason = "verifcation is supported only with FSDataset";
    } 
    if ( reason == null ) {
      blockScanner = new DataBlockScanner(this, (FSDataset)data, conf);
    } else {
      LOG.info("Periodic Block Verification is disabled because " +
               reason + ".");
    }

    //create a servlet to serve full-file content
    InetSocketAddress infoSocAddr = DataNode.getInfoAddr(conf);
    String infoHost = infoSocAddr.getHostName();
    int tmpInfoPort = infoSocAddr.getPort();
    this.infoServer = (secureResources == null) 
       ? new HttpServer("datanode", infoHost, tmpInfoPort, tmpInfoPort == 0, 
           conf, SecurityUtil.getAdminAcls(conf, DFSConfigKeys.DFS_ADMIN))
       : new HttpServer("datanode", infoHost, tmpInfoPort, tmpInfoPort == 0,
           conf, SecurityUtil.getAdminAcls(conf, DFSConfigKeys.DFS_ADMIN),
           secureResources.getListener());
    if (conf.getBoolean("dfs.https.enable", false)) {
      boolean needClientAuth = conf.getBoolean("dfs.https.need.client.auth", false);
      InetSocketAddress secInfoSocAddr = NetUtils.createSocketAddr(conf.get(
          "dfs.datanode.https.address", infoHost + ":" + 0));
      Configuration sslConf = new Configuration(false);
      sslConf.addResource(conf.get("dfs.https.server.keystore.resource",
          "ssl-server.xml"));
      this.infoServer.addSslListener(secInfoSocAddr, sslConf, needClientAuth);
    }
    this.infoServer.addInternalServlet(null, "/streamFile/*", StreamFile.class);
    this.infoServer.addInternalServlet(null, "/getFileChecksum/*",
        FileChecksumServlets.GetServlet.class);

    this.infoServer.setAttribute("datanode", this);
    this.infoServer.setAttribute("datanode.blockScanner", blockScanner);
    this.infoServer.setAttribute(JspHelper.CURRENT_CONF, conf);
    this.infoServer.addServlet(null, "/blockScannerReport", 
                               DataBlockScanner.Servlet.class);

    if (WebHdfsFileSystem.isEnabled(conf, LOG)) {
      infoServer.addJerseyResourcePackage(DatanodeWebHdfsMethods.class
          .getPackage().getName() + ";" + Param.class.getPackage().getName(),
          WebHdfsFileSystem.PATH_PREFIX + "/*");
    }
    this.infoServer.start();
    // adjust info port
    this.dnRegistration.setInfoPort(this.infoServer.getPort());
    myMetrics = DataNodeInstrumentation.create(conf,
                                               dnRegistration.getStorageID());
    
    // set service-level authorization security policy
    if (conf.getBoolean(
          ServiceAuthorizationManager.SERVICE_AUTHORIZATION_CONFIG, false)) {
      ServiceAuthorizationManager.refresh(conf, new HDFSPolicyProvider());
    }

    // BlockTokenSecretManager is created here, but it shouldn't be
    // used until it is initialized in register().
    this.blockTokenSecretManager = new BlockTokenSecretManager(false,
        0, 0);
    //init ipc server
    InetSocketAddress ipcAddr = NetUtils.createSocketAddr(
        conf.get("dfs.datanode.ipc.address"));
    ipcServer = RPC.getServer(this, ipcAddr.getHostName(), ipcAddr.getPort(), 
        conf.getInt("dfs.datanode.handler.count", 3), false, conf,
        blockTokenSecretManager);
    dnRegistration.setIpcPort(ipcServer.getListenerAddress().getPort());

    LOG.info("dnRegistration = " + dnRegistration);
  }
  
  private ObjectName mxBean = null;
  /**
   * Register the DataNode MXBean using the name
   *        "hadoop:service=DataNode,name=DataNodeInfo"
   */
  void registerMXBean(Configuration conf) {
    // We wrap to bypass standard mbean naming convention.
    // This wraping can be removed in java 6 as it is more flexible in 
    // package naming for mbeans and their impl.
    mxBean = MBeans.register("DataNode", "DataNodeInfo", this);
  }
  
  public void unRegisterMXBean() {
    if (mxBean != null)
      MBeans.unregister(mxBean);
  }
  
  /**
   * Determine the http server's effective addr
   */
  public static InetSocketAddress getInfoAddr(Configuration conf) {
    String infoAddr = NetUtils.getServerAddress(conf, 
        "dfs.datanode.info.bindAddress", 
        "dfs.datanode.info.port",
        "dfs.datanode.http.address");
    
    return NetUtils.createSocketAddr(infoAddr); 
  }

  /**
   * Creates either NIO or regular depending on socketWriteTimeout.
   */
  protected Socket newSocket() throws IOException {
    return (socketWriteTimeout > 0) ? 
           SocketChannel.open().socket() : new Socket();                                   
  }
  
  private NamespaceInfo handshake() throws IOException {
    NamespaceInfo nsInfo = new NamespaceInfo();
    while (shouldRun) {
      try {
        nsInfo = namenode.versionRequest();
        break;
      } catch(SocketTimeoutException e) {  // namenode is busy
        LOG.info("Problem connecting to server: " + getNameNodeAddr());
        try {
          Thread.sleep(1000);
        } catch (InterruptedException ie) {}
      }
    }
    String errorMsg = null;
    // verify build version
    if( ! nsInfo.getBuildVersion().equals( Storage.getBuildVersion() )) {
      errorMsg = "Incompatible build versions: namenode BV = " 
        + nsInfo.getBuildVersion() + "; datanode BV = "
        + Storage.getBuildVersion();
      LOG.fatal( errorMsg );
      notifyNamenode(DatanodeProtocol.NOTIFY, errorMsg);  
      throw new IOException( errorMsg );
    }
    assert FSConstants.LAYOUT_VERSION == nsInfo.getLayoutVersion() :
      "Data-node and name-node layout versions must be the same."
      + "Expected: "+ FSConstants.LAYOUT_VERSION + " actual "+ nsInfo.getLayoutVersion();
    return nsInfo;
  }

  /** Return the DataNode object
   * 
   */
  public static DataNode getDataNode() {
    return datanodeObject;
  } 

  public static InterDatanodeProtocol createInterDataNodeProtocolProxy(
      DatanodeID datanodeid, final Configuration conf, final int socketTimeout) throws IOException {
    final InetSocketAddress addr = NetUtils.createSocketAddr(
        datanodeid.getHost() + ":" + datanodeid.getIpcPort());
    if (InterDatanodeProtocol.LOG.isDebugEnabled()) {
      InterDatanodeProtocol.LOG.info("InterDatanodeProtocol addr=" + addr);
    }

    UserGroupInformation loginUgi = UserGroupInformation.getLoginUser();
    try {
      return loginUgi
          .doAs(new PrivilegedExceptionAction() {
            public InterDatanodeProtocol run() throws IOException {
              return (InterDatanodeProtocol) RPC.getProxy(
                  InterDatanodeProtocol.class, InterDatanodeProtocol.versionID,
                  addr, conf, socketTimeout);
            }
          });
    } catch (InterruptedException ie) {
      throw new IOException(ie.getMessage());
    }
  }

  public InetSocketAddress getNameNodeAddr() {
    return nameNodeAddr;
  }
  
  public InetSocketAddress getSelfAddr() {
    return selfAddr;
  }
    
  DataNodeInstrumentation getMetrics() {
    return myMetrics;
  }
  
  /**
   * Return the namenode's identifier
   */
  public String getNamenode() {
    //return namenode.toString();
    return "";
  }

  public static void setNewStorageID(DatanodeRegistration dnReg) {
    /* Return 
     * "DS-randInt-ipaddr-currentTimeMillis"
     * It is considered extermely rare for all these numbers to match
     * on a different machine accidentally for the following 
     * a) SecureRandom(INT_MAX) is pretty much random (1 in 2 billion), and
     * b) Good chance ip address would be different, and
     * c) Even on the same machine, Datanode is designed to use different ports.
     * d) Good chance that these are started at different times.
     * For a confict to occur all the 4 above have to match!.
     * The format of this string can be changed anytime in future without
     * affecting its functionality.
     */
    String ip = "unknownIP";
    try {
      ip = DNS.getDefaultIP("default");
    } catch (UnknownHostException ignored) {
      LOG.warn("Could not find ip address of \"default\" inteface.");
    }
    
    int rand = 0;
    try {
      rand = SecureRandom.getInstance("SHA1PRNG").nextInt(Integer.MAX_VALUE);
    } catch (NoSuchAlgorithmException e) {
      LOG.warn("Could not use SecureRandom");
      rand = R.nextInt(Integer.MAX_VALUE);
    }
    dnReg.storageID = "DS-" + rand + "-"+ ip + "-" + dnReg.getPort() + "-" + 
                      System.currentTimeMillis();
  }
  /**
   * Register datanode
   * 

* The datanode needs to register with the namenode on startup in order * 1) to report which storage it is serving now and * 2) to receive a registrationID * issued by the namenode to recognize registered datanodes. * * @see FSNamesystem#registerDatanode(DatanodeRegistration) * @throws IOException */ private void register() throws IOException { if (dnRegistration.getStorageID().equals("")) { setNewStorageID(dnRegistration); } while(shouldRun) { try { // reset name to machineName. Mainly for web interface. dnRegistration.name = machineName + ":" + dnRegistration.getPort(); dnRegistration = namenode.register(dnRegistration); break; } catch(SocketTimeoutException e) { // namenode is busy LOG.info("Problem connecting to server: " + getNameNodeAddr()); try { Thread.sleep(1000); } catch (InterruptedException ie) {} } } assert ("".equals(storage.getStorageID()) && !"".equals(dnRegistration.getStorageID())) || storage.getStorageID().equals(dnRegistration.getStorageID()) : "New storageID can be assigned only if data-node is not formatted"; if (storage.getStorageID().equals("")) { storage.setStorageID(dnRegistration.getStorageID()); storage.writeAll(); LOG.info("New storage id " + dnRegistration.getStorageID() + " is assigned to data-node " + dnRegistration.getName()); } if(! storage.getStorageID().equals(dnRegistration.getStorageID())) { throw new IOException("Inconsistent storage IDs. Name-node returned " + dnRegistration.getStorageID() + ". Expecting " + storage.getStorageID()); } if (!isBlockTokenInitialized) { /* first time registering with NN */ ExportedBlockKeys keys = dnRegistration.exportedKeys; this.isBlockTokenEnabled = keys.isBlockTokenEnabled(); if (isBlockTokenEnabled) { long blockKeyUpdateInterval = keys.getKeyUpdateInterval(); long blockTokenLifetime = keys.getTokenLifetime(); LOG.info("Block token params received from NN: keyUpdateInterval=" + blockKeyUpdateInterval / (60 * 1000) + " min(s), tokenLifetime=" + blockTokenLifetime / (60 * 1000) + " min(s)"); blockTokenSecretManager.setTokenLifetime(blockTokenLifetime); } isBlockTokenInitialized = true; } if (isBlockTokenEnabled) { blockTokenSecretManager.setKeys(dnRegistration.exportedKeys); dnRegistration.exportedKeys = ExportedBlockKeys.DUMMY_KEYS; } if (supportAppends) { Block[] bbwReport = data.getBlocksBeingWrittenReport(); long[] blocksBeingWritten = BlockListAsLongs .convertToArrayLongs(bbwReport); namenode.blocksBeingWrittenReport(dnRegistration, blocksBeingWritten); } // random short delay - helps scatter the BR from all DNs // - but we can start generating the block report immediately data.requestAsyncBlockReport(); scheduleBlockReport(initialBlockReportDelay); } /** * Shut down this instance of the datanode. * Returns only after shutdown is complete. * This method can only be called by the offerService thread. * Otherwise, deadlock might occur. */ public void shutdown() { if (!this.shouldRun) { return; } this.shouldRun = false; this.unRegisterMXBean(); if (infoServer != null) { try { infoServer.stop(); } catch (Exception e) { LOG.warn("Exception shutting down DataNode", e); } } if (ipcServer != null) { ipcServer.stop(); } if (dataXceiverServer != null) { ((DataXceiverServer) this.dataXceiverServer.getRunnable()).kill(); this.dataXceiverServer.interrupt(); // wait for all data receiver threads to exit if (this.threadGroup != null) { while (true) { this.threadGroup.interrupt(); LOG.info("Waiting for threadgroup to exit, active threads is " + this.threadGroup.activeCount()); if (this.threadGroup.activeCount() == 0) { break; } try { Thread.sleep(1000); } catch (InterruptedException e) {} } } // wait for dataXceiveServer to terminate try { this.dataXceiverServer.join(); } catch (InterruptedException ie) { } } RPC.stopProxy(namenode); // stop the RPC threads if(upgradeManager != null) upgradeManager.shutdownUpgrade(); if (blockScannerThread != null) { blockScannerThread.interrupt(); try { blockScannerThread.join(3600000L); // wait for at most 1 hour } catch (InterruptedException ie) { } } if (storage != null) { try { this.storage.unlockAll(); } catch (IOException ie) { } } if (dataNodeThread != null) { dataNodeThread.interrupt(); try { dataNodeThread.join(); } catch (InterruptedException ie) { } } if (data != null) { data.shutdown(); } if (myMetrics != null) { myMetrics.shutdown(); } } /** Check if there is no space in disk * @param e that caused this checkDiskError call **/ protected void checkDiskError(Exception e ) throws IOException { LOG.warn("checkDiskError: exception: ", e); if (e.getMessage() != null && e.getMessage().startsWith("No space left on device")) { throw new DiskOutOfSpaceException("No space left on device"); } else { checkDiskError(); } } /** * Check if there is a disk failure and if so, handle the error * **/ protected void checkDiskError( ) { try { data.checkDataDir(); } catch(DiskErrorException de) { handleDiskError(de.getMessage()); } } private void notifyNamenode(int dpCode, String msg) { //inform NameNode try { namenode.errorReport( dnRegistration, dpCode, msg); } catch( SocketTimeoutException e ) { // namenode is busy LOG.info("Problem connecting to server: " + getNameNodeAddr()); } catch(IOException ignored) { } } private void handleDiskError(String errMsgr) { boolean hasEnoughResource = data.hasEnoughResource(); LOG.warn("DataNode.handleDiskError: Keep Running: " + hasEnoughResource); //if hasEnoughtResource = true - more volumes are available, so we don't want // to shutdown DN completely and don't want NN to remove it. int dp_error = DatanodeProtocol.DISK_ERROR; if(hasEnoughResource == false) { // DN will be shutdown and NN should remove it dp_error = DatanodeProtocol.FATAL_DISK_ERROR; } //inform NameNode notifyNamenode(dp_error, errMsgr); if(hasEnoughResource) { scheduleBlockReport(0); return; // do not shutdown } LOG.warn("DataNode is shutting down.\n" + errMsgr); shouldRun = false; } /** Number of concurrent xceivers per node. */ int getXceiverCount() { return threadGroup == null ? 0 : threadGroup.activeCount(); } /** * Main loop for the DataNode. Runs until shutdown, * forever calling remote NameNode functions. */ public void offerService() throws Exception { LOG.info("using BLOCKREPORT_INTERVAL of " + blockReportInterval + "msec" + " Initial delay: " + initialBlockReportDelay + "msec"); // // Now loop for a long time.... // while (shouldRun) { try { long startTime = now(); // // Every so often, send heartbeat or block-report // if (startTime - lastHeartbeat > heartBeatInterval) { // // All heartbeat messages include following info: // -- Datanode name // -- data transfer port // -- Total capacity // -- Bytes remaining // lastHeartbeat = startTime; DatanodeCommand[] cmds = namenode.sendHeartbeat(dnRegistration, data.getCapacity(), data.getDfsUsed(), data.getRemaining(), xmitsInProgress.get(), getXceiverCount()); myMetrics.addHeartBeat(now() - startTime); //LOG.info("Just sent heartbeat, with name " + localName); if (!processCommand(cmds)) continue; } // check if there are newly received blocks Block [] blockArray=null; String [] delHintArray=null; synchronized(receivedBlockList) { synchronized(delHints) { int numBlocks = receivedBlockList.size(); if (numBlocks > 0) { if(numBlocks!=delHints.size()) { LOG.warn("Panic: receiveBlockList and delHints are not of the same length" ); } // // Send newly-received blockids to namenode // blockArray = receivedBlockList.toArray(new Block[numBlocks]); delHintArray = delHints.toArray(new String[numBlocks]); } } } if (blockArray != null) { if(delHintArray == null || delHintArray.length != blockArray.length ) { LOG.warn("Panic: block array & delHintArray are not the same" ); } namenode.blockReceived(dnRegistration, blockArray, delHintArray); synchronized (receivedBlockList) { synchronized (delHints) { for(int i=0; i blockReportInterval) { if (data.isAsyncBlockReportReady()) { // Create block report long brCreateStartTime = now(); Block[] bReport = data.retrieveAsyncBlockReport(); // Send block report long brSendStartTime = now(); DatanodeCommand cmd = namenode.blockReport(dnRegistration, BlockListAsLongs.convertToArrayLongs(bReport)); // Log the block report processing stats from Datanode perspective long brSendCost = now() - brSendStartTime; long brCreateCost = brSendStartTime - brCreateStartTime; myMetrics.addBlockReport(brSendCost); LOG.info("BlockReport of " + bReport.length + " blocks took " + brCreateCost + " msec to generate and " + brSendCost + " msecs for RPC and NN processing"); // // If we have sent the first block report, then wait a random // time before we start the periodic block reports. // if (resetBlockReportTime) { lastBlockReport = startTime - R.nextInt((int)(blockReportInterval)); resetBlockReportTime = false; } else { /* say the last block report was at 8:20:14. The current report * should have started around 9:20:14 (default 1 hour interval). * If current time is : * 1) normal like 9:20:18, next report should be at 10:20:14 * 2) unexpected like 11:35:43, next report should be at * 12:20:14 */ lastBlockReport += (now() - lastBlockReport) / blockReportInterval * blockReportInterval; } processCommand(cmd); } else { data.requestAsyncBlockReport(); if (lastBlockReport > 0) { // this isn't the first report long waitingFor = startTime - lastBlockReport - blockReportInterval; String msg = "Block report is due, and been waiting for it for " + (waitingFor/1000) + " seconds..."; if (waitingFor > LATE_BLOCK_REPORT_WARN_THRESHOLD) { LOG.warn(msg); } else if (waitingFor > LATE_BLOCK_REPORT_INFO_THRESHOLD) { LOG.info(msg); } else if (LOG.isDebugEnabled()) { LOG.debug(msg); } } } } // start block scanner if (blockScanner != null && blockScannerThread == null && upgradeManager.isUpgradeCompleted()) { LOG.info("Starting Periodic block scanner."); blockScannerThread = new Daemon(blockScanner); blockScannerThread.start(); } // // There is no work to do; sleep until hearbeat timer elapses, // or work arrives, and then iterate again. // long waitTime = heartBeatInterval - (System.currentTimeMillis() - lastHeartbeat); synchronized(receivedBlockList) { if (waitTime > 0 && receivedBlockList.size() == 0) { try { receivedBlockList.wait(waitTime); } catch (InterruptedException ie) { } delayBeforeBlockReceived(); } } // synchronized } catch(RemoteException re) { String reClass = re.getClassName(); if (UnregisteredDatanodeException.class.getName().equals(reClass) || DisallowedDatanodeException.class.getName().equals(reClass) || IncorrectVersionException.class.getName().equals(reClass)) { LOG.warn("DataNode is shutting down: " + StringUtils.stringifyException(re)); shutdown(); return; } LOG.warn(StringUtils.stringifyException(re)); } catch (IOException e) { LOG.warn(StringUtils.stringifyException(e)); } } // while (shouldRun) } // offerService /** * When a block has been received, we can delay some period of time before * reporting it to the DN, for the purpose of testing. This simulates * the actual latency of blockReceived on a real network (where the client * may be closer to the NN than the DNs). */ private void delayBeforeBlockReceived() { if (artificialBlockReceivedDelay > 0 && !receivedBlockList.isEmpty()) { try { long sleepFor = (long)R.nextInt(artificialBlockReceivedDelay); LOG.debug("DataNode " + dnRegistration + " sleeping for " + "artificial delay: " + sleepFor + " ms"); Thread.sleep(sleepFor); } catch (InterruptedException ie) { Thread.currentThread().interrupt(); } } } /** * Process an array of datanode commands * * @param cmds an array of datanode commands * @return true if further processing may be required or false otherwise. */ private boolean processCommand(DatanodeCommand[] cmds) { if (cmds != null) { for (DatanodeCommand cmd : cmds) { try { if (processCommand(cmd) == false) { return false; } } catch (IOException ioe) { LOG.warn("Error processing datanode Command", ioe); } } } return true; } /** * * @param cmd * @return true if further processing may be required or false otherwise. * @throws IOException */ private boolean processCommand(DatanodeCommand cmd) throws IOException { if (cmd == null) return true; final BlockCommand bcmd = cmd instanceof BlockCommand? (BlockCommand)cmd: null; switch(cmd.getAction()) { case DatanodeProtocol.DNA_TRANSFER: // Send a copy of a block to another datanode transferBlocks(bcmd.getBlocks(), bcmd.getTargets()); myMetrics.incrBlocksReplicated(bcmd.getBlocks().length); break; case DatanodeProtocol.DNA_INVALIDATE: // // Some local block(s) are obsolete and can be // safely garbage-collected. // Block toDelete[] = bcmd.getBlocks(); try { if (blockScanner != null) { blockScanner.deleteBlocks(toDelete); } data.invalidate(toDelete); } catch(IOException e) { checkDiskError(); throw e; } myMetrics.incrBlocksRemoved(toDelete.length); break; case DatanodeProtocol.DNA_SHUTDOWN: // shut down the data node this.shutdown(); return false; case DatanodeProtocol.DNA_REGISTER: // namenode requested a registration - at start or if NN lost contact LOG.info("DatanodeCommand action: DNA_REGISTER"); if (shouldRun) { register(); } break; case DatanodeProtocol.DNA_FINALIZE: storage.finalizeUpgrade(); break; case UpgradeCommand.UC_ACTION_START_UPGRADE: // start distributed upgrade here processDistributedUpgradeCommand((UpgradeCommand)cmd); break; case DatanodeProtocol.DNA_RECOVERBLOCK: recoverBlocks(bcmd.getBlocks(), bcmd.getTargets()); break; case DatanodeProtocol.DNA_ACCESSKEYUPDATE: LOG.info("DatanodeCommand action: DNA_ACCESSKEYUPDATE"); if (isBlockTokenEnabled) { blockTokenSecretManager.setKeys(((KeyUpdateCommand) cmd).getExportedKeys()); } break; case DatanodeProtocol.DNA_BALANCERBANDWIDTHUPDATE: LOG.info("DatanodeCommand action: DNA_BALANCERBANDWIDTHUPDATE"); int vsn = ((BalancerBandwidthCommand) cmd).getBalancerBandwidthVersion(); if (vsn >= 1) { long bandwidth = ((BalancerBandwidthCommand) cmd).getBalancerBandwidthValue(); if (bandwidth > 0) { DataXceiverServer dxcs = (DataXceiverServer) this.dataXceiverServer.getRunnable(); dxcs.balanceThrottler.setBandwidth(bandwidth); } } break; default: LOG.warn("Unknown DatanodeCommand action: " + cmd.getAction()); } return true; } // Distributed upgrade manager UpgradeManagerDatanode upgradeManager = new UpgradeManagerDatanode(this); private void processDistributedUpgradeCommand(UpgradeCommand comm ) throws IOException { assert upgradeManager != null : "DataNode.upgradeManager is null."; upgradeManager.processUpgradeCommand(comm); } /** * Start distributed upgrade if it should be initiated by the data-node. */ private void startDistributedUpgradeIfNeeded() throws IOException { UpgradeManagerDatanode um = DataNode.getDataNode().upgradeManager; assert um != null : "DataNode.upgradeManager is null."; if(!um.getUpgradeState()) return; um.setUpgradeState(false, um.getUpgradeVersion()); um.startUpgrade(); return; } private void transferBlock( Block block, DatanodeInfo xferTargets[] ) throws IOException { if (!data.isValidBlock(block)) { // block does not exist or is under-construction String errStr = "Can't send invalid block " + block; LOG.info(errStr); notifyNamenode(DatanodeProtocol.INVALID_BLOCK, errStr); return; } // Check if NN recorded length matches on-disk length long onDiskLength = data.getLength(block); if (block.getNumBytes() > onDiskLength) { // Shorter on-disk len indicates corruption so report NN the corrupt block namenode.reportBadBlocks(new LocatedBlock[]{ new LocatedBlock(block, new DatanodeInfo[] { new DatanodeInfo(dnRegistration)})}); LOG.info("Can't replicate block " + block + " because on-disk length " + onDiskLength + " is shorter than NameNode recorded length " + block.getNumBytes()); return; } int numTargets = xferTargets.length; if (numTargets > 0) { if (LOG.isInfoEnabled()) { StringBuilder xfersBuilder = new StringBuilder(); for (int i = 0; i < numTargets; i++) { xfersBuilder.append(xferTargets[i].getName()); xfersBuilder.append(" "); } LOG.info(dnRegistration + " Starting thread to transfer block " + block + " to " + xfersBuilder); } new Daemon(new DataTransfer(xferTargets, block, this)).start(); } } private void transferBlocks( Block blocks[], DatanodeInfo xferTargets[][] ) { for (int i = 0; i < blocks.length; i++) { try { transferBlock(blocks[i], xferTargets[i]); } catch (IOException ie) { LOG.warn("Failed to transfer block " + blocks[i], ie); } } } /* * Informing the name node could take a long long time! Should we wait * till namenode is informed before responding with success to the * client? For now we don't. */ protected void notifyNamenodeReceivedBlock(Block block, String delHint) { if(block==null || delHint==null) { throw new IllegalArgumentException(block==null?"Block is null":"delHint is null"); } synchronized (receivedBlockList) { synchronized (delHints) { receivedBlockList.add(block); delHints.add(delHint); receivedBlockList.notifyAll(); } } } /* ******************************************************************** Protocol when a client reads data from Datanode (Cur Ver: 9): Client's Request : ================= Processed in DataXceiver: +----------------------------------------------+ | Common Header | 1 byte OP == OP_READ_BLOCK | +----------------------------------------------+ Processed in readBlock() : +-------------------------------------------------------------------------+ | 8 byte Block ID | 8 byte genstamp | 8 byte start offset | 8 byte length | +-------------------------------------------------------------------------+ | vInt length | | +-----------------------------------+ Client sends optional response only at the end of receiving data. DataNode Response : =================== In readBlock() : If there is an error while initializing BlockSender : +---------------------------+ | 2 byte OP_STATUS_ERROR | and connection will be closed. +---------------------------+ Otherwise +---------------------------+ | 2 byte OP_STATUS_SUCCESS | +---------------------------+ Actual data, sent by BlockSender.sendBlock() : ChecksumHeader : +--------------------------------------------------+ | 1 byte CHECKSUM_TYPE | 4 byte BYTES_PER_CHECKSUM | +--------------------------------------------------+ Followed by actual data in the form of PACKETS: +------------------------------------+ | Sequence of data PACKETs .... | +------------------------------------+ A "PACKET" is defined further below. The client reads data until it receives a packet with "LastPacketInBlock" set to true or with a zero length. If there is no checksum error, it replies to DataNode with OP_STATUS_CHECKSUM_OK: Client optional response at the end of data transmission : +------------------------------+ | 2 byte OP_STATUS_CHECKSUM_OK | +------------------------------+ PACKET : Contains a packet header, checksum and data. Amount of data ======== carried is set by BUFFER_SIZE. +-----------------------------------------------------+ | 4 byte packet length (excluding packet header) | +-----------------------------------------------------+ | 8 byte offset in the block | 8 byte sequence number | +-----------------------------------------------------+ | 1 byte isLastPacketInBlock | +-----------------------------------------------------+ | 4 byte Length of actual data | +-----------------------------------------------------+ | x byte checksum data. x is defined below | +-----------------------------------------------------+ | actual data ...... | +-----------------------------------------------------+ x = (length of data + BYTE_PER_CHECKSUM - 1)/BYTES_PER_CHECKSUM * CHECKSUM_SIZE CHECKSUM_SIZE depends on CHECKSUM_TYPE (usually, 4 for CRC32) The above packet format is used while writing data to DFS also. Not all the fields might be used while reading. ************************************************************************ */ /** Header size for a packet */ public static final int PKT_HEADER_LEN = ( 4 + /* Packet payload length */ 8 + /* offset in block */ 8 + /* seqno */ 1 /* isLastPacketInBlock */); /** * Used for transferring a block of data. This class * sends a piece of data to another DataNode. */ class DataTransfer implements Runnable { DatanodeInfo targets[]; Block b; DataNode datanode; /** * Connect to the first item in the target list. Pass along the * entire target list, the block, and the data. */ public DataTransfer(DatanodeInfo targets[], Block b, DataNode datanode) throws IOException { this.targets = targets; this.b = b; this.datanode = datanode; } /** * Do the deed, write the bytes */ public void run() { xmitsInProgress.getAndIncrement(); Socket sock = null; DataOutputStream out = null; BlockSender blockSender = null; try { InetSocketAddress curTarget = NetUtils.createSocketAddr(targets[0].getName()); sock = newSocket(); NetUtils.connect(sock, curTarget, socketTimeout); sock.setSoTimeout(targets.length * socketTimeout); long writeTimeout = socketWriteTimeout + HdfsConstants.WRITE_TIMEOUT_EXTENSION * (targets.length-1); OutputStream baseStream = NetUtils.getOutputStream(sock, writeTimeout); out = new DataOutputStream(new BufferedOutputStream(baseStream, SMALL_BUFFER_SIZE)); blockSender = new BlockSender(b, 0, b.getNumBytes(), false, false, false, datanode); DatanodeInfo srcNode = new DatanodeInfo(dnRegistration); // // Header info // out.writeShort(DataTransferProtocol.DATA_TRANSFER_VERSION); out.writeByte(DataTransferProtocol.OP_WRITE_BLOCK); out.writeLong(b.getBlockId()); out.writeLong(b.getGenerationStamp()); out.writeInt(0); // no pipelining out.writeBoolean(false); // not part of recovery Text.writeString(out, ""); // client out.writeBoolean(true); // sending src node information srcNode.write(out); // Write src node DatanodeInfo // write targets out.writeInt(targets.length - 1); for (int i = 1; i < targets.length; i++) { targets[i].write(out); } Token accessToken = BlockTokenSecretManager.DUMMY_TOKEN; if (isBlockTokenEnabled) { accessToken = blockTokenSecretManager.generateToken(null, b, EnumSet.of(BlockTokenSecretManager.AccessMode.WRITE)); } accessToken.write(out); // send data & checksum blockSender.sendBlock(out, baseStream, null); // no response necessary LOG.info(dnRegistration + ":Transmitted block " + b + " to " + curTarget); } catch (IOException ie) { LOG.warn(dnRegistration + ":Failed to transfer " + b + " to " + targets[0].getName() + " got " + StringUtils.stringifyException(ie)); // check if there are any disk problem datanode.checkDiskError(); } finally { xmitsInProgress.getAndDecrement(); IOUtils.closeStream(blockSender); IOUtils.closeStream(out); IOUtils.closeSocket(sock); } } } /** * No matter what kind of exception we get, keep retrying to offerService(). * That's the loop that connects to the NameNode and provides basic DataNode * functionality. * * Only stop when "shouldRun" is turned off (which can only happen at shutdown). */ public void run() { LOG.info(dnRegistration + "In DataNode.run, data = " + data); // start dataXceiveServer dataXceiverServer.start(); ipcServer.start(); while (shouldRun) { try { startDistributedUpgradeIfNeeded(); offerService(); } catch (Exception ex) { LOG.error("Exception: " + StringUtils.stringifyException(ex)); if (shouldRun) { try { Thread.sleep(5000); } catch (InterruptedException ie) { } } } } LOG.info(dnRegistration + ":Finishing DataNode in: "+data); shutdown(); } /** Start a single datanode daemon and wait for it to finish. * If this thread is specifically interrupted, it will stop waiting. */ public static void runDatanodeDaemon(DataNode dn) throws IOException { if (dn != null) { //register datanode dn.register(); dn.dataNodeThread = new Thread(dn, dnThreadName); dn.dataNodeThread.setDaemon(true); // needed for JUnit testing dn.dataNodeThread.start(); } } static boolean isDatanodeUp(DataNode dn) { return dn.dataNodeThread != null && dn.dataNodeThread.isAlive(); } /** Instantiate a single datanode object. This must be run by invoking * {@link DataNode#runDatanodeDaemon(DataNode)} subsequently. */ public static DataNode instantiateDataNode(String args[], Configuration conf) throws IOException { return instantiateDataNode(args, conf, null); } /** Instantiate a single datanode object. This must be run by invoking * {@link DataNode#runDatanodeDaemon(DataNode)} subsequently. * @param resources Secure resources needed to run under Kerberos */ public static DataNode instantiateDataNode(String args[], Configuration conf, SecureResources resources) throws IOException { if (conf == null) conf = new Configuration(); if (!parseArguments(args, conf)) { printUsage(); return null; } if (conf.get("dfs.network.script") != null) { LOG.error("This configuration for rack identification is not supported" + " anymore. RackID resolution is handled by the NameNode."); System.exit(-1); } String[] dataDirs = conf.getStrings(DATA_DIR_KEY); dnThreadName = "DataNode: [" + StringUtils.arrayToString(dataDirs) + "]"; DefaultMetricsSystem.initialize("DataNode"); return makeInstance(dataDirs, conf, resources); } /** Instantiate & Start a single datanode daemon and wait for it to finish. * If this thread is specifically interrupted, it will stop waiting. */ public static DataNode createDataNode(String args[], Configuration conf) throws IOException { return createDataNode(args, conf, null); } /** Instantiate & Start a single datanode daemon and wait for it to finish. * If this thread is specifically interrupted, it will stop waiting. * LimitedPrivate for creating secure datanodes */ public static DataNode createDataNode(String args[], Configuration conf, SecureResources resources) throws IOException { DataNode dn = instantiateDataNode(args, conf, resources); runDatanodeDaemon(dn); return dn; } void join() { if (dataNodeThread != null) { try { dataNodeThread.join(); } catch (InterruptedException e) {} } } /** * Make an instance of DataNode after ensuring that at least one of the * given data directories (and their parent directories, if necessary) * can be created. * @param dataDirs List of directories, where the new DataNode instance should * keep its files. * @param conf Configuration instance to use. * @param resources Secure resources needed to run under Kerberos * @return DataNode instance for given list of data dirs and conf, or null if * no directory from this directory list can be created. * @throws IOException */ public static DataNode makeInstance(String[] dataDirs, Configuration conf, SecureResources resources) throws IOException { UserGroupInformation.setConfiguration(conf); LocalFileSystem localFS = FileSystem.getLocal(conf); ArrayList dirs = new ArrayList(); FsPermission dataDirPermission = new FsPermission(conf.get(DATA_DIR_PERMISSION_KEY, DEFAULT_DATA_DIR_PERMISSION)); for (String dir : dataDirs) { try { DiskChecker.checkDir(localFS, new Path(dir), dataDirPermission); dirs.add(new File(dir)); } catch(IOException e) { LOG.warn("Invalid directory in " + DATA_DIR_KEY + ": " + e.getMessage()); } } if (dirs.size() > 0) return new DataNode(conf, dirs, resources); LOG.error("All directories in " + DATA_DIR_KEY + " are invalid."); return null; } @Override public String toString() { return "DataNode{" + "data=" + data + ", localName='" + dnRegistration.getName() + "'" + ", storageID='" + dnRegistration.getStorageID() + "'" + ", xmitsInProgress=" + xmitsInProgress.get() + "}"; } private static void printUsage() { System.err.println("Usage: java DataNode"); System.err.println(" [-rollback]"); } /** * Parse and verify command line arguments and set configuration parameters. * * @return false if passed argements are incorrect */ private static boolean parseArguments(String args[], Configuration conf) { int argsLen = (args == null) ? 0 : args.length; StartupOption startOpt = StartupOption.REGULAR; for(int i=0; i < argsLen; i++) { String cmd = args[i]; if ("-r".equalsIgnoreCase(cmd) || "--rack".equalsIgnoreCase(cmd)) { LOG.error("-r, --rack arguments are not supported anymore. RackID " + "resolution is handled by the NameNode."); System.exit(-1); } else if ("-rollback".equalsIgnoreCase(cmd)) { startOpt = StartupOption.ROLLBACK; } else if ("-regular".equalsIgnoreCase(cmd)) { startOpt = StartupOption.REGULAR; } else return false; } setStartupOption(conf, startOpt); return true; } private static void setStartupOption(Configuration conf, StartupOption opt) { conf.set("dfs.datanode.startup", opt.toString()); } static StartupOption getStartupOption(Configuration conf) { return StartupOption.valueOf(conf.get("dfs.datanode.startup", StartupOption.REGULAR.toString())); } /** * This methods arranges for the data node to send the block report at the next heartbeat. */ public void scheduleBlockReport(long delay) { if (delay > 0) { // send BR after random delay lastBlockReport = System.currentTimeMillis() - ( blockReportInterval - R.nextInt((int)(delay))); } else { // send at next heartbeat lastBlockReport = lastHeartbeat - blockReportInterval; } resetBlockReportTime = true; // reset future BRs for randomness } /** * This method is used for testing. * Examples are adding and deleting blocks directly. * The most common usage will be when the data node's storage is similated. * * @return the fsdataset that stores the blocks */ public FSDatasetInterface getFSDataset() { return data; } public static void secureMain(String [] args, SecureResources resources) { try { StringUtils.startupShutdownMessage(DataNode.class, args, LOG); DataNode datanode = createDataNode(args, null, resources); if (datanode != null) datanode.join(); } catch (Throwable e) { LOG.error(StringUtils.stringifyException(e)); System.exit(-1); } finally { // We need to add System.exit here because either shutdown was called or // some disk related conditions like volumes tolerated or volumes required // condition was not met. Also, In secure mode, control will go to Jsvc and // the process hangs without System.exit. LOG.info("Exiting Datanode"); System.exit(0); } } public static void main(String args[]) { secureMain(args, null); } // InterDataNodeProtocol implementation /** {@inheritDoc} */ public BlockMetaDataInfo getBlockMetaDataInfo(Block block ) throws IOException { if (LOG.isDebugEnabled()) { LOG.debug("block=" + block); } Block stored = data.getStoredBlock(block.getBlockId()); if (stored == null) { return null; } BlockMetaDataInfo info = new BlockMetaDataInfo(stored, blockScanner.getLastScanTime(stored)); if (LOG.isDebugEnabled()) { LOG.debug("getBlockMetaDataInfo successful block=" + stored + " length " + stored.getNumBytes() + " genstamp " + stored.getGenerationStamp()); } // paranoia! verify that the contents of the stored block // matches the block file on disk. data.validateBlockMetadata(stored); return info; } @Override public BlockRecoveryInfo startBlockRecovery(Block block) throws IOException { return data.startBlockRecovery(block.getBlockId()); } public Daemon recoverBlocks(final Block[] blocks, final DatanodeInfo[][] targets) { Daemon d = new Daemon(threadGroup, new Runnable() { /** Recover a list of blocks. It is run by the primary datanode. */ public void run() { for(int i = 0; i < blocks.length; i++) { try { logRecoverBlock("NameNode", blocks[i], targets[i]); recoverBlock(blocks[i], false, targets[i], true); } catch (IOException e) { LOG.warn("recoverBlocks FAILED, blocks[" + i + "]=" + blocks[i], e); } } } }); d.start(); return d; } /** {@inheritDoc} */ public void updateBlock(Block oldblock, Block newblock, boolean finalize) throws IOException { LOG.info("oldblock=" + oldblock + "(length=" + oldblock.getNumBytes() + "), newblock=" + newblock + "(length=" + newblock.getNumBytes() + "), datanode=" + dnRegistration.getName()); data.updateBlock(oldblock, newblock); if (finalize) { data.finalizeBlockIfNeeded(newblock); myMetrics.incrBlocksWritten(); notifyNamenodeReceivedBlock(newblock, EMPTY_DEL_HINT); LOG.info("Received block " + newblock + " of size " + newblock.getNumBytes() + " as part of lease recovery."); } } /** {@inheritDoc} */ public long getProtocolVersion(String protocol, long clientVersion ) throws IOException { if (protocol.equals(InterDatanodeProtocol.class.getName())) { return InterDatanodeProtocol.versionID; } else if (protocol.equals(ClientDatanodeProtocol.class.getName())) { return ClientDatanodeProtocol.versionID; } throw new IOException("Unknown protocol to " + getClass().getSimpleName() + ": " + protocol); } /** Ensure the authentication method is kerberos */ private void checkKerberosAuthMethod(String msg) throws IOException { // User invoking the call must be same as the datanode user if (!UserGroupInformation.isSecurityEnabled()) { return; } if (UserGroupInformation.getCurrentUser().getAuthenticationMethod() != AuthenticationMethod.KERBEROS) { throw new AccessControlException("Error in "+msg+". Only " + "kerberos based authentication is allowed."); } } private void checkBlockLocalPathAccess() throws IOException { checkKerberosAuthMethod("getBlockLocalPathInfo()"); String currentUser = UserGroupInformation.getCurrentUser().getShortUserName(); if (!currentUser.equals(this.userWithLocalPathAccess)) { throw new AccessControlException( "Can't continue with getBlockLocalPathInfo() " + "authorization. The user " + currentUser + " is not allowed to call getBlockLocalPathInfo"); } } @Override public BlockLocalPathInfo getBlockLocalPathInfo(Block block, Token token) throws IOException { checkBlockLocalPathAccess(); checkBlockToken(block, token, BlockTokenSecretManager.AccessMode.READ); BlockLocalPathInfo info = data.getBlockLocalPathInfo(block); if (LOG.isDebugEnabled()) { if (info != null) { if (LOG.isTraceEnabled()) { LOG.trace("getBlockLocalPathInfo successful block=" + block + " blockfile " + info.getBlockPath() + " metafile " + info.getMetaPath()); } } else { if (LOG.isTraceEnabled()) { LOG.trace("getBlockLocalPathInfo for block=" + block + " returning null"); } } } myMetrics.incrBlocksGetLocalPathInfo(); return info; } private void checkBlockToken(Block block, Token token, AccessMode accessMode) throws IOException { if (isBlockTokenEnabled && UserGroupInformation.isSecurityEnabled()) { BlockTokenIdentifier id = new BlockTokenIdentifier(); ByteArrayInputStream buf = new ByteArrayInputStream(token.getIdentifier()); DataInputStream in = new DataInputStream(buf); id.readFields(in); if (LOG.isDebugEnabled()) { LOG.debug("Got: " + id.toString()); } blockTokenSecretManager.checkAccess(id, null, block, accessMode); } } /** Check block access token for the given access mode */ private void checkBlockToken(Block block, BlockTokenSecretManager.AccessMode accessMode) throws IOException { if (isBlockTokenEnabled && UserGroupInformation.isSecurityEnabled()) { Set tokenIds = UserGroupInformation.getCurrentUser() .getTokenIdentifiers(); if (tokenIds.size() != 1) { throw new IOException("Can't continue with " + "authorization since " + tokenIds.size() + " BlockTokenIdentifier " + "is found."); } for (TokenIdentifier tokenId : tokenIds) { BlockTokenIdentifier id = (BlockTokenIdentifier) tokenId; if (LOG.isDebugEnabled()) { LOG.debug("Got: " + id.toString()); } blockTokenSecretManager.checkAccess(id, null, block, accessMode); } } } /** A convenient class used in lease recovery */ private static class BlockRecord { final DatanodeID id; final InterDatanodeProtocol datanode; final BlockRecoveryInfo info; BlockRecord(DatanodeID id, InterDatanodeProtocol datanode, BlockRecoveryInfo info) { this.id = id; this.datanode = datanode; this.info = info; } /** {@inheritDoc} */ public String toString() { return "BlockRecord(info=" + info + " node=" + id + ")"; } } /** Recover a block * @param keepLength if true, will only recover replicas that have the same length * as the block passed in. Otherwise, will calculate the minimum length of the * replicas and truncate the rest to that length. **/ private LocatedBlock recoverBlock(Block block, boolean keepLength, DatanodeInfo[] targets, boolean closeFile) throws IOException { DatanodeID[] datanodeids = (DatanodeID[])targets; // If the block is already being recovered, then skip recovering it. // This can happen if the namenode and client start recovering the same // file at the same time. synchronized (ongoingRecovery) { Block tmp = new Block(); tmp.set(block.getBlockId(), block.getNumBytes(), GenerationStamp.WILDCARD_STAMP); if (ongoingRecovery.get(tmp) != null) { String msg = "Block " + block + " is already being recovered, " + " ignoring this request to recover it."; LOG.info(msg); throw new IOException(msg); } ongoingRecovery.put(block, block); } try { int errorCount = 0; // Number of "replicasBeingWritten" in 0.21 parlance - these are replicas // on DNs that are still alive from when the write was happening int rbwCount = 0; // Number of "replicasWaitingRecovery" in 0.21 parlance - these replicas // have survived a DN restart, and thus might be truncated (eg if the // DN died because of a machine power failure, and when the ext3 journal // replayed, it truncated the file int rwrCount = 0; List blockRecords = new ArrayList(); for(DatanodeID id : datanodeids) { try { InterDatanodeProtocol datanode = dnRegistration.equals(id)? this: DataNode.createInterDataNodeProtocolProxy(id, getConf(), socketTimeout); BlockRecoveryInfo info = datanode.startBlockRecovery(block); if (info == null) { LOG.info("No block metadata found for block " + block + " on datanode " + id); continue; } if (info.getBlock().getGenerationStamp() < block.getGenerationStamp()) { LOG.info("Only old generation stamp " + info.getBlock().getGenerationStamp() + " found on datanode " + id + " (needed block=" + block + ")"); continue; } blockRecords.add(new BlockRecord(id, datanode, info)); if (info.wasRecoveredOnStartup()) { rwrCount++; } else { rbwCount++; } } catch (IOException e) { ++errorCount; InterDatanodeProtocol.LOG.warn( "Failed to getBlockMetaDataInfo for block (=" + block + ") from datanode (=" + id + ")", e); } } // If we *only* have replicas from post-DN-restart, then we should // include them in determining length. Otherwise they might cause us // to truncate too short. boolean shouldRecoverRwrs = (rbwCount == 0); List syncList = new ArrayList(); long minlength = Long.MAX_VALUE; for (BlockRecord record : blockRecords) { BlockRecoveryInfo info = record.info; assert (info != null && info.getBlock().getGenerationStamp() >= block.getGenerationStamp()); if (!shouldRecoverRwrs && info.wasRecoveredOnStartup()) { LOG.info("Not recovering replica " + record + " since it was recovered on " + "startup and we have better replicas"); continue; } if (keepLength) { if (info.getBlock().getNumBytes() == block.getNumBytes()) { syncList.add(record); } } else { syncList.add(record); if (info.getBlock().getNumBytes() < minlength) { minlength = info.getBlock().getNumBytes(); } } } if (syncList.isEmpty() && errorCount > 0) { throw new IOException("All datanodes failed: block=" + block + ", datanodeids=" + Arrays.asList(datanodeids)); } if (!keepLength) { block.setNumBytes(minlength); } return syncBlock(block, syncList, targets, closeFile); } finally { synchronized (ongoingRecovery) { ongoingRecovery.remove(block); } } } /** Block synchronization */ private LocatedBlock syncBlock(Block block, List syncList, DatanodeInfo[] targets, boolean closeFile) throws IOException { if (LOG.isDebugEnabled()) { LOG.debug("block=" + block + ", (length=" + block.getNumBytes() + "), syncList=" + syncList + ", closeFile=" + closeFile); } //syncList.isEmpty() that all datanodes do not have the block //so the block can be deleted. if (syncList.isEmpty()) { namenode.commitBlockSynchronization(block, 0, 0, closeFile, true, DatanodeID.EMPTY_ARRAY); //always return a new access token even if everything else stays the same LocatedBlock b = new LocatedBlock(block, targets); if (isBlockTokenEnabled) { b.setBlockToken(blockTokenSecretManager.generateToken(null, b.getBlock(), EnumSet.of(BlockTokenSecretManager.AccessMode.WRITE))); } return b; } List successList = new ArrayList(); long generationstamp = namenode.nextGenerationStamp(block, closeFile); Block newblock = new Block(block.getBlockId(), block.getNumBytes(), generationstamp); for(BlockRecord r : syncList) { try { r.datanode.updateBlock(r.info.getBlock(), newblock, closeFile); successList.add(r.id); } catch (IOException e) { InterDatanodeProtocol.LOG.warn("Failed to updateBlock (newblock=" + newblock + ", datanode=" + r.id + ")", e); } } if (!successList.isEmpty()) { DatanodeID[] nlist = successList.toArray(new DatanodeID[successList.size()]); namenode.commitBlockSynchronization(block, newblock.getGenerationStamp(), newblock.getNumBytes(), closeFile, false, nlist); DatanodeInfo[] info = new DatanodeInfo[nlist.length]; for (int i = 0; i < nlist.length; i++) { info[i] = new DatanodeInfo(nlist[i]); } LocatedBlock b = new LocatedBlock(newblock, info); // success // should have used client ID to generate access token, but since // owner ID is not checked, we simply pass null for now. if (isBlockTokenEnabled) { b.setBlockToken(blockTokenSecretManager.generateToken(null, b.getBlock(), EnumSet.of(BlockTokenSecretManager.AccessMode.WRITE))); } return b; } //failed StringBuilder b = new StringBuilder(); for(BlockRecord r : syncList) { b.append("\n " + r.id); } throw new IOException("Cannot recover " + block + ", none of these " + syncList.size() + " datanodes success {" + b + "\n}"); } // ClientDataNodeProtocol implementation /** {@inheritDoc} */ public LocatedBlock recoverBlock(Block block, boolean keepLength, DatanodeInfo[] targets ) throws IOException { logRecoverBlock("Client", block, targets); checkBlockToken(block, BlockTokenSecretManager.AccessMode.WRITE); return recoverBlock(block, keepLength, targets, false); } /** {@inheritDoc} */ public Block getBlockInfo(Block block) throws IOException { checkBlockToken(block, BlockTokenSecretManager.AccessMode.READ); Block stored = data.getStoredBlock(block.getBlockId()); return stored; } private static void logRecoverBlock(String who, Block block, DatanodeID[] targets) { StringBuilder msg = new StringBuilder(targets[0].getName()); for (int i = 1; i < targets.length; i++) { msg.append(", " + targets[i].getName()); } LOG.info(who + " calls recoverBlock(block=" + block + ", targets=[" + msg + "])"); } public static InetSocketAddress getStreamingAddr(Configuration conf) { String address = NetUtils.getServerAddress(conf, "dfs.datanode.bindAddress", "dfs.datanode.port", "dfs.datanode.address"); return NetUtils.createSocketAddr(address); } @Override // DataNodeMXBean public String getHostName() { return this.machineName; } @Override // DataNodeMXBean public String getVersion() { return VersionInfo.getVersion(); } @Override // DataNodeMXBean public String getRpcPort(){ InetSocketAddress ipcAddr = NetUtils.createSocketAddr( this.getConf().get("dfs.datanode.ipc.address")); return Integer.toString(ipcAddr.getPort()); } @Override // DataNodeMXBean public String getHttpPort(){ return this.getConf().get("dfs.datanode.info.port"); } @Override // DataNodeMXBean public String getNamenodeAddress(){ return nameNodeAddr.getHostName(); } /** * Returned information is a JSON representation of a map with * volume name as the key and value is a map of volume attribute * keys to its values */ @Override // DataNodeMXBean public String getVolumeInfo() { final Map info = new HashMap(); Collection volumes = ((FSDataset)this.data).getVolumeInfo(); for (VolumeInfo v : volumes) { final Map innerInfo = new HashMap(); innerInfo.put("usedSpace", v.usedSpace); innerInfo.put("freeSpace", v.freeSpace); innerInfo.put("reservedSpace", v.reservedSpace); info.put(v.directory, innerInfo); } return JSON.toString(info); } /** * Get current value of the max balancer bandwidth in bytes per second. * * @return bandwidth Blanacer bandwidth in bytes per second for this datanode. */ public Long getBalancerBandwidth() { DataXceiverServer dxcs = (DataXceiverServer) this.dataXceiverServer.getRunnable(); return dxcs.balanceThrottler.getBandwidth(); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy