All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.kitesdk.minicluster.HdfsService Maven / Gradle / Ivy

The newest version!
/**
 * Copyright 2014 Cloudera Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.kitesdk.minicluster;

import com.google.common.base.Preconditions;

import java.io.File;
import java.io.IOException;
import java.util.List;

import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * An HDFS minicluster service implementation.
 */
public class HdfsService implements Service {

  private static final Logger logger = LoggerFactory
      .getLogger(HdfsService.class);

  /**
   * Service registration for MiniCluster factory
   */
  static {
    MiniCluster.registerService(HdfsService.class);
  }

  /**
   * Service configuration keys
   */
  public static final String NAMENODE_HTTP_PORT = "hdfs-namenode-http-port";
  public static final String DATANODE_PORT = "hdfs-datanode-port";
  public static final String DATANODE_IPC_PORT = "hdfs-datanode-ipc-port";
  public static final String DATANODE_HTTP_PORT = "hdfs-datanode-http-port";

  /**
   * Configuration settings
   */
  private Configuration hadoopConf;
  private String workDir;
  private String bindIP = "127.0.0.1";
  private int namenodeRpcPort = 8020;
  private int namenodeHttpPort = 50070;
  private int datanodePort = 50010;
  private int datanodeIpcPort = 50020;
  private int datanodeHttpPort = 50075;
  private boolean clean = false;

  /**
   * Embedded HDFS cluster
   */
  private MiniDFSCluster miniDfsCluster;

  public HdfsService() {
  }

  @Override
  public void configure(ServiceConfig serviceConfig) {
    this.workDir = serviceConfig.get(MiniCluster.WORK_DIR_KEY);
    if (serviceConfig.contains(MiniCluster.BIND_IP_KEY)) {
      bindIP = serviceConfig.get(MiniCluster.BIND_IP_KEY);
    }
    if (serviceConfig.contains(MiniCluster.CLEAN_KEY)) {
      clean = Boolean.parseBoolean(serviceConfig.get(MiniCluster.CLEAN_KEY));
    }
    if (serviceConfig.contains(MiniCluster.NAMENODE_RPC_PORT)) {
      namenodeRpcPort = Integer.parseInt(serviceConfig
          .get(MiniCluster.NAMENODE_RPC_PORT));
    }
    if (serviceConfig.contains(NAMENODE_HTTP_PORT)) {
      namenodeHttpPort = Integer
          .parseInt(serviceConfig.get(NAMENODE_HTTP_PORT));
    }
    if (serviceConfig.contains(DATANODE_PORT)) {
      datanodePort = Integer.parseInt(serviceConfig.get(DATANODE_PORT));
    }
    if (serviceConfig.contains(DATANODE_IPC_PORT)) {
      datanodeIpcPort = Integer.parseInt(serviceConfig.get(DATANODE_IPC_PORT));
    }
    if (serviceConfig.contains(DATANODE_HTTP_PORT)) {
      datanodeHttpPort = Integer
          .parseInt(serviceConfig.get(DATANODE_HTTP_PORT));
    }
    hadoopConf = serviceConfig.getHadoopConf();
  }

  @Override
  public Configuration getHadoopConf() {
    return hadoopConf;
  }

  @Override
  public void start() throws IOException {
    Preconditions.checkState(workDir != null,
        "The work dir must be set before starting cluster.");

    if (hadoopConf == null) {
      hadoopConf = new Configuration();
    }

    // If clean, then remove the work dir so we can start fresh.
    String localDFSLocation = getDFSLocation(workDir);
    if (clean) {
      logger.info("Cleaning HDFS cluster data at: " + localDFSLocation
          + " and starting fresh.");
      File file = new File(localDFSLocation);
      FileUtils.deleteDirectory(file);
    }

    // Configure and start the HDFS cluster
    boolean format = shouldFormatDFSCluster(localDFSLocation, clean);
    hadoopConf = configureDFSCluster(hadoopConf, localDFSLocation, bindIP,
        namenodeRpcPort, namenodeHttpPort, datanodePort, datanodeIpcPort,
        datanodeHttpPort);
    miniDfsCluster = new MiniDFSCluster.Builder(hadoopConf).numDataNodes(1)
        .format(format).checkDataNodeAddrConfig(true)
        .checkDataNodeHostConfig(true).build();
    logger.info("HDFS Minicluster service started.");
  }

  @Override
  public void stop() throws IOException {
    miniDfsCluster.shutdown();
    logger.info("HDFS Minicluster service shut down.");
    miniDfsCluster = null;
    hadoopConf = null;
  }

  /**
   * Get the location on the local FS where we store the HDFS data.
   * 
   * @param baseFsLocation
   *          The base location on the local filesystem we have write access to
   *          create dirs.
   * @return The location for HDFS data.
   */
  private static String getDFSLocation(String baseFsLocation) {
    return baseFsLocation + Path.SEPARATOR + "dfs";
  }

  /**
   * Returns true if we should format the DFS Cluster. We'll format if clean is
   * true, or if the dfsFsLocation does not exist.
   * 
   * @param localDFSLocation
   *          The location on the local FS to hold the HDFS metadata and block
   *          data
   * @param clean
   *          Specifies if we want to start a clean cluster
   * @return Returns true if we should format a DFSCluster, otherwise false
   */
  private static boolean shouldFormatDFSCluster(String localDFSLocation,
      boolean clean) {
    boolean format = true;
    File f = new File(localDFSLocation);
    if (f.exists() && f.isDirectory() && !clean) {
      format = false;
    }
    return format;
  }

  /**
   * Configure the DFS Cluster before launching it.
   * 
   * @param config
   *          The already created Hadoop configuration we'll further configure
   *          for HDFS
   * @param localDFSLocation
   *          The location on the local filesystem where cluster data is stored
   * @param bindIP
   *          An IP address we want to force the datanode and namenode to bind
   *          to.
   * @param namenodeRpcPort
   * @param namenodeHttpPort
   * @param datanodePort
   * @param datanodeIpcPort
   * @param datanodeHttpPort
   * @return The updated Configuration object.
   */
  private static Configuration configureDFSCluster(Configuration config,
      String localDFSLocation, String bindIP, int namenodeRpcPort,
      int namenodeHttpPort, int datanodePort, int datanodeIpcPort,
      int datanodeHttpPort) {

    logger.info("HDFS force binding to ip: " + bindIP);
    config = new KiteCompatibleConfiguration(config, bindIP, namenodeRpcPort,
        namenodeHttpPort);
    config.set(DFSConfigKeys.FS_DEFAULT_NAME_KEY, "hdfs://" + bindIP + ":"
        + namenodeRpcPort);
    config.set(DFSConfigKeys.DFS_DATANODE_ADDRESS_KEY, bindIP + ":"
        + datanodePort);
    config.set(DFSConfigKeys.DFS_DATANODE_IPC_ADDRESS_KEY, bindIP + ":"
        + datanodeIpcPort);
    config.set(DFSConfigKeys.DFS_DATANODE_HTTP_ADDRESS_KEY, bindIP + ":"
        + datanodeHttpPort);
    // When a datanode registers with the namenode, the Namenode do a hostname
    // check of the datanode which will fail on OpenShift due to reverse DNS
    // issues with the internal IP addresses. This config disables that check,
    // and will allow a datanode to connect regardless.
    config.setBoolean("dfs.namenode.datanode.registration.ip-hostname-check",
        false);
    config.set("hdfs.minidfs.basedir", localDFSLocation);
    // allow current user to impersonate others
    String user = System.getProperty("user.name");
    config.set("hadoop.proxyuser." + user + ".groups", "*");
    config.set("hadoop.proxyuser." + user + ".hosts", "*");
    return config;
  }

  /**
   * A Hadoop Configuration class that won't override the Namenode RPC and
   * Namenode HTTP bind addresses. The mini DFS cluster sets this bind address
   * to 127.0.0.1, and this can't be overridden. In environments where you can't
   * bind to 127.0.0.1 (like OpenShift), this will not work, so make sure these
   * settings can't be overridden by the mini DFS cluster.
   */
  private static class KiteCompatibleConfiguration extends Configuration {

    public KiteCompatibleConfiguration(Configuration config,
        String bindAddress, int namenodeRpcPort, int namenodeHttpPort) {
      super(config);
      super.set(DFSConfigKeys.DFS_NAMENODE_RPC_ADDRESS_KEY, bindAddress + ":"
          + namenodeRpcPort);
      super.set(DFSConfigKeys.DFS_NAMENODE_HTTP_ADDRESS_KEY, bindAddress + ":"
          + namenodeHttpPort);
    }

    @Override
    public void set(String key, String value) {
      if (!key.equals(DFSConfigKeys.DFS_NAMENODE_HTTP_ADDRESS_KEY)
          && !key.equals(DFSConfigKeys.DFS_NAMENODE_RPC_ADDRESS_KEY)) {
        super.set(key, value);
      }
    }
  }

  @Override
  public List> dependencies() {
    // no dependencies
    return null;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy