All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.mapred.uploader.FrameworkUploader Maven / Gradle / Ivy

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.mapred.uploader;

import com.google.common.annotations.VisibleForTesting;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.compress.archivers.ArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsAction;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.protocol.SystemErasureCodingPolicies;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.security.AccessControlException;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Shell;
import org.apache.hadoop.util.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.NotLinkException;
import java.nio.file.Paths;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPOutputStream;

import static org.apache.hadoop.fs.FileSystem.FS_DEFAULT_NAME_KEY;

/**
 * Upload a MapReduce framework tarball to HDFS.
 * Usage:
 * sudo -u mapred mapred frameworkuploader -fs hdfs://`hostname`:8020 -target
 * /tmp/upload.tar.gz#mr-framework
*/
public class FrameworkUploader implements Runnable {
  private static final Pattern VAR_SUBBER =
      Pattern.compile(Shell.getEnvironmentVariableRegex());
  private static final Logger LOG =
      LoggerFactory.getLogger(FrameworkUploader.class);
  private Configuration conf = new Configuration();

  // Minimal required permissions for the uploaded framework
  private static final FsPermission FRAMEWORK_PERMISSION =
      new FsPermission(0644);

  @VisibleForTesting
  String input = null;
  @VisibleForTesting
  String whitelist = null;
  @VisibleForTesting
  String blacklist = null;
  @VisibleForTesting
  String target = null;
  @VisibleForTesting
  Path targetPath = null;
  @VisibleForTesting
  short initialReplication = 3;
  @VisibleForTesting
  short finalReplication = 10;
  @VisibleForTesting
  short acceptableReplication = 9;
  @VisibleForTesting
  int timeout = 10;
  private boolean ignoreSymlink = false;

  @VisibleForTesting
  Set filteredInputFiles = new HashSet<>();
  @VisibleForTesting
  List whitelistedFiles = new LinkedList<>();
  @VisibleForTesting
  List blacklistedFiles = new LinkedList<>();

  private OutputStream targetStream = null;
  private FSDataOutputStream fsDataStream = null;
  private String alias = null;

  @VisibleForTesting
  void setConf(Configuration configuration) {
    conf = configuration;
  }

  private void printHelp(Options options) {
    HelpFormatter formatter = new HelpFormatter();
    formatter.printHelp("mapred frameworkuploader", options);
  }

  public void run() {
    try {
      collectPackages();
      buildPackage();
      LOG.info("Uploaded " + target);
      System.out.println("Suggested mapreduce.application.framework.path " +
          target);
      LOG.info(
          "Suggested mapreduce.application.classpath $PWD/" + alias + "/*");
      System.out.println("Suggested classpath $PWD/" + alias + "/*");
    } catch (UploaderException|IOException|InterruptedException e) {
      LOG.error("Error in execution " + e.getMessage());
      e.printStackTrace();
      throw new RuntimeException(e);
    }
  }

  @VisibleForTesting
  void collectPackages() throws UploaderException {
    parseLists();
    String[] list = StringUtils.split(input, File.pathSeparatorChar);
    for (String item : list) {
      LOG.info("Original source " + item);
      String expanded = expandEnvironmentVariables(item, System.getenv());
      LOG.info("Expanded source " + expanded);
      if (expanded.endsWith("*")) {
        File path = new File(expanded.substring(0, expanded.length() - 1));
        if (path.isDirectory()) {
          File[] files = path.listFiles();
          if (files != null) {
            for (File jar : files) {
              if (!jar.isDirectory()) {
                addJar(jar);
              } else {
                LOG.info("Ignored " + jar + " because it is a directory");
              }
            }
          } else {
            LOG.warn("Could not list directory " + path);
          }
        } else {
          LOG.warn("Ignored " + expanded + ". It is not a directory");
        }
      } else if (expanded.endsWith(".jar")) {
        File jarFile = new File(expanded);
        addJar(jarFile);
      } else if (!expanded.isEmpty()) {
        LOG.warn("Ignored " + expanded + " only jars are supported");
      }
    }
  }

  @VisibleForTesting
  void beginUpload() throws IOException, UploaderException {
    if (targetStream == null) {
      validateTargetPath();
      int lastIndex = target.indexOf('#');
      targetPath =
          new Path(
              target.substring(
                  0, lastIndex == -1 ? target.length() : lastIndex));
      alias = lastIndex != -1 ?
          target.substring(lastIndex + 1) :
          targetPath.getName();
      LOG.info("Target " + targetPath);
      FileSystem fileSystem = targetPath.getFileSystem(conf);

      targetStream = null;
      if (fileSystem instanceof DistributedFileSystem) {
        LOG.info("Set replication to " +
            initialReplication + " for path: " + targetPath);
        LOG.info("Disabling Erasure Coding for path: " + targetPath);
        DistributedFileSystem dfs = (DistributedFileSystem)fileSystem;
        DistributedFileSystem.HdfsDataOutputStreamBuilder builder =
            dfs.createFile(targetPath)
            .overwrite(true)
            .ecPolicyName(
                SystemErasureCodingPolicies.getReplicationPolicy().getName());
        if (initialReplication > 0) {
          builder.replication(initialReplication);
        }
        targetStream = builder.build();
      } else {
        LOG.warn("Cannot set replication to " +
            initialReplication + " for path: " + targetPath +
            " on a non-distributed fileystem " +
            fileSystem.getClass().getName());
      }
      if (targetStream == null) {
        targetStream = fileSystem.create(targetPath, true);
      }

      if (!FRAMEWORK_PERMISSION.equals(
          FRAMEWORK_PERMISSION.applyUMask(FsPermission.getUMask(conf)))) {
        LOG.info("Modifying permissions to " + FRAMEWORK_PERMISSION);
        fileSystem.setPermission(targetPath, FRAMEWORK_PERMISSION);
      }

      fsDataStream = (FSDataOutputStream) targetStream;
      if (targetPath.getName().endsWith("gz") ||
          targetPath.getName().endsWith("tgz")) {
        LOG.info("Creating GZip");
        targetStream = new GZIPOutputStream(targetStream);
      }

      Path current = targetPath.getParent();
      // Walk the path backwards to verify that the uploaded
      // framework is accessible for all users
      while (current != null) {
        try {
          FileStatus fstat = fileSystem.getFileStatus(current);
          FsPermission perm = fstat.getPermission();

          // Note: READ is not necessary to enter the directory.
          // We need to check only the EXECUTE flag
          boolean userCanEnter = perm.getUserAction()
              .implies(FsAction.EXECUTE);
          boolean groupCanEnter = perm.getGroupAction()
              .implies(FsAction.EXECUTE);
          boolean othersCanEnter = perm.getOtherAction()
              .implies(FsAction.EXECUTE);

          if (!userCanEnter || !groupCanEnter || !othersCanEnter) {
            LOG.warn("Path " + current + " is not accessible"
                + " for all users. Current permissions are: " + perm);
            LOG.warn("Please set EXECUTE permissions on this directory");
          }
          current = current.getParent();
        } catch (AccessControlException e) {
          LOG.warn("Path " + current + " is not accessible,"
              + " cannot retrieve permissions");
          LOG.warn("Please set EXECUTE permissions on this directory");
          LOG.debug("Stack trace", e);
          break;
        }
      }
    }
  }

  private long getSmallestReplicatedBlockCount()
      throws IOException {
    FileSystem fileSystem = targetPath.getFileSystem(conf);
    FileStatus status = fileSystem.getFileStatus(targetPath);
    long length = status.getLen();
    HashMap blockCount = new HashMap<>();

    // Start with 0s for each offset
    for (long offset = 0; offset < length; offset +=status.getBlockSize()) {
      blockCount.put(offset, 0);
    }

    // Count blocks
    BlockLocation[] locations = fileSystem.getFileBlockLocations(
        targetPath, 0, length);
    for(BlockLocation location: locations) {
      final int replicas = location.getHosts().length;
      blockCount.compute(
          location.getOffset(),
          (key, value) -> value == null ? 0 : value + replicas);
    }

    // Print out the results
    for (long offset = 0; offset < length; offset +=status.getBlockSize()) {
      LOG.info(String.format(
          "Replication counts offset:%d blocks:%d",
          offset, blockCount.get(offset)));
    }

    return Collections.min(blockCount.values());
  }

  private void endUpload()
      throws IOException, InterruptedException {
    FileSystem fileSystem = targetPath.getFileSystem(conf);
    if (fileSystem instanceof DistributedFileSystem) {
      fileSystem.setReplication(targetPath, finalReplication);
      LOG.info("Set replication to " +
          finalReplication + " for path: " + targetPath);
      long startTime = System.currentTimeMillis();
      long endTime = startTime;
      long currentReplication = 0;
      while(endTime - startTime < timeout * 1000 &&
           currentReplication < acceptableReplication) {
        Thread.sleep(1000);
        endTime = System.currentTimeMillis();
        currentReplication = getSmallestReplicatedBlockCount();
      }
      if (endTime - startTime >= timeout * 1000) {
        LOG.error(String.format(
            "Timed out after %d seconds while waiting for acceptable" +
                " replication of %d (current replication is %d)",
            timeout, acceptableReplication, currentReplication));
      }
    } else {
      LOG.info("Cannot set replication to " +
          finalReplication + " for path: " + targetPath +
          " on a non-distributed fileystem " +
          fileSystem.getClass().getName());
    }
  }

  @VisibleForTesting
  void buildPackage()
      throws IOException, UploaderException, InterruptedException {
    beginUpload();
    LOG.info("Compressing tarball");
    try (TarArchiveOutputStream out = new TarArchiveOutputStream(
        targetStream)) {
      for (String fullPath : filteredInputFiles) {
        LOG.info("Adding " + fullPath);
        File file = new File(fullPath);
        try (FileInputStream inputStream = new FileInputStream(file)) {
          ArchiveEntry entry = out.createArchiveEntry(file, file.getName());
          out.putArchiveEntry(entry);
          IOUtils.copyBytes(inputStream, out, 1024 * 1024);
          out.closeArchiveEntry();
        }
      }

      // Necessary to see proper replication counts in endUpload()
      fsDataStream.hflush();

      endUpload();
    } finally {
      if (targetStream != null) {
        targetStream.close();
      }
    }
  }

  private void parseLists() throws UploaderException {
    Map env = System.getenv();
    for(Map.Entry item : env.entrySet()) {
      LOG.info("Environment " + item.getKey() + " " + item.getValue());
    }
    String[] whiteListItems = StringUtils.split(whitelist);
    for (String pattern : whiteListItems) {
      String expandedPattern =
          expandEnvironmentVariables(pattern, env);
      Pattern compiledPattern =
          Pattern.compile("^" + expandedPattern + "$");
      LOG.info("Whitelisted " + compiledPattern.toString());
      whitelistedFiles.add(compiledPattern);
    }
    String[] blacklistItems = StringUtils.split(blacklist);
    for (String pattern : blacklistItems) {
      String expandedPattern =
          expandEnvironmentVariables(pattern, env);
      Pattern compiledPattern =
          Pattern.compile("^" + expandedPattern + "$");
      LOG.info("Blacklisted " + compiledPattern.toString());
      blacklistedFiles.add(compiledPattern);
    }
  }

  @VisibleForTesting
  String expandEnvironmentVariables(String innerInput, Map env)
      throws UploaderException {
    boolean found;
    do {
      found = false;
      Matcher matcher = VAR_SUBBER.matcher(innerInput);
      StringBuffer stringBuffer = new StringBuffer();
      while (matcher.find()) {
        found = true;
        String var = matcher.group(1);
        // replace $env with the child's env constructed by tt's
        String replace = env.get(var);
        // the env key is not present anywhere .. simply set it
        if (replace == null) {
          throw new UploaderException("Environment variable does not exist " +
              var);
        }
        matcher.appendReplacement(
            stringBuffer, Matcher.quoteReplacement(replace));
      }
      matcher.appendTail(stringBuffer);
      innerInput = stringBuffer.toString();
    } while (found);
    return innerInput;
  }

  private void addJar(File jar) throws UploaderException{
    boolean found = false;
    if (!jar.getName().endsWith(".jar")) {
      LOG.info("Ignored non-jar " + jar.getAbsolutePath());
    }
    for (Pattern pattern : whitelistedFiles) {
      Matcher matcher = pattern.matcher(jar.getAbsolutePath());
      if (matcher.matches()) {
        LOG.info("Whitelisted " + jar.getAbsolutePath());
        found = true;
        break;
      }
    }
    boolean excluded = false;
    for (Pattern pattern : blacklistedFiles) {
      Matcher matcher = pattern.matcher(jar.getAbsolutePath());
      if (matcher.matches()) {
        LOG.info("Blacklisted " + jar.getAbsolutePath());
        excluded = true;
        break;
      }
    }
    if (ignoreSymlink && !excluded) {
      excluded = checkSymlink(jar);
    }
    if (found && !excluded) {
      LOG.info("Whitelisted " + jar.getAbsolutePath());
      if (!filteredInputFiles.add(jar.getAbsolutePath())) {
        throw new UploaderException("Duplicate jar" + jar.getAbsolutePath());
      }
    }
    if (!found) {
      LOG.info("Ignored " + jar.getAbsolutePath() + " because it is missing " +
          "from the whitelist");
    } else if (excluded) {
      LOG.info("Ignored " + jar.getAbsolutePath() + " because it is on " +
          "the the blacklist");
    }
  }

  /**
   * Check if the file is a symlink to the same directory.
   * @param jar The file to check
   * @return true, to ignore the directory
   */
  @VisibleForTesting
  boolean checkSymlink(File jar) {
    if (Files.isSymbolicLink(jar.toPath())) {
      try {
        java.nio.file.Path link = Files.readSymbolicLink(jar.toPath());
        java.nio.file.Path jarPath = Paths.get(jar.getAbsolutePath());
        String linkString = link.toString();
        java.nio.file.Path jarParent = jarPath.getParent();
        java.nio.file.Path linkPath =
            jarParent == null ? null : jarParent.resolve(linkString);
        java.nio.file.Path linkPathParent =
            linkPath == null ? null : linkPath.getParent();
        java.nio.file.Path normalizedLinkPath =
            linkPathParent == null ? null : linkPathParent.normalize();
        if (normalizedLinkPath != null && jarParent.normalize().equals(
            normalizedLinkPath)) {
          LOG.info(String.format("Ignoring same directory link %s to %s",
              jarPath.toString(), link.toString()));
          return true;
        }
      } catch (NotLinkException ex) {
        LOG.debug("Not a link", jar);
      } catch (IOException ex) {
        LOG.warn("Cannot read symbolic link on", jar);
      }
    }
    return false;
  }

  private void validateTargetPath() throws UploaderException {
    if (!target.startsWith("hdfs:/") &&
        !target.startsWith("file:/")) {
      throw new UploaderException("Target path is not hdfs or local " + target);
    }
  }

  @VisibleForTesting
  boolean parseArguments(String[] args) throws IOException {
    Options opts = new Options();
    opts.addOption(OptionBuilder.create("h"));
    opts.addOption(OptionBuilder.create("help"));
    opts.addOption(OptionBuilder
        .withDescription("Input class path. Defaults to the default classpath.")
        .hasArg().create("input"));
    opts.addOption(OptionBuilder
        .withDescription(
            "Regex specifying the full path of jars to include in the" +
                " framework tarball. Default is a hardcoded set of jars" +
                " considered necessary to include")
        .hasArg().create("whitelist"));
    opts.addOption(OptionBuilder
        .withDescription(
            "Regex specifying the full path of jars to exclude in the" +
                " framework tarball. Default is a hardcoded set of jars" +
                " considered unnecessary to include")
        .hasArg().create("blacklist"));
    opts.addOption(OptionBuilder
        .withDescription(
            "Target file system to upload to." +
            " Example: hdfs://foo.com:8020")
        .hasArg().create("fs"));
    opts.addOption(OptionBuilder
        .withDescription(
            "Target file to upload to with a reference name." +
                " Example: /usr/mr-framework.tar.gz#mr-framework")
        .hasArg().create("target"));
    opts.addOption(OptionBuilder
        .withDescription(
            "Desired initial replication count. Default 3.")
        .hasArg().create("initialReplication"));
    opts.addOption(OptionBuilder
        .withDescription(
            "Desired final replication count. Default 10.")
        .hasArg().create("finalReplication"));
    opts.addOption(OptionBuilder
        .withDescription(
            "Desired acceptable replication count. Default 9.")
        .hasArg().create("acceptableReplication"));
    opts.addOption(OptionBuilder
        .withDescription(
            "Desired timeout for the acceptable" +
                " replication in seconds. Default 10")
        .hasArg().create("timeout"));
    opts.addOption(OptionBuilder
        .withDescription("Ignore symlinks into the same directory")
        .create("nosymlink"));
    GenericOptionsParser parser = new GenericOptionsParser(opts, args);
    if (parser.getCommandLine().hasOption("help") ||
        parser.getCommandLine().hasOption("h")) {
      printHelp(opts);
      return false;
    }
    input = parser.getCommandLine().getOptionValue(
        "input", System.getProperty("java.class.path"));
    whitelist = parser.getCommandLine().getOptionValue(
        "whitelist", DefaultJars.DEFAULT_MR_JARS);
    blacklist = parser.getCommandLine().getOptionValue(
        "blacklist", DefaultJars.DEFAULT_EXCLUDED_MR_JARS);
    initialReplication =
        Short.parseShort(parser.getCommandLine().getOptionValue(
            "initialReplication", "3"));
    finalReplication =
        Short.parseShort(parser.getCommandLine().getOptionValue(
            "finalReplication", "10"));
    acceptableReplication =
        Short.parseShort(
            parser.getCommandLine().getOptionValue(
                "acceptableReplication", "9"));
    timeout =
        Integer.parseInt(
            parser.getCommandLine().getOptionValue("timeout", "10"));
    if (parser.getCommandLine().hasOption("nosymlink")) {
      ignoreSymlink = true;
    }
    String fs = parser.getCommandLine()
        .getOptionValue("fs", null);
    String path = parser.getCommandLine().getOptionValue("target",
        "/usr/lib/mr-framework.tar.gz#mr-framework");
    boolean isFullPath =
        path.startsWith("hdfs://") ||
        path.startsWith("file://");

    if (fs == null) {
      fs = conf.get(FS_DEFAULT_NAME_KEY);
      if (fs == null && !isFullPath) {
        LOG.error("No filesystem specified in either fs or target.");
        printHelp(opts);
        return false;
      } else {
        LOG.info(String.format(
            "Target file system not specified. Using default %s", fs));
      }
    }
    if (path.isEmpty()) {
      LOG.error("Target directory not specified");
      printHelp(opts);
      return false;
    }
    StringBuilder absolutePath = new StringBuilder();
    if (!isFullPath) {
      absolutePath.append(fs);
      absolutePath.append(path.startsWith("/") ? "" : "/");
    }
    absolutePath.append(path);
    target = absolutePath.toString();

    if (parser.getRemainingArgs().length > 0) {
      LOG.warn("Unexpected parameters");
      printHelp(opts);
      return false;
    }
    return true;
  }

  /**
   * Tool entry point.
   * @param args arguments
   * @throws IOException thrown on configuration errors
   */
  public static void main(String[] args) throws IOException {
    FrameworkUploader uploader = new FrameworkUploader();
    if(uploader.parseArguments(args)) {
      uploader.run();
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy