All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.tools.dynamometer.Client Maven / Gradle / Ivy

The newest version!
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.tools.dynamometer;

import org.apache.hadoop.classification.VisibleForTesting;
import org.apache.hadoop.thirdparty.com.google.common.base.Joiner;
import org.apache.hadoop.util.Preconditions;
import org.apache.hadoop.thirdparty.com.google.common.base.Splitter;
import org.apache.hadoop.util.Lists;
import java.util.Optional;
import java.util.function.Supplier;
import org.apache.hadoop.tools.dynamometer.workloadgenerator.audit.AuditReplayMapper;
import org.apache.hadoop.tools.dynamometer.workloadgenerator.WorkloadDriver;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;
import java.net.URL;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionGroup;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobStatus;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.util.ClassUtil;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.yarn.api.ApplicationConstants;
import org.apache.hadoop.yarn.api.ApplicationConstants.Environment;
import org.apache.hadoop.yarn.api.protocolrecords.GetNewApplicationResponse;
import org.apache.hadoop.yarn.api.records.ApplicationAccessType;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ApplicationReport;
import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext;
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
import org.apache.hadoop.yarn.api.records.LocalResource;
import org.apache.hadoop.yarn.api.records.LocalResourceType;
import org.apache.hadoop.yarn.api.records.LocalResourceVisibility;
import org.apache.hadoop.yarn.api.records.QueueInfo;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.YarnApplicationState;
import org.apache.hadoop.yarn.api.records.YarnClusterMetrics;
import org.apache.hadoop.yarn.client.api.YarnClient;
import org.apache.hadoop.yarn.client.api.YarnClientApplication;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.util.Apps;
import org.apache.hadoop.yarn.util.Records;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


/**
 * Client for submitting a Dynamometer YARN application, and optionally, a
 * workload MapReduce job. This client uploads resources to HDFS as necessary
 * for them to be accessed by the YARN app, then launches an
 * {@link ApplicationMaster}, which is responsible for managing the lifetime of
 * the application.
 * 

* The Dynamometer YARN application starts up the DataNodes of an HDFS * cluster. If the namenode_servicerpc_addr option is specified, it should point * to the service RPC address of an existing namenode, which the datanodes will * talk to. Else, a namenode will be launched internal to this YARN application. * The ApplicationMaster's logs contain links to the NN / DN containers to be * able to access their logs. Some of this information is also printed by the * client. *

* The application will store files in the submitting user's home directory * under a `.dynamometer/applicationID/` folder. This is mostly for uses * internal to the application, but if the NameNode is launched through YARN, * the NameNode's metrics will also be uploaded to a file `namenode_metrics` * within this folder. This file is also accessible as part of the NameNode's * logs, but this centralized location is easier to access for subsequent * parsing. *

* If the NameNode is launched internally, this Client will monitor the * status of the NameNode, printing information about its availability as the * DataNodes register (e.g., outstanding under replicated blocks as block * reports arrive). If this is configured to launch the workload job, once the * NameNode has gathered information from all of its DataNodes, the client will * launch a workload job which is configured to act against the newly launched * NameNode. Once the workload job completes, the infrastructure application * will be shut down. At this time only the audit log replay * ({@link AuditReplayMapper}) workload is supported. *

* If there is no workload job configured, this application will, by * default, persist indefinitely until killed by YARN. You can specify the * timeout option to have it exit automatically after some time. This timeout * will enforced if there is a workload job configured as well. */ @InterfaceAudience.Public @InterfaceStability.Unstable public class Client extends Configured implements Tool { private static final Logger LOG = LoggerFactory.getLogger(Client.class); public static final String APPNAME_ARG = "appname"; public static final String APPNAME_DEFAULT = "DynamometerTest"; public static final String QUEUE_ARG = "queue"; public static final String QUEUE_DEFAULT = "default"; public static final String TIMEOUT_ARG = "timeout"; public static final String TIMEOUT_DEFAULT = "-1"; public static final String HADOOP_VERSION_ARG = "hadoop_version"; public static final String HADOOP_BINARY_PATH_ARG = "hadoop_binary_path"; public static final String NAMENODE_SERVICERPC_ADDR_ARG = "namenode_servicerpc_addr"; public static final String FS_IMAGE_DIR_ARG = "fs_image_dir"; public static final String BLOCK_LIST_PATH_ARG = "block_list_path"; public static final String CONF_PATH_ARG = "conf_path"; public static final String MASTER_VCORES_ARG = "master_vcores"; public static final String MASTER_VCORES_DEFAULT = "1"; public static final String MASTER_MEMORY_MB_ARG = "master_memory_mb"; public static final String MASTER_MEMORY_MB_DEFAULT = "2048"; public static final String TOKEN_FILE_LOCATION_ARG = "token_file_location"; public static final String WORKLOAD_REPLAY_ENABLE_ARG = "workload_replay_enable"; public static final String WORKLOAD_INPUT_PATH_ARG = "workload_input_path"; public static final String WORKLOAD_OUTPUT_PATH_ARG = "workload_output_path"; public static final String WORKLOAD_THREADS_PER_MAPPER_ARG = "workload_threads_per_mapper"; public static final String WORKLOAD_START_DELAY_ARG = "workload_start_delay"; public static final String WORKLOAD_RATE_FACTOR_ARG = "workload_rate_factor"; public static final String WORKLOAD_RATE_FACTOR_DEFAULT = "1.0"; public static final String WORKLOAD_CONFIG_ARG = "workload_config"; private static final String[] ARCHIVE_FILE_TYPES = {".zip", ".tar", ".tgz", ".tar.gz"}; private static final String START_SCRIPT_LOCATION = Client.class .getClassLoader() .getResource(DynoConstants.START_SCRIPT.getResourcePath()).toString(); private YarnClient yarnClient; // Application master specific info to register a new Application with RM/ASM private String appName = ""; // Queue for App master private String amQueue = ""; // Amt. of memory resource to request for to run the App Master private int amMemory = 10; // Amt. of virtual core resource to request for to run the App Master private int amVCores = 1; // Dependency JARs. Should include, at minimum, the JAR for the App Master private final String[] dependencyJars; private String hadoopBinary = ""; // Location of DN conf zip private String confPath = ""; // Location of root dir for DN block image zips private String blockListPath = ""; // Location of NN fs image private String fsImagePath = ""; // Location of NN fs image md5 file private String fsImageMD5Path = ""; // Location of NN VERSION file private String versionFilePath = ""; // Service RPC address of the NameNode, if it is external private String remoteNameNodeRpcAddress = ""; // True iff the NameNode should be launched within YARN private boolean launchNameNode; // The path to the file which contains the delegation tokens to be used for // the launched // containers (may be null) private String tokenFileLocation; // Holds all of the options which are passed to the AM private AMOptions amOptions; // The ApplicationId of the YARN infrastructure application. private ApplicationId infraAppId; // The current state of the YARN infrastructure application. private volatile YarnApplicationState infraAppState = YarnApplicationState.NEW; private volatile JobStatus.State workloadAppState = JobStatus.State.PREP; // Total number of DataNodes which will be launched. private int numTotalDataNodes; // Whether or not the workload job should be launched. private boolean launchWorkloadJob = false; // The workload job itself. private volatile Job workloadJob; // The input path for the workload job. private String workloadInputPath = ""; // The output path for the workload job metric results. private String workloadOutputPath = ""; // The number of threads to use per mapper for the workload job. private int workloadThreadsPerMapper; // The startup delay for the workload job. private long workloadStartDelayMs; private double workloadRateFactor = 0.0; private Map workloadExtraConfigs; // Start time for client private final long clientStartTime = System.currentTimeMillis(); // Timeout threshold for client. Kill app after time interval expires. private long clientTimeout; // Command line options private Options opts; /** * @param args Command line arguments * @throws Exception on error */ public static void main(String[] args) throws Exception { Client client = new Client( ClassUtil.findContainingJar(ApplicationMaster.class)); System.exit(ToolRunner.run(new YarnConfiguration(), client, args)); } public int run(String[] args) { boolean result; try { LOG.info("Initializing Client"); try { boolean doRun = init(args); if (!doRun) { return 0; } } catch (IllegalArgumentException e) { System.err.println(e.getLocalizedMessage()); printUsage(); return -1; } result = run(); } catch (Throwable t) { LOG.error("Error running Client", t); return 1; } if (result) { LOG.info("Application completed successfully"); return 0; } LOG.error("Application failed to complete successfully"); return 2; } public Client(String... dependencyJars) { Preconditions.checkArgument( dependencyJars != null && dependencyJars.length > 0, "Must specify at least one dependency JAR for the ApplicationMaster"); this.dependencyJars = dependencyJars; opts = new Options(); opts.addOption(APPNAME_ARG, true, "Application Name. (default '" + APPNAME_DEFAULT + "')"); opts.addOption(QUEUE_ARG, true, "RM Queue in which this application is " + "to be submitted (default '" + QUEUE_DEFAULT + "')"); opts.addOption(TIMEOUT_ARG, true, "Application timeout in milliseconds " + "(default " + TIMEOUT_DEFAULT + " = unlimited)"); opts.addOption(MASTER_MEMORY_MB_ARG, true, "Amount of memory in MB to be " + "requested to run the application master (default " + MASTER_MEMORY_MB_DEFAULT + ")"); opts.addOption(MASTER_VCORES_ARG, true, "Amount of virtual cores to be " + "requested to run the application master (default " + MASTER_VCORES_DEFAULT + ")"); // Dynamometer opts.addOption(CONF_PATH_ARG, true, "Location of the directory or archive " + "containing the Hadoop configuration. If this is already on a " + "remote FS, will save the copy step, but must be an archive file. " + "This must have the standard Hadoop conf layout containing e.g. " + "etc/hadoop/*-site.xml"); opts.addOption(BLOCK_LIST_PATH_ARG, true, "Location on HDFS of the files containing the DN block lists."); opts.addOption(FS_IMAGE_DIR_ARG, true, "Location of the directory " + "containing, at minimum, the VERSION file for the namenode. If " + "running the namenode within YARN (namenode_info_path is not " + "specified), this must also include the fsimage file and its md5 " + "hash with names conforming to: `fsimage_XXXXXXXX[.md5]`."); for (String option : new String[] {CONF_PATH_ARG, BLOCK_LIST_PATH_ARG, FS_IMAGE_DIR_ARG}) { opts.getOption(option).setRequired(true); } OptionGroup hadoopBinaryGroup = new OptionGroup(); hadoopBinaryGroup.addOption(new Option(HADOOP_BINARY_PATH_ARG, true, "Location of Hadoop binary to be deployed (archive). One of this or " + "hadoop_version is required.")); hadoopBinaryGroup.addOption(new Option(HADOOP_VERSION_ARG, true, "Version of Hadoop (like '2.7.4' or '3.0.0-beta1') for which to " + "download a binary. If this is specified, a Hadoop tarball " + "will be downloaded from an Apache mirror. By default the " + "Berkeley OCF mirror is used; specify " + DynoInfraUtils.APACHE_DOWNLOAD_MIRROR_KEY + " as a configuration or system property to change which mirror " + "is used. The tarball will be downloaded to the working " + "directory. One of this or hadoop_binary_path is required.")); hadoopBinaryGroup.setRequired(true); opts.addOptionGroup(hadoopBinaryGroup); opts.addOption(NAMENODE_SERVICERPC_ADDR_ARG, true, "Specify this option " + "to run the NameNode external to YARN. This is the service RPC " + "address of the NameNode, e.g. localhost:9020."); opts.addOption(TOKEN_FILE_LOCATION_ARG, true, "If specified, this file " + "will be used as the delegation token(s) for the launched " + "containers. Otherwise, the delegation token(s) for the default " + "FileSystem will be used."); AMOptions.setOptions(opts); opts.addOption(WORKLOAD_REPLAY_ENABLE_ARG, false, "If specified, this " + "client will additionally launch the workload replay job to replay " + "audit logs against the HDFS cluster which is started."); opts.addOption(WORKLOAD_INPUT_PATH_ARG, true, "Location of the audit traces to replay (Required for workload)"); opts.addOption(WORKLOAD_OUTPUT_PATH_ARG, true, "Location of the metrics output (Required for workload)"); opts.addOption(WORKLOAD_THREADS_PER_MAPPER_ARG, true, "Number of threads " + "per mapper to use to replay the workload. (default " + AuditReplayMapper.NUM_THREADS_DEFAULT + ")"); opts.addOption(WORKLOAD_START_DELAY_ARG, true, "Delay between launching " + "the Workload MR job and starting the audit logic replay; this is " + "used in an attempt to allow all mappers to be launched before any " + "of them start replaying. Workloads with more mappers may need a " + "longer delay to get all of the containers allocated. Human-readable " + "units accepted (e.g. 30s, 10m). (default " + WorkloadDriver.START_TIME_OFFSET_DEFAULT + ")"); opts.addOption(WORKLOAD_RATE_FACTOR_ARG, true, "Rate factor " + "(multiplicative speed factor) to apply to workload replay (Default " + WORKLOAD_RATE_FACTOR_DEFAULT + ")"); opts.addOption(WORKLOAD_CONFIG_ARG, true, "Additional configurations to " + "pass only to the workload job. This can be used multiple times " + "and should be specified as a key=value pair, e.g. '-" + WORKLOAD_CONFIG_ARG + " conf.one=val1 -" + WORKLOAD_CONFIG_ARG + " conf.two=val2'"); } /** * Helper function to print out usage. */ private void printUsage() { HelpFormatter formatter = new HelpFormatter(); // Option names are long so increasing the width is helpful formatter.setWidth(100); formatter.printHelp("Client", opts); } /** * Parse command line options. * * @param args Parsed command line options * @return Whether the init was successful to run the client * @throws ParseException on error while parsing * @throws IOException for other errors */ public boolean init(String[] args) throws ParseException, IOException { List list = Arrays.asList(args); if (list.contains("-h") || list.contains("--help")) { printUsage(); return false; } CommandLineParser parser = new GnuParser(); CommandLine commandLine = parser.parse(opts, args); yarnClient = YarnClient.createYarnClient(); yarnClient.init(getConf()); LOG.info("Starting with arguments: [\"{}\"]", Joiner.on("\" \"").join(args)); Path fsImageDir = new Path(commandLine.getOptionValue(FS_IMAGE_DIR_ARG, "")); versionFilePath = new Path(fsImageDir, "VERSION").toString(); if (commandLine.hasOption(NAMENODE_SERVICERPC_ADDR_ARG)) { launchNameNode = false; remoteNameNodeRpcAddress = commandLine.getOptionValue(NAMENODE_SERVICERPC_ADDR_ARG); } else { launchNameNode = true; FileSystem localFS = FileSystem.getLocal(getConf()); fsImageDir = fsImageDir.makeQualified(localFS.getUri(), localFS.getWorkingDirectory()); FileSystem fsImageFS = fsImageDir.getFileSystem(getConf()); FileStatus[] fsImageFiles = fsImageFS.listStatus(fsImageDir, (path) -> path.getName().matches("^fsimage_(\\d)+$")); if (fsImageFiles.length != 1) { throw new IllegalArgumentException( "Must be exactly one fsimage file present in fs_image_dir"); } fsImagePath = fsImageFiles[0].getPath().toString(); fsImageMD5Path = fsImageFiles[0].getPath().suffix(".md5").toString(); } if (amMemory < 0) { throw new IllegalArgumentException("Invalid memory specified for " + "application master, exiting. Specified memory=" + amMemory); } if (amVCores < 0) { throw new IllegalArgumentException("Invalid virtual cores specified for " + "application master, exiting. Specified virtual cores=" + amVCores); } this.appName = commandLine.getOptionValue(APPNAME_ARG, APPNAME_DEFAULT); this.amQueue = commandLine.getOptionValue(QUEUE_ARG, QUEUE_DEFAULT); this.amMemory = Integer.parseInt(commandLine .getOptionValue(MASTER_MEMORY_MB_ARG, MASTER_MEMORY_MB_DEFAULT)); this.amVCores = Integer.parseInt( commandLine.getOptionValue(MASTER_VCORES_ARG, MASTER_VCORES_DEFAULT)); this.confPath = commandLine.getOptionValue(CONF_PATH_ARG); this.blockListPath = commandLine.getOptionValue(BLOCK_LIST_PATH_ARG); if (commandLine.hasOption(HADOOP_BINARY_PATH_ARG)) { this.hadoopBinary = commandLine.getOptionValue(HADOOP_BINARY_PATH_ARG); } else { this.hadoopBinary = DynoInfraUtils.fetchHadoopTarball( new File(".").getAbsoluteFile(), commandLine.getOptionValue(HADOOP_VERSION_ARG), getConf(), LOG) .toString(); } this.amOptions = AMOptions.initFromParser(commandLine); this.clientTimeout = Integer .parseInt(commandLine.getOptionValue(TIMEOUT_ARG, TIMEOUT_DEFAULT)); this.tokenFileLocation = commandLine. getOptionValue(TOKEN_FILE_LOCATION_ARG); amOptions.verify(); Path blockPath = new Path(blockListPath); FileSystem blockListFS = blockPath.getFileSystem(getConf()); if (blockListFS.getUri().equals(FileSystem.getLocal(getConf()).getUri()) || !blockListFS.exists(blockPath)) { throw new IllegalArgumentException( "block list path must already exist on remote fs!"); } numTotalDataNodes = blockListFS.listStatus(blockPath, DynoConstants.BLOCK_LIST_FILE_FILTER).length; if (commandLine.hasOption(WORKLOAD_REPLAY_ENABLE_ARG)) { if (!commandLine.hasOption(WORKLOAD_INPUT_PATH_ARG) || !commandLine.hasOption(WORKLOAD_START_DELAY_ARG)) { throw new IllegalArgumentException("workload_replay_enable was " + "specified; must include all required workload_ parameters."); } launchWorkloadJob = true; workloadInputPath = commandLine.getOptionValue(WORKLOAD_INPUT_PATH_ARG); workloadOutputPath = commandLine.getOptionValue(WORKLOAD_OUTPUT_PATH_ARG); workloadThreadsPerMapper = Integer .parseInt(commandLine.getOptionValue(WORKLOAD_THREADS_PER_MAPPER_ARG, String.valueOf(AuditReplayMapper.NUM_THREADS_DEFAULT))); workloadRateFactor = Double.parseDouble(commandLine.getOptionValue( WORKLOAD_RATE_FACTOR_ARG, WORKLOAD_RATE_FACTOR_DEFAULT)); workloadExtraConfigs = new HashMap<>(); if (commandLine.getOptionValues(WORKLOAD_CONFIG_ARG) != null) { for (String opt : commandLine.getOptionValues(WORKLOAD_CONFIG_ARG)) { Iterator kvPair = Splitter.on("=").trimResults().split(opt).iterator(); workloadExtraConfigs.put(kvPair.next(), kvPair.next()); } } String delayString = commandLine.getOptionValue(WORKLOAD_START_DELAY_ARG, WorkloadDriver.START_TIME_OFFSET_DEFAULT); // Store a temporary config to leverage Configuration's time duration // parsing. getConf().set("___temp___", delayString); workloadStartDelayMs = getConf().getTimeDuration("___temp___", 0, TimeUnit.MILLISECONDS); } return true; } /** * Main run function for the client. * * @return true if application completed successfully * @throws IOException for general issues * @throws YarnException for issues while contacting YARN daemons */ public boolean run() throws IOException, YarnException { LOG.info("Running Client"); yarnClient.start(); YarnClusterMetrics clusterMetrics = yarnClient.getYarnClusterMetrics(); LOG.info("Got Cluster metric info from ASM, numNodeManagers={}", clusterMetrics.getNumNodeManagers()); QueueInfo queueInfo = yarnClient.getQueueInfo(this.amQueue); LOG.info("Queue info: queueName={}, queueCurrentCapacity={}, " + "queueMaxCapacity={}, queueApplicationCount={}, " + "queueChildQueueCount={}", queueInfo.getQueueName(), queueInfo.getCurrentCapacity(), queueInfo.getMaximumCapacity(), queueInfo.getApplications().size(), queueInfo.getChildQueues().size()); // Get a new application id YarnClientApplication app = yarnClient.createApplication(); GetNewApplicationResponse appResponse = app.getNewApplicationResponse(); long maxMem = appResponse.getMaximumResourceCapability().getMemorySize(); LOG.info("Max mem capabililty of resources in this cluster " + maxMem); int maxVCores = appResponse.getMaximumResourceCapability() .getVirtualCores(); LOG.info("Max virtual cores capabililty of resources in this cluster {}", maxVCores); if (amMemory > maxMem || amMemory < 0 || amVCores > maxVCores || amVCores < 0) { throw new IllegalArgumentException("Invalid AM memory or vcores: memory=" + amMemory + ", vcores=" + amVCores); } amOptions.verify(maxMem, maxVCores); // set the application name ApplicationSubmissionContext appContext = app.getApplicationSubmissionContext(); infraAppId = appContext.getApplicationId(); appContext.setApplicationName(appName); // Set up the container launch context for the application master ContainerLaunchContext amContainer = Records .newRecord(ContainerLaunchContext.class); Map acls = new HashMap<>(); acls.put(ApplicationAccessType.VIEW_APP, getConf().get( MRJobConfig.JOB_ACL_VIEW_JOB, MRJobConfig.DEFAULT_JOB_ACL_VIEW_JOB)); amContainer.setApplicationACLs(acls); FileSystem fs = FileSystem.get(getConf()); fs.mkdirs(getRemoteStoragePath(getConf(), infraAppId)); // Set the env variables to be setup in the env where the application master // will be run Map env = setupRemoteResourcesGetEnv(); amContainer.setEnvironment(env); // All of the resources for both AM and NN/DNs have been put on remote // storage // Only the application master JAR is needed as a local resource for the AM // so // we explicitly add it here Map localResources = new HashMap<>(); LocalResource scRsrc = LocalResource.newInstance( org.apache.hadoop.yarn.api.records.URL .fromPath(DynoConstants.DYNO_DEPENDENCIES.getPath(env)), LocalResourceType.ARCHIVE, LocalResourceVisibility.APPLICATION, DynoConstants.DYNO_DEPENDENCIES.getLength(env), DynoConstants.DYNO_DEPENDENCIES.getTimestamp(env)); localResources.put(DynoConstants.DYNO_DEPENDENCIES.getResourcePath(), scRsrc); // Set local resource info into app master container launch context amContainer.setLocalResources(localResources); // Set the necessary command to execute the application master amContainer.setCommands(getAMCommand()); Resource capability = Records.newRecord(Resource.class); capability.setMemorySize(amMemory); capability.setVirtualCores(amVCores); appContext.setResource(capability); // Setup security tokens if (UserGroupInformation.isSecurityEnabled()) { ByteBuffer fsTokens; if (tokenFileLocation != null) { fsTokens = ByteBuffer .wrap(Files.readAllBytes(Paths.get(tokenFileLocation))); } else { Credentials credentials = new Credentials(); String tokenRenewer = getConf().get(YarnConfiguration.RM_PRINCIPAL); if (tokenRenewer == null || tokenRenewer.length() == 0) { throw new IOException("Can't get Master Kerberos principal for the " + "RM to use as renewer"); } // For now, only getting tokens for the default file-system. final Token[] tokens = fs.addDelegationTokens(tokenRenewer, credentials); if (tokens != null) { for (Token token : tokens) { LOG.info("Got dt for " + fs.getUri() + "; " + token); } } DataOutputBuffer dob = new DataOutputBuffer(); credentials.writeTokenStorageToStream(dob); fsTokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength()); } amContainer.setTokens(fsTokens); } appContext.setAMContainerSpec(amContainer); // Set the queue to which this application is to be submitted in the RM appContext.setQueue(amQueue); LOG.info("Submitting application to RM"); yarnClient.submitApplication(appContext); // Monitor the application return monitorInfraApplication(); } /** * Set up the remote resources for the application. Upload them to remote * storage as necessary, and set up the requisite environment variables. Does * not set up any local resources. * * @return A Map representing the environment to be used for the * ApplicationMaster containing the information about all of the * remote resources. */ private Map setupRemoteResourcesGetEnv() throws IOException { LOG.info("Set the environment for the application master"); Map env = new HashMap<>(); // Copy local resources to a remote FS to prepare them for localization // by containers. We do not need to set them as local resources here as // the AM does not need them. if (launchNameNode) { setupRemoteResource(infraAppId, DynoConstants.FS_IMAGE, env, fsImagePath); setupRemoteResource(infraAppId, DynoConstants.FS_IMAGE_MD5, env, fsImageMD5Path); } else { env.put(DynoConstants.REMOTE_NN_RPC_ADDR_ENV, remoteNameNodeRpcAddress); } setupRemoteResource(infraAppId, DynoConstants.VERSION, env, versionFilePath); setupRemoteResource(infraAppId, DynoConstants.CONF_ZIP, env, confPath); setupRemoteResource(infraAppId, DynoConstants.START_SCRIPT, env, START_SCRIPT_LOCATION); setupRemoteResource(infraAppId, DynoConstants.HADOOP_BINARY, env, hadoopBinary); setupRemoteResource(infraAppId, DynoConstants.DYNO_DEPENDENCIES, env, dependencyJars); env.put(DynoConstants.BLOCK_LIST_PATH_ENV, blockListPath); env.put(DynoConstants.JOB_ACL_VIEW_ENV, getConf().get( MRJobConfig.JOB_ACL_VIEW_JOB, MRJobConfig.DEFAULT_JOB_ACL_VIEW_JOB)); env.put(DynoConstants.REMOTE_STORAGE_PATH_ENV, getRemoteStoragePath(getConf(), infraAppId).toString()); env.put(Environment.CLASSPATH.key(), getAMClassPathEnv()); return env; } private String getAMClassPathEnv() { // Add AppMaster.jar location to classpath // At some point we should not be required to add // the hadoop specific classpaths to the env. // It should be provided out of the box. // For now setting all required classpaths including // the classpath to "." for the application jar StringBuilder classPathEnv = new StringBuilder(Environment.CLASSPATH.$()) .append(ApplicationConstants.CLASS_PATH_SEPARATOR).append("./") .append(DynoConstants.DYNO_DEPENDENCIES.getResourcePath()).append("/*"); for (String c : getConf().getStrings( YarnConfiguration.YARN_APPLICATION_CLASSPATH, YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH)) { classPathEnv.append(ApplicationConstants.CLASS_PATH_SEPARATOR); classPathEnv.append(c.trim()); } classPathEnv.append(ApplicationConstants.CLASS_PATH_SEPARATOR) .append("./log4j.properties"); // add the runtime classpath needed for tests to work if (getConf().getBoolean(YarnConfiguration.IS_MINI_YARN_CLUSTER, false)) { classPathEnv.append(ApplicationConstants.CLASS_PATH_SEPARATOR); classPathEnv.append(System.getProperty("java.class.path")); } return classPathEnv.toString(); } private List getAMCommand() { List vargs = new ArrayList<>(); // Set java executable command vargs.add(Environment.JAVA_HOME.$() + "/bin/java"); // Set Xmx based on am memory size long appMasterHeapSize = Math.round(amMemory * 0.85); vargs.add("-Xmx" + appMasterHeapSize + "m"); // Set class name vargs.add(ApplicationMaster.class.getCanonicalName()); // Set params for Application Master amOptions.addToVargs(vargs); vargs.add("1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout"); vargs.add("2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stderr"); LOG.info("Completed setting up app master command: " + vargs); return Lists.newArrayList(Joiner.on(" ").join(vargs)); } /** * Upload a local resource to HDFS, or if it is nonlocal, just set environment * appropriately. The location, length and timestamp information is added to * AM container's environment, so it can launch containers later with the * correct resource settings. */ private void setupRemoteResource(ApplicationId appId, DynoResource resource, Map env, String... srcPaths) throws IOException { FileStatus remoteFileStatus; Path dstPath; Preconditions.checkArgument(srcPaths.length > 0, "Must supply at least one source path"); Preconditions.checkArgument( resource.getType() == LocalResourceType.ARCHIVE || srcPaths.length == 1, "Can only specify multiple source paths if using an ARCHIVE type"); List srcURIs = Arrays.stream(srcPaths).map(URI::create) .collect(Collectors.toList()); Set srcSchemes = srcURIs.stream().map(URI::getScheme) .collect(Collectors.toSet()); Preconditions.checkArgument(srcSchemes.size() == 1, "All source paths must have the same scheme"); String srcScheme = srcSchemes.iterator().next(); String srcPathString = "[" + Joiner.on(",").join(srcPaths) + "]"; if (srcScheme == null || srcScheme.equals(FileSystem.getLocal(getConf()).getScheme()) || srcScheme.equals("jar")) { // Need to upload this resource to remote storage List srcFiles = srcURIs.stream().map(URI::getSchemeSpecificPart) .map(File::new).collect(Collectors.toList()); Path dstPathBase = getRemoteStoragePath(getConf(), appId); boolean shouldArchive = srcFiles.size() > 1 || srcFiles.get(0).isDirectory() || (resource.getType() == LocalResourceType.ARCHIVE && Arrays.stream(ARCHIVE_FILE_TYPES).noneMatch( suffix -> srcFiles.get(0).getName().endsWith(suffix))); if (shouldArchive) { if ("jar".equals(srcScheme)) { throw new IllegalArgumentException(String.format("Resources in JARs " + "can't be zipped; resource %s is ARCHIVE and src is: %s", resource.getResourcePath(), srcPathString)); } else if (resource.getType() != LocalResourceType.ARCHIVE) { throw new IllegalArgumentException( String.format("Resource type is %s but srcPaths were: %s", resource.getType(), srcPathString)); } dstPath = new Path(dstPathBase, resource.getResourcePath()) .suffix(".zip"); } else { dstPath = new Path(dstPathBase, srcFiles.get(0).getName()); } FileSystem remoteFS = dstPath.getFileSystem(getConf()); LOG.info("Uploading resource " + resource + " from " + srcPathString + " to " + dstPath); try (OutputStream outputStream = remoteFS.create(dstPath, true)) { if ("jar".equals(srcScheme)) { try (InputStream inputStream = new URL(srcPaths[0]).openStream()) { IOUtils.copyBytes(inputStream, outputStream, getConf()); } } else if (shouldArchive) { List filesToZip; if (srcFiles.size() == 1 && srcFiles.get(0).isDirectory()) { File[] childFiles = srcFiles.get(0).listFiles(); if (childFiles == null || childFiles.length == 0) { throw new IllegalArgumentException( "Specified a directory to archive with no contents"); } filesToZip = Lists.newArrayList(childFiles); } else { filesToZip = srcFiles; } ZipOutputStream zout = new ZipOutputStream(outputStream); for (File fileToZip : filesToZip) { addFileToZipRecursively(fileToZip.getParentFile(), fileToZip, zout); } zout.close(); } else { try (InputStream inputStream = new FileInputStream(srcFiles.get(0))) { IOUtils.copyBytes(inputStream, outputStream, getConf()); } } } remoteFileStatus = remoteFS.getFileStatus(dstPath); } else { if (srcPaths.length > 1) { throw new IllegalArgumentException("If resource is on remote, must be " + "a single file: " + srcPathString); } LOG.info("Using resource {} directly from current location: {}", resource, srcPaths[0]); dstPath = new Path(srcPaths[0]); // non-local file system; we can just use it directly from where it is remoteFileStatus = FileSystem.get(dstPath.toUri(), getConf()) .getFileStatus(dstPath); if (remoteFileStatus.isDirectory()) { throw new IllegalArgumentException("If resource is on remote " + "filesystem, must be a file: " + srcPaths[0]); } } env.put(resource.getLocationEnvVar(), dstPath.toString()); env.put(resource.getTimestampEnvVar(), String.valueOf(remoteFileStatus.getModificationTime())); env.put(resource.getLengthEnvVar(), String.valueOf(remoteFileStatus.getLen())); } /** * Get the directory on the default FS which will be used for storing files * relevant to this Dynamometer application. This is inside of the * {@value DynoConstants#DYNAMOMETER_STORAGE_DIR} directory within the * submitter's home directory. * * @param conf Configuration for this application. * @param appId This application's ID. * @return Fully qualified path on the default FS. */ private static Path getRemoteStoragePath(Configuration conf, ApplicationId appId) throws IOException { FileSystem remoteFS = FileSystem.get(conf); return remoteFS.makeQualified(new Path(remoteFS.getHomeDirectory(), DynoConstants.DYNAMOMETER_STORAGE_DIR + "/" + appId)); } private void addFileToZipRecursively(File root, File file, ZipOutputStream out) throws IOException { File[] files = file.listFiles(); if (files == null) { // Not a directory String relativePath = file.getAbsolutePath() .substring(root.getAbsolutePath().length() + 1); try { try (FileInputStream in = new FileInputStream(file.getAbsolutePath())) { out.putNextEntry(new ZipEntry(relativePath)); IOUtils.copyBytes(in, out, getConf(), false); out.closeEntry(); } } catch (FileNotFoundException fnfe) { LOG.warn("Skipping file; it is a symlink with a nonexistent target: {}", file); } } else { for (File containedFile : files) { addFileToZipRecursively(root, containedFile, out); } } } /** * Monitor the submitted application for completion. Kill application if time * expires. * * @return true if application completed successfully */ private boolean monitorInfraApplication() throws YarnException, IOException { boolean loggedApplicationInfo = false; boolean success = false; Thread namenodeMonitoringThread = new Thread(() -> { Supplier exitCritera = () -> Apps.isApplicationFinalState(infraAppState); Optional namenodeProperties = Optional.empty(); while (!exitCritera.get()) { try { if (!namenodeProperties.isPresent()) { namenodeProperties = DynoInfraUtils .waitForAndGetNameNodeProperties(exitCritera, getConf(), getNameNodeInfoPath(), LOG); if (namenodeProperties.isPresent()) { Properties props = namenodeProperties.get(); LOG.info("NameNode can be reached via HDFS at: {}", DynoInfraUtils.getNameNodeHdfsUri(props)); LOG.info("NameNode web UI available at: {}", DynoInfraUtils.getNameNodeWebUri(props)); LOG.info("NameNode can be tracked at: {}", DynoInfraUtils.getNameNodeTrackingUri(props)); } else { // Only happens if we should be shutting down break; } } DynoInfraUtils.waitForNameNodeStartup(namenodeProperties.get(), exitCritera, LOG); DynoInfraUtils.waitForNameNodeReadiness(namenodeProperties.get(), numTotalDataNodes, false, exitCritera, getConf(), LOG); break; } catch (IOException ioe) { LOG.error( "Unexpected exception while waiting for NameNode readiness", ioe); } catch (InterruptedException ie) { return; } } if (!Apps.isApplicationFinalState(infraAppState) && launchWorkloadJob) { launchAndMonitorWorkloadDriver(namenodeProperties.get()); } }); if (launchNameNode) { namenodeMonitoringThread.start(); } while (true) { // Check app status every 1 second. try { Thread.sleep(1000); } catch (InterruptedException e) { LOG.debug("Thread sleep in monitoring loop interrupted"); } // Get application report for the appId we are interested in ApplicationReport report = yarnClient.getApplicationReport(infraAppId); if (report.getTrackingUrl() != null && !loggedApplicationInfo) { loggedApplicationInfo = true; LOG.info("Track the application at: " + report.getTrackingUrl()); LOG.info("Kill the application using: yarn application -kill " + report.getApplicationId()); } LOG.debug("Got application report from ASM for: appId={}, " + "clientToAMToken={}, appDiagnostics={}, appMasterHost={}, " + "appQueue={}, appMasterRpcPort={}, appStartTime={}, " + "yarnAppState={}, distributedFinalState={}, appTrackingUrl={}, " + "appUser={}", infraAppId.getId(), report.getClientToAMToken(), report.getDiagnostics(), report.getHost(), report.getQueue(), report.getRpcPort(), report.getStartTime(), report.getYarnApplicationState(), report.getFinalApplicationStatus(), report.getTrackingUrl(), report.getUser()); infraAppState = report.getYarnApplicationState(); if (infraAppState == YarnApplicationState.KILLED) { if (!launchWorkloadJob) { success = true; } else if (workloadJob == null) { LOG.error("Infra app was killed before workload job was launched."); } else if (!workloadJob.isComplete()) { LOG.error("Infra app was killed before workload job completed."); } else if (workloadJob.isSuccessful()) { success = true; } LOG.info("Infra app was killed; exiting from client."); break; } else if (infraAppState == YarnApplicationState.FINISHED || infraAppState == YarnApplicationState.FAILED) { LOG.info("Infra app exited unexpectedly. YarnState=" + infraAppState.toString() + ". Exiting from client."); break; } if ((clientTimeout != -1) && (System.currentTimeMillis() > (clientStartTime + clientTimeout))) { LOG.info("Reached client specified timeout of {} ms for application. " + "Killing application", clientTimeout); attemptCleanup(); break; } if (isCompleted(workloadAppState)) { LOG.info("Killing infrastructure app"); try { forceKillApplication(infraAppId); } catch (YarnException | IOException e) { LOG.error("Exception encountered while killing infra app", e); } } } if (launchNameNode) { try { namenodeMonitoringThread.interrupt(); namenodeMonitoringThread.join(); } catch (InterruptedException ie) { LOG.warn("Interrupted while joining workload job thread; " + "continuing to cleanup."); } } attemptCleanup(); return success; } /** * Return the path to the property file containing information about the * launched NameNode. */ @VisibleForTesting Path getNameNodeInfoPath() throws IOException { return new Path(getRemoteStoragePath(getConf(), infraAppId), DynoConstants.NN_INFO_FILE_NAME); } /** * Launch the workload driver ({@link WorkloadDriver}) and monitor the job. * Waits for the launched job to complete. * * @param nameNodeProperties The set of properties with information about the * launched NameNode. */ private void launchAndMonitorWorkloadDriver(Properties nameNodeProperties) { URI nameNodeURI = DynoInfraUtils.getNameNodeHdfsUri(nameNodeProperties); LOG.info("Launching workload job using input path: " + workloadInputPath); try { long workloadStartTime = System.currentTimeMillis() + workloadStartDelayMs; Configuration workloadConf = new Configuration(getConf()); workloadConf.set(AuditReplayMapper.INPUT_PATH_KEY, workloadInputPath); workloadConf.set(AuditReplayMapper.OUTPUT_PATH_KEY, workloadOutputPath); workloadConf.setInt(AuditReplayMapper.NUM_THREADS_KEY, workloadThreadsPerMapper); workloadConf.setDouble(AuditReplayMapper.RATE_FACTOR_KEY, workloadRateFactor); for (Map.Entry configPair : workloadExtraConfigs .entrySet()) { workloadConf.set(configPair.getKey(), configPair.getValue()); } workloadJob = WorkloadDriver.getJobForSubmission(workloadConf, nameNodeURI.toString(), workloadStartTime, AuditReplayMapper.class); workloadJob.submit(); while (!Apps.isApplicationFinalState(infraAppState) && !isCompleted(workloadAppState)) { workloadJob.monitorAndPrintJob(); Thread.sleep(5000); workloadAppState = workloadJob.getJobState(); } if (isCompleted(workloadAppState)) { LOG.info("Workload job completed successfully!"); } else { LOG.warn("Workload job failed."); } } catch (Exception e) { LOG.error("Exception encountered while running workload job", e); } } /** * Best-effort attempt to clean up any remaining applications (infrastructure * or workload). */ public void attemptCleanup() { LOG.info("Attempting to clean up remaining running applications."); if (workloadJob != null) { try { workloadAppState = workloadJob.getJobState(); } catch (IOException ioe) { LOG.warn("Unable to fetch completion status of workload job. Will " + "proceed to attempt to kill it.", ioe); } catch (InterruptedException ie) { Thread.currentThread().interrupt(); return; } if (!isCompleted(workloadAppState)) { try { LOG.info("Attempting to kill workload app: {}", workloadJob.getJobID()); workloadJob.killJob(); LOG.info("Killed workload app"); } catch (IOException ioe) { LOG.error("Unable to kill workload app ({})", workloadJob.getJobID(), ioe); } } } if (infraAppId != null && !Apps.isApplicationFinalState(infraAppState)) { try { LOG.info("Attempting to kill infrastructure app: " + infraAppId); forceKillApplication(infraAppId); LOG.info("Killed infrastructure app"); } catch (YarnException | IOException e) { LOG.error("Unable to kill infrastructure app ({})", infraAppId, e); } } } /** * Check if the input state represents completion. */ private static boolean isCompleted(JobStatus.State state) { return state == JobStatus.State.SUCCEEDED || state == JobStatus.State.FAILED || state == JobStatus.State.KILLED; } /** * Kill a submitted application by sending a call to the AM. * * @param appId Application Id to be killed. */ private void forceKillApplication(ApplicationId appId) throws YarnException, IOException { // Response can be ignored as it is non-null on success or // throws an exception in case of failures yarnClient.killApplication(appId); } @VisibleForTesting Job getWorkloadJob() { return workloadJob; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy