All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.yarn.applications.distributedshell.Client Maven / Gradle / Ivy

There is a newer version: 3.4.1
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.yarn.applications.distributedshell;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Vector;
import java.util.Arrays;
import java.util.Base64;

import com.google.common.base.Joiner;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.yarn.api.ApplicationClientProtocol;
import org.apache.hadoop.yarn.api.ApplicationConstants;
import org.apache.hadoop.yarn.api.ApplicationConstants.Environment;
import org.apache.hadoop.yarn.api.protocolrecords.GetNewApplicationResponse;
import org.apache.hadoop.yarn.api.protocolrecords.KillApplicationRequest;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ApplicationReport;
import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext;
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
import org.apache.hadoop.yarn.api.records.LocalResource;
import org.apache.hadoop.yarn.api.records.LocalResourceType;
import org.apache.hadoop.yarn.api.records.LocalResourceVisibility;
import org.apache.hadoop.yarn.api.records.LogAggregationContext;
import org.apache.hadoop.yarn.api.records.NodeReport;
import org.apache.hadoop.yarn.api.records.NodeState;
import org.apache.hadoop.yarn.api.records.Priority;
import org.apache.hadoop.yarn.api.records.QueueACL;
import org.apache.hadoop.yarn.api.records.QueueInfo;
import org.apache.hadoop.yarn.api.records.QueueUserACLInfo;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.ResourceInformation;
import org.apache.hadoop.yarn.api.records.ResourceRequest;
import org.apache.hadoop.yarn.api.records.ResourceTypeInfo;
import org.apache.hadoop.yarn.api.records.URL;
import org.apache.hadoop.yarn.api.records.YarnApplicationState;
import org.apache.hadoop.yarn.api.records.YarnClusterMetrics;
import org.apache.hadoop.yarn.api.records.timeline.TimelineDomain;
import org.apache.hadoop.yarn.api.records.ExecutionType;
import org.apache.hadoop.yarn.client.api.TimelineClient;
import org.apache.hadoop.yarn.client.api.YarnClient;
import org.apache.hadoop.yarn.client.api.YarnClientApplication;
import org.apache.hadoop.yarn.client.util.YarnClientUtils;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.ResourceNotFoundException;
import org.apache.hadoop.yarn.exceptions.YARNFeatureNotEnabledException;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.util.DockerClientConfigHandler;
import org.apache.hadoop.yarn.util.UnitsConversionUtil;
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
import org.apache.hadoop.yarn.util.resource.Resources;
import org.apache.hadoop.yarn.util.timeline.TimelineUtils;
import com.google.common.annotations.VisibleForTesting;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Client for Distributed Shell application submission to YARN.
 * 
 * 

The distributed shell client allows an application master to be launched that in turn would run * the provided shell command on a set of containers.

* *

This client is meant to act as an example on how to write yarn-based applications.

* *

To submit an application, a client first needs to connect to the ResourceManager * aka ApplicationsManager or ASM via the {@link ApplicationClientProtocol}. The {@link ApplicationClientProtocol} * provides a way for the client to get access to cluster information and to request for a * new {@link ApplicationId}.

* *

For the actual job submission, the client first has to create an {@link ApplicationSubmissionContext}. * The {@link ApplicationSubmissionContext} defines the application details such as {@link ApplicationId} * and application name, the priority assigned to the application and the queue * to which this application needs to be assigned. In addition to this, the {@link ApplicationSubmissionContext} * also defines the {@link ContainerLaunchContext} which describes the Container with which * the {@link ApplicationMaster} is launched.

* *

The {@link ContainerLaunchContext} in this scenario defines the resources to be allocated for the * {@link ApplicationMaster}'s container, the local resources (jars, configuration files) to be made available * and the environment to be set for the {@link ApplicationMaster} and the commands to be executed to run the * {@link ApplicationMaster}.

* *

Using the {@link ApplicationSubmissionContext}, the client submits the application to the * ResourceManager and then monitors the application by requesting the ResourceManager * for an {@link ApplicationReport} at regular time intervals. In case of the application taking too long, the client * kills the application by submitting a {@link KillApplicationRequest} to the ResourceManager.

* */ @InterfaceAudience.Public @InterfaceStability.Unstable public class Client { private static final Logger LOG = LoggerFactory .getLogger(Client.class); private static final int DEFAULT_AM_MEMORY = 100; private static final int DEFAULT_AM_VCORES = 1; private static final int DEFAULT_CONTAINER_MEMORY = 10; private static final int DEFAULT_CONTAINER_VCORES = 1; // Configuration private Configuration conf; private YarnClient yarnClient; // Application master specific info to register a new Application with RM/ASM private String appName = ""; // App master priority private int amPriority = 0; // Queue for App master private String amQueue = ""; // Amt. of memory resource to request for to run the App Master private long amMemory = DEFAULT_AM_MEMORY; // Amt. of virtual core resource to request for to run the App Master private int amVCores = DEFAULT_AM_VCORES; // Amount of resources to request to run the App Master private Map amResources = new HashMap<>(); // AM resource profile private String amResourceProfile = ""; // Application master jar file private String appMasterJar = ""; // Main class to invoke application master private final String appMasterMainClass; // Shell command to be executed private String shellCommand = ""; // Location of shell script private String shellScriptPath = ""; // Args to be passed to the shell command private String[] shellArgs = new String[] {}; // Env variables to be setup for the shell command private Map shellEnv = new HashMap(); // Shell Command Container priority private int shellCmdPriority = 0; // Amt of memory to request for container in which shell script will be executed private long containerMemory = DEFAULT_CONTAINER_MEMORY; // Amt. of virtual cores to request for container in which shell script will be executed private int containerVirtualCores = DEFAULT_CONTAINER_VCORES; // Amt. of resources to request for container // in which shell script will be executed private Map containerResources = new HashMap<>(); // container resource profile private String containerResourceProfile = ""; // No. of containers in which the shell script needs to be executed private int numContainers = 1; private String nodeLabelExpression = null; // Container type, default GUARANTEED. private ExecutionType containerType = ExecutionType.GUARANTEED; // Whether to auto promote opportunistic containers private boolean autoPromoteContainers = false; // Whether to enforce execution type of containers private boolean enforceExecType = false; // Placement specification private String placementSpec = ""; // Node Attribute specification private String nodeAttributeSpec = ""; // log4j.properties file // if available, add to local resources and set into classpath private String log4jPropFile = ""; // rolling private String rollingFilesPattern = ""; // Start time for client private final long clientStartTime = System.currentTimeMillis(); // Timeout threshold for client. Kill app after time interval expires. private long clientTimeout = 600000; // flag to indicate whether to keep containers across application attempts. private boolean keepContainers = false; private long attemptFailuresValidityInterval = -1; private Vector containerRetryOptions = new Vector<>(5); // Debug flag boolean debugFlag = false; // Timeline domain ID private String domainId = null; // Flag to indicate whether to create the domain of the given ID private boolean toCreateDomain = false; // Timeline domain reader access control private String viewACLs = null; // Timeline domain writer access control private String modifyACLs = null; private String flowName = null; private String flowVersion = null; private long flowRunId = 0L; // Docker client configuration private String dockerClientConfig = null; // Application tags private Set applicationTags = new HashSet<>(); // Command line options private Options opts; private static final String shellCommandPath = "shellCommands"; private static final String shellArgsPath = "shellArgs"; private static final String appMasterJarPath = "AppMaster.jar"; // Hardcoded path to custom log_properties private static final String log4jPath = "log4j.properties"; public static final String SCRIPT_PATH = "ExecScript"; /** * @param args Command line arguments */ public static void main(String[] args) { boolean result = false; try { Client client = new Client(); LOG.info("Initializing Client"); try { boolean doRun = client.init(args); if (!doRun) { System.exit(0); } } catch (IllegalArgumentException e) { System.err.println(e.getLocalizedMessage()); client.printUsage(); System.exit(-1); } result = client.run(); } catch (Throwable t) { LOG.error("Error running Client", t); System.exit(1); } if (result) { LOG.info("Application completed successfully"); System.exit(0); } LOG.error("Application failed to complete successfully"); System.exit(2); } /** */ public Client(Configuration conf) throws Exception { this( "org.apache.hadoop.yarn.applications.distributedshell.ApplicationMaster", conf); } Client(String appMasterMainClass, Configuration conf) { this.conf = conf; this.conf.setBoolean( YarnConfiguration.YARN_CLIENT_LOAD_RESOURCETYPES_FROM_SERVER, true); this.appMasterMainClass = appMasterMainClass; yarnClient = YarnClient.createYarnClient(); yarnClient.init(conf); opts = new Options(); opts.addOption("appname", true, "Application Name. Default value - DistributedShell"); opts.addOption("priority", true, "Application Priority. Default 0"); opts.addOption("queue", true, "RM Queue in which this application is to be submitted"); opts.addOption("timeout", true, "Application timeout in milliseconds"); opts.addOption("master_memory", true, "Amount of memory in MB to be requested to run the application master"); opts.addOption("master_vcores", true, "Amount of virtual cores " + "to be requested to run the application master"); opts.addOption("master_resources", true, "Amount of resources " + "to be requested to run the application master. " + "Specified as resource type=value pairs separated by commas." + "E.g. -master_resources memory-mb=512,vcores=2"); opts.addOption("jar", true, "Jar file containing the application master"); opts.addOption("master_resource_profile", true, "Resource profile for the application master"); opts.addOption("shell_command", true, "Shell command to be executed by " + "the Application Master. Can only specify either --shell_command " + "or --shell_script"); opts.addOption("shell_script", true, "Location of the shell script to be " + "executed. Can only specify either --shell_command or --shell_script"); opts.addOption("shell_args", true, "Command line args for the shell script." + "Multiple args can be separated by empty space."); opts.getOption("shell_args").setArgs(Option.UNLIMITED_VALUES); opts.addOption("shell_env", true, "Environment for shell script. Specified as env_key=env_val pairs"); opts.addOption("shell_cmd_priority", true, "Priority for the shell command containers"); opts.addOption("container_type", true, "Container execution type, GUARANTEED or OPPORTUNISTIC"); opts.addOption("container_memory", true, "Amount of memory in MB " + "to be requested to run the shell command"); opts.addOption("container_vcores", true, "Amount of virtual cores " + "to be requested to run the shell command"); opts.addOption("container_resources", true, "Amount of resources " + "to be requested to run the shell command. " + "Specified as resource type=value pairs separated by commas. " + "E.g. -container_resources memory-mb=256,vcores=1"); opts.addOption("container_resource_profile", true, "Resource profile for the shell command"); opts.addOption("num_containers", true, "No. of containers on which the shell command needs to be executed"); opts.addOption("promote_opportunistic_after_start", false, "Flag to indicate whether to automatically promote opportunistic" + " containers to guaranteed."); opts.addOption("enforce_execution_type", false, "Flag to indicate whether to enforce execution type of containers"); opts.addOption("log_properties", true, "log4j.properties file"); opts.addOption("rolling_log_pattern", true, "pattern for files that should be aggregated in a rolling fashion"); opts.addOption("keep_containers_across_application_attempts", false, "Flag to indicate whether to keep containers across application " + "attempts." + " If the flag is true, running containers will not be killed when" + " application attempt fails and these containers will be " + "retrieved by" + " the new application attempt "); opts.addOption("attempt_failures_validity_interval", true, "when attempt_failures_validity_interval in milliseconds is set to > 0," + "the failure number will not take failures which happen out of " + "the validityInterval into failure count. " + "If failure count reaches to maxAppAttempts, " + "the application will be failed."); opts.addOption("debug", false, "Dump out debug information"); opts.addOption("domain", true, "ID of the timeline domain where the " + "timeline entities will be put"); opts.addOption("view_acls", true, "Users and groups that allowed to " + "view the timeline entities in the given domain"); opts.addOption("modify_acls", true, "Users and groups that allowed to " + "modify the timeline entities in the given domain"); opts.addOption("create", false, "Flag to indicate whether to create the " + "domain specified with -domain."); opts.addOption("flow_name", true, "Flow name which the distributed shell " + "app belongs to"); opts.addOption("flow_version", true, "Flow version which the distributed " + "shell app belongs to"); opts.addOption("flow_run_id", true, "Flow run ID which the distributed " + "shell app belongs to"); opts.addOption("help", false, "Print usage"); opts.addOption("node_label_expression", true, "Node label expression to determine the nodes" + " where all the containers of this application" + " will be allocated, \"\" means containers" + " can be allocated anywhere, if you don't specify the option," + " default node_label_expression of queue will be used."); opts.addOption("container_retry_policy", true, "Retry policy when container fails to run, " + "0: NEVER_RETRY, 1: RETRY_ON_ALL_ERRORS, " + "2: RETRY_ON_SPECIFIC_ERROR_CODES"); opts.addOption("container_retry_error_codes", true, "When retry policy is set to RETRY_ON_SPECIFIC_ERROR_CODES, error " + "codes is specified with this option, " + "e.g. --container_retry_error_codes 1,2,3"); opts.addOption("container_max_retries", true, "If container could retry, it specifies max retires"); opts.addOption("container_retry_interval", true, "Interval between each retry, unit is milliseconds"); opts.addOption("container_failures_validity_interval", true, "Failures which are out of the time window will not be added to" + " the number of container retry attempts"); opts.addOption("docker_client_config", true, "The docker client configuration path. The scheme should be supplied" + " (i.e. file:// or hdfs://)." + " Only used when the Docker runtime is enabled and requested."); opts.addOption("placement_spec", true, "Placement specification. Please note, if this option is specified," + " The \"num_containers\" option will be ignored. All requested" + " containers will be of type GUARANTEED" ); opts.addOption("application_tags", true, "Application tags."); } /** */ public Client() throws Exception { this(new YarnConfiguration()); } /** * Helper function to print out usage */ private void printUsage() { new HelpFormatter().printHelp("Client", opts); } /** * Parse command line options * @param args Parsed command line options * @return Whether the init was successful to run the client * @throws ParseException */ public boolean init(String[] args) throws ParseException { CommandLine cliParser = new GnuParser().parse(opts, args); if (args.length == 0) { throw new IllegalArgumentException("No args specified for client to initialize"); } if (cliParser.hasOption("log_properties")) { String log4jPath = cliParser.getOptionValue("log_properties"); try { Log4jPropertyHelper.updateLog4jConfiguration(Client.class, log4jPath); } catch (Exception e) { LOG.warn("Can not set up custom log4j properties. " + e); } } if (cliParser.hasOption("rolling_log_pattern")) { rollingFilesPattern = cliParser.getOptionValue("rolling_log_pattern"); } if (cliParser.hasOption("help")) { printUsage(); return false; } if (cliParser.hasOption("debug")) { debugFlag = true; } if (cliParser.hasOption("keep_containers_across_application_attempts")) { LOG.info("keep_containers_across_application_attempts"); keepContainers = true; } if (cliParser.hasOption("placement_spec")) { placementSpec = cliParser.getOptionValue("placement_spec"); // Check if it is parsable PlacementSpec.parse(this.placementSpec); } appName = cliParser.getOptionValue("appname", "DistributedShell"); amPriority = Integer.parseInt(cliParser.getOptionValue("priority", "0")); amQueue = cliParser.getOptionValue("queue", "default"); amMemory = Integer.parseInt(cliParser.getOptionValue("master_memory", "-1")); amVCores = Integer.parseInt(cliParser.getOptionValue("master_vcores", "-1")); if (cliParser.hasOption("master_resources")) { Map masterResources = parseResourcesString(cliParser.getOptionValue("master_resources")); for (Map.Entry entry : masterResources.entrySet()) { if (entry.getKey().equals(ResourceInformation.MEMORY_URI)) { amMemory = entry.getValue(); } else if (entry.getKey().equals(ResourceInformation.VCORES_URI)) { amVCores = entry.getValue().intValue(); } else { amResources.put(entry.getKey(), entry.getValue()); } } } amResourceProfile = cliParser.getOptionValue("master_resource_profile", ""); if (!cliParser.hasOption("jar")) { throw new IllegalArgumentException("No jar file specified for application master"); } appMasterJar = cliParser.getOptionValue("jar"); if (!cliParser.hasOption("shell_command") && !cliParser.hasOption("shell_script")) { throw new IllegalArgumentException( "No shell command or shell script specified to be executed by application master"); } else if (cliParser.hasOption("shell_command") && cliParser.hasOption("shell_script")) { throw new IllegalArgumentException("Can not specify shell_command option " + "and shell_script option at the same time"); } else if (cliParser.hasOption("shell_command")) { shellCommand = cliParser.getOptionValue("shell_command"); } else { shellScriptPath = cliParser.getOptionValue("shell_script"); } if (cliParser.hasOption("shell_args")) { shellArgs = cliParser.getOptionValues("shell_args"); } if (cliParser.hasOption("shell_env")) { String envs[] = cliParser.getOptionValues("shell_env"); for (String env : envs) { env = env.trim(); int index = env.indexOf('='); if (index == -1) { shellEnv.put(env, ""); continue; } String key = env.substring(0, index); String val = ""; if (index < (env.length()-1)) { val = env.substring(index+1); } shellEnv.put(key, val); } } shellCmdPriority = Integer.parseInt(cliParser.getOptionValue("shell_cmd_priority", "0")); if (cliParser.hasOption("container_type")) { String containerTypeStr = cliParser.getOptionValue("container_type"); if (Arrays.stream(ExecutionType.values()).noneMatch( executionType -> executionType.toString() .equals(containerTypeStr))) { throw new IllegalArgumentException("Invalid container_type: " + containerTypeStr); } containerType = ExecutionType.valueOf(containerTypeStr); } if (cliParser.hasOption("promote_opportunistic_after_start")) { autoPromoteContainers = true; } if (cliParser.hasOption("enforce_execution_type")) { enforceExecType = true; } containerMemory = Integer.parseInt(cliParser.getOptionValue("container_memory", "-1")); containerVirtualCores = Integer.parseInt(cliParser.getOptionValue("container_vcores", "-1")); if (cliParser.hasOption("container_resources")) { Map resources = parseResourcesString(cliParser.getOptionValue("container_resources")); for (Map.Entry entry : resources.entrySet()) { if (entry.getKey().equals(ResourceInformation.MEMORY_URI)) { containerMemory = entry.getValue(); } else if (entry.getKey().equals(ResourceInformation.VCORES_URI)) { containerVirtualCores = entry.getValue().intValue(); } else { containerResources.put(entry.getKey(), entry.getValue()); } } } containerResourceProfile = cliParser.getOptionValue("container_resource_profile", ""); numContainers = Integer.parseInt(cliParser.getOptionValue("num_containers", "1")); if (numContainers < 1) { throw new IllegalArgumentException("Invalid no. of containers specified," + " exiting. Specified numContainer=" + numContainers); } nodeLabelExpression = cliParser.getOptionValue("node_label_expression", null); clientTimeout = Integer.parseInt(cliParser.getOptionValue("timeout", "600000")); attemptFailuresValidityInterval = Long.parseLong(cliParser.getOptionValue( "attempt_failures_validity_interval", "-1")); log4jPropFile = cliParser.getOptionValue("log_properties", ""); // Get timeline domain options if (cliParser.hasOption("domain")) { domainId = cliParser.getOptionValue("domain"); toCreateDomain = cliParser.hasOption("create"); if (cliParser.hasOption("view_acls")) { viewACLs = cliParser.getOptionValue("view_acls"); } if (cliParser.hasOption("modify_acls")) { modifyACLs = cliParser.getOptionValue("modify_acls"); } } // Get container retry options if (cliParser.hasOption("container_retry_policy")) { containerRetryOptions.add("--container_retry_policy " + cliParser.getOptionValue("container_retry_policy")); } if (cliParser.hasOption("container_retry_error_codes")) { containerRetryOptions.add("--container_retry_error_codes " + cliParser.getOptionValue("container_retry_error_codes")); } if (cliParser.hasOption("container_max_retries")) { containerRetryOptions.add("--container_max_retries " + cliParser.getOptionValue("container_max_retries")); } if (cliParser.hasOption("container_retry_interval")) { containerRetryOptions.add("--container_retry_interval " + cliParser.getOptionValue("container_retry_interval")); } if (cliParser.hasOption("container_failures_validity_interval")) { containerRetryOptions.add("--container_failures_validity_interval " + cliParser.getOptionValue("container_failures_validity_interval")); } if (cliParser.hasOption("flow_name")) { flowName = cliParser.getOptionValue("flow_name"); } if (cliParser.hasOption("flow_version")) { flowVersion = cliParser.getOptionValue("flow_version"); } if (cliParser.hasOption("flow_run_id")) { try { flowRunId = Long.parseLong(cliParser.getOptionValue("flow_run_id")); } catch (NumberFormatException e) { throw new IllegalArgumentException( "Flow run is not a valid long value", e); } } if (cliParser.hasOption("docker_client_config")) { dockerClientConfig = cliParser.getOptionValue("docker_client_config"); } if (cliParser.hasOption("application_tags")) { String applicationTagsStr = cliParser.getOptionValue("application_tags"); String[] appTags = applicationTagsStr.split(","); for (String appTag : appTags) { this.applicationTags.add(appTag.trim()); } } return true; } /** * Main run function for the client * @return true if application completed successfully * @throws IOException * @throws YarnException */ public boolean run() throws IOException, YarnException { LOG.info("Running Client"); yarnClient.start(); YarnClusterMetrics clusterMetrics = yarnClient.getYarnClusterMetrics(); LOG.info("Got Cluster metric info from ASM" + ", numNodeManagers=" + clusterMetrics.getNumNodeManagers()); List clusterNodeReports = yarnClient.getNodeReports( NodeState.RUNNING); LOG.info("Got Cluster node info from ASM"); for (NodeReport node : clusterNodeReports) { LOG.info("Got node report from ASM for" + ", nodeId=" + node.getNodeId() + ", nodeAddress=" + node.getHttpAddress() + ", nodeRackName=" + node.getRackName() + ", nodeNumContainers=" + node.getNumContainers()); } QueueInfo queueInfo = yarnClient.getQueueInfo(this.amQueue); if (queueInfo == null) { throw new IllegalArgumentException(String .format("Queue %s not present in scheduler configuration.", this.amQueue)); } LOG.info("Queue info" + ", queueName=" + queueInfo.getQueueName() + ", queueCurrentCapacity=" + queueInfo.getCurrentCapacity() + ", queueMaxCapacity=" + queueInfo.getMaximumCapacity() + ", queueApplicationCount=" + queueInfo.getApplications().size() + ", queueChildQueueCount=" + queueInfo.getChildQueues().size()); List listAclInfo = yarnClient.getQueueAclsInfo(); for (QueueUserACLInfo aclInfo : listAclInfo) { for (QueueACL userAcl : aclInfo.getUserAcls()) { LOG.info("User ACL Info for Queue" + ", queueName=" + aclInfo.getQueueName() + ", userAcl=" + userAcl.name()); } } if (domainId != null && domainId.length() > 0 && toCreateDomain) { prepareTimelineDomain(); } Map profiles; try { profiles = yarnClient.getResourceProfiles(); } catch (YARNFeatureNotEnabledException re) { profiles = null; } List appProfiles = new ArrayList<>(2); appProfiles.add(amResourceProfile); appProfiles.add(containerResourceProfile); for (String appProfile : appProfiles) { if (appProfile != null && !appProfile.isEmpty()) { if (profiles == null) { String message = "Resource profiles is not enabled"; LOG.error(message); throw new IOException(message); } if (!profiles.containsKey(appProfile)) { String message = "Unknown resource profile '" + appProfile + "'. Valid resource profiles are " + profiles.keySet(); LOG.error(message); throw new IOException(message); } } } // Get a new application id YarnClientApplication app = yarnClient.createApplication(); GetNewApplicationResponse appResponse = app.getNewApplicationResponse(); // TODO get min/max resource capabilities from RM and change memory ask if needed // If we do not have min/max, we may not be able to correctly request // the required resources from the RM for the app master // Memory ask has to be a multiple of min and less than max. // Dump out information about cluster capability as seen by the resource manager long maxMem = appResponse.getMaximumResourceCapability().getMemorySize(); LOG.info("Max mem capability of resources in this cluster " + maxMem); // A resource ask cannot exceed the max. if (amMemory > maxMem) { LOG.info("AM memory specified above max threshold of cluster. Using max value." + ", specified=" + amMemory + ", max=" + maxMem); amMemory = maxMem; } int maxVCores = appResponse.getMaximumResourceCapability().getVirtualCores(); LOG.info("Max virtual cores capability of resources in this cluster " + maxVCores); if (amVCores > maxVCores) { LOG.info("AM virtual cores specified above max threshold of cluster. " + "Using max value." + ", specified=" + amVCores + ", max=" + maxVCores); amVCores = maxVCores; } // set the application name ApplicationSubmissionContext appContext = app.getApplicationSubmissionContext(); ApplicationId appId = appContext.getApplicationId(); // Set up resource type requirements // For now, both memory and vcores are supported, so we set memory and // vcores requirements List resourceTypes = yarnClient.getResourceTypeInfo(); setAMResourceCapability(appContext, profiles, resourceTypes); setContainerResources(profiles, resourceTypes); appContext.setKeepContainersAcrossApplicationAttempts(keepContainers); appContext.setApplicationName(appName); if (attemptFailuresValidityInterval >= 0) { appContext .setAttemptFailuresValidityInterval(attemptFailuresValidityInterval); } Set tags = new HashSet(); if (applicationTags != null) { tags.addAll(applicationTags); } if (flowName != null) { tags.add(TimelineUtils.generateFlowNameTag(flowName)); } if (flowVersion != null) { tags.add(TimelineUtils.generateFlowVersionTag(flowVersion)); } if (flowRunId != 0) { tags.add(TimelineUtils.generateFlowRunIdTag(flowRunId)); } appContext.setApplicationTags(tags); // set local resources for the application master // local files or archives as needed // In this scenario, the jar file for the application master is part of the local resources Map localResources = new HashMap(); LOG.info("Copy App Master jar from local filesystem and add to local environment"); // Copy the application master jar to the filesystem // Create a local resource to point to the destination jar path FileSystem fs = FileSystem.get(conf); addToLocalResources(fs, appMasterJar, appMasterJarPath, appId.toString(), localResources, null); // Set the log4j properties if needed if (!log4jPropFile.isEmpty()) { addToLocalResources(fs, log4jPropFile, log4jPath, appId.toString(), localResources, null); } // The shell script has to be made available on the final container(s) // where it will be executed. // To do this, we need to first copy into the filesystem that is visible // to the yarn framework. // We do not need to set this as a local resource for the application // master as the application master does not need it. String hdfsShellScriptLocation = ""; long hdfsShellScriptLen = 0; long hdfsShellScriptTimestamp = 0; if (!shellScriptPath.isEmpty()) { Path shellSrc = new Path(shellScriptPath); String shellPathSuffix = appName + "/" + appId.toString() + "/" + SCRIPT_PATH; Path shellDst = new Path(fs.getHomeDirectory(), shellPathSuffix); fs.copyFromLocalFile(false, true, shellSrc, shellDst); hdfsShellScriptLocation = shellDst.toUri().toString(); FileStatus shellFileStatus = fs.getFileStatus(shellDst); hdfsShellScriptLen = shellFileStatus.getLen(); hdfsShellScriptTimestamp = shellFileStatus.getModificationTime(); } if (!shellCommand.isEmpty()) { addToLocalResources(fs, null, shellCommandPath, appId.toString(), localResources, shellCommand); } if (shellArgs.length > 0) { addToLocalResources(fs, null, shellArgsPath, appId.toString(), localResources, StringUtils.join(shellArgs, " ")); } // Set the necessary security tokens as needed //amContainer.setContainerTokens(containerToken); // Set the env variables to be setup in the env where the application master will be run LOG.info("Set the environment for the application master"); Map env = new HashMap(); // put location of shell script into env // using the env info, the application master will create the correct local resource for the // eventual containers that will be launched to execute the shell scripts env.put(DSConstants.DISTRIBUTEDSHELLSCRIPTLOCATION, hdfsShellScriptLocation); env.put(DSConstants.DISTRIBUTEDSHELLSCRIPTTIMESTAMP, Long.toString(hdfsShellScriptTimestamp)); env.put(DSConstants.DISTRIBUTEDSHELLSCRIPTLEN, Long.toString(hdfsShellScriptLen)); if (domainId != null && domainId.length() > 0) { env.put(DSConstants.DISTRIBUTEDSHELLTIMELINEDOMAIN, domainId); } // Add AppMaster.jar location to classpath // At some point we should not be required to add // the hadoop specific classpaths to the env. // It should be provided out of the box. // For now setting all required classpaths including // the classpath to "." for the application jar StringBuilder classPathEnv = new StringBuilder(Environment.CLASSPATH.$$()) .append(ApplicationConstants.CLASS_PATH_SEPARATOR).append("./*"); for (String c : conf.getStrings( YarnConfiguration.YARN_APPLICATION_CLASSPATH, YarnConfiguration.DEFAULT_YARN_CROSS_PLATFORM_APPLICATION_CLASSPATH)) { classPathEnv.append(ApplicationConstants.CLASS_PATH_SEPARATOR); classPathEnv.append(c.trim()); } classPathEnv.append(ApplicationConstants.CLASS_PATH_SEPARATOR).append( "./log4j.properties"); // add the runtime classpath needed for tests to work if (conf.getBoolean(YarnConfiguration.IS_MINI_YARN_CLUSTER, false)) { classPathEnv.append(':'); classPathEnv.append(System.getProperty("java.class.path")); } env.put("CLASSPATH", classPathEnv.toString()); // Set the necessary command to execute the application master Vector vargs = new Vector(30); // Set java executable command LOG.info("Setting up app master command"); // Need extra quote here because JAVA_HOME might contain space on Windows, // e.g. C:/Program Files/Java... vargs.add("\"" + Environment.JAVA_HOME.$$() + "/bin/java\""); // Set Xmx based on am memory size vargs.add("-Xmx" + amMemory + "m"); // Set class name vargs.add(appMasterMainClass); // Set params for Application Master if (containerType != null) { vargs.add("--container_type " + String.valueOf(containerType)); } if (autoPromoteContainers) { vargs.add("--promote_opportunistic_after_start"); } if (enforceExecType) { vargs.add("--enforce_execution_type"); } if (containerMemory > 0) { vargs.add("--container_memory " + String.valueOf(containerMemory)); } if (containerVirtualCores > 0) { vargs.add("--container_vcores " + String.valueOf(containerVirtualCores)); } if (!containerResources.isEmpty()) { Joiner.MapJoiner joiner = Joiner.on(',').withKeyValueSeparator("="); vargs.add("--container_resources " + joiner.join(containerResources)); } if (containerResourceProfile != null && !containerResourceProfile .isEmpty()) { vargs.add("--container_resource_profile " + containerResourceProfile); } vargs.add("--num_containers " + String.valueOf(numContainers)); if (placementSpec != null && placementSpec.length() > 0) { // Encode the spec to avoid passing special chars via shell arguments. String encodedSpec = Base64.getEncoder() .encodeToString(placementSpec.getBytes(StandardCharsets.UTF_8)); LOG.info("Encode placement spec: " + encodedSpec); vargs.add("--placement_spec " + encodedSpec); } if (null != nodeLabelExpression) { appContext.setNodeLabelExpression(nodeLabelExpression); } vargs.add("--priority " + String.valueOf(shellCmdPriority)); if (keepContainers) { vargs.add("--keep_containers_across_application_attempts"); } for (Map.Entry entry : shellEnv.entrySet()) { vargs.add("--shell_env " + entry.getKey() + "=" + entry.getValue()); } if (debugFlag) { vargs.add("--debug"); } vargs.addAll(containerRetryOptions); vargs.add("1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/AppMaster.stdout"); vargs.add("2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/AppMaster.stderr"); // Get final commmand StringBuilder command = new StringBuilder(); for (CharSequence str : vargs) { command.append(str).append(" "); } LOG.info("Completed setting up app master command " + command.toString()); List commands = new ArrayList(); commands.add(command.toString()); // Set up the container launch context for the application master ContainerLaunchContext amContainer = ContainerLaunchContext.newInstance( localResources, env, commands, null, null, null); // Service data is a binary blob that can be passed to the application // Not needed in this scenario // amContainer.setServiceData(serviceData); // Setup security tokens Credentials rmCredentials = null; if (UserGroupInformation.isSecurityEnabled()) { // Note: Credentials class is marked as LimitedPrivate for HDFS and MapReduce rmCredentials = new Credentials(); String tokenRenewer = YarnClientUtils.getRmPrincipal(conf); if (tokenRenewer == null || tokenRenewer.length() == 0) { throw new IOException( "Can't get Master Kerberos principal for the RM to use as renewer"); } // For now, only getting tokens for the default file-system. final Token tokens[] = fs.addDelegationTokens(tokenRenewer, rmCredentials); if (tokens != null) { for (Token token : tokens) { LOG.info("Got dt for " + fs.getUri() + "; " + token); } } } // Add the docker client config credentials if supplied. Credentials dockerCredentials = null; if (dockerClientConfig != null) { dockerCredentials = DockerClientConfigHandler.readCredentialsFromConfigFile( new Path(dockerClientConfig), conf, appId.toString()); } if (rmCredentials != null || dockerCredentials != null) { DataOutputBuffer dob = new DataOutputBuffer(); if (rmCredentials != null) { rmCredentials.writeTokenStorageToStream(dob); } if (dockerCredentials != null) { dockerCredentials.writeTokenStorageToStream(dob); } ByteBuffer tokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength()); amContainer.setTokens(tokens); } appContext.setAMContainerSpec(amContainer); // Set the priority for the application master // TODO - what is the range for priority? how to decide? Priority pri = Priority.newInstance(amPriority); appContext.setPriority(pri); // Set the queue to which this application is to be submitted in the RM appContext.setQueue(amQueue); specifyLogAggregationContext(appContext); // Submit the application to the applications manager // SubmitApplicationResponse submitResp = applicationsManager.submitApplication(appRequest); // Ignore the response as either a valid response object is returned on success // or an exception thrown to denote some form of a failure LOG.info("Submitting application to ASM"); yarnClient.submitApplication(appContext); // TODO // Try submitting the same request again // app submission failure? // Monitor the application return monitorApplication(appId); } @VisibleForTesting void specifyLogAggregationContext(ApplicationSubmissionContext appContext) { if (!rollingFilesPattern.isEmpty()) { LogAggregationContext logAggregationContext = LogAggregationContext .newInstance(null, null, rollingFilesPattern, ""); appContext.setLogAggregationContext(logAggregationContext); } } /** * Monitor the submitted application for completion. * Kill application if time expires. * @param appId Application Id of application to be monitored * @return true if application completed successfully * @throws YarnException * @throws IOException */ private boolean monitorApplication(ApplicationId appId) throws YarnException, IOException { while (true) { // Check app status every 1 second. try { Thread.sleep(1000); } catch (InterruptedException e) { LOG.debug("Thread sleep in monitoring loop interrupted"); } // Get application report for the appId we are interested in ApplicationReport report = yarnClient.getApplicationReport(appId); LOG.info("Got application report from ASM for" + ", appId=" + appId.getId() + ", clientToAMToken=" + report.getClientToAMToken() + ", appDiagnostics=" + report.getDiagnostics() + ", appMasterHost=" + report.getHost() + ", appQueue=" + report.getQueue() + ", appMasterRpcPort=" + report.getRpcPort() + ", appStartTime=" + report.getStartTime() + ", yarnAppState=" + report.getYarnApplicationState().toString() + ", distributedFinalState=" + report.getFinalApplicationStatus().toString() + ", appTrackingUrl=" + report.getTrackingUrl() + ", appUser=" + report.getUser()); YarnApplicationState state = report.getYarnApplicationState(); FinalApplicationStatus dsStatus = report.getFinalApplicationStatus(); if (YarnApplicationState.FINISHED == state) { if (FinalApplicationStatus.SUCCEEDED == dsStatus) { LOG.info("Application has completed successfully. Breaking monitoring loop"); return true; } else { LOG.info("Application did finished unsuccessfully." + " YarnState=" + state.toString() + ", DSFinalStatus=" + dsStatus.toString() + ". Breaking monitoring loop"); return false; } } else if (YarnApplicationState.KILLED == state || YarnApplicationState.FAILED == state) { LOG.info("Application did not finish." + " YarnState=" + state.toString() + ", DSFinalStatus=" + dsStatus.toString() + ". Breaking monitoring loop"); return false; } // The value equal or less than 0 means no timeout if (clientTimeout > 0 && System.currentTimeMillis() > (clientStartTime + clientTimeout)) { LOG.info("Reached client specified timeout for application. " + "Killing application"); forceKillApplication(appId); return false; } } } /** * Kill a submitted application by sending a call to the ASM * @param appId Application Id to be killed. * @throws YarnException * @throws IOException */ private void forceKillApplication(ApplicationId appId) throws YarnException, IOException { // TODO clarify whether multiple jobs with the same app id can be submitted and be running at // the same time. // If yes, can we kill a particular attempt only? // Response can be ignored as it is non-null on success or // throws an exception in case of failures yarnClient.killApplication(appId); } private void addToLocalResources(FileSystem fs, String fileSrcPath, String fileDstPath, String appId, Map localResources, String resources) throws IOException { String suffix = appName + "/" + appId + "/" + fileDstPath; Path dst = new Path(fs.getHomeDirectory(), suffix); if (fileSrcPath == null) { FSDataOutputStream ostream = null; try { ostream = FileSystem .create(fs, dst, new FsPermission((short) 0710)); ostream.writeUTF(resources); } finally { IOUtils.closeQuietly(ostream); } } else { fs.copyFromLocalFile(new Path(fileSrcPath), dst); } FileStatus scFileStatus = fs.getFileStatus(dst); LocalResource scRsrc = LocalResource.newInstance( URL.fromURI(dst.toUri()), LocalResourceType.FILE, LocalResourceVisibility.APPLICATION, scFileStatus.getLen(), scFileStatus.getModificationTime()); localResources.put(fileDstPath, scRsrc); } private void prepareTimelineDomain() { TimelineClient timelineClient = null; if (conf.getBoolean(YarnConfiguration.TIMELINE_SERVICE_ENABLED, YarnConfiguration.DEFAULT_TIMELINE_SERVICE_ENABLED)) { timelineClient = TimelineClient.createTimelineClient(); timelineClient.init(conf); timelineClient.start(); } else { LOG.warn("Cannot put the domain " + domainId + " because the timeline service is not enabled"); return; } try { //TODO: we need to check and combine the existing timeline domain ACLs, //but let's do it once we have client java library to query domains. TimelineDomain domain = new TimelineDomain(); domain.setId(domainId); domain.setReaders( viewACLs != null && viewACLs.length() > 0 ? viewACLs : " "); domain.setWriters( modifyACLs != null && modifyACLs.length() > 0 ? modifyACLs : " "); timelineClient.putDomain(domain); LOG.info("Put the timeline domain: " + TimelineUtils.dumpTimelineRecordtoJSON(domain)); } catch (Exception e) { LOG.error("Error when putting the timeline domain", e); } finally { timelineClient.stop(); } } private void setAMResourceCapability(ApplicationSubmissionContext appContext, Map profiles, List resourceTypes) throws IllegalArgumentException, IOException, YarnException { if (amMemory < -1 || amMemory == 0) { throw new IllegalArgumentException("Invalid memory specified for" + " application master, exiting. Specified memory=" + amMemory); } if (amVCores < -1 || amVCores == 0) { throw new IllegalArgumentException("Invalid virtual cores specified for" + " application master, exiting. " + "Specified virtual cores=" + amVCores); } Resource capability = Resource.newInstance(0, 0); if (!amResourceProfile.isEmpty()) { if (!profiles.containsKey(amResourceProfile)) { throw new IllegalArgumentException( "Failed to find specified resource profile for application master=" + amResourceProfile); } capability = Resources.clone(profiles.get(amResourceProfile)); } if (appContext.getAMContainerResourceRequests() == null) { List amResourceRequests = new ArrayList(); amResourceRequests .add(ResourceRequest.newInstance(Priority.newInstance(amPriority), "*", Resources.clone(Resources.none()), 1)); appContext.setAMContainerResourceRequests(amResourceRequests); } validateResourceTypes(amResources.keySet(), resourceTypes); for (Map.Entry entry : amResources.entrySet()) { capability.setResourceValue(entry.getKey(), entry.getValue()); } // set amMemory because it's used to set Xmx param if (amMemory == -1) { amMemory = DEFAULT_AM_MEMORY; LOG.warn("AM Memory not specified, use " + DEFAULT_AM_MEMORY + " mb as AM memory"); } if (amVCores == -1) { amVCores = DEFAULT_AM_VCORES; LOG.warn("AM vcore not specified, use " + DEFAULT_AM_VCORES + " mb as AM vcores"); } capability.setMemorySize(amMemory); capability.setVirtualCores(amVCores); appContext.getAMContainerResourceRequests().get(0).setCapability( capability); LOG.warn("AM Resource capability=" + capability); } private void setContainerResources(Map profiles, List resourceTypes) throws IllegalArgumentException { if (containerMemory < -1 || containerMemory == 0) { throw new IllegalArgumentException("Container memory '" + containerMemory + "' has to be greated than 0"); } if (containerVirtualCores < -1 || containerVirtualCores == 0) { throw new IllegalArgumentException("Container vcores '" + containerVirtualCores + "' has to be greated than 0"); } validateResourceTypes(containerResources.keySet(), resourceTypes); if (profiles == null) { containerMemory = containerMemory == -1 ? DEFAULT_CONTAINER_MEMORY : containerMemory; containerVirtualCores = containerVirtualCores == -1 ? DEFAULT_CONTAINER_VCORES : containerVirtualCores; } } private void validateResourceTypes(Iterable resourceNames, List resourceTypes) { for (String resourceName : resourceNames) { if (!resourceTypes.stream().anyMatch(e -> e.getName().equals(resourceName))) { throw new ResourceNotFoundException("Unknown resource: " + resourceName); } } } static Map parseResourcesString(String resourcesStr) { Map resources = new HashMap<>(); // Ignore the grouping "[]" if (resourcesStr.startsWith("[")) { resourcesStr = resourcesStr.substring(1); } if (resourcesStr.endsWith("]")) { resourcesStr = resourcesStr.substring(0, resourcesStr.length() - 1); } for (String resource : resourcesStr.trim().split(",")) { resource = resource.trim(); if (!resource.matches("^[^=]+=\\d+\\s?\\w*$")) { throw new IllegalArgumentException("\"" + resource + "\" is not a " + "valid resource type/amount pair. " + "Please provide key=amount pairs separated by commas."); } String[] splits = resource.split("="); String key = splits[0], value = splits[1]; String units = ResourceUtils.getUnits(value); String valueWithoutUnit = value.substring( 0, value.length() - units.length()).trim(); Long resourceValue = Long.valueOf(valueWithoutUnit); if (!units.isEmpty()) { resourceValue = UnitsConversionUtil.convert(units, "Mi", resourceValue); } if (key.equals("memory")) { key = ResourceInformation.MEMORY_URI; } resources.put(key, resourceValue); } return resources; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy