All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.spark.launcher.SparkSubmitCommandBuilder Maven / Gradle / Ivy

There is a newer version: 2.2.3
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.launcher;

import java.io.File;
import java.io.IOException;
import java.util.*;

import static org.apache.spark.launcher.CommandBuilderUtils.*;

/**
 * Special command builder for handling a CLI invocation of SparkSubmit.
 * 

* This builder adds command line parsing compatible with SparkSubmit. It handles setting * driver-side options and special parsing behavior needed for the special-casing certain internal * Spark applications. *

* This class has also some special features to aid launching shells (pyspark and sparkR) and also * examples. */ class SparkSubmitCommandBuilder extends AbstractCommandBuilder { /** * Name of the app resource used to identify the PySpark shell. The command line parser expects * the resource name to be the very first argument to spark-submit in this case. * * NOTE: this cannot be "pyspark-shell" since that identifies the PySpark shell to SparkSubmit * (see java_gateway.py), and can cause this code to enter into an infinite loop. */ static final String PYSPARK_SHELL = "pyspark-shell-main"; /** * This is the actual resource name that identifies the PySpark shell to SparkSubmit. */ static final String PYSPARK_SHELL_RESOURCE = "pyspark-shell"; /** * Name of the app resource used to identify the SparkR shell. The command line parser expects * the resource name to be the very first argument to spark-submit in this case. * * NOTE: this cannot be "sparkr-shell" since that identifies the SparkR shell to SparkSubmit * (see sparkR.R), and can cause this code to enter into an infinite loop. */ static final String SPARKR_SHELL = "sparkr-shell-main"; /** * This is the actual resource name that identifies the SparkR shell to SparkSubmit. */ static final String SPARKR_SHELL_RESOURCE = "sparkr-shell"; /** * Name of app resource used to identify examples. When running examples, args[0] should be * this name. The app resource will identify the example class to run. */ static final String RUN_EXAMPLE = "run-example"; /** * Prefix for example class names. */ static final String EXAMPLE_CLASS_PREFIX = "org.apache.spark.examples."; /** * This map must match the class names for available special classes, since this modifies the way * command line parsing works. This maps the class name to the resource to use when calling * spark-submit. */ private static final Map specialClasses = new HashMap<>(); static { specialClasses.put("org.apache.spark.repl.Main", "spark-shell"); specialClasses.put("org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver", SparkLauncher.NO_RESOURCE); specialClasses.put("org.apache.spark.sql.hive.thriftserver.HiveThriftServer2", SparkLauncher.NO_RESOURCE); } final List sparkArgs; private final boolean isAppResourceReq; private final boolean isExample; /** * Controls whether mixing spark-submit arguments with app arguments is allowed. This is needed * to parse the command lines for things like bin/spark-shell, which allows users to mix and * match arguments (e.g. "bin/spark-shell SparkShellArg --master foo"). */ private boolean allowsMixedArguments; SparkSubmitCommandBuilder() { this.sparkArgs = new ArrayList<>(); this.isAppResourceReq = true; this.isExample = false; } SparkSubmitCommandBuilder(List args) { this.allowsMixedArguments = false; this.sparkArgs = new ArrayList<>(); boolean isExample = false; List submitArgs = args; if (args.size() > 0) { switch (args.get(0)) { case PYSPARK_SHELL: this.allowsMixedArguments = true; appResource = PYSPARK_SHELL; submitArgs = args.subList(1, args.size()); break; case SPARKR_SHELL: this.allowsMixedArguments = true; appResource = SPARKR_SHELL; submitArgs = args.subList(1, args.size()); break; case RUN_EXAMPLE: isExample = true; submitArgs = args.subList(1, args.size()); } this.isExample = isExample; OptionParser parser = new OptionParser(); parser.parse(submitArgs); this.isAppResourceReq = parser.isAppResourceReq; } else { this.isExample = isExample; this.isAppResourceReq = false; } } @Override public List buildCommand(Map env) throws IOException, IllegalArgumentException { if (PYSPARK_SHELL.equals(appResource) && isAppResourceReq) { return buildPySparkShellCommand(env); } else if (SPARKR_SHELL.equals(appResource) && isAppResourceReq) { return buildSparkRCommand(env); } else { return buildSparkSubmitCommand(env); } } List buildSparkSubmitArgs() { List args = new ArrayList<>(); SparkSubmitOptionParser parser = new SparkSubmitOptionParser(); if (!allowsMixedArguments && isAppResourceReq) { checkArgument(appResource != null, "Missing application resource."); } if (verbose) { args.add(parser.VERBOSE); } if (master != null) { args.add(parser.MASTER); args.add(master); } if (deployMode != null) { args.add(parser.DEPLOY_MODE); args.add(deployMode); } if (appName != null) { args.add(parser.NAME); args.add(appName); } for (Map.Entry e : conf.entrySet()) { args.add(parser.CONF); args.add(String.format("%s=%s", e.getKey(), e.getValue())); } if (propertiesFile != null) { args.add(parser.PROPERTIES_FILE); args.add(propertiesFile); } if (isExample) { jars.addAll(findExamplesJars()); } if (!jars.isEmpty()) { args.add(parser.JARS); args.add(join(",", jars)); } if (!files.isEmpty()) { args.add(parser.FILES); args.add(join(",", files)); } if (!pyFiles.isEmpty()) { args.add(parser.PY_FILES); args.add(join(",", pyFiles)); } if (isAppResourceReq) { checkArgument(!isExample || mainClass != null, "Missing example class name."); } if (mainClass != null) { args.add(parser.CLASS); args.add(mainClass); } args.addAll(sparkArgs); if (appResource != null) { args.add(appResource); } args.addAll(appArgs); return args; } private List buildSparkSubmitCommand(Map env) throws IOException, IllegalArgumentException { // Load the properties file and check whether spark-submit will be running the app's driver // or just launching a cluster app. When running the driver, the JVM's argument will be // modified to cover the driver's configuration. Map config = getEffectiveConfig(); boolean isClientMode = isClientMode(config); String extraClassPath = isClientMode ? config.get(SparkLauncher.DRIVER_EXTRA_CLASSPATH) : null; List cmd = buildJavaCommand(extraClassPath); // Take Thrift Server as daemon if (isThriftServer(mainClass)) { addOptionString(cmd, System.getenv("SPARK_DAEMON_JAVA_OPTS")); } addOptionString(cmd, System.getenv("SPARK_SUBMIT_OPTS")); addOptionString(cmd, System.getenv("SPARK_JAVA_OPTS")); // We don't want the client to specify Xmx. These have to be set by their corresponding // memory flag --driver-memory or configuration entry spark.driver.memory String driverExtraJavaOptions = config.get(SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS); if (!isEmpty(driverExtraJavaOptions) && driverExtraJavaOptions.contains("Xmx")) { String msg = String.format("Not allowed to specify max heap(Xmx) memory settings through " + "java options (was %s). Use the corresponding --driver-memory or " + "spark.driver.memory configuration instead.", driverExtraJavaOptions); throw new IllegalArgumentException(msg); } if (isClientMode) { // Figuring out where the memory value come from is a little tricky due to precedence. // Precedence is observed in the following order: // - explicit configuration (setConf()), which also covers --driver-memory cli argument. // - properties file. // - SPARK_DRIVER_MEMORY env variable // - SPARK_MEM env variable // - default value (1g) // Take Thrift Server as daemon String tsMemory = isThriftServer(mainClass) ? System.getenv("SPARK_DAEMON_MEMORY") : null; String memory = firstNonEmpty(tsMemory, config.get(SparkLauncher.DRIVER_MEMORY), System.getenv("SPARK_DRIVER_MEMORY"), System.getenv("SPARK_MEM"), DEFAULT_MEM); cmd.add("-Xmx" + memory); addOptionString(cmd, driverExtraJavaOptions); mergeEnvPathList(env, getLibPathEnvName(), config.get(SparkLauncher.DRIVER_EXTRA_LIBRARY_PATH)); } addPermGenSizeOpt(cmd); cmd.add("org.apache.spark.deploy.SparkSubmit"); cmd.addAll(buildSparkSubmitArgs()); return cmd; } private List buildPySparkShellCommand(Map env) throws IOException { // For backwards compatibility, if a script is specified in // the pyspark command line, then run it using spark-submit. if (!appArgs.isEmpty() && appArgs.get(0).endsWith(".py")) { System.err.println( "Running python applications through 'pyspark' is not supported as of Spark 2.0.\n" + "Use ./bin/spark-submit "); System.exit(-1); } checkArgument(appArgs.isEmpty(), "pyspark does not support any application options."); // When launching the pyspark shell, the spark-submit arguments should be stored in the // PYSPARK_SUBMIT_ARGS env variable. appResource = PYSPARK_SHELL_RESOURCE; constructEnvVarArgs(env, "PYSPARK_SUBMIT_ARGS"); // Will pick up the binary executable in the following order // 1. conf spark.pyspark.driver.python // 2. conf spark.pyspark.python // 3. environment variable PYSPARK_DRIVER_PYTHON // 4. environment variable PYSPARK_PYTHON // 5. python List pyargs = new ArrayList<>(); pyargs.add(firstNonEmpty(conf.get(SparkLauncher.PYSPARK_DRIVER_PYTHON), conf.get(SparkLauncher.PYSPARK_PYTHON), System.getenv("PYSPARK_DRIVER_PYTHON"), System.getenv("PYSPARK_PYTHON"), "python")); String pyOpts = System.getenv("PYSPARK_DRIVER_PYTHON_OPTS"); if (conf.containsKey(SparkLauncher.PYSPARK_PYTHON)) { // pass conf spark.pyspark.python to python by environment variable. env.put("PYSPARK_PYTHON", conf.get(SparkLauncher.PYSPARK_PYTHON)); } if (!isEmpty(pyOpts)) { pyargs.addAll(parseOptionString(pyOpts)); } return pyargs; } private List buildSparkRCommand(Map env) throws IOException { if (!appArgs.isEmpty() && appArgs.get(0).endsWith(".R")) { System.err.println( "Running R applications through 'sparkR' is not supported as of Spark 2.0.\n" + "Use ./bin/spark-submit "); System.exit(-1); } // When launching the SparkR shell, store the spark-submit arguments in the SPARKR_SUBMIT_ARGS // env variable. appResource = SPARKR_SHELL_RESOURCE; constructEnvVarArgs(env, "SPARKR_SUBMIT_ARGS"); // Set shell.R as R_PROFILE_USER to load the SparkR package when the shell comes up. String sparkHome = System.getenv("SPARK_HOME"); env.put("R_PROFILE_USER", join(File.separator, sparkHome, "R", "lib", "SparkR", "profile", "shell.R")); List args = new ArrayList<>(); args.add(firstNonEmpty(conf.get(SparkLauncher.SPARKR_R_SHELL), System.getenv("SPARKR_DRIVER_R"), "R")); return args; } private void constructEnvVarArgs( Map env, String submitArgsEnvVariable) throws IOException { mergeEnvPathList(env, getLibPathEnvName(), getEffectiveConfig().get(SparkLauncher.DRIVER_EXTRA_LIBRARY_PATH)); StringBuilder submitArgs = new StringBuilder(); for (String arg : buildSparkSubmitArgs()) { if (submitArgs.length() > 0) { submitArgs.append(" "); } submitArgs.append(quoteForCommandString(arg)); } env.put(submitArgsEnvVariable, submitArgs.toString()); } private boolean isClientMode(Map userProps) { String userMaster = firstNonEmpty(master, userProps.get(SparkLauncher.SPARK_MASTER)); String userDeployMode = firstNonEmpty(deployMode, userProps.get(SparkLauncher.DEPLOY_MODE)); // Default master is "local[*]", so assume client mode in that case return userMaster == null || "client".equals(userDeployMode) || (!userMaster.equals("yarn-cluster") && userDeployMode == null); } /** * Return whether the given main class represents a thrift server. */ private boolean isThriftServer(String mainClass) { return (mainClass != null && mainClass.equals("org.apache.spark.sql.hive.thriftserver.HiveThriftServer2")); } private List findExamplesJars() { boolean isTesting = "1".equals(getenv("SPARK_TESTING")); List examplesJars = new ArrayList<>(); String sparkHome = getSparkHome(); File jarsDir; if (new File(sparkHome, "RELEASE").isFile()) { jarsDir = new File(sparkHome, "examples/jars"); } else { jarsDir = new File(sparkHome, String.format("examples/target/scala-%s/jars", getScalaVersion())); } boolean foundDir = jarsDir.isDirectory(); checkState(isTesting || foundDir, "Examples jars directory '%s' does not exist.", jarsDir.getAbsolutePath()); if (foundDir) { for (File f: jarsDir.listFiles()) { examplesJars.add(f.getAbsolutePath()); } } return examplesJars; } private class OptionParser extends SparkSubmitOptionParser { boolean isAppResourceReq = true; @Override protected boolean handle(String opt, String value) { if (opt.equals(MASTER)) { master = value; } else if (opt.equals(DEPLOY_MODE)) { deployMode = value; } else if (opt.equals(PROPERTIES_FILE)) { propertiesFile = value; } else if (opt.equals(DRIVER_MEMORY)) { conf.put(SparkLauncher.DRIVER_MEMORY, value); } else if (opt.equals(DRIVER_JAVA_OPTIONS)) { conf.put(SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS, value); } else if (opt.equals(DRIVER_LIBRARY_PATH)) { conf.put(SparkLauncher.DRIVER_EXTRA_LIBRARY_PATH, value); } else if (opt.equals(DRIVER_CLASS_PATH)) { conf.put(SparkLauncher.DRIVER_EXTRA_CLASSPATH, value); } else if (opt.equals(CONF)) { String[] setConf = value.split("=", 2); checkArgument(setConf.length == 2, "Invalid argument to %s: %s", CONF, value); conf.put(setConf[0], setConf[1]); } else if (opt.equals(CLASS)) { // The special classes require some special command line handling, since they allow // mixing spark-submit arguments with arguments that should be propagated to the shell // itself. Note that for this to work, the "--class" argument must come before any // non-spark-submit arguments. mainClass = value; if (specialClasses.containsKey(value)) { allowsMixedArguments = true; appResource = specialClasses.get(value); } } else if (opt.equals(KILL_SUBMISSION) || opt.equals(STATUS)) { isAppResourceReq = false; sparkArgs.add(opt); sparkArgs.add(value); } else if (opt.equals(HELP) || opt.equals(USAGE_ERROR)) { isAppResourceReq = false; sparkArgs.add(opt); } else if (opt.equals(VERSION)) { isAppResourceReq = false; sparkArgs.add(opt); } else { sparkArgs.add(opt); if (value != null) { sparkArgs.add(value); } } return true; } @Override protected boolean handleUnknown(String opt) { // When mixing arguments, add unrecognized parameters directly to the user arguments list. In // normal mode, any unrecognized parameter triggers the end of command line parsing, and the // parameter itself will be interpreted by SparkSubmit as the application resource. The // remaining params will be appended to the list of SparkSubmit arguments. if (allowsMixedArguments) { appArgs.add(opt); return true; } else if (isExample) { String className = opt; if (!className.startsWith(EXAMPLE_CLASS_PREFIX)) { className = EXAMPLE_CLASS_PREFIX + className; } mainClass = className; appResource = SparkLauncher.NO_RESOURCE; return false; } else { checkArgument(!opt.startsWith("-"), "Unrecognized option: %s", opt); checkState(appResource == null, "Found unrecognized argument but resource is already set."); appResource = opt; return false; } } @Override protected void handleExtraArgs(List extra) { appArgs.addAll(extra); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy