ml.shifu.guagua.yarn.GuaguaOptionsParser Maven / Gradle / Ivy
/*
* Copyright [2013-2014] PayPal Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ml.shifu.guagua.yarn;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLClassLoader;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* GuaguaOptionsParser
is a utility to parse command line arguments generic to the Hadoop framework.
*
* GenericOptionsParser
recognizes several standard command line arguments, enabling applications to easily
* specify a namenode, a jobtracker, additional configuration resources etc.
*
* Generic Options
*
*
* The supported generic options are:
*
*
*
*
*
* -conf <configuration file> specify a configuration file
* -D <property=value> use value for given property
* -fs <local|namenode:port> specify a namenode
* -jt <local|jobtracker:port> specify a job tracker
* -files <comma separated list of files> specify comma separated
* files to be copied to the map reduce cluster
* -libjars <comma separated list of jars> specify comma separated
* jar files to include in the classpath.
* -archives <comma separated list of archives> specify comma
* separated archives to be unarchived on the compute machines.
*
*
*
*
*
*
*
* The general command line syntax is:
*
*
*
* bin/hadoop command [genericOptions] [commandOptions]
*
*
*
*
* Generic command line arguments might modify Configuration
objects, given to
* constructors.
*
*
*
* The functionality is implemented using Commons CLI.
*
*
*
* Examples:
*
*
*
*
*
* $ bin/hadoop dfs -fs darwin:8020 -ls /data
* list /data directory in dfs with namenode darwin:8020
*
* $ bin/hadoop dfs -D fs.default.name=darwin:8020 -ls /data
* list /data directory in dfs with namenode darwin:8020
*
* $ bin/hadoop dfs -conf hadoop-site.xml -ls /data
* list /data directory in dfs with conf specified in hadoop-site.xml
*
* $ bin/hadoop job -D mapred.job.tracker=darwin:50020 -submit job.xml
* submit a job to job tracker darwin:50020
*
* $ bin/hadoop job -jt darwin:50020 -submit job.xml
* submit a job to job tracker darwin:50020
*
* $ bin/hadoop job -jt local -submit job.xml
* submit a job to local runner
*
* $ bin/hadoop jar -libjars testlib.jar -archives test.tgz -files file.txt inputjar args
* job submission with libjars, files and archives
*
*
*
*
*
* Copy from hadoop GenericOptionsParser and extend it.
*
* @see Tool
* @see ToolRunner
*
* @deprecated use {@link ml.shifu.guagua.hadoop.io.GuaguaOptionsParser}
*/
@Deprecated
public class GuaguaOptionsParser {
private static final Logger LOG = LoggerFactory.getLogger(GuaguaOptionsParser.class);
private static final String FILE_SEPERATOR = ",";
private Configuration conf;
private CommandLine commandLine;
/**
* Create an options parser with the given options to parse the args.
*
* @param opts
* the options
* @param args
* the command line arguments
* @throws IOException
*/
public GuaguaOptionsParser(Options opts, String[] args) throws IOException {
this(new Configuration(), new Options(), args);
}
/**
* Create an options parser to parse the args.
*
* @param args
* the command line arguments
* @throws IOException
*/
public GuaguaOptionsParser(String[] args) throws IOException {
this(new Configuration(), new Options(), args);
}
/**
* Create a GuaguaOptionsParser to parse only the generic Hadoop
* arguments.
*
* The array of string arguments other than the generic arguments can be
* obtained by {@link #getRemainingArgs()}.
*
* @param conf
* the Configuration
to modify.
* @param args
* command-line arguments.
* @throws IOException
*/
public GuaguaOptionsParser(Configuration conf, String[] args) throws IOException {
this(conf, new Options(), args);
}
/**
* Create a GuaguaOptionsParser
to parse given options as well as generic Hadoop options.
*
* The resulting CommandLine
object can be obtained by {@link #getCommandLine()}.
*
* @param conf
* the configuration to modify
* @param options
* options built by the caller
* @param args
* User-specified arguments
* @throws IOException
*/
public GuaguaOptionsParser(Configuration conf, Options options, String[] args) throws IOException {
parseGeneralOptions(options, conf, args);
this.conf = conf;
}
/**
* Returns an array of Strings containing only application-specific arguments.
*
* @return array of String
s containing the un-parsed arguments or empty array if
* commandLine was not defined.
*/
public String[] getRemainingArgs() {
return (commandLine == null) ? new String[] {} : commandLine.getArgs();
}
/**
* Get the modified configuration
*
* @return the configuration that has the modified parameters.
*/
public Configuration getConfiguration() {
return conf;
}
/**
* Returns the commons-cli CommandLine
object to process the parsed arguments.
*
* Note: If the object is created with {@link #GuaguaOptionsParser(Configuration, String[])}, then returned
* object will only contain parsed generic options.
*
* @return CommandLine
representing list of arguments parsed against Options descriptor.
*/
public CommandLine getCommandLine() {
return commandLine;
}
/**
* Specify properties of each generic option
*/
@SuppressWarnings("static-access")
private static Options buildGeneralOptions(Options opts) {
Option fs = OptionBuilder.withArgName("local|namenode:port").hasArg().withDescription("specify a namenode")
.create("fs");
Option jt = OptionBuilder.withArgName("local|jobtracker:port").hasArg()
.withDescription("specify a job tracker").create("jt");
Option oconf = OptionBuilder.withArgName("configuration file").hasArg()
.withDescription("specify an application configuration file").create("conf");
Option property = OptionBuilder.withArgName("property=value").hasArg()
.withDescription("use value for given property").create('D');
Option libjars = OptionBuilder.withArgName("paths").hasArg()
.withDescription("comma separated jar files to include in the classpath.").create("libjars");
Option files = OptionBuilder.withArgName("paths").hasArg()
.withDescription("comma separated files to be copied to the " + "map reduce cluster").create("files");
Option archives = OptionBuilder.withArgName("paths").hasArg()
.withDescription("comma separated archives to be unarchived" + " on the compute machines.")
.create("archives");
// file with security tokens
Option tokensFile = OptionBuilder.withArgName("tokensFile").hasArg()
.withDescription("name of the file with the tokens").create("tokenCacheFile");
Option input = OptionBuilder.withArgName("paths").hasArg().withDescription("specify input folder").create("i");
Option zk = OptionBuilder.withArgName("zkserverhost:port,zkserverhost:port").hasArg()
.withDescription("specify zookeeper servers").create("z");
Option worker = OptionBuilder.withArgName("class name").hasArg().withDescription("specify worker class name")
.create("w");
Option master = OptionBuilder.withArgName("class name").hasArg().withDescription("specify master class name")
.create("m");
Option masterResult = OptionBuilder.withArgName("class name").hasArg()
.withDescription("specify master result class name").create("mr");
Option workerResult = OptionBuilder.withArgName("class name").hasArg()
.withDescription("specify worker result class name").create("wr");
Option iteration = OptionBuilder.withArgName("1").hasArg().withDescription("specify iteration count")
.create("c");
Option name = OptionBuilder.withArgName("job name").hasArg().withDescription("specify job name").create("n");
Option inputformat = OptionBuilder.withArgName("class name").hasArg()
.withDescription("specify input format class name").create("inputformat");
opts.addOption(fs);
opts.addOption(jt);
opts.addOption(oconf);
opts.addOption(property);
opts.addOption(libjars);
opts.addOption(files);
opts.addOption(archives);
opts.addOption(tokensFile);
opts.addOption(input);
opts.addOption(zk);
opts.addOption(worker);
opts.addOption(master);
opts.addOption(masterResult);
opts.addOption(workerResult);
opts.addOption(iteration);
opts.addOption(name);
opts.addOption(inputformat);
return opts;
}
/**
* Modify configuration according user-specified generic options
*
* @param conf
* Configuration to be modified
* @param line
* User-specified generic options
*/
private void processGeneralOptions(Configuration conf, CommandLine line) throws IOException {
if(line.hasOption("fs")) {
FileSystem.setDefaultUri(conf, line.getOptionValue("fs"));
}
if(line.hasOption("jt")) {
conf.set("mapred.job.tracker", line.getOptionValue("jt"));
}
if(line.hasOption("conf")) {
String[] values = line.getOptionValues("conf");
for(String value: values) {
conf.addResource(new Path(value));
}
}
if(line.hasOption("libjars")) {
conf.set("tmpjars", validateFiles(line.getOptionValue("libjars"), conf));
// setting libjars in client classpath
URL[] libjars = getLibJars(conf);
if(libjars != null && libjars.length > 0) {
conf.setClassLoader(new URLClassLoader(libjars, conf.getClassLoader()));
Thread.currentThread().setContextClassLoader(
new URLClassLoader(libjars, Thread.currentThread().getContextClassLoader()));
}
}
if(line.hasOption("files")) {
conf.set("tmpfiles", validateFiles(line.getOptionValue("files"), conf));
}
if(line.hasOption("archives")) {
conf.set("tmparchives", validateFiles(line.getOptionValue("archives"), conf));
}
if(line.hasOption('D')) {
String[] property = line.getOptionValues('D');
for(String prop: property) {
String[] keyval = prop.split("=", 2);
if(keyval.length == 2) {
conf.set(keyval[0], keyval[1]);
}
}
}
conf.setBoolean("mapred.used.genericoptionsparser", true);
// tokensFile
if(line.hasOption("tokenCacheFile")) {
String fileName = line.getOptionValue("tokenCacheFile");
// check if the local file exists
try {
FileSystem localFs = FileSystem.getLocal(conf);
Path p = new Path(fileName);
if(!localFs.exists(p)) {
throw new FileNotFoundException("File " + fileName + " does not exist.");
}
LOG.debug("setting conf tokensFile: {}", fileName);
conf.set("mapreduce.job.credentials.json", localFs.makeQualified(p).toString());
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
/**
* If libjars are set in the conf, parse the libjars.
*/
public static URL[] getLibJars(Configuration conf) throws IOException {
String jars = conf.get("tmpjars");
if(jars == null) {
return null;
}
String[] files = jars.split(FILE_SEPERATOR);
List cp = new ArrayList();
for(String file: files) {
Path tmp = new Path(file);
if(tmp.getFileSystem(conf).equals(FileSystem.getLocal(conf))) {
cp.add(FileSystem.getLocal(conf).pathToFile(tmp).toURI().toURL());
}
}
return cp.toArray(new URL[0]);
}
/**
* Take input as a comma separated list of files and verifies if they exist. It defaults for file:/// if the files
* specified do not have a scheme. it returns the paths uri converted defaulting to file:///. So an input of
* /home/user/file1,/home/user/file2 would return file:///home/user/file1,file:///home/user/file2
*/
private String validateFiles(String files, Configuration conf) throws IOException {
if(files == null)
return null;
String[] fileArr = files.split(FILE_SEPERATOR);
String[] finalArr = new String[fileArr.length];
for(int i = 0; i < fileArr.length; i++) {
String tmp = fileArr[i];
String finalPath;
URI pathURI;
try {
pathURI = new URI(tmp);
} catch (URISyntaxException e) {
throw new IllegalArgumentException(e);
}
Path path = new Path(pathURI);
FileSystem localFs = FileSystem.getLocal(conf);
if(pathURI.getScheme() == null) {
// default to the local file system
// check if the file exists or not first
if(!localFs.exists(path)) {
throw new FileNotFoundException("File " + tmp + " does not exist.");
}
finalPath = path.makeQualified(localFs).toString();
} else {
// check if the file exists in this file system
// we need to recreate this filesystem object to copy
// these files to the file system jobtracker is running
// on.
FileSystem fs = path.getFileSystem(conf);
if(!fs.exists(path)) {
throw new FileNotFoundException("File " + tmp + " does not exist.");
}
finalPath = path.makeQualified(fs).toString();
}
finalArr[i] = finalPath;
}
return StringUtils.arrayToString(finalArr);
}
/**
* Parse the user-specified options, get the generic options, and modify configuration accordingly
*
* @param conf
* Configuration to be modified
* @param args
* User-specified arguments
* @return Command-specific arguments
*/
private String[] parseGeneralOptions(Options opts, Configuration conf, String[] args) throws IOException {
opts = buildGeneralOptions(opts);
CommandLineParser parser = new GnuParser();
try {
commandLine = parser.parse(opts, args, true);
processGeneralOptions(conf, commandLine);
return commandLine.getArgs();
} catch (ParseException e) {
LOG.warn("options parsing failed: {}", e.getMessage());
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp("general options are: ", opts);
}
return args;
}
/**
* Print the usage message for generic command-line options supported.
*
* @param out
* stream to print the usage message to.
*/
public static void printGenericCommandUsage(PrintStream out) {
out.println("Generic options supported are");
out.println("-conf specify an application configuration file");
out.println("-D use value for given property");
out.println("-fs specify a namenode");
out.println("-jt specify a job tracker");
out.println("-files specify comma separated files to be copied to the map reduce cluster");
out.println("-libjars specify comma separated jar files to include in the classpath.");
out.println("-archives specify comma separated archives to be unarchived on the compute machines.");
out.println("-i specify input folder or input file.");
out.println("-z specify zookeeper servers.");
out.println("-w specify worker class name.");
out.println("-m specify master class name.");
out.println("-mr specify master result class name.");
out.println("-wr specify worker result class name.");
out.println("-c specify maximal iteration count.");
out.println("-n specify job name.");
out.println("-inputformat specify inputformat class name.\n");
out.println("The general command line syntax is");
out.println("bin/hadoop command [genericOptions] [commandOptions]\n");
}
}