org.apache.hadoop.streaming.StreamJob Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of hadoop-streaming Show documentation
Show all versions of hadoop-streaming Show documentation
Apache Hadoop MapReduce Streaming
The newest version!
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.streaming;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URI;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.TreeSet;
import org.apache.commons.cli.BasicParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.permission.FsAction;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.MRConfig;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.filecache.DistributedCache;
import org.apache.hadoop.mapreduce.server.jobtracker.JTConfig;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileAlreadyExistsException;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.InvalidJobConfException;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobID;
import org.apache.hadoop.mapred.KeyValueTextInputFormat;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.SequenceFileAsTextInputFormat;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.lib.LazyOutputFormat;
import org.apache.hadoop.mapred.lib.aggregate.ValueAggregatorCombiner;
import org.apache.hadoop.mapred.lib.aggregate.ValueAggregatorReducer;
import org.apache.hadoop.security.AccessControlException;
import org.apache.hadoop.streaming.io.IdentifierResolver;
import org.apache.hadoop.streaming.io.InputWriter;
import org.apache.hadoop.streaming.io.OutputReader;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.RunJar;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import static org.apache.hadoop.util.RunJar.MATCH_ANY;
/** All the client-side work happens here.
* (Jar packaging, MapRed job submission and monitoring)
*/
public class StreamJob implements Tool {
protected static final Logger LOG = LoggerFactory.getLogger(StreamJob.class.getName());
final static String REDUCE_NONE = "NONE";
/** -----------Streaming CLI Implementation **/
private CommandLineParser parser = new BasicParser();
private Options allOptions;
/**@deprecated use StreamJob() with ToolRunner or set the
* Configuration using {@link #setConf(Configuration)} and
* run with {@link #run(String[])}.
*/
@Deprecated
public StreamJob(String[] argv, boolean mayExit) {
this();
argv_ = Arrays.copyOf(argv, argv.length);
this.config_ = new Configuration();
}
public StreamJob() {
setupOptions();
this.config_ = new Configuration();
}
@Override
public Configuration getConf() {
return config_;
}
@Override
public void setConf(Configuration conf) {
this.config_ = conf;
}
@Override
public int run(String[] args) throws Exception {
try {
this.argv_ = Arrays.copyOf(args, args.length);
init();
preProcessArgs();
parseArgv();
if (printUsage) {
printUsage(detailedUsage_);
return 0;
}
postProcessArgs();
setJobConf();
} catch (IllegalArgumentException ex) {
//ignore, since log will already be printed
// print the log in debug mode.
LOG.debug("Error in streaming job", ex);
return 1;
}
return submitAndMonitorJob();
}
/**
* This method creates a streaming job from the given argument list.
* The created object can be used and/or submitted to a jobtracker for
* execution by a job agent such as JobControl
* @param argv the list args for creating a streaming job
* @return the created JobConf object
* @throws IOException
*/
static public JobConf createJob(String[] argv) throws IOException {
StreamJob job = new StreamJob();
job.argv_ = argv;
job.init();
job.preProcessArgs();
job.parseArgv();
job.postProcessArgs();
job.setJobConf();
return job.jobConf_;
}
/**
* This is the method that actually
* initializes the job conf and submits the job
* to the jobtracker
* @throws IOException
* @deprecated use {@link #run(String[])} instead.
*/
@Deprecated
public int go() throws IOException {
try {
return run(argv_);
}
catch (Exception ex) {
throw new IOException(ex.getMessage());
}
}
protected void init() {
try {
env_ = new Environment();
} catch (IOException io) {
throw new RuntimeException(io);
}
}
void preProcessArgs() {
verbose_ = false;
// Unset HADOOP_ROOT_LOGGER in case streaming job
// invokes additional hadoop commands.
addTaskEnvironment_ = "HADOOP_ROOT_LOGGER=";
}
void postProcessArgs() throws IOException {
if (inputSpecs_.size() == 0) {
fail("Required argument: -input ");
}
if (output_ == null) {
fail("Required argument: -output ");
}
msg("addTaskEnvironment=" + addTaskEnvironment_);
for (final String packageFile : packageFiles_) {
File f = new File(packageFile);
if (f.isFile()) {
shippedCanonFiles_.add(f.getCanonicalPath());
}
}
msg("shippedCanonFiles_=" + shippedCanonFiles_);
// careful with class names..
mapCmd_ = unqualifyIfLocalPath(mapCmd_);
comCmd_ = unqualifyIfLocalPath(comCmd_);
redCmd_ = unqualifyIfLocalPath(redCmd_);
}
String unqualifyIfLocalPath(String cmd) throws IOException {
if (cmd == null) {
//
} else {
String prog = cmd;
String args = "";
int s = cmd.indexOf(" ");
if (s != -1) {
prog = cmd.substring(0, s);
args = cmd.substring(s + 1);
}
String progCanon;
try {
progCanon = new File(prog).getCanonicalPath();
} catch (IOException io) {
progCanon = prog;
}
boolean shipped = shippedCanonFiles_.contains(progCanon);
msg("shipped: " + shipped + " " + progCanon);
if (shipped) {
// Change path to simple filename.
// That way when PipeMapRed calls Runtime.exec(),
// it will look for the excutable in Task's working dir.
// And this is where TaskRunner unjars our job jar.
prog = new File(prog).getName();
if (args.length() > 0) {
cmd = prog + " " + args;
} else {
cmd = prog;
}
}
}
msg("cmd=" + cmd);
return cmd;
}
void parseArgv() {
CommandLine cmdLine = null;
try {
cmdLine = parser.parse(allOptions, argv_);
} catch(Exception oe) {
LOG.error(oe.getMessage());
exitUsage(argv_.length > 0 && "-info".equals(argv_[0]));
}
if (cmdLine != null) {
@SuppressWarnings("unchecked")
List args = cmdLine.getArgList();
if(args != null && args.size() > 0) {
fail("Found " + args.size() + " unexpected arguments on the " +
"command line " + args);
}
detailedUsage_ = cmdLine.hasOption("info");
if (cmdLine.hasOption("help") || detailedUsage_) {
printUsage = true;
return;
}
verbose_ = cmdLine.hasOption("verbose");
background_ = cmdLine.hasOption("background");
debug_ = cmdLine.hasOption("debug")? debug_ + 1 : debug_;
String[] values = cmdLine.getOptionValues("input");
if (values != null && values.length > 0) {
for (String input : values) {
inputSpecs_.add(input);
}
}
output_ = cmdLine.getOptionValue("output");
mapCmd_ = cmdLine.getOptionValue("mapper");
comCmd_ = cmdLine.getOptionValue("combiner");
redCmd_ = cmdLine.getOptionValue("reducer");
lazyOutput_ = cmdLine.hasOption("lazyOutput");
values = cmdLine.getOptionValues("file");
if (values != null && values.length > 0) {
LOG.warn("-file option is deprecated, please use generic option" +
" -files instead.");
StringBuffer fileList = new StringBuffer();
for (String file : values) {
packageFiles_.add(file);
try {
Path path = new Path(file);
FileSystem localFs = FileSystem.getLocal(config_);
Path qualifiedPath = path.makeQualified(
localFs.getUri(), localFs.getWorkingDirectory());
validate(qualifiedPath);
String finalPath = qualifiedPath.toString();
if(fileList.length() > 0) {
fileList.append(',');
}
fileList.append(finalPath);
} catch (Exception e) {
throw new IllegalArgumentException(e);
}
}
String tmpFiles = config_.get("tmpfiles", "");
if (tmpFiles.isEmpty()) {
tmpFiles = fileList.toString();
} else {
tmpFiles = tmpFiles + "," + fileList;
}
config_.set("tmpfiles", tmpFiles);
}
String fsName = cmdLine.getOptionValue("dfs");
if (null != fsName){
LOG.warn("-dfs option is deprecated, please use -fs instead.");
config_.set("fs.default.name", fsName);
}
additionalConfSpec_ = cmdLine.getOptionValue("additionalconfspec");
inputFormatSpec_ = cmdLine.getOptionValue("inputformat");
outputFormatSpec_ = cmdLine.getOptionValue("outputformat");
numReduceTasksSpec_ = cmdLine.getOptionValue("numReduceTasks");
partitionerSpec_ = cmdLine.getOptionValue("partitioner");
inReaderSpec_ = cmdLine.getOptionValue("inputreader");
mapDebugSpec_ = cmdLine.getOptionValue("mapdebug");
reduceDebugSpec_ = cmdLine.getOptionValue("reducedebug");
ioSpec_ = cmdLine.getOptionValue("io");
String[] car = cmdLine.getOptionValues("cacheArchive");
if (null != car && car.length > 0){
LOG.warn("-cacheArchive option is deprecated, please use -archives instead.");
for(String s : car){
cacheArchives = (cacheArchives == null)?s :cacheArchives + "," + s;
}
}
String[] caf = cmdLine.getOptionValues("cacheFile");
if (null != caf && caf.length > 0){
LOG.warn("-cacheFile option is deprecated, please use -files instead.");
for(String s : caf){
cacheFiles = (cacheFiles == null)?s :cacheFiles + "," + s;
}
}
String[] jobconf = cmdLine.getOptionValues("jobconf");
if (null != jobconf && jobconf.length > 0){
LOG.warn("-jobconf option is deprecated, please use -D instead.");
for(String s : jobconf){
String[] parts = s.split("=", 2);
config_.set(parts[0], parts[1]);
}
}
String[] cmd = cmdLine.getOptionValues("cmdenv");
if (null != cmd && cmd.length > 0){
for(String s : cmd) {
if (addTaskEnvironment_.length() > 0) {
addTaskEnvironment_ += " ";
}
addTaskEnvironment_ += s;
}
}
} else {
exitUsage(argv_.length > 0 && "-info".equals(argv_[0]));
}
}
protected void msg(String msg) {
if (verbose_) {
System.out.println("STREAM: " + msg);
}
}
private Option createOption(String name, String desc,
String argName, int max, boolean required){
return Option.builder(name)
.argName(argName)
.hasArgs()
.numberOfArgs(max)
.desc(desc)
.required(required)
.build();
}
private Option createBoolOption(String name, String desc){
return Option.builder(name).desc(desc).build();
}
private void validate(final Path path) throws IOException {
try {
path.getFileSystem(config_).access(path, FsAction.READ);
} catch (FileNotFoundException e) {
fail("File: " + path + " does not exist.");
} catch (AccessControlException e) {
fail("File: " + path + " is not readable.");
}
}
private void setupOptions(){
// input and output are not required for -info and -help options,
// though they are required for streaming job to be run.
Option input = createOption("input",
"DFS input file(s) for the Map step",
"path",
Integer.MAX_VALUE,
false);
Option output = createOption("output",
"DFS output directory for the Reduce step",
"path", 1, false);
Option mapper = createOption("mapper",
"The streaming command to run", "cmd", 1, false);
Option combiner = createOption("combiner",
"The streaming command to run", "cmd", 1, false);
// reducer could be NONE
Option reducer = createOption("reducer",
"The streaming command to run", "cmd", 1, false);
Option file = createOption("file",
"File to be shipped in the Job jar file",
"file", Integer.MAX_VALUE, false);
Option dfs = createOption("dfs",
"Optional. Override DFS configuration", "|local", 1, false);
Option additionalconfspec = createOption("additionalconfspec",
"Optional.", "spec", 1, false);
Option inputformat = createOption("inputformat",
"Optional.", "spec", 1, false);
Option outputformat = createOption("outputformat",
"Optional.", "spec", 1, false);
Option partitioner = createOption("partitioner",
"Optional.", "spec", 1, false);
Option numReduceTasks = createOption("numReduceTasks",
"Optional.", "spec",1, false );
Option inputreader = createOption("inputreader",
"Optional.", "spec", 1, false);
Option mapDebug = createOption("mapdebug",
"Optional.", "spec", 1, false);
Option reduceDebug = createOption("reducedebug",
"Optional", "spec",1, false);
Option jobconf =
createOption("jobconf",
"(n=v) Optional. Add or override a JobConf property.",
"spec", 1, false);
Option cmdenv =
createOption("cmdenv", "(n=v) Pass env.var to streaming commands.",
"spec", 1, false);
Option cacheFile = createOption("cacheFile",
"File name URI", "fileNameURI", Integer.MAX_VALUE, false);
Option cacheArchive = createOption("cacheArchive",
"File name URI", "fileNameURI", Integer.MAX_VALUE, false);
Option io = createOption("io",
"Optional.", "spec", 1, false);
// boolean properties
Option background = createBoolOption("background", "Submit the job and don't wait till it completes.");
Option verbose = createBoolOption("verbose", "print verbose output");
Option info = createBoolOption("info", "print verbose output");
Option help = createBoolOption("help", "print this help message");
Option debug = createBoolOption("debug", "print debug output");
Option lazyOutput = createBoolOption("lazyOutput", "create outputs lazily");
allOptions = new Options().
addOption(input).
addOption(output).
addOption(mapper).
addOption(combiner).
addOption(reducer).
addOption(file).
addOption(dfs).
addOption(additionalconfspec).
addOption(inputformat).
addOption(outputformat).
addOption(partitioner).
addOption(numReduceTasks).
addOption(inputreader).
addOption(mapDebug).
addOption(reduceDebug).
addOption(jobconf).
addOption(cmdenv).
addOption(cacheFile).
addOption(cacheArchive).
addOption(io).
addOption(background).
addOption(verbose).
addOption(info).
addOption(debug).
addOption(help).
addOption(lazyOutput);
}
public void exitUsage(boolean detailed) {
printUsage(detailed);
fail("");
}
private void printUsage(boolean detailed) {
System.out.println("Usage: $HADOOP_HOME/bin/hadoop jar hadoop-streaming.jar"
+ " [options]");
System.out.println("Options:");
System.out.println(" -input DFS input file(s) for the Map"
+ " step.");
System.out.println(" -output DFS output directory for the"
+ " Reduce step.");
System.out.println(" -mapper Optional. Command"
+ " to be run as mapper.");
System.out.println(" -combiner Optional. Command"
+ " to be run as combiner.");
System.out.println(" -reducer Optional. Command"
+ " to be run as reducer.");
System.out.println(" -file Optional. File/dir to be "
+ "shipped in the Job jar file.\n" +
" Deprecated. Use generic option \"-files\" instead.");
System.out.println(" -inputformat \n"
+ " Optional. The input format class.");
System.out.println(" -outputformat \n"
+ " Optional. The output format class.");
System.out.println(" -partitioner Optional. The"
+ " partitioner class.");
System.out.println(" -numReduceTasks Optional. Number of reduce "
+ "tasks.");
System.out.println(" -inputreader Optional. Input recordreader"
+ " spec.");
System.out.println(" -cmdenv = Optional. Pass env.var to"
+ " streaming commands.");
System.out.println(" -mapdebug Optional. "
+ "To run this script when a map task fails.");
System.out.println(" -reducedebug Optional."
+ " To run this script when a reduce task fails.");
System.out.println(" -io Optional. Format to use"
+ " for input to and output");
System.out.println(" from mapper/reducer commands");
System.out.println(" -lazyOutput Optional. Lazily create Output.");
System.out.println(" -background Optional. Submit the job and don't wait till it completes.");
System.out.println(" -verbose Optional. Print verbose output.");
System.out.println(" -info Optional. Print detailed usage.");
System.out.println(" -help Optional. Print help message.");
System.out.println();
GenericOptionsParser.printGenericCommandUsage(System.out);
if (!detailed) {
System.out.println();
System.out.println("For more details about these options:");
System.out.println("Use " +
"$HADOOP_HOME/bin/hadoop jar hadoop-streaming.jar -info");
return;
}
System.out.println();
System.out.println("Usage tips:");
System.out.println("In -input: globbing on is supported and can "
+ "have multiple -input");
System.out.println();
System.out.println("Default Map input format: a line is a record in UTF-8 "
+ "the key part ends at first");
System.out.println(" TAB, the rest of the line is the value");
System.out.println();
System.out.println("To pass a Custom input format:");
System.out.println(" -inputformat package.MyInputFormat");
System.out.println();
System.out.println("Similarly, to pass a custom output format:");
System.out.println(" -outputformat package.MyOutputFormat");
System.out.println();
System.out.println("The files with extensions .class and .jar/.zip," +
" specified for the -file");
System.out.println(" argument[s], end up in \"classes\" and \"lib\" " +
"directories respectively inside");
System.out.println(" the working directory when the mapper and reducer are"
+ " run. All other files");
System.out.println(" specified for the -file argument[s]" +
" end up in the working directory when the");
System.out.println(" mapper and reducer are run. The location of this " +
"working directory is");
System.out.println(" unspecified.");
System.out.println();
System.out.println("To set the number of reduce tasks (num. of output " +
"files) as, say 10:");
System.out.println(" Use -numReduceTasks 10");
System.out.println("To skip the sort/combine/shuffle/sort/reduce step:");
System.out.println(" Use -numReduceTasks 0");
System.out.println(" Map output then becomes a 'side-effect " +
"output' rather than a reduce input.");
System.out.println(" This speeds up processing. This also feels " +
"more like \"in-place\" processing");
System.out.println(" because the input filename and the map " +
"input order are preserved.");
System.out.println(" This is equivalent to -reducer NONE");
System.out.println();
System.out.println("To speed up the last maps:");
System.out.println(" -D " + MRJobConfig.MAP_SPECULATIVE + "=true");
System.out.println("To speed up the last reduces:");
System.out.println(" -D " + MRJobConfig.REDUCE_SPECULATIVE + "=true");
System.out.println("To name the job (appears in the JobTracker Web UI):");
System.out.println(" -D " + MRJobConfig.JOB_NAME + "='My Job'");
System.out.println("To change the local temp directory:");
System.out.println(" -D dfs.data.dir=/tmp/dfs");
System.out.println(" -D stream.tmpdir=/tmp/streaming");
System.out.println("Additional local temp directories with -jt local:");
System.out.println(" -D " + MRConfig.LOCAL_DIR + "=/tmp/local");
System.out.println(" -D " + JTConfig.JT_SYSTEM_DIR + "=/tmp/system");
System.out.println(" -D " + MRConfig.TEMP_DIR + "=/tmp/temp");
System.out.println("To treat tasks with non-zero exit status as SUCCEDED:");
System.out.println(" -D stream.non.zero.exit.is.failure=false");
System.out.println("Use a custom hadoop streaming build along with standard"
+ " hadoop install:");
System.out.println(" $HADOOP_HOME/bin/hadoop jar " +
"/path/my-hadoop-streaming.jar [...]\\");
System.out.println(" [...] -D stream.shipped.hadoopstreaming=" +
"/path/my-hadoop-streaming.jar");
System.out.println("For more details about jobconf parameters see:");
System.out.println(" http://wiki.apache.org/hadoop/JobConfFile");
System.out.println("Truncate the values of the job configuration copied" +
"to the environment at the given length:");
System.out.println(" -D stream.jobconf.truncate.limit=-1");
System.out.println("To set an environment variable in a streaming " +
"command:");
System.out.println(" -cmdenv EXAMPLE_DIR=/home/example/dictionaries/");
System.out.println();
System.out.println("Shortcut:");
System.out.println(" setenv HSTREAMING \"$HADOOP_HOME/bin/hadoop jar " +
"hadoop-streaming.jar\"");
System.out.println();
System.out.println("Example: $HSTREAMING -mapper " +
"\"/usr/local/bin/perl5 filter.pl\"");
System.out.println(" -file /local/filter.pl -input " +
"\"/logs/0604*/*\" [...]");
System.out.println(" Ships a script, invokes the non-shipped perl " +
"interpreter. Shipped files go to");
System.out.println(" the working directory so filter.pl is found by perl. "
+ "Input files are all the");
System.out.println(" daily logs for days in month 2006-04");
}
public void fail(String message) {
System.err.println(message);
System.err.println("Try -help for more information");
throw new IllegalArgumentException(message);
}
// --------------------------------------------
protected String getHadoopClientHome() {
String h = env_.getProperty("HADOOP_HOME"); // standard Hadoop
if (h == null) {
//fail("Missing required environment variable: HADOOP_HOME");
h = "UNDEF";
}
return h;
}
protected boolean isLocalHadoop() {
return StreamUtil.isLocalJobTracker(jobConf_);
}
@Deprecated
protected String getClusterNick() {
return "default";
}
/** @return path to the created Jar file or null if no files are necessary.
*/
protected String packageJobJar() throws IOException {
ArrayList unjarFiles = new ArrayList();
// Runtime code: ship same version of code as self (job submitter code)
// usually found in: build/contrib or build/hadoop--dev-streaming.jar
// First try an explicit spec: it's too hard to find our own location in this case:
// $HADOOP_HOME/bin/hadoop jar /not/first/on/classpath/custom-hadoop-streaming.jar
// where findInClasspath() would find the version of hadoop-streaming.jar in $HADOOP_HOME
String runtimeClasses = config_.get("stream.shipped.hadoopstreaming"); // jar or class dir
if (runtimeClasses == null) {
runtimeClasses = StreamUtil.findInClasspath(StreamJob.class.getName());
}
if (runtimeClasses == null) {
throw new IOException("runtime classes not found: " + getClass().getPackage());
} else {
msg("Found runtime classes in: " + runtimeClasses);
}
if (isLocalHadoop()) {
// don't package class files (they might get unpackaged in "." and then
// hide the intended CLASSPATH entry)
// we still package everything else (so that scripts and executable are found in
// Task workdir like distributed Hadoop)
} else {
if (new File(runtimeClasses).isDirectory()) {
packageFiles_.add(runtimeClasses);
} else {
unjarFiles.add(runtimeClasses);
}
}
if (packageFiles_.size() + unjarFiles.size() == 0) {
return null;
}
String tmp = jobConf_.get("stream.tmpdir"); //, "/tmp/${mapreduce.job.user.name}/"
File tmpDir = (tmp == null) ? null : new File(tmp);
// tmpDir=null means OS default tmp dir
File jobJar = File.createTempFile("streamjob", ".jar", tmpDir);
System.out.println("packageJobJar: " + packageFiles_ + " " + unjarFiles + " " + jobJar
+ " tmpDir=" + tmpDir);
if (debug_ == 0) {
jobJar.deleteOnExit();
}
JarBuilder builder = new JarBuilder();
if (verbose_) {
builder.setVerbose(true);
}
String jobJarName = jobJar.getAbsolutePath();
builder.merge(packageFiles_, unjarFiles, jobJarName);
return jobJarName;
}
/**
* get the uris of all the files/caches
*/
protected void getURIs(String lcacheArchives, String lcacheFiles) {
String archives[] = StringUtils.getStrings(lcacheArchives);
String files[] = StringUtils.getStrings(lcacheFiles);
fileURIs = StringUtils.stringToURI(files);
archiveURIs = StringUtils.stringToURI(archives);
}
protected void setJobConf() throws IOException {
if (additionalConfSpec_ != null) {
LOG.warn("-additionalconfspec option is deprecated, please use -conf instead.");
config_.addResource(new Path(additionalConfSpec_));
}
// general MapRed job properties
jobConf_ = new JobConf(config_, StreamJob.class);
// All streaming jobs get the task timeout value
// from the configuration settings.
// The correct FS must be set before this is called!
// (to resolve local vs. dfs drive letter differences)
// (mapreduce.job.working.dir will be lazily initialized ONCE and depends on FS)
for (int i = 0; i < inputSpecs_.size(); i++) {
FileInputFormat.addInputPaths(jobConf_,
(String) inputSpecs_.get(i));
}
String defaultPackage = this.getClass().getPackage().getName();
Class c;
Class fmt = null;
if (inReaderSpec_ == null && inputFormatSpec_ == null) {
fmt = TextInputFormat.class;
} else if (inputFormatSpec_ != null) {
if (inputFormatSpec_.equals(TextInputFormat.class.getName())
|| inputFormatSpec_.equals(TextInputFormat.class.getCanonicalName())
|| inputFormatSpec_.equals(TextInputFormat.class.getSimpleName())) {
fmt = TextInputFormat.class;
} else if (inputFormatSpec_.equals(KeyValueTextInputFormat.class
.getName())
|| inputFormatSpec_.equals(KeyValueTextInputFormat.class
.getCanonicalName())
|| inputFormatSpec_.equals(KeyValueTextInputFormat.class.getSimpleName())) {
if (inReaderSpec_ == null) {
fmt = KeyValueTextInputFormat.class;
}
} else if (inputFormatSpec_.equals(SequenceFileInputFormat.class
.getName())
|| inputFormatSpec_
.equals(org.apache.hadoop.mapred.SequenceFileInputFormat.class
.getCanonicalName())
|| inputFormatSpec_
.equals(org.apache.hadoop.mapred.SequenceFileInputFormat.class.getSimpleName())) {
if (inReaderSpec_ == null) {
fmt = SequenceFileInputFormat.class;
}
} else if (inputFormatSpec_.equals(SequenceFileAsTextInputFormat.class
.getName())
|| inputFormatSpec_.equals(SequenceFileAsTextInputFormat.class
.getCanonicalName())
|| inputFormatSpec_.equals(SequenceFileAsTextInputFormat.class.getSimpleName())) {
fmt = SequenceFileAsTextInputFormat.class;
} else {
c = StreamUtil.goodClassOrNull(jobConf_, inputFormatSpec_, defaultPackage);
if (c != null) {
fmt = c;
} else {
fail("-inputformat : class not found : " + inputFormatSpec_);
}
}
}
if (fmt == null) {
fmt = StreamInputFormat.class;
}
jobConf_.setInputFormat(fmt);
if (ioSpec_ != null) {
jobConf_.set("stream.map.input", ioSpec_);
jobConf_.set("stream.map.output", ioSpec_);
jobConf_.set("stream.reduce.input", ioSpec_);
jobConf_.set("stream.reduce.output", ioSpec_);
}
Class extends IdentifierResolver> idResolverClass =
jobConf_.getClass("stream.io.identifier.resolver.class",
IdentifierResolver.class, IdentifierResolver.class);
IdentifierResolver idResolver = ReflectionUtils.newInstance(idResolverClass, jobConf_);
idResolver.resolve(jobConf_.get("stream.map.input", IdentifierResolver.TEXT_ID));
jobConf_.setClass("stream.map.input.writer.class",
idResolver.getInputWriterClass(), InputWriter.class);
idResolver.resolve(jobConf_.get("stream.reduce.input", IdentifierResolver.TEXT_ID));
jobConf_.setClass("stream.reduce.input.writer.class",
idResolver.getInputWriterClass(), InputWriter.class);
jobConf_.set("stream.addenvironment", addTaskEnvironment_);
boolean isMapperACommand = false;
if (mapCmd_ != null) {
c = StreamUtil.goodClassOrNull(jobConf_, mapCmd_, defaultPackage);
if (c != null) {
jobConf_.setMapperClass(c);
} else {
isMapperACommand = true;
jobConf_.setMapperClass(PipeMapper.class);
jobConf_.setMapRunnerClass(PipeMapRunner.class);
jobConf_.set("stream.map.streamprocessor",
URLEncoder.encode(mapCmd_, "UTF-8"));
}
}
if (comCmd_ != null) {
c = StreamUtil.goodClassOrNull(jobConf_, comCmd_, defaultPackage);
if (c != null) {
jobConf_.setCombinerClass(c);
} else {
jobConf_.setCombinerClass(PipeCombiner.class);
jobConf_.set("stream.combine.streamprocessor", URLEncoder.encode(
comCmd_, "UTF-8"));
}
}
if (numReduceTasksSpec_!= null) {
int numReduceTasks = Integer.parseInt(numReduceTasksSpec_);
jobConf_.setNumReduceTasks(numReduceTasks);
}
boolean isReducerACommand = false;
if (redCmd_ != null) {
if (redCmd_.equals(REDUCE_NONE)) {
jobConf_.setNumReduceTasks(0);
}
if (jobConf_.getNumReduceTasks() != 0) {
if (redCmd_.compareToIgnoreCase("aggregate") == 0) {
jobConf_.setReducerClass(ValueAggregatorReducer.class);
jobConf_.setCombinerClass(ValueAggregatorCombiner.class);
} else {
c = StreamUtil.goodClassOrNull(jobConf_, redCmd_, defaultPackage);
if (c != null) {
jobConf_.setReducerClass(c);
} else {
isReducerACommand = true;
jobConf_.setReducerClass(PipeReducer.class);
jobConf_.set("stream.reduce.streamprocessor", URLEncoder.encode(
redCmd_, "UTF-8"));
}
}
}
}
idResolver.resolve(jobConf_.get("stream.map.output",
IdentifierResolver.TEXT_ID));
jobConf_.setClass("stream.map.output.reader.class",
idResolver.getOutputReaderClass(), OutputReader.class);
if (isMapperACommand || jobConf_.get("stream.map.output") != null) {
// if mapper is a command, then map output key/value classes come from the
// idResolver
jobConf_.setMapOutputKeyClass(idResolver.getOutputKeyClass());
jobConf_.setMapOutputValueClass(idResolver.getOutputValueClass());
if (jobConf_.getNumReduceTasks() == 0) {
jobConf_.setOutputKeyClass(idResolver.getOutputKeyClass());
jobConf_.setOutputValueClass(idResolver.getOutputValueClass());
}
}
idResolver.resolve(jobConf_.get("stream.reduce.output",
IdentifierResolver.TEXT_ID));
jobConf_.setClass("stream.reduce.output.reader.class",
idResolver.getOutputReaderClass(), OutputReader.class);
if (isReducerACommand || jobConf_.get("stream.reduce.output") != null) {
// if reducer is a command, then output key/value classes come from the
// idResolver
jobConf_.setOutputKeyClass(idResolver.getOutputKeyClass());
jobConf_.setOutputValueClass(idResolver.getOutputValueClass());
}
if (inReaderSpec_ != null) {
String[] args = inReaderSpec_.split(",");
String readerClass = args[0];
// this argument can only be a Java class
c = StreamUtil.goodClassOrNull(jobConf_, readerClass, defaultPackage);
if (c != null) {
jobConf_.set("stream.recordreader.class", c.getName());
} else {
fail("-inputreader: class not found: " + readerClass);
}
for (int i = 1; i < args.length; i++) {
String[] nv = args[i].split("=", 2);
String k = "stream.recordreader." + nv[0];
String v = (nv.length > 1) ? nv[1] : "";
jobConf_.set(k, v);
}
}
FileOutputFormat.setOutputPath(jobConf_, new Path(output_));
fmt = null;
if (outputFormatSpec_!= null) {
c = StreamUtil.goodClassOrNull(jobConf_, outputFormatSpec_, defaultPackage);
if (c != null) {
fmt = c;
} else {
fail("-outputformat : class not found : " + outputFormatSpec_);
}
}
if (fmt == null) {
fmt = TextOutputFormat.class;
}
if (lazyOutput_) {
LazyOutputFormat.setOutputFormatClass(jobConf_, fmt);
} else {
jobConf_.setOutputFormat(fmt);
}
if (partitionerSpec_!= null) {
c = StreamUtil.goodClassOrNull(jobConf_, partitionerSpec_, defaultPackage);
if (c != null) {
jobConf_.setPartitionerClass(c);
} else {
fail("-partitioner : class not found : " + partitionerSpec_);
}
}
if(mapDebugSpec_ != null){
jobConf_.setMapDebugScript(mapDebugSpec_);
}
if(reduceDebugSpec_ != null){
jobConf_.setReduceDebugScript(reduceDebugSpec_);
}
// last, allow user to override anything
// (although typically used with properties we didn't touch)
jar_ = packageJobJar();
if (jar_ != null) {
jobConf_.setJar(jar_);
}
if ((cacheArchives != null) || (cacheFiles != null)){
getURIs(cacheArchives, cacheFiles);
boolean b = DistributedCache.checkURIs(fileURIs, archiveURIs);
if (!b)
fail(LINK_URI);
}
// set the jobconf for the caching parameters
if (cacheArchives != null) {
Job.setCacheArchives(archiveURIs, jobConf_);
}
if (cacheFiles != null) {
Job.setCacheFiles(fileURIs, jobConf_);
}
if (verbose_) {
listJobConfProperties();
}
msg("submitting to jobconf: " + getJobTrackerHostPort());
}
/**
* Prints out the jobconf properties on stdout
* when verbose is specified.
*/
protected void listJobConfProperties()
{
msg("==== JobConf properties:");
TreeMap sorted = new TreeMap();
for (final Map.Entry en : jobConf_) {
sorted.put(en.getKey(), en.getValue());
}
for (final Map.Entry en: sorted.entrySet()) {
msg(en.getKey() + "=" + en.getValue());
}
msg("====");
}
protected String getJobTrackerHostPort() {
return jobConf_.get(JTConfig.JT_IPC_ADDRESS);
}
// Based on JobClient
public int submitAndMonitorJob() throws IOException {
if (jar_ != null && isLocalHadoop()) {
// getAbs became required when shell and subvm have different working dirs...
File wd = new File(".").getAbsoluteFile();
RunJar.unJar(new File(jar_), wd, MATCH_ANY);
}
// if jobConf_ changes must recreate a JobClient
jc_ = new JobClient(jobConf_);
running_ = null;
try {
running_ = jc_.submitJob(jobConf_);
jobId_ = running_.getID();
if (background_) {
LOG.info("Job is running in background.");
} else if (!jc_.monitorAndPrintJob(jobConf_, running_)) {
LOG.error("Job not successful!");
return 1;
}
LOG.info("Output directory: " + output_);
} catch(FileNotFoundException fe) {
LOG.error("Error launching job , bad input path : " + fe.getMessage());
return 2;
} catch(InvalidJobConfException je) {
LOG.error("Error launching job , Invalid job conf : " + je.getMessage());
return 3;
} catch(FileAlreadyExistsException fae) {
LOG.error("Error launching job , Output path already exists : "
+ fae.getMessage());
return 4;
} catch(IOException ioe) {
LOG.error("Error Launching job : " + ioe.getMessage());
return 5;
} catch (InterruptedException ie) {
LOG.error("Error monitoring job : " + ie.getMessage());
return 6;
} finally {
jc_.close();
}
return 0;
}
protected String[] argv_;
protected boolean background_;
protected boolean verbose_;
protected boolean detailedUsage_;
protected boolean printUsage = false;
protected int debug_;
protected Environment env_;
protected String jar_;
protected boolean localHadoop_;
protected Configuration config_;
protected JobConf jobConf_;
protected JobClient jc_;
// command-line arguments
protected ArrayList inputSpecs_ = new ArrayList();
protected TreeSet seenPrimary_ = new TreeSet();
protected boolean hasSimpleInputSpecs_;
protected ArrayList packageFiles_ = new ArrayList();
protected ArrayList shippedCanonFiles_ = new ArrayList();
//protected TreeMap userJobConfProps_ = new TreeMap();
protected String output_;
protected String mapCmd_;
protected String comCmd_;
protected String redCmd_;
protected String cacheFiles;
protected String cacheArchives;
protected URI[] fileURIs;
protected URI[] archiveURIs;
protected String inReaderSpec_;
protected String inputFormatSpec_;
protected String outputFormatSpec_;
protected String partitionerSpec_;
protected String numReduceTasksSpec_;
protected String additionalConfSpec_;
protected String mapDebugSpec_;
protected String reduceDebugSpec_;
protected String ioSpec_;
protected boolean lazyOutput_;
// Use to communicate config to the external processes (ex env.var.HADOOP_USER)
// encoding "a=b c=d"
protected String addTaskEnvironment_;
protected boolean outputSingleNode_;
protected long minRecWrittenToEnableSkip_;
protected RunningJob running_;
protected JobID jobId_;
protected static final String LINK_URI = "You need to specify the uris as scheme://path#linkname," +
"Please specify a different link name for all of your caching URIs";
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy