
com.aliyun.odps.mapred.bridge.streaming.StreamJob Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.aliyun.odps.mapred.bridge.streaming;
import static com.aliyun.odps.mapred.utils.UTF8ByteArrayUtils.unescapeSeparator;
import java.io.File;
import java.io.IOException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.TreeMap;
import java.util.UUID;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import org.apache.commons.cli.BasicParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.aliyun.odps.Column;
import com.aliyun.odps.OdpsException;
import com.aliyun.odps.OdpsType;
import com.aliyun.odps.Resource;
import com.aliyun.odps.conf.Configuration;
import com.aliyun.odps.data.TableInfo;
import com.aliyun.odps.mapred.JobClient;
import com.aliyun.odps.mapred.RunningJob;
import com.aliyun.odps.mapred.bridge.MetaExplorer;
import com.aliyun.odps.mapred.bridge.MetaExplorerImpl;
import com.aliyun.odps.mapred.bridge.streaming.io.InputWriter;
import com.aliyun.odps.mapred.bridge.streaming.io.OutputReader;
import com.aliyun.odps.mapred.bridge.streaming.io.RecordInputWriter;
import com.aliyun.odps.mapred.bridge.streaming.io.RecordOutputReader;
import com.aliyun.odps.mapred.bridge.streaming.io.TextInputWriter;
import com.aliyun.odps.mapred.bridge.streaming.io.TextOutputReader;
import com.aliyun.odps.mapred.conf.JobConf;
import com.aliyun.odps.mapred.conf.SessionState;
import com.aliyun.odps.mapred.utils.InputUtils;
import com.aliyun.odps.mapred.utils.OutputUtils;
/**
* All the client-side work happens here.
* (Jar packaging, MapRed job submission and monitoring)
*/
public class StreamJob {
protected static final Log LOG = LogFactory.getLog(StreamJob.class.getName());
final static String REDUCE_NONE = "NONE";
/**
* -----------Streaming CLI Implementation *
*/
private CommandLineParser parser = new BasicParser();
private Options allOptions;
public StreamJob() {
setupOptions();
this.config_ = new JobConf();
}
public Configuration getConf() {
return config_;
}
public int run(String[] args) throws Exception {
for (String aa : args) {
LOG.debug("arg: '" + aa + "'");
}
try {
this.argv_ = Arrays.copyOf(args, args.length);
init();
preProcessArgs();
parseArgv();
if (printUsage) {
printUsage(detailedUsage_);
return 0;
}
postProcessArgs();
setJobConf();
} catch (IllegalArgumentException ex) {
//ignore, since log will already be printed
// print the log in debug mode.
LOG.debug("Error in streaming job", ex);
ex.printStackTrace();
return 1;
}
return submitAndMonitorJob();
}
/**
* This method creates a streaming job from the given argument list.
* The created object can be used and/or submitted to a jobtracker for
* execution by a job agent such as JobControl
*
* @param argv
* the list args for creating a streaming job
* @return the created JobConf object
* @throws IOException
*/
static public JobConf createJob(String[] argv) throws IOException {
StreamJob job = new StreamJob();
job.argv_ = argv;
job.init();
job.preProcessArgs();
job.parseArgv();
job.postProcessArgs();
job.setJobConf();
return job.jobConf_;
}
protected void init() {
try {
env_ = new Environment();
} catch (IOException io) {
throw new RuntimeException(io);
}
}
void preProcessArgs() {
verbose_ = false;
// Unset HADOOP_ROOT_LOGGER in case streaming job
// invokes additional hadoop commands.
addTaskEnvironment_ = "HADOOP_ROOT_LOGGER=";
}
void postProcessArgs() throws IOException {
msg("addTaskEnvironment=" + addTaskEnvironment_);
for (final String packageFile : packageFiles_) {
File f = new File(packageFile);
if (f.isFile()) {
shippedCanonFiles_.add(f.getCanonicalPath());
}
}
msg("shippedCanonFiles_=" + shippedCanonFiles_);
// careful with class names..
mapCmd_ = unqualifyIfLocalPath(mapCmd_);
comCmd_ = unqualifyIfLocalPath(comCmd_);
redCmd_ = unqualifyIfLocalPath(redCmd_);
}
String unqualifyIfLocalPath(String cmd) throws IOException {
if (cmd == null) {
//
} else {
String prog = cmd;
String args = "";
int s = cmd.indexOf(" ");
if (s != -1) {
prog = cmd.substring(0, s);
args = cmd.substring(s + 1);
}
String progCanon;
try {
progCanon = new File(prog).getCanonicalPath();
} catch (IOException io) {
progCanon = prog;
}
boolean shipped = shippedCanonFiles_.contains(progCanon);
msg("shipped: " + shipped + " " + progCanon);
if (shipped) {
// Change path to simple filename.
// That way when PipeMapRed calls Runtime.exec(),
// it will look for the excutable in Task's working dir.
// And this is where TaskRunner unjars our job jar.
prog = new File(prog).getName();
if (args.length() > 0) {
cmd = prog + " " + args;
} else {
cmd = prog;
}
}
}
msg("cmd=" + cmd);
return cmd;
}
void parseArgv() {
CommandLine cmdLine = null;
try {
cmdLine = parser.parse(allOptions, argv_);
} catch (Exception oe) {
LOG.error(oe.getMessage());
exitUsage(argv_.length > 0 && "-info".equals(argv_[0]));
}
if (cmdLine == null) {
exitUsage(argv_.length > 0 && "-info".equals(argv_[0]));
return;
}
@SuppressWarnings("unchecked")
List args = cmdLine.getArgList();
if (args != null && args.size() > 0) {
fail("Found " + args.size() + " unexpected arguments on the " +
"command line " + args);
}
detailedUsage_ = cmdLine.hasOption("info");
if (cmdLine.hasOption("help") || detailedUsage_) {
printUsage = true;
return;
}
verbose_ = cmdLine.hasOption("verbose");
background_ = cmdLine.hasOption("background");
debug_ = cmdLine.hasOption("debug") ? debug_ + 1 : debug_;
output_ = cmdLine.getOptionValue("output");
comCmd_ = cmdLine.getOptionValue("combiner");
redCmd_ = cmdLine.getOptionValue("reducer");
lazyOutput_ = cmdLine.hasOption("lazyOutput");
String[] values = cmdLine.getOptionValues("file");
SessionState ss = SessionState.get();
MetaExplorer metaExplorer = new MetaExplorerImpl(ss.getOdps());
Map aliasToTempResource = new HashMap();
String padding = "_" + UUID.randomUUID().toString();
if (values != null && values.length > 0) {
for (int i = 0; i < values.length; i++) {
String file = values[i];
packageFiles_.add(file);
try {
aliasToTempResource.put(FilenameUtils.getName(file),
metaExplorer.addFileResourceWithRetry(file, Resource.Type.FILE,
padding, true));
} catch (OdpsException e) {
throw new RuntimeException(e);
}
}
config_.set("stream.temp.resource.alias",
new GsonBuilder().disableHtmlEscaping().create().toJson(aliasToTempResource));
String[] res = config_.getResources();
Set resources = aliasToTempResource.keySet();
if (res != null) {
config_.setResources(StringUtils.join(res, ",") + "," + StringUtils.join(resources, ","));
} else {
config_.setResources(StringUtils.join(resources, ","));
}
}
additionalConfSpec_ = cmdLine.getOptionValue("additionalconfspec");
numReduceTasksSpec_ = cmdLine.getOptionValue("numReduceTasks");
partitionerSpec_ = cmdLine.getOptionValue("partitioner");
mapDebugSpec_ = cmdLine.getOptionValue("mapdebug");
reduceDebugSpec_ = cmdLine.getOptionValue("reducedebug");
ioSpec_ = cmdLine.getOptionValue("io");
String[] car = cmdLine.getOptionValues("cacheArchive");
if (null != car) {
fail("no -cacheArchive option any more, please use -resources instead.");
}
String[] caf = cmdLine.getOptionValues("cacheFile");
if (null != caf) {
fail("no -cacheFile option any more, please use -resources instead.");
}
mapCmd_ = cmdLine.getOptionValue("mapper");
String[] cmd = cmdLine.getOptionValues("cmdenv");
if (null != cmd && cmd.length > 0) {
for (String s : cmd) {
if (addTaskEnvironment_.length() > 0) {
addTaskEnvironment_ += " ";
}
addTaskEnvironment_ += s;
}
}
// per table input config
Map> inputConfigs =
new HashMap>();
String[] columns = null;
for (Option opt : cmdLine.getOptions()) {
if ("jobconf".equals(opt.getOpt())) {
String[] jobconf = opt.getValues();
if (null != jobconf && jobconf.length > 0) {
for (String s : jobconf) {
String[] parts = s.split("=", 2);
config_.set(parts[0], parts[1]);
}
}
} else if ("columns".equals(opt.getOpt())) {
String columnsValue = opt.getValue();
if (columnsValue.equals("ALL")) {
columns = null;
} else {
columns = columnsValue.split(",");
}
} else if ("input".equals(opt.getOpt())) {
values = opt.getValues();
if (values != null && values.length > 0) {
for (String input : values) {
TableInfo ti = parseTableInfo(input);
if (columns != null) {
ti.setCols(columns);
}
inputSpecs_.add(ti);
String inputKey = (ti.getProjectName() + "." + ti.getTableName()).toLowerCase();
// XXX only apply once per table
if (inputConfigs.get(inputKey) != null) {
continue;
}
Map inputConfig = new HashMap();
inputConfig.put("stream.map.input.field.separator",
config_.get("stream.map.input.field.separator", "\t"));
// TODO other per table input config: cols, etc.
inputConfigs.put(inputKey, inputConfig);
}
}
}
}
try {
config_.set("stream.map.input.configs",
new GsonBuilder().disableHtmlEscaping().create().toJson(inputConfigs));
} catch (Exception e) {
throw new RuntimeException("fail to set input configs");
}
}
protected void msg(String msg) {
if (verbose_) {
System.out.println("STREAM: " + msg);
}
}
private Option createOption(String name, String desc,
String argName, int max, boolean required) {
return OptionBuilder
.withArgName(argName)
.hasArgs(max)
.withDescription(desc)
.isRequired(required)
.create(name);
}
private Option createBoolOption(String name, String desc) {
return OptionBuilder.withDescription(desc).create(name);
}
private void setupOptions() {
// input and output are not required for -info and -help options,
// though they are required for streaming job to be run.
Option input = createOption("input",
"Input tables/partitions for the Map step",
"path",
Integer.MAX_VALUE,
false);
Option columns = createOption("columns",
"Input table column names for the Map step",
"spec",
1,
false);
Option output = createOption("output",
"Result table/partition for the Reduce step",
"path", 1, false);
Option mapper = createOption("mapper",
"The streaming command to run", "cmd", 1, false);
Option combiner = createOption("combiner",
"The streaming command to run", "cmd", 1, false);
// reducer could be NONE
Option reducer = createOption("reducer",
"The streaming command to run", "cmd", 1, false);
Option file = createOption("file",
"File to be shipped in the Job jar file",
"file", Integer.MAX_VALUE, false);
Option additionalconfspec = createOption("additionalconfspec",
"Optional.", "spec", 1, false);
Option partitioner = createOption("partitioner",
"Optional.", "spec", 1, false);
Option numReduceTasks = createOption("numReduceTasks",
"Optional.", "spec", 1, false);
Option mapDebug = createOption("mapdebug",
"Optional.", "spec", 1, false);
Option reduceDebug = createOption("reducedebug",
"Optional", "spec", 1, false);
Option jobconf =
createOption("jobconf",
"(n=v) Optional. Add or override a JobConf property.",
"spec", 1, false);
Option cmdenv =
createOption("cmdenv", "(n=v) Pass env.var to streaming commands.",
"spec", 1, false);
Option cacheFile = createOption("cacheFile",
"File name URI", "fileNameURI", Integer.MAX_VALUE, false);
Option cacheArchive = createOption("cacheArchive",
"File name URI", "fileNameURI", Integer.MAX_VALUE, false);
Option io = createOption("io",
"Optional.", "spec", 1, false);
// boolean properties
Option
background =
createBoolOption("background", "Submit the job and don't wait till it completes.");
Option verbose = createBoolOption("verbose", "print verbose output");
Option info = createBoolOption("info", "print verbose output");
Option help = createBoolOption("help", "print this help message");
Option debug = createBoolOption("debug", "print debug output");
Option lazyOutput = createBoolOption("lazyOutput", "create outputs lazily");
allOptions = new Options().
addOption(input).
addOption(columns).
addOption(output).
addOption(mapper).
addOption(combiner).
addOption(reducer).
addOption(file).
addOption(additionalconfspec).
addOption(partitioner).
addOption(numReduceTasks).
addOption(mapDebug).
addOption(reduceDebug).
addOption(jobconf).
addOption(cmdenv).
addOption(cacheFile).
addOption(cacheArchive).
addOption(io).
addOption(background).
addOption(verbose).
addOption(info).
addOption(debug).
addOption(help).
addOption(lazyOutput);
}
public void exitUsage(boolean detailed) {
printUsage(detailed);
fail("");
}
private void printUsage(boolean detailed) {
System.out.println(
"Usage: jar [-classpath ...] [-resources ...] com.aliyun.odps.mapred.bridge.streaming.StreamJob"
+ " [options]");
System.out.println("Options:");
System.out
.println(" -input <[/prj/]tbl/[pt=x[/ds=y]]> input table/partition for the Map"
+ " step.");
System.out.println(" -output <[/prj/]tbl/[pt=x[/ds=y]]> output table/partition for the"
+ " Reduce step.");
System.out.println(" -mapper Optional. Command"
+ " to be run as mapper.");
System.out.println(" -combiner Optional. Command"
+ " to be run as combiner.");
System.out.println(" -reducer Optional. Command"
+ " to be run as reducer.");
System.out.println(" -file Optional. Local file/dir to be "
+ "shipped with the streaming job.");
System.out.println(" -partitioner Optional. The"
+ " partitioner class.");
System.out.println(" -numReduceTasks Optional. Number of reduce "
+ "tasks.");
System.out.println(" -cmdenv = Optional. Pass env.var to"
+ " streaming commands.");
System.out.println(" -lazyOutput Optional. Lazily create Output.");
System.out
.println(" -background Optional. Submit the job and don't wait till it completes.");
System.out.println(" -verbose Optional. Print verbose output.");
System.out.println(" -info Optional. Print detailed usage.");
System.out.println(" -help Optional. Print help message.");
System.out.println();
if (!detailed) {
System.out.println();
System.out.println("For more details about these options use -info option.");
return;
}
System.out.println();
System.out.println("Usage tips:");
System.out.println("To set the number of reduce tasks (num. of output " +
"files) as, say 10:");
System.out.println(" Use -numReduceTasks 10");
System.out.println("To skip the sort/combine/shuffle/sort/reduce step:");
System.out.println(" Use -numReduceTasks 0");
System.out.println(" Map output then becomes a 'side-effect " +
"output' rather than a reduce input.");
System.out.println(" This speeds up processing. This also feels " +
"more like \"in-place\" processing");
System.out.println(" because the input filename and the map " +
"input order are preserved.");
System.out.println(" This is equivalent to -reducer NONE");
System.out.println();
System.out.println("To treat tasks with non-zero exit status as SUCCEDED:");
System.out.println(" -D stream.non.zero.exit.is.failure=false");
System.out.println("To set an environement variable in a streaming " +
"command:");
System.out.println(" -cmdenv EXAMPLE_DIR=/home/example/dictionaries/");
}
public void fail(String message) {
System.err.println(message);
System.err.println("Try -help for more information");
throw new IllegalArgumentException(message);
}
// --------------------------------------------
/**
* Parse table input/output path to TableInfo.
* Supported pattern:
* /prj/tbl
* /prj/tbl/pt=x/ds=y
* tbl
* tbl/pt=x/ds=y
*/
private static TableInfo parseTableInfo(String tableInfoStr) {
String prj = SessionState.get().getOdps().getDefaultProject();
if (prj == null) {
// should not happen
throw new RuntimeException("default project should have been set");
}
String tbl = null;
String part = null;
if (tableInfoStr.startsWith("/")) {
String[] parts = tableInfoStr.substring(1).split("/", 3);
if (parts.length < 2) {
throw new IllegalArgumentException("invalid table info: " + tableInfoStr);
}
prj = parts[0];
tbl = parts[1];
if (parts.length == 3) {
part = parts[2];
}
} else {
String[] parts = tableInfoStr.split("/", 2);
if (parts.length == 0) {
throw new IllegalArgumentException("invalid table info: " + tableInfoStr);
}
tbl = parts[0];
if (parts.length == 2) {
part = parts[1];
}
}
TableInfo.TableInfoBuilder builder = TableInfo.builder();
builder.projectName(prj);
builder.tableName(tbl);
if (part != null) {
builder.partSpec(part);
}
return builder.build();
}
protected void setJobConf() throws IOException {
// general MapRed job properties
jobConf_ = new JobConf(config_);
// All streaming jobs get the task timeout value
// from the configuration settings.
for (int i = 0; i < inputSpecs_.size(); i++) {
InputUtils.addTable(inputSpecs_.get(i), jobConf_);
}
String defaultPackage = this.getClass().getPackage().getName();
if (ioSpec_ != null) {
jobConf_.set("stream.map.input", ioSpec_);
jobConf_.set("stream.map.output", ioSpec_);
jobConf_.set("stream.reduce.input", ioSpec_);
jobConf_.set("stream.reduce.output", ioSpec_);
}
//Class extends IdentifierResolver> idResolverClass =
// jobConf_.getClass("stream.io.identifier.resolver.class",
// IdentifierResolver.class, IdentifierResolver.class);
//IdentifierResolver idResolver = ReflectionUtils.newInstance(idResolverClass, jobConf_);
//idResolver.resolve(jobConf_.get("stream.map.input", IdentifierResolver.TEXT_ID));
//jobConf_.setClass("stream.map.input.writer.class",
// idResolver.getInputWriterClass(), InputWriter.class);
jobConf_.setClass("stream.map.input.writer.class", RecordInputWriter.class, InputWriter.class);
//idResolver.resolve(jobConf_.get("stream.reduce.input", IdentifierResolver.TEXT_ID));
//jobConf_.setClass("stream.reduce.input.writer.class",
// idResolver.getInputWriterClass(), InputWriter.class);
jobConf_.setClass("stream.reduce.input.writer.class", TextInputWriter.class, InputWriter.class);
jobConf_.set("stream.addenvironment", addTaskEnvironment_);
boolean isMapperACommand = false;
Class c = null;
if (mapCmd_ != null) {
c = StreamUtil.goodClassOrNull(jobConf_, mapCmd_, defaultPackage);
if (c != null) {
jobConf_.setMapperClass(c);
} else {
isMapperACommand = true;
jobConf_.setMapperClass(PipeMapper.class);
//jobConf_.setMapRunnerClass(PipeMapRunner.class);
jobConf_.set("stream.map.streamprocessor",
URLEncoder.encode(mapCmd_, "UTF-8"));
}
}
if (comCmd_ != null) {
c = StreamUtil.goodClassOrNull(jobConf_, comCmd_, defaultPackage);
if (c != null) {
jobConf_.setCombinerClass(c);
} else {
jobConf_.setCombinerClass(PipeCombiner.class);
jobConf_.set("stream.combine.streamprocessor", URLEncoder.encode(
comCmd_, "UTF-8"));
}
}
if (numReduceTasksSpec_ != null) {
int numReduceTasks = Integer.parseInt(numReduceTasksSpec_);
jobConf_.setNumReduceTasks(numReduceTasks);
}
boolean isReducerACommand = false;
if (redCmd_ != null) {
if (redCmd_.equals(REDUCE_NONE)) {
jobConf_.setNumReduceTasks(0);
}
if (jobConf_.getNumReduceTasks() != 0) {
if (redCmd_.compareToIgnoreCase("aggregate") == 0) {
//jobConf_.setReducerClass(ValueAggregatorReducer.class);
//jobConf_.setCombinerClass(ValueAggregatorCombiner.class);
// TODO reducer lib
throw new UnsupportedOperationException("'aggregate' reducer not supported yet");
} else {
c = StreamUtil.goodClassOrNull(jobConf_, redCmd_, defaultPackage);
if (c != null) {
jobConf_.setReducerClass(c);
} else {
isReducerACommand = true;
jobConf_.setReducerClass(PipeReducer.class);
jobConf_.set("stream.reduce.streamprocessor", URLEncoder.encode(
redCmd_, "UTF-8"));
}
}
}
}
String
mapOutputFieldSeparator =
unescapeSeparator(jobConf_.get("stream.map.output.field.separator", "\t"));
String
reduceInputFieldSeparator =
unescapeSeparator(jobConf_.get("stream.reduce.input.field.separator", "\t"));
int numOfMapOutputKeyFields = jobConf_.getInt("stream.num.map.output.key.fields", 1);
if (numOfMapOutputKeyFields > 1 && !mapOutputFieldSeparator.equals(reduceInputFieldSeparator)) {
throw new IllegalArgumentException(
"for multiple-fields key, stream.reduce.input.field.separator should be the same as stream.map.output.field.separator to avoid confusion");
}
Column[] mapOutputKeySchema = new Column[numOfMapOutputKeyFields];
Map keyOptions =
parseKeyOptions(jobConf_.get("stream.map.output.key.options", ""));
for (int i = 0; i < mapOutputKeySchema.length; i++) {
KeyDescription keyDesc = keyOptions.get(i + 1);
OdpsType t = (keyDesc == null || !keyDesc.numeric) ? OdpsType.STRING : OdpsType.BIGINT;
mapOutputKeySchema[i] = new Column("map_out_key" + i, t);
}
jobConf_.setMapOutputKeySchema(mapOutputKeySchema);
if (!keyOptions.isEmpty()) {
JobConf.SortOrder[] sortOrder = new JobConf.SortOrder[mapOutputKeySchema.length];
for (int i = 0; i < mapOutputKeySchema.length; i++) {
KeyDescription keyDesc = keyOptions.get(i + 1);
sortOrder[i] =
(keyDesc == null || !keyDesc.reverse) ? JobConf.SortOrder.ASC : JobConf.SortOrder.DESC;
}
jobConf_.setOutputKeySortOrder(sortOrder);
}
jobConf_.setMapOutputValueSchema(new Column[]{new Column("map_out_value", OdpsType.STRING)});
// use setPartitionColumns for KeyFieldBasedPartitioner
if (partitionerSpec_ != null) {
if (partitionerSpec_.equals("KeyFieldBasedPartitioner")) {
partitionerSpec_ = "com.aliyun.odps.mapred.lib.KeyFieldBasedPartitioner";
}
if (partitionerSpec_.equals("com.aliyun.odps.mapred.lib.KeyFieldBasedPartitioner")) {
String
mapOutputKeyFieldSeparator =
unescapeSeparator(jobConf_.get("map.output.key.field.separator", "\t"));
if (mapOutputFieldSeparator.equals(mapOutputKeyFieldSeparator)) {
int numOfKeyFieldsForPartition = jobConf_.getInt("num.key.fields.for.partition", 1);
if (numOfKeyFieldsForPartition > numOfMapOutputKeyFields) {
throw new IllegalArgumentException(
"num.key.fields.for.partition should not bigger than stream.num.map.output.key.fields");
}
if (numOfKeyFieldsForPartition < numOfMapOutputKeyFields) {
String[] partitionColumns = new String[numOfKeyFieldsForPartition];
for (int i = 0; i < numOfKeyFieldsForPartition; i++) {
partitionColumns[i] = mapOutputKeySchema[i].getName();
}
jobConf_.setPartitionColumns(partitionColumns);
}
} else {
// need to split the first field for partition, only for compatible with hadoop.
// FIXME this partitioner would be implemented by the StreamingOperator at runtime...
c = StreamUtil.goodClassOrNull(jobConf_, partitionerSpec_, defaultPackage);
if (c != null) {
jobConf_.setPartitionerClass(c);
}
}
} else {
throw new IllegalArgumentException(
"User defined partitioner not supported for streaming job");
}
}
Class mapOutputReaderClass = TextOutputReader.class;
Class reduceOutputReaderClass = RecordOutputReader.class;
if (jobConf_.getNumReduceTasks() > 0) {
boolean hasKey = jobConf_.getInt("stream.num.reduce.output.key.fields", 0) > 0;
reduceOutputReaderClass = hasKey ? TextOutputReader.class : RecordOutputReader.class;
} else {
boolean hasKey = jobConf_.getInt("stream.num.map.output.key.fields", 0) > 0;
mapOutputReaderClass = hasKey ? TextOutputReader.class : RecordOutputReader.class;
}
jobConf_.setClass("stream.map.output.reader.class", mapOutputReaderClass, OutputReader.class);
jobConf_
.setClass("stream.reduce.output.reader.class", reduceOutputReaderClass, OutputReader.class);
// XXX no-output allowed
if (output_ != null) {
OutputUtils.addTable(parseTableInfo(output_), jobConf_);
}
//if(mapDebugSpec_ != null){
// jobConf_.setMapDebugScript(mapDebugSpec_);
//}
//if(reduceDebugSpec_ != null){
// jobConf_.setReduceDebugScript(reduceDebugSpec_);
//}
// last, allow user to override anything
// (although typically used with properties we didn't touch)
// FIXME resources linkname
if (verbose_) {
listJobConfProperties();
}
}
/**
* Prints out the jobconf properties on stdout
* when verbose is specified.
*/
protected void listJobConfProperties() {
msg("==== JobConf properties:");
TreeMap sorted = new TreeMap();
for (final Map.Entry en : jobConf_) {
sorted.put(en.getKey(), en.getValue());
}
for (final Map.Entry en : sorted.entrySet()) {
msg(en.getKey() + "=" + en.getValue());
}
msg("====");
}
// Based on JobClient
public int submitAndMonitorJob() throws Exception {
running_ = JobClient.submitJob(jobConf_);
LOG.debug("submit job done");
if (background_) {
LOG.info("Job is running in background.");
} else {
running_.waitForCompletion();
if (!running_.isSuccessful()) {
return 1;
}
}
return 0;
}
private static class KeyDescription {
boolean numeric = false;
boolean reverse = false;
}
;
/**
* Parse key option;
* option is like f[n][r],f[n][r],...
*/
private Map parseKeyOptions(String options) {
Map keys = new HashMap();
StringTokenizer st = new StringTokenizer(options, "nr,", true);
while (st.hasMoreTokens()) {
String token = st.nextToken();
int fieldId;
try {
fieldId = Integer.parseInt(token);
} catch (NumberFormatException e) {
throw new IllegalArgumentException(
"invalid key options format, expect field number at '" + token + "'");
}
KeyDescription keyDesc = new KeyDescription();
while (st.hasMoreTokens()) {
token = st.nextToken();
if (token.equals(",")) {
break;
} else if (token.equals("n")) {
keyDesc.numeric = true;
} else if (token.equals("r")) {
keyDesc.reverse = true;
} else {
throw new IllegalArgumentException(
"invalid key options format, unknown option '" + token + "'");
}
}
keys.put(fieldId, keyDesc);
}
return keys;
}
protected String[] argv_;
protected boolean background_;
protected boolean verbose_;
protected boolean detailedUsage_;
protected boolean printUsage = false;
protected int debug_;
protected Environment env_;
protected JobConf config_;
protected JobConf jobConf_;
// command-line arguments
protected ArrayList inputSpecs_ = new ArrayList();
protected ArrayList packageFiles_ = new ArrayList();
protected ArrayList shippedCanonFiles_ = new ArrayList();
//protected TreeMap userJobConfProps_ = new TreeMap();
protected String output_;
protected String mapCmd_;
protected String comCmd_;
protected String redCmd_;
protected String partitionerSpec_;
protected String numReduceTasksSpec_;
protected String additionalConfSpec_;
protected String mapDebugSpec_;
protected String reduceDebugSpec_;
protected String ioSpec_;
protected boolean lazyOutput_;
// Use to communicate config to the external processes (ex env.var.HADOOP_USER)
// encoding "a=b c=d"
protected String addTaskEnvironment_;
protected RunningJob running_;
protected static final String
LINK_URI =
"You need to specify the uris as scheme://path#linkname," +
"Please specify a different link name for all of your caching URIs";
public static void main(String[] args) {
StreamJob job = new StreamJob();
try {
System.exit(job.run(args));
} catch (Exception ex) {
ex.printStackTrace();
System.exit(1);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy