All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.streaming.StreamJob Maven / Gradle / Ivy

There is a newer version: 3.4.1
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.streaming;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URI;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.TreeSet;

import org.apache.commons.cli.BasicParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.permission.FsAction;
import org.apache.hadoop.mapreduce.MRConfig;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.filecache.DistributedCache;
import org.apache.hadoop.mapreduce.server.jobtracker.JTConfig;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileAlreadyExistsException;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.InvalidJobConfException;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobID;
import org.apache.hadoop.mapred.KeyValueTextInputFormat;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.SequenceFileAsTextInputFormat;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.lib.LazyOutputFormat;
import org.apache.hadoop.mapred.lib.aggregate.ValueAggregatorCombiner;
import org.apache.hadoop.mapred.lib.aggregate.ValueAggregatorReducer;
import org.apache.hadoop.security.AccessControlException;
import org.apache.hadoop.streaming.io.IdentifierResolver;
import org.apache.hadoop.streaming.io.InputWriter;
import org.apache.hadoop.streaming.io.OutputReader;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.RunJar;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;

import static org.apache.hadoop.util.RunJar.MATCH_ANY;

/** All the client-side work happens here.
 * (Jar packaging, MapRed job submission and monitoring)
 */
public class StreamJob implements Tool {

  protected static final Logger LOG = LoggerFactory.getLogger(StreamJob.class.getName());
  final static String REDUCE_NONE = "NONE";

  /** -----------Streaming CLI Implementation  **/
  private CommandLineParser parser = new BasicParser();
  private Options allOptions;
  /**@deprecated use StreamJob() with ToolRunner or set the
   * Configuration using {@link #setConf(Configuration)} and
   * run with {@link #run(String[])}.
   */
  @Deprecated
  public StreamJob(String[] argv, boolean mayExit) {
    this();
    argv_ = Arrays.copyOf(argv, argv.length);
    this.config_ = new Configuration();
  }

  public StreamJob() {
    setupOptions();
    this.config_ = new Configuration();
  }

  @Override
  public Configuration getConf() {
    return config_;
  }

  @Override
  public void setConf(Configuration conf) {
    this.config_ = conf;
  }

  @Override
  public int run(String[] args) throws Exception {
    try {
      this.argv_ = Arrays.copyOf(args, args.length);
      init();

      preProcessArgs();
      parseArgv();
      if (printUsage) {
        printUsage(detailedUsage_);
        return 0;
      }
      postProcessArgs();

      setJobConf();
    } catch (IllegalArgumentException ex) {
      //ignore, since log will already be printed
      // print the log in debug mode.
      LOG.debug("Error in streaming job", ex);
      return 1;
    }
    return submitAndMonitorJob();
  }

  /**
   * This method creates a streaming job from the given argument list.
   * The created object can be used and/or submitted to a jobtracker for
   * execution by a job agent such as JobControl
   * @param argv the list args for creating a streaming job
   * @return the created JobConf object
   * @throws IOException
   */
  static public JobConf createJob(String[] argv) throws IOException {
    StreamJob job = new StreamJob();
    job.argv_ = argv;
    job.init();
    job.preProcessArgs();
    job.parseArgv();
    job.postProcessArgs();
    job.setJobConf();
    return job.jobConf_;
  }

  /**
   * This is the method that actually
   * initializes the job conf and submits the job
   * to the jobtracker
   * @throws IOException
   * @deprecated use {@link #run(String[])} instead.
   */
  @Deprecated
  public int go() throws IOException {
    try {
      return run(argv_);
    }
    catch (Exception ex) {
      throw new IOException(ex.getMessage());
    }
  }

  protected void init() {
    try {
      env_ = new Environment();
    } catch (IOException io) {
      throw new RuntimeException(io);
    }
  }

  void preProcessArgs() {
    verbose_ = false;
    // Unset HADOOP_ROOT_LOGGER in case streaming job
    // invokes additional hadoop commands.
    addTaskEnvironment_ = "HADOOP_ROOT_LOGGER=";
  }

  void postProcessArgs() throws IOException {

    if (inputSpecs_.size() == 0) {
      fail("Required argument: -input ");
    }
    if (output_ == null) {
      fail("Required argument: -output ");
    }
    msg("addTaskEnvironment=" + addTaskEnvironment_);

    for (final String packageFile : packageFiles_) {
      File f = new File(packageFile);
      if (f.isFile()) {
        shippedCanonFiles_.add(f.getCanonicalPath());
      }
    }
    msg("shippedCanonFiles_=" + shippedCanonFiles_);

    // careful with class names..
    mapCmd_ = unqualifyIfLocalPath(mapCmd_);
    comCmd_ = unqualifyIfLocalPath(comCmd_);
    redCmd_ = unqualifyIfLocalPath(redCmd_);
  }

  String unqualifyIfLocalPath(String cmd) throws IOException {
    if (cmd == null) {
      //
    } else {
      String prog = cmd;
      String args = "";
      int s = cmd.indexOf(" ");
      if (s != -1) {
        prog = cmd.substring(0, s);
        args = cmd.substring(s + 1);
      }
      String progCanon;
      try {
        progCanon = new File(prog).getCanonicalPath();
      } catch (IOException io) {
        progCanon = prog;
      }
      boolean shipped = shippedCanonFiles_.contains(progCanon);
      msg("shipped: " + shipped + " " + progCanon);
      if (shipped) {
        // Change path to simple filename.
        // That way when PipeMapRed calls Runtime.exec(),
        // it will look for the excutable in Task's working dir.
        // And this is where TaskRunner unjars our job jar.
        prog = new File(prog).getName();
        if (args.length() > 0) {
          cmd = prog + " " + args;
        } else {
          cmd = prog;
        }
      }
    }
    msg("cmd=" + cmd);
    return cmd;
  }

  void parseArgv() {
    CommandLine cmdLine = null;
    try {
      cmdLine = parser.parse(allOptions, argv_);
    } catch(Exception oe) {
      LOG.error(oe.getMessage());
      exitUsage(argv_.length > 0 && "-info".equals(argv_[0]));
    }

    if (cmdLine != null) {
      @SuppressWarnings("unchecked")
      List args = cmdLine.getArgList();
      if(args != null && args.size() > 0) {
        fail("Found " + args.size() + " unexpected arguments on the " +
            "command line " + args);
      }
      
      detailedUsage_ = cmdLine.hasOption("info");
      if (cmdLine.hasOption("help") || detailedUsage_) {
        printUsage = true;
        return;
      }
      verbose_ =  cmdLine.hasOption("verbose");
      background_ =  cmdLine.hasOption("background");
      debug_ = cmdLine.hasOption("debug")? debug_ + 1 : debug_;

      String[] values = cmdLine.getOptionValues("input");
      if (values != null && values.length > 0) {
        for (String input : values) {
          inputSpecs_.add(input);
        }
      }
      output_ =  cmdLine.getOptionValue("output");

      mapCmd_ = cmdLine.getOptionValue("mapper");
      comCmd_ = cmdLine.getOptionValue("combiner");
      redCmd_ = cmdLine.getOptionValue("reducer");

      lazyOutput_ = cmdLine.hasOption("lazyOutput");

      values = cmdLine.getOptionValues("file");
      if (values != null && values.length > 0) {
        LOG.warn("-file option is deprecated, please use generic option" +
        		" -files instead.");

        StringBuffer fileList = new StringBuffer();
        for (String file : values) {
          packageFiles_.add(file);
          try {
            Path path = new Path(file);
            FileSystem localFs = FileSystem.getLocal(config_);
            Path qualifiedPath = path.makeQualified(
                localFs.getUri(), localFs.getWorkingDirectory());
            validate(qualifiedPath);
            String finalPath = qualifiedPath.toString();
            if(fileList.length() > 0) {
              fileList.append(',');
            }
            fileList.append(finalPath);
          } catch (Exception e) {
            throw new IllegalArgumentException(e);
          }
        }
        String tmpFiles = config_.get("tmpfiles", "");
        if (tmpFiles.isEmpty()) {
          tmpFiles = fileList.toString();
        } else {
          tmpFiles = tmpFiles + "," + fileList;
        }
        config_.set("tmpfiles", tmpFiles);
      }

      String fsName = cmdLine.getOptionValue("dfs");
      if (null != fsName){
        LOG.warn("-dfs option is deprecated, please use -fs instead.");
        config_.set("fs.default.name", fsName);
      }

      additionalConfSpec_ = cmdLine.getOptionValue("additionalconfspec");
      inputFormatSpec_ = cmdLine.getOptionValue("inputformat");
      outputFormatSpec_ = cmdLine.getOptionValue("outputformat");
      numReduceTasksSpec_ = cmdLine.getOptionValue("numReduceTasks");
      partitionerSpec_ = cmdLine.getOptionValue("partitioner");
      inReaderSpec_ = cmdLine.getOptionValue("inputreader");
      mapDebugSpec_ = cmdLine.getOptionValue("mapdebug");
      reduceDebugSpec_ = cmdLine.getOptionValue("reducedebug");
      ioSpec_ = cmdLine.getOptionValue("io");

      String[] car = cmdLine.getOptionValues("cacheArchive");
      if (null != car && car.length > 0){
        LOG.warn("-cacheArchive option is deprecated, please use -archives instead.");
        for(String s : car){
          cacheArchives = (cacheArchives == null)?s :cacheArchives + "," + s;
        }
      }

      String[] caf = cmdLine.getOptionValues("cacheFile");
      if (null != caf && caf.length > 0){
        LOG.warn("-cacheFile option is deprecated, please use -files instead.");
        for(String s : caf){
          cacheFiles = (cacheFiles == null)?s :cacheFiles + "," + s;
        }
      }

      String[] jobconf = cmdLine.getOptionValues("jobconf");
      if (null != jobconf && jobconf.length > 0){
        LOG.warn("-jobconf option is deprecated, please use -D instead.");
        for(String s : jobconf){
          String[] parts = s.split("=", 2);
          config_.set(parts[0], parts[1]);
        }
      }

      String[] cmd = cmdLine.getOptionValues("cmdenv");
      if (null != cmd && cmd.length > 0){
        for(String s : cmd) {
          if (addTaskEnvironment_.length() > 0) {
            addTaskEnvironment_ += " ";
          }
          addTaskEnvironment_ += s;
        }
      }
    } else {
      exitUsage(argv_.length > 0 && "-info".equals(argv_[0]));
    }
  }

  protected void msg(String msg) {
    if (verbose_) {
      System.out.println("STREAM: " + msg);
    }
  }

  private Option createOption(String name, String desc,
                              String argName, int max, boolean required){
    return OptionBuilder
           .withArgName(argName)
           .hasArgs(max)
           .withDescription(desc)
           .isRequired(required)
           .create(name);
  }

  private Option createBoolOption(String name, String desc){
    return OptionBuilder.withDescription(desc).create(name);
  }

  private void validate(final Path path) throws IOException {
    try {
      path.getFileSystem(config_).access(path, FsAction.READ);
    } catch (FileNotFoundException e) {
      fail("File: " + path + " does not exist.");
    } catch (AccessControlException e) {
      fail("File: " + path + " is not readable.");
    }
  }

  private void setupOptions(){

    // input and output are not required for -info and -help options,
    // though they are required for streaming job to be run.
    Option input   = createOption("input",
                                  "DFS input file(s) for the Map step",
                                  "path",
                                  Integer.MAX_VALUE,
                                  false);

    Option output  = createOption("output",
                                  "DFS output directory for the Reduce step",
                                  "path", 1, false);
    Option mapper  = createOption("mapper",
                                  "The streaming command to run", "cmd", 1, false);
    Option combiner = createOption("combiner",
                                   "The streaming command to run", "cmd", 1, false);
    // reducer could be NONE
    Option reducer = createOption("reducer",
                                  "The streaming command to run", "cmd", 1, false);
    Option file = createOption("file",
                               "File to be shipped in the Job jar file",
                               "file", Integer.MAX_VALUE, false);
    Option dfs = createOption("dfs",
                              "Optional. Override DFS configuration", "|local", 1, false);
    Option additionalconfspec = createOption("additionalconfspec",
                                             "Optional.", "spec", 1, false);
    Option inputformat = createOption("inputformat",
                                      "Optional.", "spec", 1, false);
    Option outputformat = createOption("outputformat",
                                       "Optional.", "spec", 1, false);
    Option partitioner = createOption("partitioner",
                                      "Optional.", "spec", 1, false);
    Option numReduceTasks = createOption("numReduceTasks",
        "Optional.", "spec",1, false );
    Option inputreader = createOption("inputreader",
                                      "Optional.", "spec", 1, false);
    Option mapDebug = createOption("mapdebug",
                                   "Optional.", "spec", 1, false);
    Option reduceDebug = createOption("reducedebug",
                                      "Optional", "spec",1, false);
    Option jobconf =
      createOption("jobconf",
                   "(n=v) Optional. Add or override a JobConf property.",
                   "spec", 1, false);

    Option cmdenv =
      createOption("cmdenv", "(n=v) Pass env.var to streaming commands.",
                   "spec", 1, false);
    Option cacheFile = createOption("cacheFile",
                                    "File name URI", "fileNameURI", Integer.MAX_VALUE, false);
    Option cacheArchive = createOption("cacheArchive",
                                       "File name URI", "fileNameURI", Integer.MAX_VALUE, false);
    Option io = createOption("io",
                             "Optional.", "spec", 1, false);

    // boolean properties

    Option background = createBoolOption("background", "Submit the job and don't wait till it completes.");
    Option verbose = createBoolOption("verbose", "print verbose output");
    Option info = createBoolOption("info", "print verbose output");
    Option help = createBoolOption("help", "print this help message");
    Option debug = createBoolOption("debug", "print debug output");
    Option lazyOutput = createBoolOption("lazyOutput", "create outputs lazily");

    allOptions = new Options().
      addOption(input).
      addOption(output).
      addOption(mapper).
      addOption(combiner).
      addOption(reducer).
      addOption(file).
      addOption(dfs).
      addOption(additionalconfspec).
      addOption(inputformat).
      addOption(outputformat).
      addOption(partitioner).
      addOption(numReduceTasks).
      addOption(inputreader).
      addOption(mapDebug).
      addOption(reduceDebug).
      addOption(jobconf).
      addOption(cmdenv).
      addOption(cacheFile).
      addOption(cacheArchive).
      addOption(io).
      addOption(background).
      addOption(verbose).
      addOption(info).
      addOption(debug).
      addOption(help).
      addOption(lazyOutput);
  }

  public void exitUsage(boolean detailed) {
    printUsage(detailed);
    fail("");
  }

  private void printUsage(boolean detailed) {
    System.out.println("Usage: $HADOOP_HOME/bin/hadoop jar hadoop-streaming.jar"
        + " [options]");
    System.out.println("Options:");
    System.out.println("  -input           DFS input file(s) for the Map"
        + " step.");
    System.out.println("  -output          DFS output directory for the"
        + " Reduce step.");
    System.out.println("  -mapper          Optional. Command"
        + " to be run as mapper.");
    System.out.println("  -combiner        Optional. Command"
        + " to be run as combiner.");
    System.out.println("  -reducer         Optional. Command"
        + " to be run as reducer.");
    System.out.println("  -file            Optional. File/dir to be "
        + "shipped in the Job jar file.\n" +
        "                  Deprecated. Use generic option \"-files\" instead.");
    System.out.println("  -inputformat    \n"
        + "                  Optional. The input format class.");
    System.out.println("  -outputformat   \n"
        + "                  Optional. The output format class.");
    System.out.println("  -partitioner      Optional. The"
        + " partitioner class.");
    System.out.println("  -numReduceTasks  Optional. Number of reduce "
        + "tasks.");
    System.out.println("  -inputreader     Optional. Input recordreader"
        + " spec.");
    System.out.println("  -cmdenv         = Optional. Pass env.var to"
        + " streaming commands.");
    System.out.println("  -mapdebug        Optional. "
        + "To run this script when a map task fails.");
    System.out.println("  -reducedebug     Optional."
        + " To run this script when a reduce task fails.");
    System.out.println("  -io              Optional. Format to use"
        + " for input to and output");
    System.out.println("                  from mapper/reducer commands");
    System.out.println("  -lazyOutput     Optional. Lazily create Output.");
    System.out.println("  -background     Optional. Submit the job and don't wait till it completes.");
    System.out.println("  -verbose        Optional. Print verbose output.");
    System.out.println("  -info           Optional. Print detailed usage.");
    System.out.println("  -help           Optional. Print help message.");
    System.out.println();
    GenericOptionsParser.printGenericCommandUsage(System.out);

    if (!detailed) {
      System.out.println();
      System.out.println("For more details about these options:");
      System.out.println("Use " +
          "$HADOOP_HOME/bin/hadoop jar hadoop-streaming.jar -info");
      return;
    }
    System.out.println();
    System.out.println("Usage tips:");
    System.out.println("In -input: globbing on  is supported and can "
        + "have multiple -input");
    System.out.println();
    System.out.println("Default Map input format: a line is a record in UTF-8 "
        + "the key part ends at first");
    System.out.println("  TAB, the rest of the line is the value");
    System.out.println();
    System.out.println("To pass a Custom input format:");
    System.out.println("  -inputformat package.MyInputFormat");
    System.out.println();
    System.out.println("Similarly, to pass a custom output format:");
    System.out.println("  -outputformat package.MyOutputFormat");
    System.out.println();
    System.out.println("The files with extensions .class and .jar/.zip," +
        " specified for the -file");
    System.out.println("  argument[s], end up in \"classes\" and \"lib\" " +
        "directories respectively inside");
    System.out.println("  the working directory when the mapper and reducer are"
        + " run. All other files");
    System.out.println("  specified for the -file argument[s]" +
        " end up in the working directory when the");
    System.out.println("  mapper and reducer are run. The location of this " +
        "working directory is");
    System.out.println("  unspecified.");
    System.out.println();
    System.out.println("To set the number of reduce tasks (num. of output " +
        "files) as, say 10:");
    System.out.println("  Use -numReduceTasks 10");
    System.out.println("To skip the sort/combine/shuffle/sort/reduce step:");
    System.out.println("  Use -numReduceTasks 0");
    System.out.println("  Map output then becomes a 'side-effect " +
        "output' rather than a reduce input.");
    System.out.println("  This speeds up processing. This also feels " +
        "more like \"in-place\" processing");
    System.out.println("  because the input filename and the map " +
        "input order are preserved.");
    System.out.println("  This is equivalent to -reducer NONE");
    System.out.println();
    System.out.println("To speed up the last maps:");
    System.out.println("  -D " + MRJobConfig.MAP_SPECULATIVE + "=true");
    System.out.println("To speed up the last reduces:");
    System.out.println("  -D " + MRJobConfig.REDUCE_SPECULATIVE + "=true");
    System.out.println("To name the job (appears in the JobTracker Web UI):");
    System.out.println("  -D " + MRJobConfig.JOB_NAME + "='My Job'");
    System.out.println("To change the local temp directory:");
    System.out.println("  -D dfs.data.dir=/tmp/dfs");
    System.out.println("  -D stream.tmpdir=/tmp/streaming");
    System.out.println("Additional local temp directories with -jt local:");
    System.out.println("  -D " + MRConfig.LOCAL_DIR + "=/tmp/local");
    System.out.println("  -D " + JTConfig.JT_SYSTEM_DIR + "=/tmp/system");
    System.out.println("  -D " + MRConfig.TEMP_DIR + "=/tmp/temp");
    System.out.println("To treat tasks with non-zero exit status as SUCCEDED:");
    System.out.println("  -D stream.non.zero.exit.is.failure=false");
    System.out.println("Use a custom hadoop streaming build along with standard"
        + " hadoop install:");
    System.out.println("  $HADOOP_HOME/bin/hadoop jar " +
        "/path/my-hadoop-streaming.jar [...]\\");
    System.out.println("    [...] -D stream.shipped.hadoopstreaming=" +
        "/path/my-hadoop-streaming.jar");
    System.out.println("For more details about jobconf parameters see:");
    System.out.println("  http://wiki.apache.org/hadoop/JobConfFile");
    System.out.println("Truncate the values of the job configuration copied" +
        "to the environment at the given length:");
    System.out.println("   -D stream.jobconf.truncate.limit=-1");
    System.out.println("To set an environment variable in a streaming " +
        "command:");
    System.out.println("   -cmdenv EXAMPLE_DIR=/home/example/dictionaries/");
    System.out.println();
    System.out.println("Shortcut:");
    System.out.println("   setenv HSTREAMING \"$HADOOP_HOME/bin/hadoop jar " +
        "hadoop-streaming.jar\"");
    System.out.println();
    System.out.println("Example: $HSTREAMING -mapper " +
        "\"/usr/local/bin/perl5 filter.pl\"");
    System.out.println("           -file /local/filter.pl -input " +
        "\"/logs/0604*/*\" [...]");
    System.out.println("  Ships a script, invokes the non-shipped perl " +
        "interpreter. Shipped files go to");
    System.out.println("  the working directory so filter.pl is found by perl. "
        + "Input files are all the");
    System.out.println("  daily logs for days in month 2006-04");
  }

  public void fail(String message) {
    System.err.println(message);
    System.err.println("Try -help for more information");
    throw new IllegalArgumentException(message);
  }

  // --------------------------------------------

  protected String getHadoopClientHome() {
    String h = env_.getProperty("HADOOP_HOME"); // standard Hadoop
    if (h == null) {
      //fail("Missing required environment variable: HADOOP_HOME");
      h = "UNDEF";
    }
    return h;
  }

  protected boolean isLocalHadoop() {
    return StreamUtil.isLocalJobTracker(jobConf_);
  }

  @Deprecated
  protected String getClusterNick() {
    return "default";
  }

  /** @return path to the created Jar file or null if no files are necessary.
   */
  protected String packageJobJar() throws IOException {
    ArrayList unjarFiles = new ArrayList();

    // Runtime code: ship same version of code as self (job submitter code)
    // usually found in: build/contrib or build/hadoop--dev-streaming.jar

    // First try an explicit spec: it's too hard to find our own location in this case:
    // $HADOOP_HOME/bin/hadoop jar /not/first/on/classpath/custom-hadoop-streaming.jar
    // where findInClasspath() would find the version of hadoop-streaming.jar in $HADOOP_HOME
    String runtimeClasses = config_.get("stream.shipped.hadoopstreaming"); // jar or class dir

    if (runtimeClasses == null) {
      runtimeClasses = StreamUtil.findInClasspath(StreamJob.class.getName());
    }
    if (runtimeClasses == null) {
      throw new IOException("runtime classes not found: " + getClass().getPackage());
    } else {
      msg("Found runtime classes in: " + runtimeClasses);
    }
    if (isLocalHadoop()) {
      // don't package class files (they might get unpackaged in "." and then
      //  hide the intended CLASSPATH entry)
      // we still package everything else (so that scripts and executable are found in
      //  Task workdir like distributed Hadoop)
    } else {
      if (new File(runtimeClasses).isDirectory()) {
        packageFiles_.add(runtimeClasses);
      } else {
        unjarFiles.add(runtimeClasses);
      }
    }
    if (packageFiles_.size() + unjarFiles.size() == 0) {
      return null;
    }
    String tmp = jobConf_.get("stream.tmpdir"); //, "/tmp/${mapreduce.job.user.name}/"
    File tmpDir = (tmp == null) ? null : new File(tmp);
    // tmpDir=null means OS default tmp dir
    File jobJar = File.createTempFile("streamjob", ".jar", tmpDir);
    System.out.println("packageJobJar: " + packageFiles_ + " " + unjarFiles + " " + jobJar
                       + " tmpDir=" + tmpDir);
    if (debug_ == 0) {
      jobJar.deleteOnExit();
    }
    JarBuilder builder = new JarBuilder();
    if (verbose_) {
      builder.setVerbose(true);
    }
    String jobJarName = jobJar.getAbsolutePath();
    builder.merge(packageFiles_, unjarFiles, jobJarName);
    return jobJarName;
  }

  /**
   * get the uris of all the files/caches
   */
  protected void getURIs(String lcacheArchives, String lcacheFiles) {
    String archives[] = StringUtils.getStrings(lcacheArchives);
    String files[] = StringUtils.getStrings(lcacheFiles);
    fileURIs = StringUtils.stringToURI(files);
    archiveURIs = StringUtils.stringToURI(archives);
  }

  protected void setJobConf() throws IOException {
    if (additionalConfSpec_ != null) {
      LOG.warn("-additionalconfspec option is deprecated, please use -conf instead.");
      config_.addResource(new Path(additionalConfSpec_));
    }

    // general MapRed job properties
    jobConf_ = new JobConf(config_, StreamJob.class);

    // All streaming jobs get the task timeout value
    // from the configuration settings.

    // The correct FS must be set before this is called!
    // (to resolve local vs. dfs drive letter differences)
    // (mapreduce.job.working.dir will be lazily initialized ONCE and depends on FS)
    for (int i = 0; i < inputSpecs_.size(); i++) {
      FileInputFormat.addInputPaths(jobConf_,
                        (String) inputSpecs_.get(i));
    }

    String defaultPackage = this.getClass().getPackage().getName();
    Class c;
    Class fmt = null;
    if (inReaderSpec_ == null && inputFormatSpec_ == null) {
      fmt = TextInputFormat.class;
    } else if (inputFormatSpec_ != null) {
      if (inputFormatSpec_.equals(TextInputFormat.class.getName())
          || inputFormatSpec_.equals(TextInputFormat.class.getCanonicalName())
          || inputFormatSpec_.equals(TextInputFormat.class.getSimpleName())) {
        fmt = TextInputFormat.class;
      } else if (inputFormatSpec_.equals(KeyValueTextInputFormat.class
          .getName())
          || inputFormatSpec_.equals(KeyValueTextInputFormat.class
              .getCanonicalName())
          || inputFormatSpec_.equals(KeyValueTextInputFormat.class.getSimpleName())) {
        if (inReaderSpec_ == null) {
          fmt = KeyValueTextInputFormat.class;
        }
      } else if (inputFormatSpec_.equals(SequenceFileInputFormat.class
          .getName())
          || inputFormatSpec_
              .equals(org.apache.hadoop.mapred.SequenceFileInputFormat.class
                  .getCanonicalName())
          || inputFormatSpec_
              .equals(org.apache.hadoop.mapred.SequenceFileInputFormat.class.getSimpleName())) {
        if (inReaderSpec_ == null) {
          fmt = SequenceFileInputFormat.class;
        }
      } else if (inputFormatSpec_.equals(SequenceFileAsTextInputFormat.class
          .getName())
          || inputFormatSpec_.equals(SequenceFileAsTextInputFormat.class
              .getCanonicalName())
          || inputFormatSpec_.equals(SequenceFileAsTextInputFormat.class.getSimpleName())) {
        fmt = SequenceFileAsTextInputFormat.class;
      } else {
        c = StreamUtil.goodClassOrNull(jobConf_, inputFormatSpec_, defaultPackage);
        if (c != null) {
          fmt = c;
        } else {
          fail("-inputformat : class not found : " + inputFormatSpec_);
        }
      }
    }
    if (fmt == null) {
      fmt = StreamInputFormat.class;
    }

    jobConf_.setInputFormat(fmt);

    if (ioSpec_ != null) {
      jobConf_.set("stream.map.input", ioSpec_);
      jobConf_.set("stream.map.output", ioSpec_);
      jobConf_.set("stream.reduce.input", ioSpec_);
      jobConf_.set("stream.reduce.output", ioSpec_);
    }

    Class idResolverClass =
      jobConf_.getClass("stream.io.identifier.resolver.class",
        IdentifierResolver.class, IdentifierResolver.class);
    IdentifierResolver idResolver = ReflectionUtils.newInstance(idResolverClass, jobConf_);

    idResolver.resolve(jobConf_.get("stream.map.input", IdentifierResolver.TEXT_ID));
    jobConf_.setClass("stream.map.input.writer.class",
      idResolver.getInputWriterClass(), InputWriter.class);

    idResolver.resolve(jobConf_.get("stream.reduce.input", IdentifierResolver.TEXT_ID));
    jobConf_.setClass("stream.reduce.input.writer.class",
      idResolver.getInputWriterClass(), InputWriter.class);

    jobConf_.set("stream.addenvironment", addTaskEnvironment_);

    boolean isMapperACommand = false;
    if (mapCmd_ != null) {
      c = StreamUtil.goodClassOrNull(jobConf_, mapCmd_, defaultPackage);
      if (c != null) {
        jobConf_.setMapperClass(c);
      } else {
        isMapperACommand = true;
        jobConf_.setMapperClass(PipeMapper.class);
        jobConf_.setMapRunnerClass(PipeMapRunner.class);
        jobConf_.set("stream.map.streamprocessor",
                     URLEncoder.encode(mapCmd_, "UTF-8"));
      }
    }

    if (comCmd_ != null) {
      c = StreamUtil.goodClassOrNull(jobConf_, comCmd_, defaultPackage);
      if (c != null) {
        jobConf_.setCombinerClass(c);
      } else {
        jobConf_.setCombinerClass(PipeCombiner.class);
        jobConf_.set("stream.combine.streamprocessor", URLEncoder.encode(
                comCmd_, "UTF-8"));
      }
    }

    if (numReduceTasksSpec_!= null) {
      int numReduceTasks = Integer.parseInt(numReduceTasksSpec_);
      jobConf_.setNumReduceTasks(numReduceTasks);
    }

    boolean isReducerACommand = false;
    if (redCmd_ != null) {
      if (redCmd_.equals(REDUCE_NONE)) {
        jobConf_.setNumReduceTasks(0);
      }
      if (jobConf_.getNumReduceTasks() != 0) {
        if (redCmd_.compareToIgnoreCase("aggregate") == 0) {
          jobConf_.setReducerClass(ValueAggregatorReducer.class);
          jobConf_.setCombinerClass(ValueAggregatorCombiner.class);
        } else {

          c = StreamUtil.goodClassOrNull(jobConf_, redCmd_, defaultPackage);
          if (c != null) {
            jobConf_.setReducerClass(c);
          } else {
            isReducerACommand = true;
            jobConf_.setReducerClass(PipeReducer.class);
            jobConf_.set("stream.reduce.streamprocessor", URLEncoder.encode(
                redCmd_, "UTF-8"));
          }
        }
      }
    }

    idResolver.resolve(jobConf_.get("stream.map.output",
        IdentifierResolver.TEXT_ID));
    jobConf_.setClass("stream.map.output.reader.class",
      idResolver.getOutputReaderClass(), OutputReader.class);
    if (isMapperACommand || jobConf_.get("stream.map.output") != null) {
      // if mapper is a command, then map output key/value classes come from the
      // idResolver
      jobConf_.setMapOutputKeyClass(idResolver.getOutputKeyClass());
      jobConf_.setMapOutputValueClass(idResolver.getOutputValueClass());

      if (jobConf_.getNumReduceTasks() == 0) {
        jobConf_.setOutputKeyClass(idResolver.getOutputKeyClass());
        jobConf_.setOutputValueClass(idResolver.getOutputValueClass());
      }
    }

    idResolver.resolve(jobConf_.get("stream.reduce.output",
        IdentifierResolver.TEXT_ID));
    jobConf_.setClass("stream.reduce.output.reader.class",
      idResolver.getOutputReaderClass(), OutputReader.class);
    if (isReducerACommand || jobConf_.get("stream.reduce.output") != null) {
      // if reducer is a command, then output key/value classes come from the
      // idResolver
      jobConf_.setOutputKeyClass(idResolver.getOutputKeyClass());
      jobConf_.setOutputValueClass(idResolver.getOutputValueClass());
    }

    if (inReaderSpec_ != null) {
      String[] args = inReaderSpec_.split(",");
      String readerClass = args[0];
      // this argument can only be a Java class
      c = StreamUtil.goodClassOrNull(jobConf_, readerClass, defaultPackage);
      if (c != null) {
        jobConf_.set("stream.recordreader.class", c.getName());
      } else {
        fail("-inputreader: class not found: " + readerClass);
      }
      for (int i = 1; i < args.length; i++) {
        String[] nv = args[i].split("=", 2);
        String k = "stream.recordreader." + nv[0];
        String v = (nv.length > 1) ? nv[1] : "";
        jobConf_.set(k, v);
      }
    }

    FileOutputFormat.setOutputPath(jobConf_, new Path(output_));
    fmt = null;
    if (outputFormatSpec_!= null) {
      c = StreamUtil.goodClassOrNull(jobConf_, outputFormatSpec_, defaultPackage);
      if (c != null) {
        fmt = c;
      } else {
        fail("-outputformat : class not found : " + outputFormatSpec_);
      }
    }
    if (fmt == null) {
      fmt = TextOutputFormat.class;
    }
    if (lazyOutput_) {
      LazyOutputFormat.setOutputFormatClass(jobConf_, fmt);
    } else {
      jobConf_.setOutputFormat(fmt);
    }

    if (partitionerSpec_!= null) {
      c = StreamUtil.goodClassOrNull(jobConf_, partitionerSpec_, defaultPackage);
      if (c != null) {
        jobConf_.setPartitionerClass(c);
      } else {
        fail("-partitioner : class not found : " + partitionerSpec_);
      }
    }

    if(mapDebugSpec_ != null){
    	jobConf_.setMapDebugScript(mapDebugSpec_);
    }
    if(reduceDebugSpec_ != null){
    	jobConf_.setReduceDebugScript(reduceDebugSpec_);
    }
    // last, allow user to override anything
    // (although typically used with properties we didn't touch)

    jar_ = packageJobJar();
    if (jar_ != null) {
      jobConf_.setJar(jar_);
    }

    if ((cacheArchives != null) || (cacheFiles != null)){
      getURIs(cacheArchives, cacheFiles);
      boolean b = DistributedCache.checkURIs(fileURIs, archiveURIs);
      if (!b)
        fail(LINK_URI);
    }
    // set the jobconf for the caching parameters
    if (cacheArchives != null)
      DistributedCache.setCacheArchives(archiveURIs, jobConf_);
    if (cacheFiles != null)
      DistributedCache.setCacheFiles(fileURIs, jobConf_);

    if (verbose_) {
      listJobConfProperties();
    }

    msg("submitting to jobconf: " + getJobTrackerHostPort());
  }

  /**
   * Prints out the jobconf properties on stdout
   * when verbose is specified.
   */
  protected void listJobConfProperties()
  {
    msg("==== JobConf properties:");
    TreeMap sorted = new TreeMap();
    for (final Map.Entry en : jobConf_)  {
      sorted.put(en.getKey(), en.getValue());
    }
    for (final Map.Entry en: sorted.entrySet()) {
      msg(en.getKey() + "=" + en.getValue());
    }
    msg("====");
  }

  protected String getJobTrackerHostPort() {
    return jobConf_.get(JTConfig.JT_IPC_ADDRESS);
  }

  // Based on JobClient
  public int submitAndMonitorJob() throws IOException {

    if (jar_ != null && isLocalHadoop()) {
      // getAbs became required when shell and subvm have different working dirs...
      File wd = new File(".").getAbsoluteFile();
      RunJar.unJar(new File(jar_), wd, MATCH_ANY);
    }

    // if jobConf_ changes must recreate a JobClient
    jc_ = new JobClient(jobConf_);
    running_ = null;
    try {
      running_ = jc_.submitJob(jobConf_);
      jobId_ = running_.getID();
      if (background_) {
        LOG.info("Job is running in background.");
      } else if (!jc_.monitorAndPrintJob(jobConf_, running_)) {
        LOG.error("Job not successful!");
        return 1;
      }
      LOG.info("Output directory: " + output_);
    } catch(FileNotFoundException fe) {
      LOG.error("Error launching job , bad input path : " + fe.getMessage());
      return 2;
    } catch(InvalidJobConfException je) {
      LOG.error("Error launching job , Invalid job conf : " + je.getMessage());
      return 3;
    } catch(FileAlreadyExistsException fae) {
      LOG.error("Error launching job , Output path already exists : "
                + fae.getMessage());
      return 4;
    } catch(IOException ioe) {
      LOG.error("Error Launching job : " + ioe.getMessage());
      return 5;
    } catch (InterruptedException ie) {
      LOG.error("Error monitoring job : " + ie.getMessage());
      return 6;
    } finally {
      jc_.close();
    }
    return 0;
  }

  protected String[] argv_;
  protected boolean background_;
  protected boolean verbose_;
  protected boolean detailedUsage_;
  protected boolean printUsage = false;
  protected int debug_;

  protected Environment env_;

  protected String jar_;
  protected boolean localHadoop_;
  protected Configuration config_;
  protected JobConf jobConf_;
  protected JobClient jc_;

  // command-line arguments
  protected ArrayList inputSpecs_ = new ArrayList();
  protected TreeSet seenPrimary_ = new TreeSet();
  protected boolean hasSimpleInputSpecs_;
  protected ArrayList packageFiles_ = new ArrayList();
  protected ArrayList shippedCanonFiles_ = new ArrayList();
  //protected TreeMap userJobConfProps_ = new TreeMap();
  protected String output_;
  protected String mapCmd_;
  protected String comCmd_;
  protected String redCmd_;
  protected String cacheFiles;
  protected String cacheArchives;
  protected URI[] fileURIs;
  protected URI[] archiveURIs;
  protected String inReaderSpec_;
  protected String inputFormatSpec_;
  protected String outputFormatSpec_;
  protected String partitionerSpec_;
  protected String numReduceTasksSpec_;
  protected String additionalConfSpec_;
  protected String mapDebugSpec_;
  protected String reduceDebugSpec_;
  protected String ioSpec_;
  protected boolean lazyOutput_;

  // Use to communicate config to the external processes (ex env.var.HADOOP_USER)
  // encoding "a=b c=d"
  protected String addTaskEnvironment_;

  protected boolean outputSingleNode_;
  protected long minRecWrittenToEnableSkip_;

  protected RunningJob running_;
  protected JobID jobId_;
  protected static final String LINK_URI = "You need to specify the uris as scheme://path#linkname," +
    "Please specify a different link name for all of your caching URIs";

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy