All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.librec.job.JobConf Maven / Gradle / Ivy

/**
 * Copyright (C) 2016 LibRec
 * 

* This file is part of LibRec. * LibRec is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. *

* LibRec is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. *

* You should have received a copy of the GNU General Public License * along with LibRec. If not, see . */ package net.librec.job; /** * A map/reduce job configuration. *

*

JobConf is the primary interface for a user to describe a * map-reduce job to the librec framework for execution. The framework tries to * faithfully execute the job as-is described by JobConf, however: *

    *
  1. * Some configuration parameters might have been marked as * * final by administrators and hence cannot be altered. *
  2. *
  3. * While some job parameters are straight-forward to set * (e.g. {@link #setNumReduceTasks(int)}), some parameters interact subtly * with the rest of the framework and/or job-configuration and is relatively * more complex for the user to control finely * (e.g. {@link #setNumMapTasks(int)}). *
  4. *

*

*

JobConf typically specifies the {@link Mapper}, combiner * (if any), {@link Partitioner}, {@link Reducer}, {@link InputFormat} and * {@link OutputFormat} implementations to be used etc. *

*

Optionally JobConf is used to specify other advanced facets * of the job such as Comparators to be used, files to be put in * the {@link DistributedCache}, whether or not intermediate and/or job outputs * are to be compressed (and how), debugability via user-provided scripts * ( {@link #setMapDebugScript(String)}/{@link #setReduceDebugScript(String)}), * for doing post-processing on task logs, task's stdout, stderr, syslog. * and etc.

*

*

Here is an example on how to configure a job via JobConf:

*

 *     // Create a new JobConf
 *     JobConf job = new JobConf(new Configuration(), MyJob.class);
 * 

* // Specify various job-specific parameters * job.setJobName("myjob"); *

* FileInputFormat.setInputPaths(job, new Path("in")); * FileOutputFormat.setOutputPath(job, new Path("out")); *

* job.setMapperClass(MyJob.MyMapper.class); * job.setCombinerClass(MyJob.MyReducer.class); * job.setReducerClass(MyJob.MyReducer.class); *

* job.setInputFormat(SequenceFileInputFormat.class); * job.setOutputFormat(SequenceFileOutputFormat.class); *

* * @see JobClient * @see ClusterStatus * @see Tool * @see DistributedCache */ //@InterfaceAudience.Public //@InterfaceStability.Stable //public class JobConf extends Configuration { // // private static final Log LOG = LogFactory.getLog(JobConf.class); // private static final Pattern JAVA_OPTS_XMX_PATTERN = // Pattern.compile(".*(?:^|\\s)-Xmx(\\d+)([gGmMkK]?)(?:$|\\s).*"); // // static{ // ConfigUtil.loadResources(); // } // // /** // * @deprecated Use {@link #MAPREDUCE_JOB_MAP_MEMORY_MB_PROPERTY} and // * {@link #MAPREDUCE_JOB_REDUCE_MEMORY_MB_PROPERTY} // */ // @Deprecated // public static final String MAPRED_TASK_MAXVMEM_PROPERTY = // "mapred.task.maxvmem"; // // /** // * @deprecated // */ // @Deprecated // public static final String UPPER_LIMIT_ON_TASK_VMEM_PROPERTY = // "mapred.task.limit.maxvmem"; // // /** // * @deprecated // */ // @Deprecated // public static final String MAPRED_TASK_DEFAULT_MAXVMEM_PROPERTY = // "mapred.task.default.maxvmem"; // // /** // * @deprecated // */ // @Deprecated // public static final String MAPRED_TASK_MAXPMEM_PROPERTY = // "mapred.task.maxpmem"; // // /** // * A value which if set for memory related configuration options, // * indicates that the options are turned off. // * Deprecated because it makes no sense in the context of MR2. // */ // @Deprecated // public static final long DISABLED_MEMORY_LIMIT = -1L; // // /** // * Property name for the configuration property mapreduce.cluster.local.dir // */ // public static final String MAPRED_LOCAL_DIR_PROPERTY = MRConfig.LOCAL_DIR; // // /** // * Name of the queue to which jobs will be submitted, if no queue // * name is mentioned. // */ // public static final String DEFAULT_QUEUE_NAME = "default"; // // /** // * The variable is kept for M/R 1.x applications, while M/R 2.x applications // * should use {@link #MAPREDUCE_JOB_MAP_MEMORY_MB_PROPERTY} // */ // @Deprecated // public static final String MAPRED_JOB_MAP_MEMORY_MB_PROPERTY = // "mapred.job.map.memory.mb"; // // /** // * The variable is kept for M/R 1.x applications, while M/R 2.x applications // * should use {@link #MAPREDUCE_JOB_REDUCE_MEMORY_MB_PROPERTY} // */ // @Deprecated // public static final String MAPRED_JOB_REDUCE_MEMORY_MB_PROPERTY = // "mapred.job.reduce.memory.mb"; // // /** Pattern for the default unpacking behavior for job jars */ // public static final Pattern UNPACK_JAR_PATTERN_DEFAULT = // Pattern.compile("(?:classes/|lib/).*"); // // /** // * Configuration key to set the java command line options for the child // * map and reduce tasks. // * // * Java opts for the task tracker child processes. // * The following symbol, if present, will be interpolated: @taskid@. // * It is replaced by current TaskID. Any other occurrences of '@' will go // * unchanged. // * For example, to enable verbose gc logging to a file named for the taskid in // * /tmp and to set the heap maximum to be a gigabyte, pass a 'value' of: // * -Xmx1024m -verbose:gc -Xloggc:/tmp/@[email protected] // * // * The configuration variable {@link #MAPRED_TASK_ENV} can be used to pass // * other environment variables to the child processes. // * // * @deprecated Use {@link #MAPRED_MAP_TASK_JAVA_OPTS} or // * {@link #MAPRED_REDUCE_TASK_JAVA_OPTS} // */ // @Deprecated // public static final String MAPRED_TASK_JAVA_OPTS = "mapred.child.java.opts"; // // public static final String DEFAULT_MAPRED_TASK_JAVA_OPTS = ""; // // /** // * @deprecated // * Configuration key to set the maximum virtual memory available to the child // * map and reduce tasks (in kilo-bytes). This has been deprecated and will no // * longer have any effect. // */ // @Deprecated // public static final String MAPRED_TASK_ULIMIT = "mapred.child.ulimit"; // // /** // * @deprecated // * Configuration key to set the maximum virtual memory available to the // * map tasks (in kilo-bytes). This has been deprecated and will no // * longer have any effect. // */ // @Deprecated // public static final String MAPRED_MAP_TASK_ULIMIT = "mapreduce.map.ulimit"; // // /** // * @deprecated // * Configuration key to set the maximum virtual memory available to the // * reduce tasks (in kilo-bytes). This has been deprecated and will no // * longer have any effect. // */ // @Deprecated // public static final String MAPRED_REDUCE_TASK_ULIMIT = // "mapreduce.reduce.ulimit"; // // // /** // * Configuration key to set the environment of the child map/reduce tasks. // * // * The format of the value is k1=v1,k2=v2. Further it can // * reference existing environment variables via $key on // * Linux or %key% on Windows. // * // * Example: // *
    // *
  • A=foo - This will set the env variable A to foo.
  • // *
  • B=$X:c This is inherit tasktracker's X env variable on Linux.
  • // *
  • B=%X%;c This is inherit tasktracker's X env variable on Windows.
  • // *
// * // * @deprecated Use {@link #MAPRED_MAP_TASK_ENV} or // * {@link #MAPRED_REDUCE_TASK_ENV} // */ // @Deprecated // public static final String MAPRED_TASK_ENV = "mapred.child.env"; // // /** // * Configuration key to set the environment of the child map tasks. // * // * The format of the value is k1=v1,k2=v2. Further it can // * reference existing environment variables via $key on // * Linux or %key% on Windows. // * // * Example: // *
    // *
  • A=foo - This will set the env variable A to foo.
  • // *
  • B=$X:c This is inherit tasktracker's X env variable on Linux.
  • // *
  • B=%X%;c This is inherit tasktracker's X env variable on Windows.
  • // *
// */ // public static final String MAPRED_MAP_TASK_ENV = JobContext.MAP_ENV; // // /** // * Configuration key to set the environment of the child reduce tasks. // * // * The format of the value is k1=v1,k2=v2. Further it can // * reference existing environment variables via $key on // * Linux or %key% on Windows. // * // * Example: // *
    // *
  • A=foo - This will set the env variable A to foo.
  • // *
  • B=$X:c This is inherit tasktracker's X env variable on Linux.
  • // *
  • B=%X%;c This is inherit tasktracker's X env variable on Windows.
  • // *
// */ // public static final String MAPRED_REDUCE_TASK_ENV = JobContext.REDUCE_ENV; // // private Credentials credentials = new Credentials(); // // /** // * Configuration key to set the logging {@link Level} for the map task. // * // * The allowed logging levels are: // * OFF, FATAL, ERROR, WARN, INFO, DEBUG, TRACE and ALL. // */ // public static final String MAPRED_MAP_TASK_LOG_LEVEL = // JobContext.MAP_LOG_LEVEL; // // /** // * Configuration key to set the logging {@link Level} for the reduce task. // * // * The allowed logging levels are: // * OFF, FATAL, ERROR, WARN, INFO, DEBUG, TRACE and ALL. // */ // public static final String MAPRED_REDUCE_TASK_LOG_LEVEL = // JobContext.REDUCE_LOG_LEVEL; // // /** // * Default logging level for map/reduce tasks. // */ // public static final Level DEFAULT_LOG_LEVEL = Level.INFO; // // /** // * The variable is kept for M/R 1.x applications, M/R 2.x applications should // * use {@link MRJobConfig#WORKFLOW_ID} instead // */ // @Deprecated // public static final String WORKFLOW_ID = MRJobConfig.WORKFLOW_ID; // // /** // * The variable is kept for M/R 1.x applications, M/R 2.x applications should // * use {@link MRJobConfig#WORKFLOW_NAME} instead // */ // @Deprecated // public static final String WORKFLOW_NAME = MRJobConfig.WORKFLOW_NAME; // // /** // * The variable is kept for M/R 1.x applications, M/R 2.x applications should // * use {@link MRJobConfig#WORKFLOW_NODE_NAME} instead // */ // @Deprecated // public static final String WORKFLOW_NODE_NAME = // MRJobConfig.WORKFLOW_NODE_NAME; // // /** // * The variable is kept for M/R 1.x applications, M/R 2.x applications should // * use {@link MRJobConfig#WORKFLOW_ADJACENCY_PREFIX_STRING} instead // */ // @Deprecated // public static final String WORKFLOW_ADJACENCY_PREFIX_STRING = // MRJobConfig.WORKFLOW_ADJACENCY_PREFIX_STRING; // // /** // * The variable is kept for M/R 1.x applications, M/R 2.x applications should // * use {@link MRJobConfig#WORKFLOW_ADJACENCY_PREFIX_PATTERN} instead // */ // @Deprecated // public static final String WORKFLOW_ADJACENCY_PREFIX_PATTERN = // MRJobConfig.WORKFLOW_ADJACENCY_PREFIX_PATTERN; // // /** // * The variable is kept for M/R 1.x applications, M/R 2.x applications should // * use {@link MRJobConfig#WORKFLOW_TAGS} instead // */ // @Deprecated // public static final String WORKFLOW_TAGS = MRJobConfig.WORKFLOW_TAGS; // // /** // * The variable is kept for M/R 1.x applications, M/R 2.x applications should // * not use it // */ // @Deprecated // public static final String MAPREDUCE_RECOVER_JOB = // "mapreduce.job.restart.recover"; // // /** // * The variable is kept for M/R 1.x applications, M/R 2.x applications should // * not use it // */ // @Deprecated // public static final boolean DEFAULT_MAPREDUCE_RECOVER_JOB = true; // // /** // * Construct a map/reduce job configuration. // */ // public JobConf() { // checkAndWarnDeprecation(); // } // // /** // * Construct a map/reduce job configuration. // * // * @param exampleClass a class whose containing jar is used as the job's jar. // */ // public JobConf(Class exampleClass) { // setJarByClass(exampleClass); // checkAndWarnDeprecation(); // } // // /** // * Construct a map/reduce job configuration. // * // * @param conf a Configuration whose settings will be inherited. // */ // public JobConf(Configuration conf) { // super(conf); // // if (conf instanceof JobConf) { // JobConf that = (JobConf)conf; // credentials = that.credentials; // } // // checkAndWarnDeprecation(); // } // // // /** Construct a map/reduce job configuration. // * // * @param conf a Configuration whose settings will be inherited. // * @param exampleClass a class whose containing jar is used as the job's jar. // */ // public JobConf(Configuration conf, Class exampleClass) { // this(conf); // setJarByClass(exampleClass); // } // // // /** Construct a map/reduce configuration. // * // * @param config a Configuration-format XML job description file. // */ // public JobConf(String config) { // this(new Path(config)); // } // // /** Construct a map/reduce configuration. // * // * @param config a Configuration-format XML job description file. // */ // public JobConf(Path config) { // super(); // addResource(config); // checkAndWarnDeprecation(); // } // // /** A new map/reduce configuration where the behavior of reading from the // * default resources can be turned off. // *

// * If the parameter {@code loadDefaults} is false, the new instance // * will not load resources from the default files. // * // * @param loadDefaults specifies whether to load from the default files // */ // public JobConf(boolean loadDefaults) { // super(loadDefaults); // checkAndWarnDeprecation(); // } // // /** // * Get credentials for the job. // * @return credentials for the job // */ // public Credentials getCredentials() { // return credentials; // } // // @Private // public void setCredentials(Credentials credentials) { // this.credentials = credentials; // } // // /** // * Get the user jar for the map-reduce job. // * // * @return the user jar for the map-reduce job. // */ // public String getJar() { return get(JobContext.JAR); } // // /** // * Set the user jar for the map-reduce job. // * // * @param jar the user jar for the map-reduce job. // */ // public void setJar(String jar) { set(JobContext.JAR, jar); } // // /** // * Get the pattern for jar contents to unpack on the tasktracker // */ // public Pattern getJarUnpackPattern() { // return getPattern(JobContext.JAR_UNPACK_PATTERN, UNPACK_JAR_PATTERN_DEFAULT); // } // // // /** // * Set the job's jar file by finding an example class location. // * // * @param cls the example class. // */ // public void setJarByClass(Class cls) { // String jar = ClassUtil.findContainingJar(cls); // if (jar != null) { // setJar(jar); // } // } // // public String[] getLocalDirs() throws IOException { // return getTrimmedStrings(MRConfig.LOCAL_DIR); // } // // /** // * Use MRAsyncDiskService.moveAndDeleteAllVolumes instead. // */ // @Deprecated // public void deleteLocalFiles() throws IOException { // String[] localDirs = getLocalDirs(); // for (int i = 0; i < localDirs.length; i++) { // FileSystem.getLocal(this).delete(new Path(localDirs[i]), true); // } // } // // public void deleteLocalFiles(String subdir) throws IOException { // String[] localDirs = getLocalDirs(); // for (int i = 0; i < localDirs.length; i++) { // FileSystem.getLocal(this).delete(new Path(localDirs[i], subdir), true); // } // } // // /** // * Constructs a local file name. Files are distributed among configured // * local directories. // */ // public Path getLocalPath(String pathString) throws IOException { // return getLocalPath(MRConfig.LOCAL_DIR, pathString); // } // // /** // * Get the reported username for this job. // * // * @return the username // */ // public String getUser() { // return get(JobContext.USER_NAME); // } // // /** // * Set the reported username for this job. // * // * @param user the username for this job. // */ // public void setUser(String user) { // set(JobContext.USER_NAME, user); // } // // // // /** // * Set whether the framework should keep the intermediate files for // * failed tasks. // * // * @param keep true if framework should keep the intermediate files // * for failed tasks, false otherwise. // * // */ // public void setKeepFailedTaskFiles(boolean keep) { // setBoolean(JobContext.PRESERVE_FAILED_TASK_FILES, keep); // } // // /** // * Should the temporary files for failed tasks be kept? // * // * @return should the files be kept? // */ // public boolean getKeepFailedTaskFiles() { // return getBoolean(JobContext.PRESERVE_FAILED_TASK_FILES, false); // } // // /** // * Set a regular expression for task names that should be kept. // * The regular expression ".*_m_000123_0" would keep the files // * for the first instance of map 123 that ran. // * // * @param pattern the java.util.regex.Pattern to match against the // * task names. // */ // public void setKeepTaskFilesPattern(String pattern) { // set(JobContext.PRESERVE_FILES_PATTERN, pattern); // } // // /** // * Get the regular expression that is matched against the task names // * to see if we need to keep the files. // * // * @return the pattern as a string, if it was set, othewise null. // */ // public String getKeepTaskFilesPattern() { // return get(JobContext.PRESERVE_FILES_PATTERN); // } // // /** // * Set the current working directory for the default file system. // * // * @param dir the new current working directory. // */ // public void setWorkingDirectory(Path dir) { // dir = new Path(getWorkingDirectory(), dir); // set(JobContext.WORKING_DIR, dir.toString()); // } // // /** // * Get the current working directory for the default file system. // * // * @return the directory name. // */ // public Path getWorkingDirectory() { // String name = get(JobContext.WORKING_DIR); // if (name != null) { // return new Path(name); // } else { // try { // Path dir = FileSystem.get(this).getWorkingDirectory(); // set(JobContext.WORKING_DIR, dir.toString()); // return dir; // } catch (IOException e) { // throw new RuntimeException(e); // } // } // } // // /** // * Sets the number of tasks that a spawned task JVM should run // * before it exits // * @param numTasks the number of tasks to execute; defaults to 1; // * -1 signifies no limit // */ // public void setNumTasksToExecutePerJvm(int numTasks) { // setInt(JobContext.JVM_NUMTASKS_TORUN, numTasks); // } // // /** // * Get the number of tasks that a spawned JVM should execute // */ // public int getNumTasksToExecutePerJvm() { // return getInt(JobContext.JVM_NUMTASKS_TORUN, 1); // } // // /** // * Get the {@link InputFormat} implementation for the map-reduce job, // * defaults to {@link TextInputFormat} if not specified explicity. // * // * @return the {@link InputFormat} implementation for the map-reduce job. // */ // public InputFormat getInputFormat() { // return ReflectionUtils.newInstance(getClass("mapred.input.format.class", // TextInputFormat.class, // InputFormat.class), // this); // } // // /** // * Set the {@link InputFormat} implementation for the map-reduce job. // * // * @param theClass the {@link InputFormat} implementation for the map-reduce // * job. // */ // public void setInputFormat(Class theClass) { // setClass("mapred.input.format.class", theClass, InputFormat.class); // } // // /** // * Get the {@link OutputFormat} implementation for the map-reduce job, // * defaults to {@link TextOutputFormat} if not specified explicity. // * // * @return the {@link OutputFormat} implementation for the map-reduce job. // */ // public OutputFormat getOutputFormat() { // return ReflectionUtils.newInstance(getClass("mapred.output.format.class", // TextOutputFormat.class, // OutputFormat.class), // this); // } // // /** // * Get the {@link OutputCommitter} implementation for the map-reduce job, // * defaults to {@link FileOutputCommitter} if not specified explicitly. // * // * @return the {@link OutputCommitter} implementation for the map-reduce job. // */ // public OutputCommitter getOutputCommitter() { // return (OutputCommitter)ReflectionUtils.newInstance( // getClass("mapred.output.committer.class", FileOutputCommitter.class, // OutputCommitter.class), this); // } // // /** // * Set the {@link OutputCommitter} implementation for the map-reduce job. // * // * @param theClass the {@link OutputCommitter} implementation for the map-reduce // * job. // */ // public void setOutputCommitter(Class theClass) { // setClass("mapred.output.committer.class", theClass, OutputCommitter.class); // } // // /** // * Set the {@link OutputFormat} implementation for the map-reduce job. // * // * @param theClass the {@link OutputFormat} implementation for the map-reduce // * job. // */ // public void setOutputFormat(Class theClass) { // setClass("mapred.output.format.class", theClass, OutputFormat.class); // } // // /** // * Should the map outputs be compressed before transfer? // * // * @param compress should the map outputs be compressed? // */ // public void setCompressMapOutput(boolean compress) { // setBoolean(JobContext.MAP_OUTPUT_COMPRESS, compress); // } // // /** // * Are the outputs of the maps be compressed? // * // * @return true if the outputs of the maps are to be compressed, // * false otherwise. // */ // public boolean getCompressMapOutput() { // return getBoolean(JobContext.MAP_OUTPUT_COMPRESS, false); // } // // /** // * Set the given class as the {@link CompressionCodec} for the map outputs. // * // * @param codecClass the {@link CompressionCodec} class that will compress // * the map outputs. // */ // public void // setMapOutputCompressorClass(Class codecClass) { // setCompressMapOutput(true); // setClass(JobContext.MAP_OUTPUT_COMPRESS_CODEC, codecClass, // CompressionCodec.class); // } // // /** // * Get the {@link CompressionCodec} for compressing the map outputs. // * // * @param defaultValue the {@link CompressionCodec} to return if not set // * @return the {@link CompressionCodec} class that should be used to compress the // * map outputs. // * @throws IllegalArgumentException if the class was specified, but not found // */ // public Class // getMapOutputCompressorClass(Class defaultValue) { // Class codecClass = defaultValue; // String name = get(JobContext.MAP_OUTPUT_COMPRESS_CODEC); // if (name != null) { // try { // codecClass = getClassByName(name).asSubclass(CompressionCodec.class); // } catch (ClassNotFoundException e) { // throw new IllegalArgumentException("Compression codec " + name + // " was not found.", e); // } // } // return codecClass; // } // // /** // * Get the key class for the map output data. If it is not set, use the // * (final) output key class. This allows the map output key class to be // * different than the final output key class. // * // * @return the map output key class. // */ // public Class getMapOutputKeyClass() { // Class retv = getClass(JobContext.MAP_OUTPUT_KEY_CLASS, null, Object.class); // if (retv == null) { // retv = getOutputKeyClass(); // } // return retv; // } // // /** // * Set the key class for the map output data. This allows the user to // * specify the map output key class to be different than the final output // * value class. // * // * @param theClass the map output key class. // */ // public void setMapOutputKeyClass(Class theClass) { // setClass(JobContext.MAP_OUTPUT_KEY_CLASS, theClass, Object.class); // } // // /** // * Get the value class for the map output data. If it is not set, use the // * (final) output value class This allows the map output value class to be // * different than the final output value class. // * // * @return the map output value class. // */ // public Class getMapOutputValueClass() { // Class retv = getClass(JobContext.MAP_OUTPUT_VALUE_CLASS, null, // Object.class); // if (retv == null) { // retv = getOutputValueClass(); // } // return retv; // } // // /** // * Set the value class for the map output data. This allows the user to // * specify the map output value class to be different than the final output // * value class. // * // * @param theClass the map output value class. // */ // public void setMapOutputValueClass(Class theClass) { // setClass(JobContext.MAP_OUTPUT_VALUE_CLASS, theClass, Object.class); // } // // /** // * Get the key class for the job output data. // * // * @return the key class for the job output data. // */ // public Class getOutputKeyClass() { // return getClass(JobContext.OUTPUT_KEY_CLASS, // LongWritable.class, Object.class); // } // // /** // * Set the key class for the job output data. // * // * @param theClass the key class for the job output data. // */ // public void setOutputKeyClass(Class theClass) { // setClass(JobContext.OUTPUT_KEY_CLASS, theClass, Object.class); // } // // /** // * Get the {@link RawComparator} comparator used to compare keys. // * // * @return the {@link RawComparator} comparator used to compare keys. // */ // public RawComparator getOutputKeyComparator() { // Class theClass = getClass( // JobContext.KEY_COMPARATOR, null, RawComparator.class); // if (theClass != null) // return ReflectionUtils.newInstance(theClass, this); // return WritableComparator.get(getMapOutputKeyClass().asSubclass(WritableComparable.class), this); // } // // /** // * Set the {@link RawComparator} comparator used to compare keys. // * // * @param theClass the {@link RawComparator} comparator used to // * compare keys. // * @see #setOutputValueGroupingComparator(Class) // */ // public void setOutputKeyComparatorClass(Class theClass) { // setClass(JobContext.KEY_COMPARATOR, // theClass, RawComparator.class); // } // // /** // * Set the {@link KeyFieldBasedComparator} options used to compare keys. // * // * @param keySpec the key specification of the form -k pos1[,pos2], where, // * pos is of the form f[.c][opts], where f is the number // * of the key field to use, and c is the number of the first character from // * the beginning of the field. Fields and character posns are numbered // * starting with 1; a character position of zero in pos2 indicates the // * field's last character. If '.c' is omitted from pos1, it defaults to 1 // * (the beginning of the field); if omitted from pos2, it defaults to 0 // * (the end of the field). opts are ordering options. The supported options // * are: // * -n, (Sort numerically) // * -r, (Reverse the result of comparison) // */ // public void setKeyFieldComparatorOptions(String keySpec) { // setOutputKeyComparatorClass(KeyFieldBasedComparator.class); // set(KeyFieldBasedComparator.COMPARATOR_OPTIONS, keySpec); // } // // /** // * Get the {@link KeyFieldBasedComparator} options // */ // public String getKeyFieldComparatorOption() { // return get(KeyFieldBasedComparator.COMPARATOR_OPTIONS); // } // // /** // * Set the {@link KeyFieldBasedPartitioner} options used for // * {@link Partitioner} // * // * @param keySpec the key specification of the form -k pos1[,pos2], where, // * pos is of the form f[.c][opts], where f is the number // * of the key field to use, and c is the number of the first character from // * the beginning of the field. Fields and character posns are numbered // * starting with 1; a character position of zero in pos2 indicates the // * field's last character. If '.c' is omitted from pos1, it defaults to 1 // * (the beginning of the field); if omitted from pos2, it defaults to 0 // * (the end of the field). // */ // public void setKeyFieldPartitionerOptions(String keySpec) { // setPartitionerClass(KeyFieldBasedPartitioner.class); // set(KeyFieldBasedPartitioner.PARTITIONER_OPTIONS, keySpec); // } // // /** // * Get the {@link KeyFieldBasedPartitioner} options // */ // public String getKeyFieldPartitionerOption() { // return get(KeyFieldBasedPartitioner.PARTITIONER_OPTIONS); // } // // /** // * Get the user defined {@link WritableComparable} comparator for // * grouping keys of inputs to the combiner. // * // * @return comparator set by the user for grouping values. // * @see #setCombinerKeyGroupingComparator(Class) for details. // */ // public RawComparator getCombinerKeyGroupingComparator() { // Class theClass = getClass( // JobContext.COMBINER_GROUP_COMPARATOR_CLASS, null, RawComparator.class); // if (theClass == null) { // return getOutputKeyComparator(); // } // // return ReflectionUtils.newInstance(theClass, this); // } // // /** // * Get the user defined {@link WritableComparable} comparator for // * grouping keys of inputs to the reduce. // * // * @return comparator set by the user for grouping values. // * @see #setOutputValueGroupingComparator(Class) for details. // */ // public RawComparator getOutputValueGroupingComparator() { // Class theClass = getClass( // JobContext.GROUP_COMPARATOR_CLASS, null, RawComparator.class); // if (theClass == null) { // return getOutputKeyComparator(); // } // // return ReflectionUtils.newInstance(theClass, this); // } // // /** // * Set the user defined {@link RawComparator} comparator for // * grouping keys in the input to the combiner. // *

// *

This comparator should be provided if the equivalence rules for keys // * for sorting the intermediates are different from those for grouping keys // * before each call to // * {@link Reducer#reduce(Object, java.util.Iterator, OutputCollector, Reporter)}.

// *

// *

For key-value pairs (K1,V1) and (K2,V2), the values (V1, V2) are passed // * in a single call to the reduce function if K1 and K2 compare as equal.

// *

// *

Since {@link #setOutputKeyComparatorClass(Class)} can be used to control // * how keys are sorted, this can be used in conjunction to simulate // * secondary sort on values.

// *

// *

Note: This is not a guarantee of the combiner sort being // * stable in any sense. (In any case, with the order of available // * map-outputs to the combiner being non-deterministic, it wouldn't make // * that much sense.)

// * // * @param theClass the comparator class to be used for grouping keys for the // * combiner. It should implement RawComparator. // * @see #setOutputKeyComparatorClass(Class) // */ // public void setCombinerKeyGroupingComparator( // Class theClass) { // setClass(JobContext.COMBINER_GROUP_COMPARATOR_CLASS, // theClass, RawComparator.class); // } // // /** // * Set the user defined {@link RawComparator} comparator for // * grouping keys in the input to the reduce. // * // *

This comparator should be provided if the equivalence rules for keys // * for sorting the intermediates are different from those for grouping keys // * before each call to // * {@link Reducer#reduce(Object, java.util.Iterator, OutputCollector, Reporter)}.

// * // *

For key-value pairs (K1,V1) and (K2,V2), the values (V1, V2) are passed // * in a single call to the reduce function if K1 and K2 compare as equal.

// * // *

Since {@link #setOutputKeyComparatorClass(Class)} can be used to control // * how keys are sorted, this can be used in conjunction to simulate // * secondary sort on values.

// * // *

Note: This is not a guarantee of the reduce sort being // * stable in any sense. (In any case, with the order of available // * map-outputs to the reduce being non-deterministic, it wouldn't make // * that much sense.)

// * // * @param theClass the comparator class to be used for grouping keys. // * It should implement RawComparator. // * @see #setOutputKeyComparatorClass(Class) // * @see #setCombinerKeyGroupingComparator(Class) // */ // public void setOutputValueGroupingComparator( // Class theClass) { // setClass(JobContext.GROUP_COMPARATOR_CLASS, // theClass, RawComparator.class); // } // // /** // * Should the framework use the new context-object code for running // * the mapper? // * @return true, if the new api should be used // */ // public boolean getUseNewMapper() { // return getBoolean("mapred.mapper.new-api", false); // } // /** // * Set whether the framework should use the new api for the mapper. // * This is the default for jobs submitted with the new Job api. // * @param flag true, if the new api should be used // */ // public void setUseNewMapper(boolean flag) { // setBoolean("mapred.mapper.new-api", flag); // } // // /** // * Should the framework use the new context-object code for running // * the reducer? // * @return true, if the new api should be used // */ // public boolean getUseNewReducer() { // return getBoolean("mapred.reducer.new-api", false); // } // /** // * Set whether the framework should use the new api for the reducer. // * This is the default for jobs submitted with the new Job api. // * @param flag true, if the new api should be used // */ // public void setUseNewReducer(boolean flag) { // setBoolean("mapred.reducer.new-api", flag); // } // // /** // * Get the value class for job outputs. // * // * @return the value class for job outputs. // */ // public Class getOutputValueClass() { // return getClass(JobContext.OUTPUT_VALUE_CLASS, Text.class, Object.class); // } // // /** // * Set the value class for job outputs. // * // * @param theClass the value class for job outputs. // */ // public void setOutputValueClass(Class theClass) { // setClass(JobContext.OUTPUT_VALUE_CLASS, theClass, Object.class); // } // // /** // * Get the {@link Mapper} class for the job. // * // * @return the {@link Mapper} class for the job. // */ // public Class getMapperClass() { // return getClass("mapred.mapper.class", IdentityMapper.class, Mapper.class); // } // // /** // * Set the {@link Mapper} class for the job. // * // * @param theClass the {@link Mapper} class for the job. // */ // public void setMapperClass(Class theClass) { // setClass("mapred.mapper.class", theClass, Mapper.class); // } // // /** // * Get the {@link MapRunnable} class for the job. // * // * @return the {@link MapRunnable} class for the job. // */ // public Class getMapRunnerClass() { // return getClass("mapred.map.runner.class", // MapRunner.class, MapRunnable.class); // } // // /** // * Expert: Set the {@link MapRunnable} class for the job. // * // * Typically used to exert greater control on {@link Mapper}s. // * // * @param theClass the {@link MapRunnable} class for the job. // */ // public void setMapRunnerClass(Class theClass) { // setClass("mapred.map.runner.class", theClass, MapRunnable.class); // } // // /** // * Get the {@link Partitioner} used to partition {@link Mapper}-outputs // * to be sent to the {@link Reducer}s. // * // * @return the {@link Partitioner} used to partition map-outputs. // */ // public Class getPartitionerClass() { // return getClass("mapred.partitioner.class", // HashPartitioner.class, Partitioner.class); // } // // /** // * Set the {@link Partitioner} class used to partition // * {@link Mapper}-outputs to be sent to the {@link Reducer}s. // * // * @param theClass the {@link Partitioner} used to partition map-outputs. // */ // public void setPartitionerClass(Class theClass) { // setClass("mapred.partitioner.class", theClass, Partitioner.class); // } // // /** // * Get the {@link Reducer} class for the job. // * // * @return the {@link Reducer} class for the job. // */ // public Class getReducerClass() { // return getClass("mapred.reducer.class", // IdentityReducer.class, Reducer.class); // } // // /** // * Set the {@link Reducer} class for the job. // * // * @param theClass the {@link Reducer} class for the job. // */ // public void setReducerClass(Class theClass) { // setClass("mapred.reducer.class", theClass, Reducer.class); // } // // /** // * Get the user-defined combiner class used to combine map-outputs // * before being sent to the reducers. Typically the combiner is same as the // * the {@link Reducer} for the job i.e. {@link #getReducerClass()}. // * // * @return the user-defined combiner class used to combine map-outputs. // */ // public Class getCombinerClass() { // return getClass("mapred.combiner.class", null, Reducer.class); // } // // /** // * Set the user-defined combiner class used to combine map-outputs // * before being sent to the reducers. // * // *

The combiner is an application-specified aggregation operation, which // * can help cut down the amount of data transferred between the // * {@link Mapper} and the {@link Reducer}, leading to better performance.

// * // *

The framework may invoke the combiner 0, 1, or multiple times, in both // * the mapper and reducer tasks. In general, the combiner is called as the // * sort/merge result is written to disk. The combiner must: // *

    // *
  • be side-effect free
  • // *
  • have the same input and output key types and the same input and // * output value types
  • // *

// * // *

Typically the combiner is same as the Reducer for the // * job i.e. {@link #setReducerClass(Class)}.

// * // * @param theClass the user-defined combiner class used to combine // * map-outputs. // */ // public void setCombinerClass(Class theClass) { // setClass("mapred.combiner.class", theClass, Reducer.class); // } // // /** // * Should speculative execution be used for this job? // * Defaults to true. // * // * @return true if speculative execution be used for this job, // * false otherwise. // */ // public boolean getSpeculativeExecution() { // return (getMapSpeculativeExecution() || getReduceSpeculativeExecution()); // } // // /** // * Turn speculative execution on or off for this job. // * // * @param speculativeExecution true if speculative execution // * should be turned on, else false. // */ // public void setSpeculativeExecution(boolean speculativeExecution) { // setMapSpeculativeExecution(speculativeExecution); // setReduceSpeculativeExecution(speculativeExecution); // } // // /** // * Should speculative execution be used for this job for map tasks? // * Defaults to true. // * // * @return true if speculative execution be // * used for this job for map tasks, // * false otherwise. // */ // public boolean getMapSpeculativeExecution() { // return getBoolean(JobContext.MAP_SPECULATIVE, true); // } // // /** // * Turn speculative execution on or off for this job for map tasks. // * // * @param speculativeExecution true if speculative execution // * should be turned on for map tasks, // * else false. // */ // public void setMapSpeculativeExecution(boolean speculativeExecution) { // setBoolean(JobContext.MAP_SPECULATIVE, speculativeExecution); // } // // /** // * Should speculative execution be used for this job for reduce tasks? // * Defaults to true. // * // * @return true if speculative execution be used // * for reduce tasks for this job, // * false otherwise. // */ // public boolean getReduceSpeculativeExecution() { // return getBoolean(JobContext.REDUCE_SPECULATIVE, true); // } // // /** // * Turn speculative execution on or off for this job for reduce tasks. // * // * @param speculativeExecution true if speculative execution // * should be turned on for reduce tasks, // * else false. // */ // public void setReduceSpeculativeExecution(boolean speculativeExecution) { // setBoolean(JobContext.REDUCE_SPECULATIVE, // speculativeExecution); // } // // /** // * Get configured the number of reduce tasks for this job. // * Defaults to 1. // * // * @return the number of reduce tasks for this job. // */ // public int getNumMapTasks() { return getInt(JobContext.NUM_MAPS, 1); } // // /** // * Set the number of map tasks for this job. // * // *

Note: This is only a hint to the framework. The actual // * number of spawned map tasks depends on the number of {@link InputSplit}s // * generated by the job's {@link InputFormat#getSplits(JobConf, int)}. // * // * A custom {@link InputFormat} is typically used to accurately control // * the number of map tasks for the job.

// * // *

How many maps?

// * // *

The number of maps is usually driven by the total size of the inputs // * i.e. total number of blocks of the input files.

// * // *

The right level of parallelism for maps seems to be around 10-100 maps // * per-node, although it has been set up to 300 or so for very cpu-light map // * tasks. Task setup takes awhile, so it is best if the maps take at least a // * minute to execute.

// * // *

The default behavior of file-based {@link InputFormat}s is to split the // * input into logical {@link InputSplit}s based on the total size, in // * bytes, of input files. However, the {@link FileSystem} blocksize of the // * input files is treated as an upper bound for input splits. A lower bound // * on the split size can be set via // * // * mapreduce.input.fileinputformat.split.minsize.

// * // *

Thus, if you expect 10TB of input data and have a blocksize of 128MB, // * you'll end up with 82,000 maps, unless {@link #setNumMapTasks(int)} is // * used to set it even higher.

// * // * @param n the number of map tasks for this job. // * @see InputFormat#getSplits(JobConf, int) // * @see FileInputFormat // * @see FileSystem#getDefaultBlockSize() // * @see FileStatus#getBlockSize() // */ // public void setNumMapTasks(int n) { setInt(JobContext.NUM_MAPS, n); } // // /** // * Get configured the number of reduce tasks for this job. Defaults to // * 1. // * // * @return the number of reduce tasks for this job. // */ // public int getNumReduceTasks() { return getInt(JobContext.NUM_REDUCES, 1); } // // /** // * Set the requisite number of reduce tasks for this job. // * // *

How many reduces?

// * // *

The right number of reduces seems to be 0.95 or // * 1.75 multiplied by (<no. of nodes> * // * // * mapreduce.tasktracker.reduce.tasks.maximum). // *

// * // *

With 0.95 all of the reduces can launch immediately and // * start transfering map outputs as the maps finish. With 1.75 // * the faster nodes will finish their first round of reduces and launch a // * second wave of reduces doing a much better job of load balancing.

// * // *

Increasing the number of reduces increases the framework overhead, but // * increases load balancing and lowers the cost of failures.

// * // *

The scaling factors above are slightly less than whole numbers to // * reserve a few reduce slots in the framework for speculative-tasks, failures // * etc.

// * // *

Reducer NONE

// * // *

It is legal to set the number of reduce-tasks to zero.

// * // *

In this case the output of the map-tasks directly go to distributed // * file-system, to the path set by // * {@link FileOutputFormat#setOutputPath(JobConf, Path)}. Also, the // * framework doesn't sort the map-outputs before writing it out to HDFS.

// * // * @param n the number of reduce tasks for this job. // */ // public void setNumReduceTasks(int n) { setInt(JobContext.NUM_REDUCES, n); } // // /** // * Get the configured number of maximum attempts that will be made to run a // * map task, as specified by the mapreduce.map.maxattempts // * property. If this property is not already set, the default is 4 attempts. // * // * @return the max number of attempts per map task. // */ // public int getMaxMapAttempts() { // return getInt(JobContext.MAP_MAX_ATTEMPTS, 4); // } // // /** // * Expert: Set the number of maximum attempts that will be made to run a // * map task. // * // * @param n the number of attempts per map task. // */ // public void setMaxMapAttempts(int n) { // setInt(JobContext.MAP_MAX_ATTEMPTS, n); // } // // /** // * Get the configured number of maximum attempts that will be made to run a // * reduce task, as specified by the mapreduce.reduce.maxattempts // * property. If this property is not already set, the default is 4 attempts. // * // * @return the max number of attempts per reduce task. // */ // public int getMaxReduceAttempts() { // return getInt(JobContext.REDUCE_MAX_ATTEMPTS, 4); // } // /** // * Expert: Set the number of maximum attempts that will be made to run a // * reduce task. // * // * @param n the number of attempts per reduce task. // */ // public void setMaxReduceAttempts(int n) { // setInt(JobContext.REDUCE_MAX_ATTEMPTS, n); // } // // /** // * Get the user-specified job name. This is only used to identify the // * job to the user. // * // * @return the job's name, defaulting to "". // */ // public String getJobName() { // return get(JobContext.JOB_NAME, ""); // } // // /** // * Set the user-specified job name. // * // * @param name the job's new name. // */ // public void setJobName(String name) { // set(JobContext.JOB_NAME, name); // } // // /** // * Get the user-specified session identifier. The default is the empty string. // * // * The session identifier is used to tag metric data that is reported to some // * performance metrics system via the org.apache.librec.metrics API. The // * session identifier is intended, in particular, for use by librec-On-Demand // * (HOD) which allocates a virtual librec cluster dynamically and transiently. // * HOD will set the session identifier by modifying the mapred-site.xml file // * before starting the cluster. // * // * When not running under HOD, this identifer is expected to remain set to // * the empty string. // * // * @return the session identifier, defaulting to "". // */ // @Deprecated // public String getSessionId() { // return get("session.id", ""); // } // // /** // * Set the user-specified session identifier. // * // * @param sessionId the new session id. // */ // @Deprecated // public void setSessionId(String sessionId) { // set("session.id", sessionId); // } // // /** // * Set the maximum no. of failures of a given job per tasktracker. // * If the no. of task failures exceeds noFailures, the // * tasktracker is blacklisted for this job. // * // * @param noFailures maximum no. of failures of a given job per tasktracker. // */ // public void setMaxTaskFailuresPerTracker(int noFailures) { // setInt(JobContext.MAX_TASK_FAILURES_PER_TRACKER, noFailures); // } // // /** // * Expert: Get the maximum no. of failures of a given job per tasktracker. // * If the no. of task failures exceeds this, the tasktracker is // * blacklisted for this job. // * // * @return the maximum no. of failures of a given job per tasktracker. // */ // public int getMaxTaskFailuresPerTracker() { // return getInt(JobContext.MAX_TASK_FAILURES_PER_TRACKER, 3); // } // // /** // * Get the maximum percentage of map tasks that can fail without // * the job being aborted. // * // * Each map task is executed a minimum of {@link #getMaxMapAttempts()} // * attempts before being declared as failed. // * // * Defaults to zero, i.e. any failed map-task results in // * the job being declared as {@link JobStatus#FAILED}. // * // * @return the maximum percentage of map tasks that can fail without // * the job being aborted. // */ // public int getMaxMapTaskFailuresPercent() { // return getInt(JobContext.MAP_FAILURES_MAX_PERCENT, 0); // } // // /** // * Expert: Set the maximum percentage of map tasks that can fail without the // * job being aborted. // * // * Each map task is executed a minimum of {@link #getMaxMapAttempts} attempts // * before being declared as failed. // * // * @param percent the maximum percentage of map tasks that can fail without // * the job being aborted. // */ // public void setMaxMapTaskFailuresPercent(int percent) { // setInt(JobContext.MAP_FAILURES_MAX_PERCENT, percent); // } // // /** // * Get the maximum percentage of reduce tasks that can fail without // * the job being aborted. // * // * Each reduce task is executed a minimum of {@link #getMaxReduceAttempts()} // * attempts before being declared as failed. // * // * Defaults to zero, i.e. any failed reduce-task results // * in the job being declared as {@link JobStatus#FAILED}. // * // * @return the maximum percentage of reduce tasks that can fail without // * the job being aborted. // */ // public int getMaxReduceTaskFailuresPercent() { // return getInt(JobContext.REDUCE_FAILURES_MAXPERCENT, 0); // } // // /** // * Set the maximum percentage of reduce tasks that can fail without the job // * being aborted. // * // * Each reduce task is executed a minimum of {@link #getMaxReduceAttempts()} // * attempts before being declared as failed. // * // * @param percent the maximum percentage of reduce tasks that can fail without // * the job being aborted. // */ // public void setMaxReduceTaskFailuresPercent(int percent) { // setInt(JobContext.REDUCE_FAILURES_MAXPERCENT, percent); // } // // /** // * Set {@link JobPriority} for this job. // * // * @param prio the {@link JobPriority} for this job. // */ // public void setJobPriority(JobPriority prio) { // set(JobContext.PRIORITY, prio.toString()); // } // // /** // * Get the {@link JobPriority} for this job. // * // * @return the {@link JobPriority} for this job. // */ // public JobPriority getJobPriority() { // String prio = get(JobContext.PRIORITY); // if(prio == null) { // return JobPriority.NORMAL; // } // // return JobPriority.valueOf(prio); // } // // /** // * Set JobSubmitHostName for this job. // * // * @param hostname the JobSubmitHostName for this job. // */ // void setJobSubmitHostName(String hostname) { // set(MRJobConfig.JOB_SUBMITHOST, hostname); // } // // /** // * Get the JobSubmitHostName for this job. // * // * @return the JobSubmitHostName for this job. // */ // String getJobSubmitHostName() { // String hostname = get(MRJobConfig.JOB_SUBMITHOST); // // return hostname; // } // // /** // * Set JobSubmitHostAddress for this job. // * // * @param hostadd the JobSubmitHostAddress for this job. // */ // void setJobSubmitHostAddress(String hostadd) { // set(MRJobConfig.JOB_SUBMITHOSTADDR, hostadd); // } // // /** // * Get JobSubmitHostAddress for this job. // * // * @return JobSubmitHostAddress for this job. // */ // String getJobSubmitHostAddress() { // String hostadd = get(MRJobConfig.JOB_SUBMITHOSTADDR); // // return hostadd; // } // // /** // * Get whether the task profiling is enabled. // * @return true if some tasks will be profiled // */ // public boolean getProfileEnabled() { // return getBoolean(JobContext.TASK_PROFILE, false); // } // // /** // * Set whether the system should collect profiler information for some of // * the tasks in this job? The information is stored in the user log // * directory. // * @param newValue true means it should be gathered // */ // public void setProfileEnabled(boolean newValue) { // setBoolean(JobContext.TASK_PROFILE, newValue); // } // // /** // * Set the boolean property for specifying which classpath takes precedence - // * the user's one or the system one, when the tasks are launched // * @param value pass true if user's classes should take precedence // */ // public void setUserClassesTakesPrecedence(boolean value) { // setBoolean(MRJobConfig.MAPREDUCE_JOB_USER_CLASSPATH_FIRST, value); // } // // /** // * Get the boolean value for the property that specifies which classpath // * takes precedence when tasks are launched. True - user's classes takes // * precedence. False - system's classes takes precedence. // * @return true if user's classes should take precedence // */ // public boolean userClassesTakesPrecedence() { // return getBoolean(MRJobConfig.MAPREDUCE_JOB_USER_CLASSPATH_FIRST, false); // } // // /** // * Get the profiler configuration arguments. // * // * The default value for this property is // * "-agentlib:hprof=cpu=samples,heap=sites,force=n,thread=y,verbose=n,file=%s" // * // * @return the parameters to pass to the task child to configure profiling // */ // public String getProfileParams() { // return get(JobContext.TASK_PROFILE_PARAMS, // MRJobConfig.DEFAULT_TASK_PROFILE_PARAMS); // } // // /** // * Set the profiler configuration arguments. If the string contains a '%s' it // * will be replaced with the name of the profiling output file when the task // * runs. // * // * This value is passed to the task child JVM on the command line. // * // * @param value the configuration string // */ // public void setProfileParams(String value) { // set(JobContext.TASK_PROFILE_PARAMS, value); // } // // /** // * Get the range of maps or reduces to profile. // * @param isMap is the task a map? // * @return the task ranges // */ // public IntegerRanges getProfileTaskRange(boolean isMap) { // return getRange((isMap ? JobContext.NUM_MAP_PROFILES : // JobContext.NUM_REDUCE_PROFILES), "0-2"); // } // // /** // * Set the ranges of maps or reduces to profile. setProfileEnabled(true) // * must also be called. // * @param newValue a set of integer ranges of the map ids // */ // public void setProfileTaskRange(boolean isMap, String newValue) { // // parse the value to make sure it is legal // new Configuration.IntegerRanges(newValue); // set((isMap ? JobContext.NUM_MAP_PROFILES : JobContext.NUM_REDUCE_PROFILES), // newValue); // } // // /** // * Set the debug script to run when the map tasks fail. // * // *

The debug script can aid debugging of failed map tasks. The script is // * given task's stdout, stderr, syslog, jobconf files as arguments.

// * // *

The debug command, run on the node where the map failed, is:

// *

// * $script $stdout $stderr $syslog $jobconf. // *

// * // *

The script file is distributed through {@link DistributedCache} // * APIs. The script needs to be symlinked.

// * // *

Here is an example on how to submit a script // *

//   * job.setMapDebugScript("./myscript");
//   * DistributedCache.createSymlink(job);
//   * DistributedCache.addCacheFile("/debug/scripts/myscript#myscript");
//   * 

// * // * @param mDbgScript the script name // */ // public void setMapDebugScript(String mDbgScript) { // set(JobContext.MAP_DEBUG_SCRIPT, mDbgScript); // } // // /** // * Get the map task's debug script. // * // * @return the debug Script for the mapred job for failed map tasks. // * @see #setMapDebugScript(String) // */ // public String getMapDebugScript() { // return get(JobContext.MAP_DEBUG_SCRIPT); // } // // /** // * Set the debug script to run when the reduce tasks fail. // * // *

The debug script can aid debugging of failed reduce tasks. The script // * is given task's stdout, stderr, syslog, jobconf files as arguments.

// * // *

The debug command, run on the node where the map failed, is:

// *

// * $script $stdout $stderr $syslog $jobconf. // *

// * // *

The script file is distributed through {@link DistributedCache} // * APIs. The script file needs to be symlinked

// * // *

Here is an example on how to submit a script // *

//   * job.setReduceDebugScript("./myscript");
//   * DistributedCache.createSymlink(job);
//   * DistributedCache.addCacheFile("/debug/scripts/myscript#myscript");
//   * 

// * // * @param rDbgScript the script name // */ // public void setReduceDebugScript(String rDbgScript) { // set(JobContext.REDUCE_DEBUG_SCRIPT, rDbgScript); // } // // /** // * Get the reduce task's debug Script // * // * @return the debug script for the mapred job for failed reduce tasks. // * @see #setReduceDebugScript(String) // */ // public String getReduceDebugScript() { // return get(JobContext.REDUCE_DEBUG_SCRIPT); // } // // /** // * Get the uri to be invoked in-order to send a notification after the job // * has completed (success/failure). // * // * @return the job end notification uri, null if it hasn't // * been set. // * @see #setJobEndNotificationURI(String) // */ // public String getJobEndNotificationURI() { // return get(JobContext.MR_JOB_END_NOTIFICATION_URL); // } // // /** // * Set the uri to be invoked in-order to send a notification after the job // * has completed (success/failure). // * // *

The uri can contain 2 special parameters: $jobId and // * $jobStatus. Those, if present, are replaced by the job's // * identifier and completion-status respectively.

// * // *

This is typically used by application-writers to implement chaining of // * Map-Reduce jobs in an asynchronous manner.

// * // * @param uri the job end notification uri // * @see JobStatus // * @see Job Completion and Chaining // */ // public void setJobEndNotificationURI(String uri) { // set(JobContext.MR_JOB_END_NOTIFICATION_URL, uri); // } // // /** // * Get job-specific shared directory for use as scratch space // * // *

// * When a job starts, a shared directory is created at location // * // * ${mapreduce.cluster.local.dir}/taskTracker/$user/jobcache/$jobid/work/ . // * This directory is exposed to the users through // * mapreduce.job.local.dir . // * So, the tasks can use this space // * as scratch space and share files among them.

// * This value is available as System property also. // * // * @return The localized job specific shared directory // */ // public String getJobLocalDir() { // return get(JobContext.JOB_LOCAL_DIR); // } // // /** // * Get memory required to run a map task of the job, in MB. // * // * If a value is specified in the configuration, it is returned. // * Else, it returns {@link JobContext#DEFAULT_MAP_MEMORY_MB}. // *

// * For backward compatibility, if the job configuration sets the // * key {@link #MAPRED_TASK_MAXVMEM_PROPERTY} to a value different // * from {@link #DISABLED_MEMORY_LIMIT}, that value will be used // * after converting it from bytes to MB. // * @return memory required to run a map task of the job, in MB, // */ // public long getMemoryForMapTask() { // long value = getDeprecatedMemoryValue(); // if (value < 0) { // return getMemoryRequired(TaskType.MAP); // } // return value; // } // // public void setMemoryForMapTask(long mem) { // setLong(JobConf.MAPREDUCE_JOB_MAP_MEMORY_MB_PROPERTY, mem); // // In case that M/R 1.x applications use the old property name // setLong(JobConf.MAPRED_JOB_MAP_MEMORY_MB_PROPERTY, mem); // } // // /** // * Get memory required to run a reduce task of the job, in MB. // * // * If a value is specified in the configuration, it is returned. // * Else, it returns {@link JobContext#DEFAULT_REDUCE_MEMORY_MB}. // *

// * For backward compatibility, if the job configuration sets the // * key {@link #MAPRED_TASK_MAXVMEM_PROPERTY} to a value different // * from {@link #DISABLED_MEMORY_LIMIT}, that value will be used // * after converting it from bytes to MB. // * @return memory required to run a reduce task of the job, in MB. // */ // public long getMemoryForReduceTask() { // long value = getDeprecatedMemoryValue(); // if (value < 0) { // return getMemoryRequired(TaskType.REDUCE); // } // return value; // } // // // Return the value set to the key MAPRED_TASK_MAXVMEM_PROPERTY, // // converted into MBs. // // Returns DISABLED_MEMORY_LIMIT if unset, or set to a negative // // value. // private long getDeprecatedMemoryValue() { // long oldValue = getLong(MAPRED_TASK_MAXVMEM_PROPERTY, // DISABLED_MEMORY_LIMIT); // if (oldValue > 0) { // oldValue /= (1024*1024); // } // return oldValue; // } // // public void setMemoryForReduceTask(long mem) { // setLong(JobConf.MAPREDUCE_JOB_REDUCE_MEMORY_MB_PROPERTY, mem); // // In case that M/R 1.x applications use the old property name // setLong(JobConf.MAPRED_JOB_REDUCE_MEMORY_MB_PROPERTY, mem); // } // // /** // * Return the name of the queue to which this job is submitted. // * Defaults to 'default'. // * // * @return name of the queue // */ // public String getQueueName() { // return get(JobContext.QUEUE_NAME, DEFAULT_QUEUE_NAME); // } // // /** // * Set the name of the queue to which this job should be submitted. // * // * @param queueName Name of the queue // */ // public void setQueueName(String queueName) { // set(JobContext.QUEUE_NAME, queueName); // } // // /** // * Normalize the negative values in configuration // * // * @param val // * @return normalized value // */ // public static long normalizeMemoryConfigValue(long val) { // if (val < 0) { // val = DISABLED_MEMORY_LIMIT; // } // return val; // } // // /** // * Find a jar that contains a class of the same name, if any. // * It will return a jar file, even if that is not the first thing // * on the class path that has a class with the same name. // * // * @param my_class the class to find. // * @return a jar file that contains the class, or null. // * @throws IOException // */ // public static String findContainingJar(Class my_class) { // return ClassUtil.findContainingJar(my_class); // } // // /** // * Get the memory required to run a task of this job, in bytes. See // * {@link #MAPRED_TASK_MAXVMEM_PROPERTY} // *

// * This method is deprecated. Now, different memory limits can be // * set for map and reduce tasks of a job, in MB. // *

// * For backward compatibility, if the job configuration sets the // * key {@link #MAPRED_TASK_MAXVMEM_PROPERTY}, that value is returned. // * Otherwise, this method will return the larger of the values returned by // * {@link #getMemoryForMapTask()} and {@link #getMemoryForReduceTask()} // * after converting them into bytes. // * // * @return Memory required to run a task of this job, in bytes. // * @see #setMaxVirtualMemoryForTask(long) // * @deprecated Use {@link #getMemoryForMapTask()} and // * {@link #getMemoryForReduceTask()} // */ // @Deprecated // public long getMaxVirtualMemoryForTask() { // LOG.warn( // "getMaxVirtualMemoryForTask() is deprecated. " + // "Instead use getMemoryForMapTask() and getMemoryForReduceTask()"); // // long value = getLong(MAPRED_TASK_MAXVMEM_PROPERTY, // Math.max(getMemoryForMapTask(), getMemoryForReduceTask()) * 1024 * 1024); // return value; // } // // /** // * Set the maximum amount of memory any task of this job can use. See // * {@link #MAPRED_TASK_MAXVMEM_PROPERTY} // *

// * mapred.task.maxvmem is split into // * mapreduce.map.memory.mb // * and mapreduce.map.memory.mb,mapred // * each of the new key are set // * as mapred.task.maxvmem / 1024 // * as new values are in MB // * // * @param vmem Maximum amount of virtual memory in bytes any task of this job // * can use. // * @see #getMaxVirtualMemoryForTask() // * @deprecated // * Use {@link #setMemoryForMapTask(long mem)} and // * Use {@link #setMemoryForReduceTask(long mem)} // */ // @Deprecated // public void setMaxVirtualMemoryForTask(long vmem) { // LOG.warn("setMaxVirtualMemoryForTask() is deprecated."+ // "Instead use setMemoryForMapTask() and setMemoryForReduceTask()"); // if (vmem < 0) { // throw new IllegalArgumentException("Task memory allocation may not be < 0"); // } // // if(get(JobConf.MAPRED_TASK_MAXVMEM_PROPERTY) == null) { // setMemoryForMapTask(vmem / (1024 * 1024)); //Changing bytes to mb // setMemoryForReduceTask(vmem / (1024 * 1024));//Changing bytes to mb // }else{ // this.setLong(JobConf.MAPRED_TASK_MAXVMEM_PROPERTY,vmem); // } // } // // /** // * @deprecated this variable is deprecated and nolonger in use. // */ // @Deprecated // public long getMaxPhysicalMemoryForTask() { // LOG.warn("The API getMaxPhysicalMemoryForTask() is deprecated." // + " Refer to the APIs getMemoryForMapTask() and" // + " getMemoryForReduceTask() for details."); // return -1; // } // // /* // * @deprecated this // */ // @Deprecated // public void setMaxPhysicalMemoryForTask(long mem) { // LOG.warn("The API setMaxPhysicalMemoryForTask() is deprecated." // + " The value set is ignored. Refer to " // + " setMemoryForMapTask() and setMemoryForReduceTask() for details."); // } // // static String deprecatedString(String key) { // return "The variable " + key + " is no longer used."; // } // // private void checkAndWarnDeprecation() { // if(get(JobConf.MAPRED_TASK_MAXVMEM_PROPERTY) != null) { // LOG.warn(JobConf.deprecatedString(JobConf.MAPRED_TASK_MAXVMEM_PROPERTY) // + " Instead use " + JobConf.MAPREDUCE_JOB_MAP_MEMORY_MB_PROPERTY // + " and " + JobConf.MAPREDUCE_JOB_REDUCE_MEMORY_MB_PROPERTY); // } // if(get(JobConf.MAPRED_TASK_ULIMIT) != null ) { // LOG.warn(JobConf.deprecatedString(JobConf.MAPRED_TASK_ULIMIT)); // } // if(get(JobConf.MAPRED_MAP_TASK_ULIMIT) != null ) { // LOG.warn(JobConf.deprecatedString(JobConf.MAPRED_MAP_TASK_ULIMIT)); // } // if(get(JobConf.MAPRED_REDUCE_TASK_ULIMIT) != null ) { // LOG.warn(JobConf.deprecatedString(JobConf.MAPRED_REDUCE_TASK_ULIMIT)); // } // } // // private String getConfiguredTaskJavaOpts(TaskType taskType) { // String userClasspath = ""; // String adminClasspath = ""; // if (taskType == TaskType.MAP) { // userClasspath = get(MAPRED_MAP_TASK_JAVA_OPTS, // get(MAPRED_TASK_JAVA_OPTS, DEFAULT_MAPRED_TASK_JAVA_OPTS)); // adminClasspath = get(MRJobConfig.MAPRED_MAP_ADMIN_JAVA_OPTS, // MRJobConfig.DEFAULT_MAPRED_ADMIN_JAVA_OPTS); // } else { // userClasspath = get(MAPRED_REDUCE_TASK_JAVA_OPTS, // get(MAPRED_TASK_JAVA_OPTS, DEFAULT_MAPRED_TASK_JAVA_OPTS)); // adminClasspath = get(MRJobConfig.MAPRED_REDUCE_ADMIN_JAVA_OPTS, // MRJobConfig.DEFAULT_MAPRED_ADMIN_JAVA_OPTS); // } // // return adminClasspath + " " + userClasspath; // } // // @Private // public String getTaskJavaOpts(TaskType taskType) { // String javaOpts = getConfiguredTaskJavaOpts(taskType); // // if (!javaOpts.contains("-Xmx")) { // float heapRatio = getFloat(MRJobConfig.HEAP_MEMORY_MB_RATIO, // MRJobConfig.DEFAULT_HEAP_MEMORY_MB_RATIO); // // if (heapRatio > 1.0f || heapRatio < 0) { // LOG.warn("Invalid value for " + MRJobConfig.HEAP_MEMORY_MB_RATIO // + ", using the default."); // heapRatio = MRJobConfig.DEFAULT_HEAP_MEMORY_MB_RATIO; // } // // int taskContainerMb = getMemoryRequired(taskType); // int taskHeapSize = (int)Math.ceil(taskContainerMb * heapRatio); // // String xmxArg = String.format("-Xmx%dm", taskHeapSize); // LOG.info("Task java-opts do not specify heap size. Setting task attempt" + // " jvm max heap size to " + xmxArg); // // javaOpts += " " + xmxArg; // } // // return javaOpts; // } // // /** // * Parse the Maximum heap size from the java opts as specified by the -Xmx option // * Format: -Xmx[g|G|m|M|k|K] // * @param javaOpts String to parse to read maximum heap size // * @return Maximum heap size in MB or -1 if not specified // */ // @Private // @VisibleForTesting // public static int parseMaximumHeapSizeMB(String javaOpts) { // // Find the last matching -Xmx following word boundaries // Matcher m = JAVA_OPTS_XMX_PATTERN.matcher(javaOpts); // if (m.matches()) { // long size = Long.parseLong(m.group(1)); // if (size <= 0) { // return -1; // } // if (m.group(2).isEmpty()) { // // -Xmx specified in bytes // return (int) (size / (1024 * 1024)); // } // char unit = m.group(2).charAt(0); // switch (unit) { // case 'g': // case 'G': // // -Xmx specified in GB // return (int) (size * 1024); // case 'm': // case 'M': // // -Xmx specified in MB // return (int) size; // case 'k': // case 'K': // // -Xmx specified in KB // return (int) (size / 1024); // } // } // // -Xmx not specified // return -1; // } // // private int getMemoryRequiredHelper( // String configName, int defaultValue, int heapSize, float heapRatio) { // int memory = getInt(configName, -1); // if (memory <= 0) { // if (heapSize > 0) { // memory = (int) Math.ceil(heapSize / heapRatio); // LOG.info("Figured value for " + configName + " from javaOpts"); // } else { // memory = defaultValue; // } // } // // return memory; // } // // @Private // public int getMemoryRequired(TaskType taskType) { // int memory = 1024; // int heapSize = parseMaximumHeapSizeMB(getConfiguredTaskJavaOpts(taskType)); // float heapRatio = getFloat(MRJobConfig.HEAP_MEMORY_MB_RATIO, // MRJobConfig.DEFAULT_HEAP_MEMORY_MB_RATIO); // if (taskType == TaskType.MAP) { // return getMemoryRequiredHelper(MRJobConfig.MAP_MEMORY_MB, // MRJobConfig.DEFAULT_MAP_MEMORY_MB, heapSize, heapRatio); // } else if (taskType == TaskType.REDUCE) { // return getMemoryRequiredHelper(MRJobConfig.REDUCE_MEMORY_MB, // MRJobConfig.DEFAULT_REDUCE_MEMORY_MB, heapSize, heapRatio); // } else { // return memory; // } // } // //}





© 2015 - 2024 Weber Informatics LLC | Privacy Policy