org.apache.hadoop.mapred.JobConf Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.mapred;
import java.io.IOException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.Enumeration;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.*;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.mapred.lib.IdentityMapper;
import org.apache.hadoop.mapred.lib.IdentityReducer;
import org.apache.hadoop.mapred.lib.HashPartitioner;
import org.apache.hadoop.mapred.lib.KeyFieldBasedComparator;
import org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.ResourceCalculatorPlugin;
import org.apache.hadoop.util.Tool;
/**
* A map/reduce job configuration.
*
* JobConf
is the primary interface for a user to describe a
* map-reduce job to the Hadoop framework for execution. The framework tries to
* faithfully execute the job as-is described by JobConf
, however:
*
* -
* Some configuration parameters might have been marked as
*
* final by administrators and hence cannot be altered.
*
* -
* While some job parameters are straight-forward to set
* (e.g. {@link #setNumReduceTasks(int)}), some parameters interact subtly
* rest of the framework and/or job-configuration and is relatively more
* complex for the user to control finely (e.g. {@link #setNumMapTasks(int)}).
*
*
*
* JobConf
typically specifies the {@link Mapper}, combiner
* (if any), {@link Partitioner}, {@link Reducer}, {@link InputFormat} and
* {@link OutputFormat} implementations to be used etc.
*
*
Optionally JobConf
is used to specify other advanced facets
* of the job such as Comparator
s to be used, files to be put in
* the {@link DistributedCache}, whether or not intermediate and/or job outputs
* are to be compressed (and how), debugability via user-provided scripts
* ( {@link #setMapDebugScript(String)}/{@link #setReduceDebugScript(String)}),
* for doing post-processing on task logs, task's stdout, stderr, syslog.
* and etc.
*
* Here is an example on how to configure a job via JobConf
:
*
* // Create a new JobConf
* JobConf job = new JobConf(new Configuration(), MyJob.class);
*
* // Specify various job-specific parameters
* job.setJobName("myjob");
*
* FileInputFormat.setInputPaths(job, new Path("in"));
* FileOutputFormat.setOutputPath(job, new Path("out"));
*
* job.setMapperClass(MyJob.MyMapper.class);
* job.setCombinerClass(MyJob.MyReducer.class);
* job.setReducerClass(MyJob.MyReducer.class);
*
* job.setInputFormat(SequenceFileInputFormat.class);
* job.setOutputFormat(SequenceFileOutputFormat.class);
*
*
* @see JobClient
* @see ClusterStatus
* @see Tool
* @see DistributedCache
* @deprecated Use {@link Configuration} instead
*/
@Deprecated
public class JobConf extends Configuration {
private static final Log LOG = LogFactory.getLog(JobConf.class);
private static final String MAPRED_JOB_FINISH_WHEN_REDUCES_DONE = "mapred.job.finish.when.reduces.done";
static{
Configuration.addDefaultResource("mapred-default.xml");
Configuration.addDefaultResource("mapred-site.xml");
}
/**
* @deprecated Use {@link #MAPRED_JOB_MAP_MEMORY_MB_PROPERTY} and
* {@link #MAPRED_JOB_REDUCE_MEMORY_MB_PROPERTY}
*/
@Deprecated
public static final String MAPRED_TASK_MAXVMEM_PROPERTY =
"mapred.task.maxvmem";
/**
* @deprecated
*/
@Deprecated
public static final String UPPER_LIMIT_ON_TASK_VMEM_PROPERTY =
"mapred.task.limit.maxvmem";
/**
* @deprecated
*/
@Deprecated
public static final String MAPRED_TASK_DEFAULT_MAXVMEM_PROPERTY =
"mapred.task.default.maxvmem";
/**
* @deprecated
*/
@Deprecated
public static final String MAPRED_TASK_MAXPMEM_PROPERTY =
"mapred.task.maxpmem";
public static final String MAPRED_MAX_TRACKER_FAILURES_PROPERTY =
"mapred.max.tracker.failures";
/**
* A value which if set for memory related configuration options,
* indicates that the options are turned off.
*/
public static final long DISABLED_MEMORY_LIMIT =
ResourceCalculatorPlugin.UNAVAILABLE;
/**
* Name of the queue to which jobs will be submitted, if no queue
* name is mentioned.
*/
public static final String DEFAULT_QUEUE_NAME = "default";
public static final String JOB_SOURCE_CONF = "hive.query.source";
static final String MAPRED_JOB_MAP_MEMORY_MB_PROPERTY =
"mapred.job.map.memory.mb";
static final String MAPRED_JOB_REDUCE_MEMORY_MB_PROPERTY =
"mapred.job.reduce.memory.mb";
/**
* Configuration key to set the java command line options for the child
* map and reduce tasks.
*
* Java opts for the task tracker child processes.
* The following symbol, if present, will be interpolated: @taskid@.
* It is replaced by current TaskID. Any other occurrences of '@' will go
* unchanged.
* For example, to enable verbose gc logging to a file named for the taskid in
* /tmp and to set the heap maximum to be a gigabyte, pass a 'value' of:
* -Xmx1024m -verbose:gc -Xloggc:/tmp/@[email protected]
*
* The configuration variable {@link #MAPRED_TASK_ULIMIT} can be used to
* control the maximum virtual memory of the child processes.
*
* The configuration variable {@link #MAPRED_TASK_ENV} can be used to pass
* other environment variables to the child processes.
*
* @deprecated Use {@link #MAPRED_MAP_TASK_JAVA_OPTS} or
* {@link #MAPRED_REDUCE_TASK_JAVA_OPTS}
*/
@Deprecated
public static final String MAPRED_TASK_JAVA_OPTS = "mapred.child.java.opts";
/**
* Configuration key to set the java command line options for the map tasks.
*
* Java opts for the task tracker child map processes.
* The following symbol, if present, will be interpolated: @taskid@.
* It is replaced by current TaskID. Any other occurrences of '@' will go
* unchanged.
* For example, to enable verbose gc logging to a file named for the taskid in
* /tmp and to set the heap maximum to be a gigabyte, pass a 'value' of:
* -Xmx1024m -verbose:gc -Xloggc:/tmp/@[email protected]
*
* The configuration variable {@link #MAPRED_MAP_TASK_ULIMIT} can be used to
* control the maximum virtual memory of the map processes.
*
* The configuration variable {@link #MAPRED_MAP_TASK_ENV} can be used to pass
* other environment variables to the map processes.
*/
public static final String MAPRED_MAP_TASK_JAVA_OPTS =
"mapred.map.child.java.opts";
/**
* Configuration key to set the java command line options for the reduce tasks.
*
* Java opts for the task tracker child reduce processes.
* The following symbol, if present, will be interpolated: @taskid@.
* It is replaced by current TaskID. Any other occurrences of '@' will go
* unchanged.
* For example, to enable verbose gc logging to a file named for the taskid in
* /tmp and to set the heap maximum to be a gigabyte, pass a 'value' of:
* -Xmx1024m -verbose:gc -Xloggc:/tmp/@[email protected]
*
* The configuration variable {@link #MAPRED_REDUCE_TASK_ULIMIT} can be used
* to control the maximum virtual memory of the reduce processes.
*
* The configuration variable {@link #MAPRED_REDUCE_TASK_ENV} can be used to
* pass process environment variables to the reduce processes.
*/
public static final String MAPRED_REDUCE_TASK_JAVA_OPTS =
"mapred.reduce.child.java.opts";
public static final String DEFAULT_MAPRED_TASK_JAVA_OPTS = "-Xmx200m";
/**
* Configuration key to set the maximum virutal memory available to the child
* map and reduce tasks (in kilo-bytes).
*
* Note: This must be greater than or equal to the -Xmx passed to the JavaVM
* via {@link #MAPRED_TASK_JAVA_OPTS}, else the VM might not start.
*
* @deprecated Use {@link #MAPRED_MAP_TASK_ULIMIT} or
* {@link #MAPRED_REDUCE_TASK_ULIMIT}
*/
@Deprecated
public static final String MAPRED_TASK_ULIMIT = "mapred.child.ulimit";
/**
* Configuration key to set the maximum virutal memory available to the
* map tasks (in kilo-bytes).
*
* Note: This must be greater than or equal to the -Xmx passed to the JavaVM
* via {@link #MAPRED_MAP_TASK_JAVA_OPTS}, else the VM might not start.
*/
public static final String MAPRED_MAP_TASK_ULIMIT = "mapred.map.child.ulimit";
/**
* Configuration key to set the maximum virutal memory available to the
* reduce tasks (in kilo-bytes).
*
* Note: This must be greater than or equal to the -Xmx passed to the JavaVM
* via {@link #MAPRED_REDUCE_TASK_JAVA_OPTS}, else the VM might not start.
*/
public static final String MAPRED_REDUCE_TASK_ULIMIT =
"mapred.reduce.child.ulimit";
/**
* Configuration key to set the environment of the child map/reduce tasks.
*
* The format of the value is k1=v1,k2=v2
. Further it can
* reference existing environment variables via $key
.
*
* Example:
*
* - A=foo - This will set the env variable A to foo.
* - B=$X:c This is inherit tasktracker's X env variable.
*
*
* @deprecated Use {@link #MAPRED_MAP_TASK_ENV} or
* {@link #MAPRED_REDUCE_TASK_ENV}
*/
@Deprecated
public static final String MAPRED_TASK_ENV = "mapred.child.env";
/**
* Configuration key to set the maximum virutal memory available to the
* map tasks.
*
* The format of the value is k1=v1,k2=v2
. Further it can
* reference existing environment variables via $key
.
*
* Example:
*
* - A=foo - This will set the env variable A to foo.
* - B=$X:c This is inherit tasktracker's X env variable.
*
*/
public static final String MAPRED_MAP_TASK_ENV = "mapred.map.child.env";
/**
* Configuration key to set the maximum virutal memory available to the
* reduce tasks.
*
* The format of the value is k1=v1,k2=v2
. Further it can
* reference existing environment variables via $key
.
*
* Example:
*
* - A=foo - This will set the env variable A to foo.
* - B=$X:c This is inherit tasktracker's X env variable.
*
*/
public static final String MAPRED_REDUCE_TASK_ENV =
"mapred.reduce.child.env";
/**
* Construct a map/reduce job configuration.
*/
public JobConf() {
checkAndWarnDeprecation();
}
/**
* Construct a map/reduce job configuration.
*
* @param exampleClass a class whose containing jar is used as the job's jar.
*/
public JobConf(Class exampleClass) {
setJarByClass(exampleClass);
checkAndWarnDeprecation();
}
/**
* Construct a map/reduce job configuration.
*
* @param conf a Configuration whose settings will be inherited.
*/
public JobConf(Configuration conf) {
super(conf);
checkAndWarnDeprecation();
}
/** Construct a map/reduce job configuration.
*
* @param conf a Configuration whose settings will be inherited.
* @param exampleClass a class whose containing jar is used as the job's jar.
*/
public JobConf(Configuration conf, Class exampleClass) {
this(conf);
setJarByClass(exampleClass);
}
/** Construct a map/reduce configuration.
*
* @param config a Configuration-format XML job description file.
*/
public JobConf(String config) {
this(new Path(config));
}
/** Construct a map/reduce configuration.
*
* @param config a Configuration-format XML job description file.
*/
public JobConf(Path config) {
super();
addResource(config);
checkAndWarnDeprecation();
}
/** A new map/reduce configuration where the behavior of reading from the
* default resources can be turned off.
*
* If the parameter {@code loadDefaults} is false, the new instance
* will not load resources from the default files.
*
* @param loadDefaults specifies whether to load from the default files
*/
public JobConf(boolean loadDefaults) {
super(loadDefaults);
checkAndWarnDeprecation();
}
/**
* Get the user jar for the map-reduce job.
*
* @return the user jar for the map-reduce job.
*/
public String getJar() { return get("mapred.jar"); }
/**
* Set the user jar for the map-reduce job.
*
* @param jar the user jar for the map-reduce job.
*/
public void setJar(String jar) { set("mapred.jar", jar); }
/**
* Set the job's jar file by finding an example class location.
*
* @param cls the example class.
*/
public void setJarByClass(Class cls) {
String jar = findContainingJar(cls);
if (jar != null) {
setJar(jar);
}
}
public String[] getLocalDirs() throws IOException {
return getStrings("mapred.local.dir");
}
/**
* Use MRAsyncDiskService.moveAndDeleteAllVolumes instead.
* @see org.apache.hadoop.util.MRAsyncDiskService#cleanupAllVolumes()
*/
@Deprecated
public void deleteLocalFiles() throws IOException {
String[] localDirs = getLocalDirs();
for (int i = 0; i < localDirs.length; i++) {
FileSystem.getLocal(this).delete(new Path(localDirs[i]));
}
}
public void deleteLocalFiles(String subdir) throws IOException {
String[] localDirs = getLocalDirs();
for (int i = 0; i < localDirs.length; i++) {
FileSystem.getLocal(this).delete(new Path(localDirs[i], subdir));
}
}
/**
* Constructs a local file name. Files are distributed among configured
* local directories.
*/
public Path getLocalPath(String pathString) throws IOException {
return getLocalPath("mapred.local.dir", pathString);
}
/**
* Get the reported username for this job.
*
* @return the username
*/
public String getUser() {
return get("user.name");
}
/**
* Set the reported username for this job.
*
* @param user the username for this job.
*/
public void setUser(String user) {
set("user.name", user);
}
/**
* Set whether the framework should keep the intermediate files for
* failed tasks.
*
* @param keep true
if framework should keep the intermediate files
* for failed tasks, false
otherwise.
*
*/
public void setKeepFailedTaskFiles(boolean keep) {
setBoolean("keep.failed.task.files", keep);
}
/**
* Should the temporary files for failed tasks be kept?
*
* @return should the files be kept?
*/
public boolean getKeepFailedTaskFiles() {
return getBoolean("keep.failed.task.files", false);
}
/**
* Set a regular expression for task names that should be kept.
* The regular expression ".*_m_000123_0" would keep the files
* for the first instance of map 123 that ran.
*
* @param pattern the java.util.regex.Pattern to match against the
* task names.
*/
public void setKeepTaskFilesPattern(String pattern) {
set("keep.task.files.pattern", pattern);
}
/**
* Get the regular expression that is matched against the task names
* to see if we need to keep the files.
*
* @return the pattern as a string, if it was set, othewise null.
*/
public String getKeepTaskFilesPattern() {
return get("keep.task.files.pattern");
}
/**
* Set the current working directory for the default file system.
*
* @param dir the new current working directory.
*/
public void setWorkingDirectory(Path dir) {
if (!dir.isAbsolute()) {
FileSystem.LogForCollect
.info("set job working directory to non absolute path: " + dir
+ " working directory: " + getWorkingDirectory());
}
dir = new Path(getWorkingDirectory(), dir);
set("mapred.working.dir", dir.toString());
}
/**
* Get the current working directory for the default file system.
*
* @return the directory name.
*/
public Path getWorkingDirectory() {
String name = get("mapred.working.dir");
if (name != null) {
return new Path(name);
} else {
try {
Path dir = FileSystem.get(this).getWorkingDirectory();
set("mapred.working.dir", dir.toString());
return dir;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
/**
* Sets the number of tasks that a spawned task JVM should run
* before it exits
* @param numTasks the number of tasks to execute; defaults to 1;
* -1 signifies no limit
*/
public void setNumTasksToExecutePerJvm(int numTasks) {
setInt("mapred.job.reuse.jvm.num.tasks", numTasks);
}
/**
* Get the number of tasks that a spawned JVM should execute
*/
public int getNumTasksToExecutePerJvm() {
return getInt("mapred.job.reuse.jvm.num.tasks", 1);
}
/**
* Get the {@link InputFormat} implementation for the map-reduce job,
* defaults to {@link TextInputFormat} if not specified explicity.
*
* @return the {@link InputFormat} implementation for the map-reduce job.
*/
public InputFormat getInputFormat() {
return ReflectionUtils.newInstance(getClass("mapred.input.format.class",
TextInputFormat.class,
InputFormat.class),
this);
}
/**
* Set the {@link InputFormat} implementation for the map-reduce job.
*
* @param theClass the {@link InputFormat} implementation for the map-reduce
* job.
*/
public void setInputFormat(Class extends InputFormat> theClass) {
setClass("mapred.input.format.class", theClass, InputFormat.class);
}
/**
* Get the {@link OutputFormat} implementation for the map-reduce job,
* defaults to {@link TextOutputFormat} if not specified explicity.
*
* @return the {@link OutputFormat} implementation for the map-reduce job.
*/
public OutputFormat getOutputFormat() {
return ReflectionUtils.newInstance(getClass("mapred.output.format.class",
TextOutputFormat.class,
OutputFormat.class),
this);
}
/**
* Get the {@link OutputCommitter} implementation for the map-reduce job,
* defaults to {@link FileOutputCommitter} if not specified explicitly.
*
* @return the {@link OutputCommitter} implementation for the map-reduce job.
*/
public OutputCommitter getOutputCommitter() {
return (OutputCommitter)ReflectionUtils.newInstance(
getClass("mapred.output.committer.class", FileOutputCommitter.class,
OutputCommitter.class), this);
}
/**
* Set the {@link OutputCommitter} implementation for the map-reduce job.
*
* @param theClass the {@link OutputCommitter} implementation for the map-reduce
* job.
*/
public void setOutputCommitter(Class extends OutputCommitter> theClass) {
setClass("mapred.output.committer.class", theClass, OutputCommitter.class);
}
/**
* Set the {@link OutputFormat} implementation for the map-reduce job.
*
* @param theClass the {@link OutputFormat} implementation for the map-reduce
* job.
*/
public void setOutputFormat(Class extends OutputFormat> theClass) {
setClass("mapred.output.format.class", theClass, OutputFormat.class);
}
/**
* Should the map outputs be compressed before transfer?
* Uses the SequenceFile compression.
*
* @param compress should the map outputs be compressed?
*/
public void setCompressMapOutput(boolean compress) {
setBoolean("mapred.compress.map.output", compress);
}
/**
* Are the outputs of the maps be compressed?
*
* @return true
if the outputs of the maps are to be compressed,
* false
otherwise.
*/
public boolean getCompressMapOutput() {
return getBoolean("mapred.compress.map.output", false);
}
/**
* Set the given class as the {@link CompressionCodec} for the map outputs.
*
* @param codecClass the {@link CompressionCodec} class that will compress
* the map outputs.
*/
public void
setMapOutputCompressorClass(Class extends CompressionCodec> codecClass) {
setCompressMapOutput(true);
setClass("mapred.map.output.compression.codec", codecClass,
CompressionCodec.class);
}
/**
* Get the {@link CompressionCodec} for compressing the map outputs.
*
* @param defaultValue the {@link CompressionCodec} to return if not set
* @return the {@link CompressionCodec} class that should be used to compress the
* map outputs.
* @throws IllegalArgumentException if the class was specified, but not found
*/
public Class extends CompressionCodec>
getMapOutputCompressorClass(Class extends CompressionCodec> defaultValue) {
Class extends CompressionCodec> codecClass = defaultValue;
String name = get("mapred.map.output.compression.codec");
if (name != null) {
try {
codecClass = getClassByName(name).asSubclass(CompressionCodec.class);
} catch (ClassNotFoundException e) {
throw new IllegalArgumentException("Compression codec " + name +
" was not found.", e);
}
}
return codecClass;
}
/**
* Get the key class for the map output data. If it is not set, use the
* (final) output key class. This allows the map output key class to be
* different than the final output key class.
*
* @return the map output key class.
*/
public Class> getMapOutputKeyClass() {
Class> retv = getClass("mapred.mapoutput.key.class", null, Object.class);
if (retv == null) {
retv = getOutputKeyClass();
}
return retv;
}
/**
* Set the key class for the map output data. This allows the user to
* specify the map output key class to be different than the final output
* value class.
*
* @param theClass the map output key class.
*/
public void setMapOutputKeyClass(Class> theClass) {
setClass("mapred.mapoutput.key.class", theClass, Object.class);
}
/**
* Get the value class for the map output data. If it is not set, use the
* (final) output value class This allows the map output value class to be
* different than the final output value class.
*
* @return the map output value class.
*/
public Class> getMapOutputValueClass() {
Class> retv = getClass("mapred.mapoutput.value.class", null,
Object.class);
if (retv == null) {
retv = getOutputValueClass();
}
return retv;
}
/**
* Set the value class for the map output data. This allows the user to
* specify the map output value class to be different than the final output
* value class.
*
* @param theClass the map output value class.
*/
public void setMapOutputValueClass(Class> theClass) {
setClass("mapred.mapoutput.value.class", theClass, Object.class);
}
/**
* Get the key class for the job output data.
*
* @return the key class for the job output data.
*/
public Class> getOutputKeyClass() {
return getClass("mapred.output.key.class",
LongWritable.class, Object.class);
}
/**
* Set the key class for the job output data.
*
* @param theClass the key class for the job output data.
*/
public void setOutputKeyClass(Class> theClass) {
setClass("mapred.output.key.class", theClass, Object.class);
}
/**
* Get the {@link RawComparator} comparator used to compare keys.
*
* @return the {@link RawComparator} comparator used to compare keys.
*/
public RawComparator getOutputKeyComparator() {
Class extends RawComparator> theClass = getClass("mapred.output.key.comparator.class",
null, RawComparator.class);
if (theClass != null)
return ReflectionUtils.newInstance(theClass, this);
return WritableComparator.get(getMapOutputKeyClass().asSubclass(WritableComparable.class));
}
/**
* Set the {@link RawComparator} comparator used to compare keys.
*
* @param theClass the {@link RawComparator} comparator used to
* compare keys.
* @see #setOutputValueGroupingComparator(Class)
*/
public void setOutputKeyComparatorClass(Class extends RawComparator> theClass) {
setClass("mapred.output.key.comparator.class",
theClass, RawComparator.class);
}
/**
* Set the {@link KeyFieldBasedComparator} options used to compare keys.
*
* @param keySpec the key specification of the form -k pos1[,pos2], where,
* pos is of the form f[.c][opts], where f is the number
* of the key field to use, and c is the number of the first character from
* the beginning of the field. Fields and character posns are numbered
* starting with 1; a character position of zero in pos2 indicates the
* field's last character. If '.c' is omitted from pos1, it defaults to 1
* (the beginning of the field); if omitted from pos2, it defaults to 0
* (the end of the field). opts are ordering options. The supported options
* are:
* -n, (Sort numerically)
* -r, (Reverse the result of comparison)
*/
public void setKeyFieldComparatorOptions(String keySpec) {
setOutputKeyComparatorClass(KeyFieldBasedComparator.class);
set("mapred.text.key.comparator.options", keySpec);
}
/**
* Get the {@link KeyFieldBasedComparator} options
*/
public String getKeyFieldComparatorOption() {
return get("mapred.text.key.comparator.options");
}
/**
* Set the {@link KeyFieldBasedPartitioner} options used for
* {@link Partitioner}
*
* @param keySpec the key specification of the form -k pos1[,pos2], where,
* pos is of the form f[.c][opts], where f is the number
* of the key field to use, and c is the number of the first character from
* the beginning of the field. Fields and character posns are numbered
* starting with 1; a character position of zero in pos2 indicates the
* field's last character. If '.c' is omitted from pos1, it defaults to 1
* (the beginning of the field); if omitted from pos2, it defaults to 0
* (the end of the field).
*/
public void setKeyFieldPartitionerOptions(String keySpec) {
setPartitionerClass(KeyFieldBasedPartitioner.class);
set("mapred.text.key.partitioner.options", keySpec);
}
/**
* Get the {@link KeyFieldBasedPartitioner} options
*/
public String getKeyFieldPartitionerOption() {
return get("mapred.text.key.partitioner.options");
}
/**
* Get the user defined {@link WritableComparable} comparator for
* grouping keys of inputs to the reduce.
*
* @return comparator set by the user for grouping values.
* @see #setOutputValueGroupingComparator(Class) for details.
*/
public RawComparator getOutputValueGroupingComparator() {
Class extends RawComparator> theClass = getClass("mapred.output.value.groupfn.class", null,
RawComparator.class);
if (theClass == null) {
return getOutputKeyComparator();
}
return ReflectionUtils.newInstance(theClass, this);
}
/**
* Set the user defined {@link RawComparator} comparator for
* grouping keys in the input to the reduce.
*
* This comparator should be provided if the equivalence rules for keys
* for sorting the intermediates are different from those for grouping keys
* before each call to
* {@link Reducer#reduce(Object, java.util.Iterator, OutputCollector, Reporter)}.
*
* For key-value pairs (K1,V1) and (K2,V2), the values (V1, V2) are passed
* in a single call to the reduce function if K1 and K2 compare as equal.
*
* Since {@link #setOutputKeyComparatorClass(Class)} can be used to control
* how keys are sorted, this can be used in conjunction to simulate
* secondary sort on values.
*
* Note: This is not a guarantee of the reduce sort being
* stable in any sense. (In any case, with the order of available
* map-outputs to the reduce being non-deterministic, it wouldn't make
* that much sense.)
*
* @param theClass the comparator class to be used for grouping keys.
* It should implement RawComparator
.
* @see #setOutputKeyComparatorClass(Class)
*/
public void setOutputValueGroupingComparator(
Class extends RawComparator> theClass) {
setClass("mapred.output.value.groupfn.class",
theClass, RawComparator.class);
}
/**
* Should the framework use the new context-object code for running
* the mapper?
* @return true, if the new api should be used
*/
public boolean getUseNewMapper() {
return getBoolean("mapred.mapper.new-api", false);
}
/**
* Set whether the framework should use the new api for the mapper.
* This is the default for jobs submitted with the new Job api.
* @param flag true, if the new api should be used
*/
public void setUseNewMapper(boolean flag) {
setBoolean("mapred.mapper.new-api", flag);
}
/**
* Should the framework use the new context-object code for running
* the reducer?
* @return true, if the new api should be used
*/
public boolean getUseNewReducer() {
return getBoolean("mapred.reducer.new-api", false);
}
/**
* Set whether the framework should use the new api for the reducer.
* This is the default for jobs submitted with the new Job api.
* @param flag true, if the new api should be used
*/
public void setUseNewReducer(boolean flag) {
setBoolean("mapred.reducer.new-api", flag);
}
/**
* Get the value class for job outputs.
*
* @return the value class for job outputs.
*/
public Class> getOutputValueClass() {
return getClass("mapred.output.value.class", Text.class, Object.class);
}
/**
* Set the value class for job outputs.
*
* @param theClass the value class for job outputs.
*/
public void setOutputValueClass(Class> theClass) {
setClass("mapred.output.value.class", theClass, Object.class);
}
/**
* Get the {@link Mapper} class for the job.
*
* @return the {@link Mapper} class for the job.
*/
public Class extends Mapper> getMapperClass() {
return getClass("mapred.mapper.class", IdentityMapper.class, Mapper.class);
}
/**
* Set the {@link Mapper} class for the job.
*
* @param theClass the {@link Mapper} class for the job.
*/
public void setMapperClass(Class extends Mapper> theClass) {
setClass("mapred.mapper.class", theClass, Mapper.class);
}
/**
* Get the {@link MapRunnable} class for the job.
*
* @return the {@link MapRunnable} class for the job.
*/
public Class extends MapRunnable> getMapRunnerClass() {
return getClass("mapred.map.runner.class",
MapRunner.class, MapRunnable.class);
}
/**
* Expert: Set the {@link MapRunnable} class for the job.
*
* Typically used to exert greater control on {@link Mapper}s.
*
* @param theClass the {@link MapRunnable} class for the job.
*/
public void setMapRunnerClass(Class extends MapRunnable> theClass) {
setClass("mapred.map.runner.class", theClass, MapRunnable.class);
}
/**
* Get the {@link Partitioner} used to partition {@link Mapper}-outputs
* to be sent to the {@link Reducer}s.
*
* @return the {@link Partitioner} used to partition map-outputs.
*/
public Class extends Partitioner> getPartitionerClass() {
return getClass("mapred.partitioner.class",
HashPartitioner.class, Partitioner.class);
}
/**
* Set the {@link Partitioner} class used to partition
* {@link Mapper}-outputs to be sent to the {@link Reducer}s.
*
* @param theClass the {@link Partitioner} used to partition map-outputs.
*/
public void setPartitionerClass(Class extends Partitioner> theClass) {
setClass("mapred.partitioner.class", theClass, Partitioner.class);
}
/**
* Get the {@link Reducer} class for the job.
*
* @return the {@link Reducer} class for the job.
*/
public Class extends Reducer> getReducerClass() {
return getClass("mapred.reducer.class",
IdentityReducer.class, Reducer.class);
}
/**
* Set the {@link Reducer} class for the job.
*
* @param theClass the {@link Reducer} class for the job.
*/
public void setReducerClass(Class extends Reducer> theClass) {
setClass("mapred.reducer.class", theClass, Reducer.class);
}
/**
* Get the user-defined combiner class used to combine map-outputs
* before being sent to the reducers. Typically the combiner is same as the
* the {@link Reducer} for the job i.e. {@link #getReducerClass()}.
*
* @return the user-defined combiner class used to combine map-outputs.
*/
public Class extends Reducer> getCombinerClass() {
return getClass("mapred.combiner.class", null, Reducer.class);
}
/**
* Set the user-defined combiner class used to combine map-outputs
* before being sent to the reducers.
*
* The combiner is an application-specified aggregation operation, which
* can help cut down the amount of data transferred between the
* {@link Mapper} and the {@link Reducer}, leading to better performance.
*
* The framework may invoke the combiner 0, 1, or multiple times, in both
* the mapper and reducer tasks. In general, the combiner is called as the
* sort/merge result is written to disk. The combiner must:
*
* - be side-effect free
* - have the same input and output key types and the same input and
* output value types
*
*
* Typically the combiner is same as the Reducer
for the
* job i.e. {@link #setReducerClass(Class)}.
*
* @param theClass the user-defined combiner class used to combine
* map-outputs.
*/
public void setCombinerClass(Class extends Reducer> theClass) {
setClass("mapred.combiner.class", theClass, Reducer.class);
}
/**
* Should speculative execution be used for this job?
* Defaults to true
.
*
* @return true
if speculative execution be used for this job,
* false
otherwise.
*/
public boolean getSpeculativeExecution() {
return (getMapSpeculativeExecution() || getReduceSpeculativeExecution());
}
/**
* Turn speculative execution on or off for this job.
*
* @param speculativeExecution true
if speculative execution
* should be turned on, else false
.
*/
public void setSpeculativeExecution(boolean speculativeExecution) {
setMapSpeculativeExecution(speculativeExecution);
setReduceSpeculativeExecution(speculativeExecution);
}
/**
* Should speculative execution be used for this job for map tasks?
* Defaults to true
.
*
* @return true
if speculative execution be
* used for this job for map tasks,
* false
otherwise.
*/
public boolean getMapSpeculativeExecution() {
return getBoolean("mapred.map.tasks.speculative.execution", true);
}
/**
* Turn speculative execution on or off for this job for map tasks.
*
* @param speculativeExecution true
if speculative execution
* should be turned on for map tasks,
* else false
.
*/
public void setMapSpeculativeExecution(boolean speculativeExecution) {
setBoolean("mapred.map.tasks.speculative.execution", speculativeExecution);
}
/**
* Should speculative execution be used for this job for reduce tasks?
* Defaults to true
.
*
* @return true
if speculative execution be used
* for reduce tasks for this job,
* false
otherwise.
*/
public boolean getReduceSpeculativeExecution() {
return getBoolean("mapred.reduce.tasks.speculative.execution", true);
}
/**
* Turn speculative execution on or off for this job for reduce tasks.
*
* @param speculativeExecution true
if speculative execution
* should be turned on for reduce tasks,
* else false
.
*/
public void setReduceSpeculativeExecution(boolean speculativeExecution) {
setBoolean("mapred.reduce.tasks.speculative.execution",
speculativeExecution);
}
/**
* Get time to wait before invoking speculative execution for maps.
*/
public long getMapSpeculativeLag() {
return getLong("mapred.speculative.map.lag", 60 * 1000);
}
/**
* Set time to wait before invoking speculative execution for maps.
*
* @param mapSpeculativeLag New value for speculative lag.
*/
public void setMapSpeculativeLag(long mapSpeculativeLag) {
set("mapred.speculative.map.lag", "" + mapSpeculativeLag);
}
/**
* Get time to wait before invoking speculative execution for reduces.
*/
public long getReduceSpeculativeLag() {
return getLong("mapred.speculative.reduce.lag", 60 * 1000);
}
/**
* Set time to wait before invoking speculative execution for reduces.
*
* @param reduceSpeculativeLag New value for speculative lag.
*/
public void setReduceSpeculativeLag(long reduceSpeculativeLag) {
set("mapred.speculative.reduce.lag", "" + reduceSpeculativeLag);
}
/**
* Set minimum projected task duration in seconds
* before invoking speculative execution on mappers
*
* @param mapSpeculativeDuration New value for speculative duration
*/
public void setMapSpeculativeDuration(long mapSpeculativeDuration) {
set("mapred.speculative.map.duration", "" + mapSpeculativeDuration);
}
/**
* Set minimum projected task duration in seconds
* before invoking speculative execution on reducers
*
* @param reduceSpeculativeDuration New value for speculative duration
*/
public void setReduceSpeculativeDuration(long reduceSpeculativeDuration) {
set("mapred.speculative.reduce.duration", "" + reduceSpeculativeDuration);
}
/**
* Get minimum projected task duration in seconds
* before invoking speculative execution on mappers
*
* Disabled by default
*/
public long getMapSpeculativeDuration() {
return getLong("mapred.speculative.map.duration", 0L);
}
/**
* Get minimum projected task duration in seconds
* before invoking speculative execution on reducers
*
* Disabled by default
*/
public long getReduceSpeculativeDuration() {
return getLong("mapred.speculative.reduce.duration", 0L);
}
/**
* Get configured the number of reduce tasks for this job.
* Defaults to 1
.
*
* @return the number of reduce tasks for this job.
*/
public int getNumMapTasks() { return getInt("mapred.map.tasks", 1); }
/**
* Set the number of map tasks for this job.
*
* Note: This is only a hint to the framework. The actual
* number of spawned map tasks depends on the number of {@link InputSplit}s
* generated by the job's {@link InputFormat#getSplits(JobConf, int)}.
*
* A custom {@link InputFormat} is typically used to accurately control
* the number of map tasks for the job.
*
* How many maps?
*
* The number of maps is usually driven by the total size of the inputs
* i.e. total number of blocks of the input files.
*
* The right level of parallelism for maps seems to be around 10-100 maps
* per-node, although it has been set up to 300 or so for very cpu-light map
* tasks. Task setup takes awhile, so it is best if the maps take at least a
* minute to execute.
*
* The default behavior of file-based {@link InputFormat}s is to split the
* input into logical {@link InputSplit}s based on the total size, in
* bytes, of input files. However, the {@link FileSystem} blocksize of the
* input files is treated as an upper bound for input splits. A lower bound
* on the split size can be set via
*
* mapred.min.split.size.
*
* Thus, if you expect 10TB of input data and have a blocksize of 128MB,
* you'll end up with 82,000 maps, unless {@link #setNumMapTasks(int)} is
* used to set it even higher.
*
* @param n the number of map tasks for this job.
* @see InputFormat#getSplits(JobConf, int)
* @see FileInputFormat
* @see FileSystem#getDefaultBlockSize()
* @see FileStatus#getBlockSize()
*/
public void setNumMapTasks(int n) { setInt("mapred.map.tasks", n); }
/**
* Get configured the number of reduce tasks for this job. Defaults to
* 1
.
*
* @return the number of reduce tasks for this job.
*/
public int getNumReduceTasks() { return getInt("mapred.reduce.tasks", 1); }
/**
* Set the requisite number of reduce tasks for this job.
*
* How many reduces?
*
* The right number of reduces seems to be 0.95
or
* 1.75
multiplied by (<no. of nodes> *
*
* mapred.tasktracker.reduce.tasks.maximum).
*
*
* With 0.95
all of the reduces can launch immediately and
* start transfering map outputs as the maps finish. With 1.75
* the faster nodes will finish their first round of reduces and launch a
* second wave of reduces doing a much better job of load balancing.
*
* Increasing the number of reduces increases the framework overhead, but
* increases load balancing and lowers the cost of failures.
*
* The scaling factors above are slightly less than whole numbers to
* reserve a few reduce slots in the framework for speculative-tasks, failures
* etc.
*
* Reducer NONE
*
* It is legal to set the number of reduce-tasks to zero
.
*
* In this case the output of the map-tasks directly go to distributed
* file-system, to the path set by
* {@link FileOutputFormat#setOutputPath(JobConf, Path)}. Also, the
* framework doesn't sort the map-outputs before writing it out to HDFS.
*
* @param n the number of reduce tasks for this job.
*/
public void setNumReduceTasks(int n) { setInt("mapred.reduce.tasks", n); }
/**
* Specify whether job-setup and job-cleanup is needed for the job
*
* @param needed If true
, job-setup and job-cleanup will be
* considered from {@link OutputCommitter}
* else ignored.
*/
public void setJobSetupCleanupNeeded(boolean needed) {
setBoolean("mapred.committer.job.setup.cleanup.needed", needed);
}
/**
* Get whether job-setup and job-cleanup is needed for the job
*
* @return boolean
*/
public boolean getJobSetupCleanupNeeded() {
return getBoolean("mapred.committer.job.setup.cleanup.needed", true);
}
/**
* Get whether job should finish when reduces are done. The unfinished
* mappers will be killed.
* @return true If the job finish when reduces are done
*/
public boolean getJobFinishWhenReducesDone() {
return getBoolean(MAPRED_JOB_FINISH_WHEN_REDUCES_DONE, false);
}
/**
* Specify whether task-cleanup is needed for the job
*
* @param needed If true
, task-cleanup will be considered
* from {@link OutputCommitter} else ignored.
*/
public void setTaskCleanupNeeded(boolean needed) {
setBoolean("mapred.committer.task.cleanup.needed", needed);
}
/**
* Get whether task-cleanup is needed for the job
* The purpose of the task-cleanup task is to perform OutputCommitter.abort().
* If there is no need to run this method, we can disable task-cleanup to
* improve latency.
*
* @return boolean
*/
public boolean getTaskCleanupNeeded() {
return getBoolean("mapred.committer.task.cleanup.needed", true);
}
/**
* Get the configured number of maximum attempts that will be made to run a
* map task, as specified by the mapred.map.max.attempts
* property. If this property is not already set, the default is 4 attempts.
*
* @return the max number of attempts per map task.
*/
public int getMaxMapAttempts() {
return getInt("mapred.map.max.attempts", 4);
}
/**
* Expert: Set the number of maximum attempts that will be made to run a
* map task.
*
* @param n the number of attempts per map task.
*/
public void setMaxMapAttempts(int n) {
setInt("mapred.map.max.attempts", n);
}
/**
* Get the configured number of maximum attempts that will be made to run a
* reduce task, as specified by the mapred.reduce.max.attempts
* property. If this property is not already set, the default is 4 attempts.
*
* @return the max number of attempts per reduce task.
*/
public int getMaxReduceAttempts() {
return getInt("mapred.reduce.max.attempts", 4);
}
/**
* Expert: Set the number of maximum attempts that will be made to run a
* reduce task.
*
* @param n the number of attempts per reduce task.
*/
public void setMaxReduceAttempts(int n) {
setInt("mapred.reduce.max.attempts", n);
}
/**
* Get the user-specified job name. This is only used to identify the
* job to the user.
*
* @return the job's name, defaulting to "".
*/
public String getJobName() {
return get("mapred.job.name", "");
}
/**
* Set the user-specified job name.
*
* @param name the job's new name.
*/
public void setJobName(String name) {
set("mapred.job.name", name);
}
/**
* Get the user-specified session identifier. The default is the empty string.
*
* The session identifier is used to tag metric data that is reported to some
* performance metrics system via the org.apache.hadoop.metrics API. The
* session identifier is intended, in particular, for use by Hadoop-On-Demand
* (HOD) which allocates a virtual Hadoop cluster dynamically and transiently.
* HOD will set the session identifier by modifying the mapred-site.xml file
* before starting the cluster.
*
* When not running under HOD, this identifer is expected to remain set to
* the empty string.
*
* @return the session identifier, defaulting to "".
*/
public String getSessionId() {
return get("session.id", "");
}
/**
* Set the user-specified session identifier.
*
* @param sessionId the new session id.
*/
public void setSessionId(String sessionId) {
set("session.id", sessionId);
}
/**
* Set the maximum no. of failures of a given job per tasktracker.
* If the no. of task failures exceeds noFailures
, the
* tasktracker is blacklisted for this job.
*
* @param noFailures maximum no. of failures of a given job per tasktracker.
*/
public void setMaxTaskFailuresPerTracker(int noFailures) {
setInt("mapred.max.tracker.failures", noFailures);
}
/**
* Expert: Get the maximum no. of failures of a given job per tasktracker.
* If the no. of task failures exceeds this, the tasktracker is
* blacklisted for this job.
*
* @return the maximum no. of failures of a given job per tasktracker.
*/
public int getMaxTaskFailuresPerTracker() {
return getInt(MAPRED_MAX_TRACKER_FAILURES_PROPERTY, 4);
}
/**
* Get the maximum percentage of map tasks that can fail without
* the job being aborted.
*
* Each map task is executed a minimum of {@link #getMaxMapAttempts()}
* attempts before being declared as failed.
*
* Defaults to zero
, i.e. any failed map-task results in
* the job being declared as {@link JobStatus#FAILED}.
*
* @return the maximum percentage of map tasks that can fail without
* the job being aborted.
*/
public int getMaxMapTaskFailuresPercent() {
return getInt("mapred.max.map.failures.percent", 0);
}
/**
* Expert: Set the maximum percentage of map tasks that can fail without the
* job being aborted.
*
* Each map task is executed a minimum of {@link #getMaxMapAttempts} attempts
* before being declared as failed.
*
* @param percent the maximum percentage of map tasks that can fail without
* the job being aborted.
*/
public void setMaxMapTaskFailuresPercent(int percent) {
setInt("mapred.max.map.failures.percent", percent);
}
/**
* Get the maximum percentage of reduce tasks that can fail without
* the job being aborted.
*
* Each reduce task is executed a minimum of {@link #getMaxReduceAttempts()}
* attempts before being declared as failed.
*
* Defaults to zero
, i.e. any failed reduce-task results
* in the job being declared as {@link JobStatus#FAILED}.
*
* @return the maximum percentage of reduce tasks that can fail without
* the job being aborted.
*/
public int getMaxReduceTaskFailuresPercent() {
return getInt("mapred.max.reduce.failures.percent", 0);
}
/**
* Set the maximum percentage of reduce tasks that can fail without the job
* being aborted.
*
* Each reduce task is executed a minimum of {@link #getMaxReduceAttempts()}
* attempts before being declared as failed.
*
* @param percent the maximum percentage of reduce tasks that can fail without
* the job being aborted.
*/
public void setMaxReduceTaskFailuresPercent(int percent) {
setInt("mapred.max.reduce.failures.percent", percent);
}
/**
* Set {@link JobPriority} for this job.
*
* @param prio the {@link JobPriority} for this job.
*/
public void setJobPriority(JobPriority prio) {
set("mapred.job.priority", prio.toString());
}
/**
* Get the {@link JobPriority} for this job.
*
* @return the {@link JobPriority} for this job.
*/
public JobPriority getJobPriority() {
String prio = get("mapred.job.priority");
if(prio == null) {
return JobPriority.NORMAL;
}
return JobPriority.valueOf(prio);
}
/**
* Get whether the task profiling is enabled.
* @return true if some tasks will be profiled
*/
public boolean getProfileEnabled() {
return getBoolean("mapred.task.profile", false);
}
/**
* Set whether the system should collect profiler information for some of
* the tasks in this job? The information is stored in the user log
* directory.
* @param newValue true means it should be gathered
*/
public void setProfileEnabled(boolean newValue) {
setBoolean("mapred.task.profile", newValue);
}
/**
* Get the profiler configuration arguments.
*
* The default value for this property is
* "-agentlib:hprof=cpu=samples,heap=sites,force=n,thread=y,verbose=n,file=%s"
*
* @return the parameters to pass to the task child to configure profiling
*/
public String getProfileParams() {
return get("mapred.task.profile.params",
"-agentlib:hprof=cpu=samples,heap=sites,force=n,thread=y," +
"verbose=n,file=%s");
}
/**
* Set the profiler configuration arguments. If the string contains a '%s' it
* will be replaced with the name of the profiling output file when the task
* runs.
*
* This value is passed to the task child JVM on the command line.
*
* @param value the configuration string
*/
public void setProfileParams(String value) {
set("mapred.task.profile.params", value);
}
/**
* Get the range of maps or reduces to profile.
* @param isMap is the task a map?
* @return the task ranges
*/
public IntegerRanges getProfileTaskRange(boolean isMap) {
return getRange((isMap ? "mapred.task.profile.maps" :
"mapred.task.profile.reduces"), "0-2");
}
/**
* Set the ranges of maps or reduces to profile. setProfileEnabled(true)
* must also be called.
* @param newValue a set of integer ranges of the map ids
*/
public void setProfileTaskRange(boolean isMap, String newValue) {
// parse the value to make sure it is legal
new Configuration.IntegerRanges(newValue);
set((isMap ? "mapred.task.profile.maps" : "mapred.task.profile.reduces"),
newValue);
}
/**
* Set the debug script to run when the map tasks fail.
*
* The debug script can aid debugging of failed map tasks. The script is
* given task's stdout, stderr, syslog, jobconf files as arguments.
*
* The debug command, run on the node where the map failed, is:
*
* $script $stdout $stderr $syslog $jobconf.
*
*
* The script file is distributed through {@link DistributedCache}
* APIs. The script needs to be symlinked.
*
* Here is an example on how to submit a script
*
* job.setMapDebugScript("./myscript");
* DistributedCache.createSymlink(job);
* DistributedCache.addCacheFile("/debug/scripts/myscript#myscript");
*
*
* @param mDbgScript the script name
*/
public void setMapDebugScript(String mDbgScript) {
set("mapred.map.task.debug.script", mDbgScript);
}
/**
* Get the map task's debug script.
*
* @return the debug Script for the mapred job for failed map tasks.
* @see #setMapDebugScript(String)
*/
public String getMapDebugScript() {
return get("mapred.map.task.debug.script");
}
/**
* Set the debug script to run when the reduce tasks fail.
*
* The debug script can aid debugging of failed reduce tasks. The script
* is given task's stdout, stderr, syslog, jobconf files as arguments.
*
* The debug command, run on the node where the map failed, is:
*
* $script $stdout $stderr $syslog $jobconf.
*
*
* The script file is distributed through {@link DistributedCache}
* APIs. The script file needs to be symlinked
*
* Here is an example on how to submit a script
*
* job.setReduceDebugScript("./myscript");
* DistributedCache.createSymlink(job);
* DistributedCache.addCacheFile("/debug/scripts/myscript#myscript");
*
*
* @param rDbgScript the script name
*/
public void setReduceDebugScript(String rDbgScript) {
set("mapred.reduce.task.debug.script", rDbgScript);
}
/**
* Get the reduce task's debug Script
*
* @return the debug script for the mapred job for failed reduce tasks.
* @see #setReduceDebugScript(String)
*/
public String getReduceDebugScript() {
return get("mapred.reduce.task.debug.script");
}
/**
* Get the uri to be invoked in-order to send a notification after the job
* has completed (success/failure).
*
* @return the job end notification uri, null
if it hasn't
* been set.
* @see #setJobEndNotificationURI(String)
*/
public String getJobEndNotificationURI() {
return get("job.end.notification.url");
}
/**
* Set the uri to be invoked in-order to send a notification after the job
* has completed (success/failure).
*
* The uri can contain 2 special parameters: $jobId and
* $jobStatus. Those, if present, are replaced by the job's
* identifier and completion-status respectively.
*
* This is typically used by application-writers to implement chaining of
* Map-Reduce jobs in an asynchronous manner.
*
* @param uri the job end notification uri
* @see JobStatus
* @see Job Completion and Chaining
*/
public void setJobEndNotificationURI(String uri) {
set("job.end.notification.url", uri);
}
/**
* Get job-specific shared directory for use as scratch space
*
*
* When a job starts, a shared directory is created at location
*
* ${mapred.local.dir}/taskTracker/jobcache/$jobid/work/
.
* This directory is exposed to the users through
* job.local.dir
.
* So, the tasks can use this space
* as scratch space and share files among them.
* This value is available as System property also.
*
* @return The localized job specific shared directory
*/
public String getJobLocalDir() {
return get("job.local.dir");
}
/**
* Get memory required to run a map task of the job, in MB.
*
* If a value is specified in the configuration, it is returned.
* Else, it returns {@link #DISABLED_MEMORY_LIMIT}.
*
* For backward compatibility, if the job configuration sets the
* key {@link #MAPRED_TASK_MAXVMEM_PROPERTY} to a value different
* from {@link #DISABLED_MEMORY_LIMIT}, that value will be used
* after converting it from bytes to MB.
* @return memory required to run a map task of the job, in MB,
* or {@link #DISABLED_MEMORY_LIMIT} if unset.
*/
public long getMemoryForMapTask() {
long value = getDeprecatedMemoryValue();
if (value == DISABLED_MEMORY_LIMIT) {
value = normalizeMemoryConfigValue(
getLong(JobConf.MAPRED_JOB_MAP_MEMORY_MB_PROPERTY,
DISABLED_MEMORY_LIMIT));
}
return value;
}
public void setMemoryForMapTask(long mem) {
setLong(JobConf.MAPRED_JOB_MAP_MEMORY_MB_PROPERTY, mem);
}
/**
* Get memory required to run a reduce task of the job, in MB.
*
* If a value is specified in the configuration, it is returned.
* Else, it returns {@link #DISABLED_MEMORY_LIMIT}.
*
* For backward compatibility, if the job configuration sets the
* key {@link #MAPRED_TASK_MAXVMEM_PROPERTY} to a value different
* from {@link #DISABLED_MEMORY_LIMIT}, that value will be used
* after converting it from bytes to MB.
* @return memory required to run a reduce task of the job, in MB,
* or {@link #DISABLED_MEMORY_LIMIT} if unset.
*/
public long getMemoryForReduceTask() {
long value = getDeprecatedMemoryValue();
if (value == DISABLED_MEMORY_LIMIT) {
value = normalizeMemoryConfigValue(
getLong(JobConf.MAPRED_JOB_REDUCE_MEMORY_MB_PROPERTY,
DISABLED_MEMORY_LIMIT));
}
return value;
}
// Return the value set to the key MAPRED_TASK_MAXVMEM_PROPERTY,
// converted into MBs.
// Returns DISABLED_MEMORY_LIMIT if unset, or set to a negative
// value.
private long getDeprecatedMemoryValue() {
long oldValue = getLong(MAPRED_TASK_MAXVMEM_PROPERTY,
DISABLED_MEMORY_LIMIT);
oldValue = normalizeMemoryConfigValue(oldValue);
if (oldValue != DISABLED_MEMORY_LIMIT) {
oldValue /= (1024*1024);
}
return oldValue;
}
public void setMemoryForReduceTask(long mem) {
setLong(JobConf.MAPRED_JOB_REDUCE_MEMORY_MB_PROPERTY, mem);
}
/**
* Return the name of the queue to which this job is submitted.
* Defaults to 'default'.
*
* @return name of the queue
*/
public String getQueueName() {
return get("mapred.job.queue.name", DEFAULT_QUEUE_NAME);
}
/**
* Set the name of the queue to which this job should be submitted.
*
* @param queueName Name of the queue
*/
public void setQueueName(String queueName) {
set("mapred.job.queue.name", queueName);
}
/**
* Get the source of the job.
* Useful for getting the context that the job runs in.
*/
public String getJobSource() {
return get(JOB_SOURCE_CONF);
}
/**
* Normalize the negative values in configuration
*
* @param val
* @return normalized value
*/
public static long normalizeMemoryConfigValue(long val) {
if (val < 0) {
val = DISABLED_MEMORY_LIMIT;
}
return val;
}
/**
* Compute the number of slots required to run a single map task-attempt
* of this job.
* @param slotSizePerMap cluster-wide value of the amount of memory required
* to run a map-task
* @return the number of slots required to run a single map task-attempt
* 1 if memory parameters are disabled.
*/
int computeNumSlotsPerMap(long slotSizePerMap) {
if ((slotSizePerMap==DISABLED_MEMORY_LIMIT) ||
(getMemoryForMapTask()==DISABLED_MEMORY_LIMIT)) {
return 1;
}
return (int)(Math.ceil((float)getMemoryForMapTask() / (float)slotSizePerMap));
}
/**
* Compute the number of slots required to run a single reduce task-attempt
* of this job.
* @param slotSizePerReduce cluster-wide value of the amount of memory
* required to run a reduce-task
* @return the number of slots required to run a single reduce task-attempt
* 1 if memory parameters are disabled.
*/
int computeNumSlotsPerReduce(long slotSizePerReduce) {
if ((slotSizePerReduce==DISABLED_MEMORY_LIMIT) ||
(getMemoryForReduceTask()==DISABLED_MEMORY_LIMIT)) {
return 1;
}
return
(int)(Math.ceil((float)getMemoryForReduceTask() / (float)slotSizePerReduce));
}
/**
* Find a jar that contains a class of the same name, if any.
* It will return a jar file, even if that is not the first thing
* on the class path that has a class with the same name.
*
* @param my_class the class to find.
* @return a jar file that contains the class, or null.
* @throws IOException
*/
private static String findContainingJar(Class my_class) {
ClassLoader loader = my_class.getClassLoader();
String class_file = my_class.getName().replaceAll("\\.", "/") + ".class";
try {
for(Enumeration itr = loader.getResources(class_file);
itr.hasMoreElements();) {
URL url = (URL) itr.nextElement();
if ("jar".equals(url.getProtocol())) {
String toReturn = url.getPath();
if (toReturn.startsWith("file:")) {
toReturn = toReturn.substring("file:".length());
}
toReturn = URLDecoder.decode(toReturn, "UTF-8");
return toReturn.replaceAll("!.*$", "");
}
}
} catch (IOException e) {
throw new RuntimeException(e);
}
return null;
}
/**
* Get the memory required to run a task of this job, in bytes. See
* {@link #MAPRED_TASK_MAXVMEM_PROPERTY}
*
* This method is deprecated. Now, different memory limits can be
* set for map and reduce tasks of a job, in MB.
*
* For backward compatibility, if the job configuration sets the
* key {@link #MAPRED_TASK_MAXVMEM_PROPERTY} to a value different
* from {@link #DISABLED_MEMORY_LIMIT}, that value is returned.
* Otherwise, this method will return the larger of the values returned by
* {@link #getMemoryForMapTask()} and {@link #getMemoryForReduceTask()}
* after converting them into bytes.
*
* @return Memory required to run a task of this job, in bytes,
* or {@link #DISABLED_MEMORY_LIMIT}, if unset.
* @see #setMaxVirtualMemoryForTask(long)
* @deprecated Use {@link #getMemoryForMapTask()} and
* {@link #getMemoryForReduceTask()}
*/
@Deprecated
public long getMaxVirtualMemoryForTask() {
LOG.warn(
"getMaxVirtualMemoryForTask() is deprecated. " +
"Instead use getMemoryForMapTask() and getMemoryForReduceTask()");
long value = getLong(MAPRED_TASK_MAXVMEM_PROPERTY, DISABLED_MEMORY_LIMIT);
value = normalizeMemoryConfigValue(value);
if (value == DISABLED_MEMORY_LIMIT) {
value = Math.max(getMemoryForMapTask(), getMemoryForReduceTask());
value = normalizeMemoryConfigValue(value);
if (value != DISABLED_MEMORY_LIMIT) {
value *= 1024*1024;
}
}
return value;
}
/**
* Set the maximum amount of memory any task of this job can use. See
* {@link #MAPRED_TASK_MAXVMEM_PROPERTY}
*
* mapred.task.maxvmem is split into
* mapred.job.map.memory.mb
* and mapred.job.map.memory.mb,mapred
* each of the new key are set
* as mapred.task.maxvmem / 1024
* as new values are in MB
*
* @param vmem Maximum amount of virtual memory in bytes any task of this job
* can use.
* @see #getMaxVirtualMemoryForTask()
* @deprecated
* Use {@link #setMemoryForMapTask(long mem)} and
* Use {@link #setMemoryForReduceTask(long mem)}
*/
@Deprecated
public void setMaxVirtualMemoryForTask(long vmem) {
LOG.warn("setMaxVirtualMemoryForTask() is deprecated."+
"Instead use setMemoryForMapTask() and setMemoryForReduceTask()");
if(vmem != DISABLED_MEMORY_LIMIT && vmem < 0) {
setMemoryForMapTask(DISABLED_MEMORY_LIMIT);
setMemoryForReduceTask(DISABLED_MEMORY_LIMIT);
}
if(get(JobConf.MAPRED_TASK_MAXVMEM_PROPERTY) == null) {
setMemoryForMapTask(vmem / (1024 * 1024)); //Changing bytes to mb
setMemoryForReduceTask(vmem / (1024 * 1024));//Changing bytes to mb
}else{
this.setLong(JobConf.MAPRED_TASK_MAXVMEM_PROPERTY,vmem);
}
}
/**
* @deprecated this variable is deprecated and nolonger in use.
*/
@Deprecated
public long getMaxPhysicalMemoryForTask() {
LOG.warn("The API getMaxPhysicalMemoryForTask() is deprecated."
+ " Refer to the APIs getMemoryForMapTask() and"
+ " getMemoryForReduceTask() for details.");
return -1;
}
/*
* @deprecated this
*/
@Deprecated
public void setMaxPhysicalMemoryForTask(long mem) {
LOG.warn("The API setMaxPhysicalMemoryForTask() is deprecated."
+ " The value set is ignored. Refer to "
+ " setMemoryForMapTask() and setMemoryForReduceTask() for details.");
}
static String deprecatedString(String key) {
return "The variable " + key + " is no longer used.";
}
private void checkAndWarnDeprecation() {
if(get(JobConf.MAPRED_TASK_MAXVMEM_PROPERTY) != null) {
LOG.warn(JobConf.deprecatedString(JobConf.MAPRED_TASK_MAXVMEM_PROPERTY)
+ " Instead use " + JobConf.MAPRED_JOB_MAP_MEMORY_MB_PROPERTY
+ " and " + JobConf.MAPRED_JOB_REDUCE_MEMORY_MB_PROPERTY);
}
}
/**
* Replce the jobtracker configuration with the configuration of 0 or 1
* instance. This allows switching two sets of configurations in the
* command line option.
* @param conf The jobConf to be overwritten
* @param instance 0 or 1 instance of the jobtracker
*/
public static void overrideConfiguration(JobConf conf, int instance) {
final String CONFIG_KEYS[] =
new String[]{"mapred.job.tracker", "mapred.local.dir",
"mapred.fairscheduler.server.address"};
for (String configKey : CONFIG_KEYS) {
String value = conf.get(configKey + "-" + instance);
if (value != null) {
conf.set(configKey, value);
} else {
LOG.warn("Configuration " + configKey + "-" + instance + " not found.");
}
}
}
}