org.apache.hadoop.hive.ql.exec.MapRedTask Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of hive-exec Show documentation
Show all versions of hive-exec Show documentation
Hive is a data warehouse infrastructure built on top of Hadoop see
http://wiki.apache.org/hadoop/Hive
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.exec;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.ContentSummary;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.io.CachingPrintStream;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
import org.apache.hadoop.hive.ql.Context;
import org.apache.hadoop.hive.ql.DriverContext;
import org.apache.hadoop.hive.ql.exec.Utilities.StreamPrinter;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.MapredWork;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.session.SessionState;
import org.apache.hadoop.hive.shims.ShimLoader;
import org.apache.hadoop.mapred.JobConf;
/**
* Extension of ExecDriver:
* - can optionally spawn a map-reduce task from a separate jvm
* - will make last minute adjustments to map-reduce job parameters, viz:
* * estimating number of reducers
* * estimating whether job should run locally
**/
public class MapRedTask extends ExecDriver implements Serializable {
private static final long serialVersionUID = 1L;
static final String HADOOP_MEM_KEY = "HADOOP_HEAPSIZE";
static final String HADOOP_OPTS_KEY = "HADOOP_OPTS";
static final String HADOOP_CLIENT_OPTS = "HADOOP_CLIENT_OPTS";
static final String HIVE_DEBUG_RECURSIVE = "HIVE_DEBUG_RECURSIVE";
static final String HIVE_MAIN_CLIENT_DEBUG_OPTS = "HIVE_MAIN_CLIENT_DEBUG_OPTS";
static final String HIVE_CHILD_CLIENT_DEBUG_OPTS = "HIVE_CHILD_CLIENT_DEBUG_OPTS";
static final String[] HIVE_SYS_PROP = {"build.dir", "build.dir.hive"};
private transient ContentSummary inputSummary = null;
private transient boolean runningViaChild = false;
private transient boolean inputSizeEstimated = false;
private transient long totalInputFileSize;
private transient long totalInputNumFiles;
private Process executor;
public MapRedTask() {
super();
}
public MapRedTask(MapredWork plan, JobConf job, boolean isSilent) throws HiveException {
throw new RuntimeException("Illegal Constructor call");
}
@Override
public int execute(DriverContext driverContext) {
Context ctx = driverContext.getCtx();
boolean ctxCreated = false;
try {
if (ctx == null) {
ctx = new Context(conf);
ctxCreated = true;
}
// estimate number of reducers
setNumberOfReducers();
// auto-determine local mode if allowed
if (!ctx.isLocalOnlyExecutionMode() &&
conf.getBoolVar(HiveConf.ConfVars.LOCALMODEAUTO)) {
if (inputSummary == null) {
inputSummary = Utilities.getInputSummary(driverContext.getCtx(), work, null);
}
// set the values of totalInputFileSize and totalInputNumFiles, estimating them
// if percentage block sampling is being used
estimateInputSize();
// at this point the number of reducers is precisely defined in the plan
int numReducers = work.getNumReduceTasks();
if (LOG.isDebugEnabled()) {
LOG.debug("Task: " + getId() + ", Summary: " +
totalInputFileSize + "," + totalInputNumFiles + ","
+ numReducers);
}
String reason = MapRedTask.isEligibleForLocalMode(conf, numReducers,
totalInputFileSize, totalInputNumFiles);
if (reason == null) {
// clone configuration before modifying it on per-task basis
cloneConf();
ShimLoader.getHadoopShims().setJobLauncherRpcAddress(conf, "local");
console.printInfo("Selecting local mode for task: " + getId());
this.setLocalMode(true);
} else {
console.printInfo("Cannot run job locally: " + reason);
this.setLocalMode(false);
}
}
runningViaChild = ShimLoader.getHadoopShims().isLocalMode(conf) ||
conf.getBoolVar(HiveConf.ConfVars.SUBMITVIACHILD);
if(!runningViaChild) {
// we are not running this mapred task via child jvm
// so directly invoke ExecDriver
return super.execute(driverContext);
}
// we need to edit the configuration to setup cmdline. clone it first
cloneConf();
// propagate input format if necessary
super.setInputAttributes(conf);
// enable assertion
String hadoopExec = conf.getVar(HiveConf.ConfVars.HADOOPBIN);
String hiveJar = conf.getJar();
String libJarsOption;
String addedJars = Utilities.getResourceFiles(conf, SessionState.ResourceType.JAR);
conf.setVar(ConfVars.HIVEADDEDJARS, addedJars);
String auxJars = conf.getAuxJars();
// Put auxjars and addedjars together into libjars
if (StringUtils.isEmpty(addedJars)) {
if (StringUtils.isEmpty(auxJars)) {
libJarsOption = " ";
} else {
libJarsOption = " -libjars " + auxJars + " ";
}
} else {
if (StringUtils.isEmpty(auxJars)) {
libJarsOption = " -libjars " + addedJars + " ";
} else {
libJarsOption = " -libjars " + addedJars + "," + auxJars + " ";
}
}
// Generate the hiveConfArgs after potentially adding the jars
String hiveConfArgs = generateCmdLine(conf, ctx);
// write out the plan to a local file
Path planPath = new Path(ctx.getLocalTmpFileURI(), "plan.xml");
OutputStream out = FileSystem.getLocal(conf).create(planPath);
MapredWork plan = getWork();
LOG.info("Generating plan file " + planPath.toString());
Utilities.serializeMapRedWork(plan, out);
String isSilent = "true".equalsIgnoreCase(System
.getProperty("test.silent")) ? "-nolog" : "";
String jarCmd;
if (ShimLoader.getHadoopShims().usesJobShell()) {
jarCmd = libJarsOption + hiveJar + " " + ExecDriver.class.getName();
} else {
jarCmd = hiveJar + " " + ExecDriver.class.getName() + libJarsOption;
}
String cmdLine = hadoopExec + " jar " + jarCmd + " -plan "
+ planPath.toString() + " " + isSilent + " " + hiveConfArgs;
String workDir = (new File(".")).getCanonicalPath();
String files = Utilities.getResourceFiles(conf, SessionState.ResourceType.FILE);
if (!files.isEmpty()) {
cmdLine = cmdLine + " -files " + files;
workDir = (new Path(ctx.getLocalTmpFileURI())).toUri().getPath();
if (! (new File(workDir)).mkdir()) {
throw new IOException ("Cannot create tmp working dir: " + workDir);
}
for (String f: StringUtils.split(files, ',')) {
Path p = new Path(f);
String target = p.toUri().getPath();
String link = workDir + Path.SEPARATOR + p.getName();
if (FileUtil.symLink(target, link) != 0) {
throw new IOException ("Cannot link to added file: " + target + " from: " + link);
}
}
}
LOG.info("Executing: " + cmdLine);
// Inherit Java system variables
String hadoopOpts;
StringBuilder sb = new StringBuilder();
Properties p = System.getProperties();
for (String element : HIVE_SYS_PROP) {
if (p.containsKey(element)) {
sb.append(" -D" + element + "=" + p.getProperty(element));
}
}
hadoopOpts = sb.toString();
// Inherit the environment variables
String[] env;
Map variables = new HashMap(System.getenv());
// The user can specify the hadoop memory
if (ShimLoader.getHadoopShims().isLocalMode(conf)) {
// if we are running in local mode - then the amount of memory used
// by the child jvm can no longer default to the memory used by the
// parent jvm
int hadoopMem = conf.getIntVar(HiveConf.ConfVars.HIVEHADOOPMAXMEM);
if (hadoopMem == 0) {
// remove env var that would default child jvm to use parent's memory
// as default. child jvm would use default memory for a hadoop client
variables.remove(HADOOP_MEM_KEY);
} else {
// user specified the memory for local mode hadoop run
variables.put(HADOOP_MEM_KEY, String.valueOf(hadoopMem));
}
} else {
// nothing to do - we are not running in local mode - only submitting
// the job via a child process. in this case it's appropriate that the
// child jvm use the same memory as the parent jvm
}
if (variables.containsKey(HADOOP_OPTS_KEY)) {
variables.put(HADOOP_OPTS_KEY, variables.get(HADOOP_OPTS_KEY)
+ hadoopOpts);
} else {
variables.put(HADOOP_OPTS_KEY, hadoopOpts);
}
if(variables.containsKey(HIVE_DEBUG_RECURSIVE)) {
configureDebugVariablesForChildJVM(variables);
}
env = new String[variables.size()];
int pos = 0;
for (Map.Entry entry : variables.entrySet()) {
String name = entry.getKey();
String value = entry.getValue();
env[pos++] = name + "=" + value;
}
// Run ExecDriver in another JVM
executor = Runtime.getRuntime().exec(cmdLine, env, new File(workDir));
CachingPrintStream errPrintStream =
new CachingPrintStream(SessionState.getConsole().getChildErrStream());
StreamPrinter outPrinter = new StreamPrinter(
executor.getInputStream(), null,
SessionState.getConsole().getChildOutStream());
StreamPrinter errPrinter = new StreamPrinter(
executor.getErrorStream(), null,
errPrintStream);
outPrinter.start();
errPrinter.start();
int exitVal = jobExecHelper.progressLocal(executor, getId());
if (exitVal != 0) {
LOG.error("Execution failed with exit status: " + exitVal);
if (SessionState.get() != null) {
SessionState.get().addLocalMapRedErrors(getId(), errPrintStream.getOutput());
}
} else {
LOG.info("Execution completed successfully");
}
return exitVal;
} catch (Exception e) {
e.printStackTrace();
LOG.error("Exception: " + e.getMessage());
return (1);
} finally {
try {
// creating the context can create a bunch of files. So make
// sure to clear it out
if(ctxCreated) {
ctx.clear();
}
} catch (Exception e) {
LOG.error("Exception: " + e.getMessage());
}
}
}
static void configureDebugVariablesForChildJVM(Map environmentVariables) {
// this method contains various asserts to warn if environment variables are in a buggy state
assert environmentVariables.containsKey(HADOOP_CLIENT_OPTS)
&& environmentVariables.get(HADOOP_CLIENT_OPTS) != null : HADOOP_CLIENT_OPTS
+ " environment variable must be set when JVM in debug mode";
String hadoopClientOpts = environmentVariables.get(HADOOP_CLIENT_OPTS);
assert environmentVariables.containsKey(HIVE_MAIN_CLIENT_DEBUG_OPTS)
&& environmentVariables.get(HIVE_MAIN_CLIENT_DEBUG_OPTS) != null : HIVE_MAIN_CLIENT_DEBUG_OPTS
+ " environment variable must be set when JVM in debug mode";
assert hadoopClientOpts.contains(environmentVariables.get(HIVE_MAIN_CLIENT_DEBUG_OPTS)) : HADOOP_CLIENT_OPTS
+ " environment variable must contain debugging parameters, when JVM in debugging mode";
assert "y".equals(environmentVariables.get(HIVE_DEBUG_RECURSIVE))
|| "n".equals(environmentVariables.get(HIVE_DEBUG_RECURSIVE)) : HIVE_DEBUG_RECURSIVE
+ " environment variable must be set to \"y\" or \"n\" when debugging";
if (environmentVariables.get(HIVE_DEBUG_RECURSIVE).equals("y")) {
// swap debug options in HADOOP_CLIENT_OPTS to those that the child JVM should have
assert environmentVariables.containsKey(HIVE_CHILD_CLIENT_DEBUG_OPTS)
&& environmentVariables.get(HIVE_MAIN_CLIENT_DEBUG_OPTS) != null : HIVE_CHILD_CLIENT_DEBUG_OPTS
+ " environment variable must be set when JVM in debug mode";
String newHadoopClientOpts = hadoopClientOpts.replace(
environmentVariables.get(HIVE_MAIN_CLIENT_DEBUG_OPTS),
environmentVariables.get(HIVE_CHILD_CLIENT_DEBUG_OPTS));
environmentVariables.put(HADOOP_CLIENT_OPTS, newHadoopClientOpts);
} else {
// remove from HADOOP_CLIENT_OPTS any debug related options
String newHadoopClientOpts = hadoopClientOpts.replace(
environmentVariables.get(HIVE_MAIN_CLIENT_DEBUG_OPTS), "").trim();
if (newHadoopClientOpts.isEmpty()) {
environmentVariables.remove(HADOOP_CLIENT_OPTS);
} else {
environmentVariables.put(HADOOP_CLIENT_OPTS, newHadoopClientOpts);
}
}
// child JVM won't need to change debug parameters when creating it's own children
environmentVariables.remove(HIVE_DEBUG_RECURSIVE);
}
@Override
public boolean mapStarted() {
boolean b = super.mapStarted();
return runningViaChild ? isdone : b;
}
@Override
public boolean reduceStarted() {
boolean b = super.reduceStarted();
return runningViaChild ? isdone : b;
}
@Override
public boolean mapDone() {
boolean b = super.mapDone();
return runningViaChild ? isdone : b;
}
@Override
public boolean reduceDone() {
boolean b = super.reduceDone();
return runningViaChild ? isdone : b;
}
/**
* Set the number of reducers for the mapred work.
*/
private void setNumberOfReducers() throws IOException {
// this is a temporary hack to fix things that are not fixed in the compiler
Integer numReducersFromWork = work.getNumReduceTasks();
if (work.getReducer() == null) {
console
.printInfo("Number of reduce tasks is set to 0 since there's no reduce operator");
work.setNumReduceTasks(Integer.valueOf(0));
} else {
if (numReducersFromWork >= 0) {
console.printInfo("Number of reduce tasks determined at compile time: "
+ work.getNumReduceTasks());
} else if (job.getNumReduceTasks() > 0) {
int reducers = job.getNumReduceTasks();
work.setNumReduceTasks(reducers);
console
.printInfo("Number of reduce tasks not specified. Defaulting to jobconf value of: "
+ reducers);
} else {
int reducers = estimateNumberOfReducers();
work.setNumReduceTasks(reducers);
console
.printInfo("Number of reduce tasks not specified. Estimated from input data size: "
+ reducers);
}
console
.printInfo("In order to change the average load for a reducer (in bytes):");
console.printInfo(" set " + HiveConf.ConfVars.BYTESPERREDUCER.varname
+ "=");
console.printInfo("In order to limit the maximum number of reducers:");
console.printInfo(" set " + HiveConf.ConfVars.MAXREDUCERS.varname
+ "=");
console.printInfo("In order to set a constant number of reducers:");
console.printInfo(" set " + HiveConf.ConfVars.HADOOPNUMREDUCERS
+ "=");
}
}
/**
* Estimate the number of reducers needed for this job, based on job input,
* and configuration parameters.
*
* The output of this method should only be used if the output of this
* MapRedTask is not being used to populate a bucketed table and the user
* has not specified the number of reducers to use.
*
* @return the number of reducers.
*/
private int estimateNumberOfReducers() throws IOException {
long bytesPerReducer = conf.getLongVar(HiveConf.ConfVars.BYTESPERREDUCER);
int maxReducers = conf.getIntVar(HiveConf.ConfVars.MAXREDUCERS);
if(inputSummary == null) {
// compute the summary and stash it away
inputSummary = Utilities.getInputSummary(driverContext.getCtx(), work, null);
}
// if all inputs are sampled, we should shrink the size of reducers accordingly.
estimateInputSize();
if (totalInputFileSize != inputSummary.getLength()) {
LOG.info("BytesPerReducer=" + bytesPerReducer + " maxReducers="
+ maxReducers + " estimated totalInputFileSize=" + totalInputFileSize);
} else {
LOG.info("BytesPerReducer=" + bytesPerReducer + " maxReducers="
+ maxReducers + " totalInputFileSize=" + totalInputFileSize);
}
int reducers = (int) ((totalInputFileSize + bytesPerReducer - 1) / bytesPerReducer);
reducers = Math.max(1, reducers);
reducers = Math.min(maxReducers, reducers);
// If this map reduce job writes final data to a table and bucketing is being inferred,
// and the user has configured Hive to do this, make sure the number of reducers is a
// power of two
if (conf.getBoolVar(HiveConf.ConfVars.HIVE_INFER_BUCKET_SORT_NUM_BUCKETS_POWER_TWO) &&
work.isFinalMapRed() && !work.getBucketedColsByDirectory().isEmpty()) {
int reducersLog = (int)(Math.log(reducers) / Math.log(2)) + 1;
int reducersPowerTwo = (int)Math.pow(2, reducersLog);
// If the original number of reducers was a power of two, use that
if (reducersPowerTwo / 2 == reducers) {
return reducers;
} else if (reducersPowerTwo > maxReducers) {
// If the next power of two greater than the original number of reducers is greater
// than the max number of reducers, use the preceding power of two, which is strictly
// less than the original number of reducers and hence the max
reducers = reducersPowerTwo / 2;
} else {
// Otherwise use the smallest power of two greater than the original number of reducers
reducers = reducersPowerTwo;
}
}
return reducers;
}
/**
* Sets the values of totalInputFileSize and totalInputNumFiles. If percentage
* block sampling is used, these values are estimates based on the highest
* percentage being used for sampling multiplied by the value obtained from the
* input summary. Otherwise, these values are set to the exact value obtained
* from the input summary.
*
* Once the function completes, inputSizeEstimated is set so that the logic is
* never run more than once.
*/
private void estimateInputSize() {
if (inputSizeEstimated) {
// If we've already run this function, return
return;
}
// Initialize the values to be those taken from the input summary
totalInputFileSize = inputSummary.getLength();
totalInputNumFiles = inputSummary.getFileCount();
if (work.getNameToSplitSample() == null || work.getNameToSplitSample().isEmpty()) {
// If percentage block sampling wasn't used, we don't need to do any estimation
inputSizeEstimated = true;
return;
}
// if all inputs are sampled, we should shrink the size of the input accordingly
double highestSamplePercentage = 0;
boolean allSample = false;
for (String alias : work.getAliasToWork().keySet()) {
if (work.getNameToSplitSample().containsKey(alias)) {
allSample = true;
Double rate = work.getNameToSplitSample().get(alias).getPercent();
if (rate != null && rate > highestSamplePercentage) {
highestSamplePercentage = rate;
}
} else {
allSample = false;
break;
}
}
if (allSample) {
// This is a little bit dangerous if inputs turns out not to be able to be sampled.
// In that case, we significantly underestimate the input.
// It's the same as estimateNumberOfReducers(). It's just our best
// guess and there is no guarantee.
totalInputFileSize = Math.min((long) (totalInputFileSize * highestSamplePercentage / 100D)
, totalInputFileSize);
totalInputNumFiles = Math.min((long) (totalInputNumFiles * highestSamplePercentage / 100D)
, totalInputNumFiles);
}
inputSizeEstimated = true;
}
/**
* Find out if a job can be run in local mode based on it's characteristics
*
* @param conf Hive Configuration
* @param numReducers total number of reducers for this job
* @param inputLength the size of the input
* @param inputFileCount the number of files of input
* @return String null if job is eligible for local mode, reason otherwise
*/
public static String isEligibleForLocalMode(HiveConf conf,
int numReducers,
long inputLength,
long inputFileCount) {
long maxBytes = conf.getLongVar(HiveConf.ConfVars.LOCALMODEMAXBYTES);
long maxInputFiles = conf.getIntVar(HiveConf.ConfVars.LOCALMODEMAXINPUTFILES);
// check for max input size
if (inputLength > maxBytes) {
return "Input Size (= " + inputLength + ") is larger than " +
HiveConf.ConfVars.LOCALMODEMAXBYTES.varname + " (= " + maxBytes + ")";
}
// ideally we would like to do this check based on the number of splits
// in the absence of an easy way to get the number of splits - do this
// based on the total number of files (pessimistically assumming that
// splits are equal to number of files in worst case)
if (inputFileCount > maxInputFiles) {
return "Number of Input Files (= " + inputFileCount +
") is larger than " +
HiveConf.ConfVars.LOCALMODEMAXINPUTFILES.varname + "(= " + maxInputFiles + ")";
}
// since local mode only runs with 1 reducers - make sure that the
// the number of reducers (set by user or inferred) is <=1
if (numReducers > 1) {
return "Number of reducers (= " + numReducers + ") is more than 1";
}
return null;
}
@Override
public Operator extends OperatorDesc> getReducer() {
return getWork().getReducer();
}
@Override
public void shutdown() {
super.shutdown();
if (executor != null) {
executor.destroy();
executor = null;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy