org.apache.hadoop.hive.ql.parse.MapReduceCompiler Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.parse;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.ContentSummary;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.Context;
import org.apache.hadoop.hive.ql.exec.ConditionalTask;
import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
import org.apache.hadoop.hive.ql.exec.MapJoinOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.exec.Task;
import org.apache.hadoop.hive.ql.exec.UnionOperator;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.exec.mr.ExecDriver;
import org.apache.hadoop.hive.ql.exec.mr.MapRedTask;
import org.apache.hadoop.hive.ql.hooks.ReadEntity;
import org.apache.hadoop.hive.ql.hooks.WriteEntity;
import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
import org.apache.hadoop.hive.ql.lib.Dispatcher;
import org.apache.hadoop.hive.ql.lib.GraphWalker;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.NodeProcessor;
import org.apache.hadoop.hive.ql.lib.Rule;
import org.apache.hadoop.hive.ql.lib.RuleRegExp;
import org.apache.hadoop.hive.ql.optimizer.GenMRFileSink1;
import org.apache.hadoop.hive.ql.optimizer.GenMROperator;
import org.apache.hadoop.hive.ql.optimizer.GenMRProcContext;
import org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx;
import org.apache.hadoop.hive.ql.optimizer.GenMRRedSink1;
import org.apache.hadoop.hive.ql.optimizer.GenMRRedSink2;
import org.apache.hadoop.hive.ql.optimizer.GenMRRedSink3;
import org.apache.hadoop.hive.ql.optimizer.GenMRTableScan1;
import org.apache.hadoop.hive.ql.optimizer.GenMRUnion1;
import org.apache.hadoop.hive.ql.optimizer.MapJoinFactory;
import org.apache.hadoop.hive.ql.optimizer.physical.PhysicalContext;
import org.apache.hadoop.hive.ql.optimizer.physical.PhysicalOptimizer;
import org.apache.hadoop.hive.ql.plan.MapWork;
import org.apache.hadoop.hive.ql.plan.MapredWork;
import org.apache.hadoop.hive.ql.plan.MoveWork;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.shims.ShimLoader;
public class MapReduceCompiler extends TaskCompiler {
protected final Log LOG = LogFactory.getLog(MapReduceCompiler.class);
public MapReduceCompiler() {
}
// loop over all the tasks recursively
@Override
protected void setInputFormat(Task extends Serializable> task) {
if (task instanceof ExecDriver) {
MapWork work = ((MapredWork) task.getWork()).getMapWork();
HashMap> opMap = work.getAliasToWork();
if (!opMap.isEmpty()) {
for (Operator extends OperatorDesc> op : opMap.values()) {
setInputFormat(work, op);
}
}
} else if (task instanceof ConditionalTask) {
List> listTasks
= ((ConditionalTask) task).getListTasks();
for (Task extends Serializable> tsk : listTasks) {
setInputFormat(tsk);
}
}
if (task.getChildTasks() != null) {
for (Task extends Serializable> childTask : task.getChildTasks()) {
setInputFormat(childTask);
}
}
}
private void setInputFormat(MapWork work, Operator extends OperatorDesc> op) {
if (op.isUseBucketizedHiveInputFormat()) {
work.setUseBucketizedHiveInputFormat(true);
return;
}
if (op.getChildOperators() != null) {
for (Operator extends OperatorDesc> childOp : op.getChildOperators()) {
setInputFormat(work, childOp);
}
}
}
// loop over all the tasks recursively
private void breakTaskTree(Task extends Serializable> task) {
if (task instanceof ExecDriver) {
HashMap> opMap = ((MapredWork) task
.getWork()).getMapWork().getAliasToWork();
if (!opMap.isEmpty()) {
for (Operator extends OperatorDesc> op : opMap.values()) {
breakOperatorTree(op);
}
}
} else if (task instanceof ConditionalTask) {
List> listTasks = ((ConditionalTask) task)
.getListTasks();
for (Task extends Serializable> tsk : listTasks) {
breakTaskTree(tsk);
}
}
if (task.getChildTasks() == null) {
return;
}
for (Task extends Serializable> childTask : task.getChildTasks()) {
breakTaskTree(childTask);
}
}
// loop over all the operators recursively
private void breakOperatorTree(Operator extends OperatorDesc> topOp) {
if (topOp instanceof ReduceSinkOperator) {
topOp.setChildOperators(null);
}
if (topOp.getChildOperators() == null) {
return;
}
for (Operator extends OperatorDesc> op : topOp.getChildOperators()) {
breakOperatorTree(op);
}
}
/**
* Make a best guess at trying to find the number of reducers
*/
private static int getNumberOfReducers(MapredWork mrwork, HiveConf conf) {
if (mrwork.getReduceWork() == null) {
return 0;
}
if (mrwork.getReduceWork().getNumReduceTasks() >= 0) {
return mrwork.getReduceWork().getNumReduceTasks();
}
return conf.getIntVar(HiveConf.ConfVars.HADOOPNUMREDUCERS);
}
@Override
protected void decideExecMode(List> rootTasks, Context ctx,
GlobalLimitCtx globalLimitCtx)
throws SemanticException {
// bypass for explain queries for now
if (ctx.getExplain()) {
return;
}
// user has told us to run in local mode or doesn't want auto-local mode
if (ctx.isLocalOnlyExecutionMode() ||
!conf.getBoolVar(HiveConf.ConfVars.LOCALMODEAUTO)) {
return;
}
final Context lCtx = ctx;
PathFilter p = new PathFilter() {
public boolean accept(Path file) {
return !lCtx.isMRTmpFileURI(file.toUri().getPath());
}
};
List mrtasks = Utilities.getMRTasks(rootTasks);
// map-reduce jobs will be run locally based on data size
// first find out if any of the jobs needs to run non-locally
boolean hasNonLocalJob = false;
for (ExecDriver mrtask : mrtasks) {
try {
ContentSummary inputSummary = Utilities.getInputSummary
(ctx, mrtask.getWork().getMapWork(), p);
int numReducers = getNumberOfReducers(mrtask.getWork(), conf);
long estimatedInput;
if (globalLimitCtx != null && globalLimitCtx.isEnable()) {
// If the global limit optimization is triggered, we will
// estimate input data actually needed based on limit rows.
// estimated Input = (num_limit * max_size_per_row) * (estimated_map + 2)
//
long sizePerRow = HiveConf.getLongVar(conf,
HiveConf.ConfVars.HIVELIMITMAXROWSIZE);
estimatedInput = globalLimitCtx.getGlobalLimit() * sizePerRow;
long minSplitSize = HiveConf.getLongVar(conf,
HiveConf.ConfVars.MAPREDMINSPLITSIZE);
long estimatedNumMap = inputSummary.getLength() / minSplitSize + 1;
estimatedInput = estimatedInput * (estimatedNumMap + 1);
} else {
estimatedInput = inputSummary.getLength();
}
if (LOG.isDebugEnabled()) {
LOG.debug("Task: " + mrtask.getId() + ", Summary: " +
inputSummary.getLength() + "," + inputSummary.getFileCount() + ","
+ numReducers + ", estimated Input: " + estimatedInput);
}
if (MapRedTask.isEligibleForLocalMode(conf, numReducers,
estimatedInput, inputSummary.getFileCount()) != null) {
hasNonLocalJob = true;
break;
} else {
mrtask.setLocalMode(true);
}
} catch (IOException e) {
throw new SemanticException(e);
}
}
if (!hasNonLocalJob) {
// Entire query can be run locally.
// Save the current tracker value and restore it when done.
ctx.setOriginalTracker(ShimLoader.getHadoopShims().getJobLauncherRpcAddress(conf));
ShimLoader.getHadoopShims().setJobLauncherRpcAddress(conf, "local");
console.printInfo("Automatically selecting local only mode for query");
}
}
@Override
protected void optimizeTaskPlan(List> rootTasks,
ParseContext pCtx, Context ctx) throws SemanticException {
// reduce sink does not have any kids - since the plan by now has been
// broken up into multiple
// tasks, iterate over all tasks.
// For each task, go over all operators recursively
for (Task extends Serializable> rootTask : rootTasks) {
breakTaskTree(rootTask);
}
PhysicalContext physicalContext = new PhysicalContext(conf,
getParseContext(pCtx, rootTasks), ctx, rootTasks, pCtx.getFetchTask());
PhysicalOptimizer physicalOptimizer = new PhysicalOptimizer(
physicalContext, conf);
physicalOptimizer.optimize();
}
@Override
protected void generateTaskTree(List> rootTasks, ParseContext pCtx,
List> mvTask, Set inputs, Set outputs) throws SemanticException {
// generate map reduce plans
ParseContext tempParseContext = getParseContext(pCtx, rootTasks);
GenMRProcContext procCtx = new GenMRProcContext(
conf,
new HashMap, Task extends Serializable>>(),
tempParseContext, mvTask, rootTasks,
new LinkedHashMap, GenMapRedCtx>(),
inputs, outputs);
// create a walker which walks the tree in a DFS manner while maintaining
// the operator stack.
// The dispatcher generates the plan from the operator tree
Map opRules = new LinkedHashMap();
opRules.put(new RuleRegExp(new String("R1"),
TableScanOperator.getOperatorName() + "%"),
new GenMRTableScan1());
opRules.put(new RuleRegExp(new String("R2"),
TableScanOperator.getOperatorName() + "%.*" + ReduceSinkOperator.getOperatorName() + "%"),
new GenMRRedSink1());
opRules.put(new RuleRegExp(new String("R3"),
ReduceSinkOperator.getOperatorName() + "%.*" + ReduceSinkOperator.getOperatorName() + "%"),
new GenMRRedSink2());
opRules.put(new RuleRegExp(new String("R4"),
FileSinkOperator.getOperatorName() + "%"),
new GenMRFileSink1());
opRules.put(new RuleRegExp(new String("R5"),
UnionOperator.getOperatorName() + "%"),
new GenMRUnion1());
opRules.put(new RuleRegExp(new String("R6"),
UnionOperator.getOperatorName() + "%.*" + ReduceSinkOperator.getOperatorName() + "%"),
new GenMRRedSink3());
opRules.put(new RuleRegExp(new String("R7"),
MapJoinOperator.getOperatorName() + "%"),
MapJoinFactory.getTableScanMapJoin());
// The dispatcher fires the processor corresponding to the closest matching
// rule and passes the context along
Dispatcher disp = new DefaultRuleDispatcher(new GenMROperator(), opRules,
procCtx);
GraphWalker ogw = new GenMapRedWalker(disp);
ArrayList topNodes = new ArrayList();
topNodes.addAll(pCtx.getTopOps().values());
ogw.startWalking(topNodes, null);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy