org.apache.hadoop.hive.ql.parse.TaskCompiler Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.parse;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.Warehouse;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.ql.Context;
import org.apache.hadoop.hive.ql.ErrorMsg;
import org.apache.hadoop.hive.ql.exec.ColumnStatsTask;
import org.apache.hadoop.hive.ql.exec.FetchTask;
import org.apache.hadoop.hive.ql.exec.StatsTask;
import org.apache.hadoop.hive.ql.exec.Task;
import org.apache.hadoop.hive.ql.exec.TaskFactory;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.exec.mr.ExecDriver;
import org.apache.hadoop.hive.ql.exec.spark.SparkTask;
import org.apache.hadoop.hive.ql.hooks.ReadEntity;
import org.apache.hadoop.hive.ql.hooks.WriteEntity;
import org.apache.hadoop.hive.ql.metadata.Hive;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.optimizer.GenMapRedUtils;
import org.apache.hadoop.hive.ql.plan.ColumnStatsDesc;
import org.apache.hadoop.hive.ql.plan.ColumnStatsWork;
import org.apache.hadoop.hive.ql.plan.CreateTableDesc;
import org.apache.hadoop.hive.ql.plan.DDLWork;
import org.apache.hadoop.hive.ql.plan.FetchWork;
import org.apache.hadoop.hive.ql.plan.LoadFileDesc;
import org.apache.hadoop.hive.ql.plan.LoadTableDesc;
import org.apache.hadoop.hive.ql.plan.MoveWork;
import org.apache.hadoop.hive.ql.plan.PlanUtils;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.ql.session.SessionState.LogHelper;
import com.google.common.collect.Interner;
import com.google.common.collect.Interners;
/**
* TaskCompiler is a the base class for classes that compile
* operator pipelines into tasks.
*/
public abstract class TaskCompiler {
protected final Log LOG = LogFactory.getLog(TaskCompiler.class);
protected Hive db;
protected LogHelper console;
protected HiveConf conf;
public void init(HiveConf conf, LogHelper console, Hive db) {
this.conf = conf;
this.db = db;
this.console = console;
}
@SuppressWarnings({"nls", "unchecked"})
public void compile(final ParseContext pCtx, final List> rootTasks,
final HashSet inputs, final HashSet outputs) throws SemanticException {
Context ctx = pCtx.getContext();
GlobalLimitCtx globalLimitCtx = pCtx.getGlobalLimitCtx();
QB qb = pCtx.getQB();
List> mvTask = new ArrayList>();
List loadTableWork = pCtx.getLoadTableWork();
List loadFileWork = pCtx.getLoadFileWork();
boolean isCStats = qb.isAnalyzeRewrite();
if (pCtx.getFetchTask() != null) {
return;
}
optimizeOperatorPlan(pCtx, inputs, outputs);
/*
* In case of a select, use a fetch task instead of a move task.
* If the select is from analyze table column rewrite, don't create a fetch task. Instead create
* a column stats task later.
*/
if (pCtx.getQB().getIsQuery() && !isCStats) {
if ((!loadTableWork.isEmpty()) || (loadFileWork.size() != 1)) {
throw new SemanticException(ErrorMsg.GENERIC_ERROR.getMsg());
}
LoadFileDesc loadFileDesc = loadFileWork.get(0);
String cols = loadFileDesc.getColumns();
String colTypes = loadFileDesc.getColumnTypes();
TableDesc resultTab = pCtx.getFetchTabledesc();
if (resultTab == null) {
String resFileFormat = HiveConf.getVar(conf, HiveConf.ConfVars.HIVEQUERYRESULTFILEFORMAT);
resultTab = PlanUtils.getDefaultQueryOutputTableDesc(cols, colTypes, resFileFormat);
}
FetchWork fetch = new FetchWork(loadFileDesc.getSourcePath(),
resultTab, qb.getParseInfo().getOuterQueryLimit());
fetch.setSource(pCtx.getFetchSource());
fetch.setSink(pCtx.getFetchSink());
pCtx.setFetchTask((FetchTask) TaskFactory.get(fetch, conf));
// For the FetchTask, the limit optimization requires we fetch all the rows
// in memory and count how many rows we get. It's not practical if the
// limit factor is too big
int fetchLimit = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVELIMITOPTMAXFETCH);
if (globalLimitCtx.isEnable() && globalLimitCtx.getGlobalLimit() > fetchLimit) {
LOG.info("For FetchTask, LIMIT " + globalLimitCtx.getGlobalLimit() + " > " + fetchLimit
+ ". Doesn't qualify limit optimiztion.");
globalLimitCtx.disableOpt();
}
if (qb.getParseInfo().getOuterQueryLimit() == 0) {
// Believe it or not, some tools do generate queries with limit 0 and than expect
// query to run quickly. Lets meet their requirement.
LOG.info("Limit 0. No query execution needed.");
return;
}
} else if (!isCStats) {
for (LoadTableDesc ltd : loadTableWork) {
Task tsk = TaskFactory.get(new MoveWork(null, null, ltd, null, false), conf);
mvTask.add(tsk);
// Check to see if we are stale'ing any indexes and auto-update them if we want
if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVEINDEXAUTOUPDATE)) {
IndexUpdater indexUpdater = new IndexUpdater(loadTableWork, inputs, conf);
try {
List> indexUpdateTasks = indexUpdater
.generateUpdateTasks();
for (Task extends Serializable> updateTask : indexUpdateTasks) {
tsk.addDependentTask(updateTask);
}
} catch (HiveException e) {
console
.printInfo("WARNING: could not auto-update stale indexes, which are not in sync");
}
}
}
boolean oneLoadFile = true;
for (LoadFileDesc lfd : loadFileWork) {
if (qb.isCTAS()) {
assert (oneLoadFile); // should not have more than 1 load file for
// CTAS
// make the movetask's destination directory the table's destination.
Path location;
String loc = qb.getTableDesc().getLocation();
if (loc == null) {
// get the table's default location
Path targetPath;
try {
String[] names = Utilities.getDbTableName(qb.getTableDesc().getTableName());
if (!db.databaseExists(names[0])) {
throw new SemanticException("ERROR: The database " + names[0]
+ " does not exist.");
}
Warehouse wh = new Warehouse(conf);
targetPath = wh.getTablePath(db.getDatabase(names[0]), names[1]);
} catch (HiveException e) {
throw new SemanticException(e);
} catch (MetaException e) {
throw new SemanticException(e);
}
location = targetPath;
} else {
location = new Path(loc);
}
lfd.setTargetDir(location);
oneLoadFile = false;
}
mvTask.add(TaskFactory.get(new MoveWork(null, null, null, lfd, false), conf));
}
}
generateTaskTree(rootTasks, pCtx, mvTask, inputs, outputs);
/*
* If the query was the result of analyze table column compute statistics rewrite, create
* a column stats task instead of a fetch task to persist stats to the metastore.
*/
if (isCStats) {
genColumnStatsTask(qb, loadTableWork, loadFileWork, rootTasks);
}
// For each task, set the key descriptor for the reducer
for (Task extends Serializable> rootTask : rootTasks) {
GenMapRedUtils.setKeyAndValueDescForTaskTree(rootTask);
}
// If a task contains an operator which instructs bucketizedhiveinputformat
// to be used, please do so
for (Task extends Serializable> rootTask : rootTasks) {
setInputFormat(rootTask);
}
optimizeTaskPlan(rootTasks, pCtx, ctx);
decideExecMode(rootTasks, ctx, globalLimitCtx);
if (qb.isCTAS()) {
// generate a DDL task and make it a dependent task of the leaf
CreateTableDesc crtTblDesc = qb.getTableDesc();
crtTblDesc.validate(conf);
// clear the mapredWork output file from outputs for CTAS
// DDLWork at the tail of the chain will have the output
Iterator outIter = outputs.iterator();
while (outIter.hasNext()) {
switch (outIter.next().getType()) {
case DFS_DIR:
case LOCAL_DIR:
outIter.remove();
break;
default:
break;
}
}
Task extends Serializable> crtTblTask = TaskFactory.get(new DDLWork(
inputs, outputs, crtTblDesc), conf);
// find all leaf tasks and make the DDLTask as a dependent task of all of
// them
HashSet> leaves = new LinkedHashSet>();
getLeafTasks(rootTasks, leaves);
assert (leaves.size() > 0);
for (Task extends Serializable> task : leaves) {
if (task instanceof StatsTask) {
// StatsTask require table to already exist
for (Task extends Serializable> parentOfStatsTask : task.getParentTasks()) {
parentOfStatsTask.addDependentTask(crtTblTask);
}
for (Task extends Serializable> parentOfCrtTblTask : crtTblTask.getParentTasks()) {
parentOfCrtTblTask.removeDependentTask(task);
}
crtTblTask.addDependentTask(task);
} else {
task.addDependentTask(crtTblTask);
}
}
}
if (globalLimitCtx.isEnable() && pCtx.getFetchTask() != null) {
LOG.info("set least row check for FetchTask: " + globalLimitCtx.getGlobalLimit());
pCtx.getFetchTask().getWork().setLeastNumRows(globalLimitCtx.getGlobalLimit());
}
if (globalLimitCtx.isEnable() && globalLimitCtx.getLastReduceLimitDesc() != null) {
LOG.info("set least row check for LimitDesc: " + globalLimitCtx.getGlobalLimit());
globalLimitCtx.getLastReduceLimitDesc().setLeastRows(globalLimitCtx.getGlobalLimit());
List mrTasks = Utilities.getMRTasks(rootTasks);
for (ExecDriver tsk : mrTasks) {
tsk.setRetryCmdWhenFail(true);
}
List sparkTasks = Utilities.getSparkTasks(rootTasks);
for (SparkTask sparkTask : sparkTasks) {
sparkTask.setRetryCmdWhenFail(true);
}
}
Interner interner = Interners.newStrongInterner();
for (Task extends Serializable> rootTask : rootTasks) {
GenMapRedUtils.internTableDesc(rootTask, interner);
}
}
/**
* A helper function to generate a column stats task on top of map-red task. The column stats
* task fetches from the output of the map-red task, constructs the column stats object and
* persists it to the metastore.
*
* This method generates a plan with a column stats task on top of map-red task and sets up the
* appropriate metadata to be used during execution.
*
* @param qb
*/
@SuppressWarnings("unchecked")
protected void genColumnStatsTask(QB qb, List loadTableWork,
List loadFileWork, List> rootTasks) {
QBParseInfo qbParseInfo = qb.getParseInfo();
ColumnStatsTask cStatsTask = null;
ColumnStatsWork cStatsWork = null;
FetchWork fetch = null;
String tableName = qbParseInfo.getTableName();
List colName = qbParseInfo.getColName();
List colType = qbParseInfo.getColType();
boolean isTblLevel = qbParseInfo.isTblLvl();
String cols = loadFileWork.get(0).getColumns();
String colTypes = loadFileWork.get(0).getColumnTypes();
String resFileFormat = HiveConf.getVar(conf, HiveConf.ConfVars.HIVEQUERYRESULTFILEFORMAT);
TableDesc resultTab = PlanUtils.getDefaultQueryOutputTableDesc(cols, colTypes, resFileFormat);
fetch = new FetchWork(loadFileWork.get(0).getSourcePath(),
resultTab, qb.getParseInfo().getOuterQueryLimit());
ColumnStatsDesc cStatsDesc = new ColumnStatsDesc(tableName,
colName, colType, isTblLevel);
cStatsWork = new ColumnStatsWork(fetch, cStatsDesc);
cStatsTask = (ColumnStatsTask) TaskFactory.get(cStatsWork, conf);
rootTasks.add(cStatsTask);
}
/**
* Find all leaf tasks of the list of root tasks.
*/
protected void getLeafTasks(List> rootTasks,
HashSet> leaves) {
for (Task extends Serializable> root : rootTasks) {
getLeafTasks(root, leaves);
}
}
private void getLeafTasks(Task extends Serializable> task,
HashSet> leaves) {
if (task.getDependentTasks() == null) {
if (!leaves.contains(task)) {
leaves.add(task);
}
} else {
getLeafTasks(task.getDependentTasks(), leaves);
}
}
/*
* Called to transform tasks into local tasks where possible/desirable
*/
protected abstract void decideExecMode(List> rootTasks, Context ctx,
GlobalLimitCtx globalLimitCtx) throws SemanticException;
/*
* Called at the beginning of the compile phase to have another chance to optimize the operator plan
*/
protected void optimizeOperatorPlan(ParseContext pCtxSet, Set inputs,
Set outputs) throws SemanticException {
}
/*
* Called after the tasks have been generated to run another round of optimization
*/
protected abstract void optimizeTaskPlan(List> rootTasks,
ParseContext pCtx, Context ctx) throws SemanticException;
/*
* Called to set the appropriate input format for tasks
*/
protected abstract void setInputFormat(Task extends Serializable> rootTask);
/*
* Called to generate the taks tree from the parse context/operator tree
*/
protected abstract void generateTaskTree(List> rootTasks, ParseContext pCtx,
List> mvTask, Set inputs, Set outputs) throws SemanticException;
/**
* Create a clone of the parse context
*/
public ParseContext getParseContext(ParseContext pCtx, List> rootTasks) {
ParseContext clone = new ParseContext(conf,
pCtx.getQB(), pCtx.getParseTree(),
pCtx.getOpToPartPruner(), pCtx.getOpToPartList(), pCtx.getTopOps(),
pCtx.getOpParseCtx(), pCtx.getJoinOps(), pCtx.getSmbMapJoinOps(),
pCtx.getLoadTableWork(), pCtx.getLoadFileWork(), pCtx.getContext(),
pCtx.getIdToTableNameMap(), pCtx.getDestTableId(), pCtx.getUCtx(),
pCtx.getListMapJoinOpsNoReducer(), pCtx.getGroupOpToInputTables(),
pCtx.getPrunedPartitions(), pCtx.getOpToSamplePruner(), pCtx.getGlobalLimitCtx(),
pCtx.getNameToSplitSample(), pCtx.getSemanticInputs(), rootTasks,
pCtx.getOpToPartToSkewedPruner(), pCtx.getViewAliasToInput(),
pCtx.getReduceSinkOperatorsAddedByEnforceBucketingSorting(),
pCtx.getQueryProperties());
clone.setFetchTask(pCtx.getFetchTask());
clone.setLineageInfo(pCtx.getLineageInfo());
clone.setMapJoinOps(pCtx.getMapJoinOps());
return clone;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy