Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.optimizer.physical;
import java.io.Serializable;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.Context;
import org.apache.hadoop.hive.ql.exec.ConditionalTask;
import org.apache.hadoop.hive.ql.exec.DummyStoreOperator;
import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
import org.apache.hadoop.hive.ql.exec.MapJoinOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator;
import org.apache.hadoop.hive.ql.exec.Task;
import org.apache.hadoop.hive.ql.exec.TaskFactory;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.exec.mr.MapRedTask;
import org.apache.hadoop.hive.ql.lib.Dispatcher;
import org.apache.hadoop.hive.ql.optimizer.GenMapRedUtils;
import org.apache.hadoop.hive.ql.optimizer.MapJoinProcessor;
import org.apache.hadoop.hive.ql.parse.ParseContext;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.ConditionalResolverCommonJoin;
import org.apache.hadoop.hive.ql.plan.ConditionalResolverCommonJoin.ConditionalResolverCommonJoinCtx;
import org.apache.hadoop.hive.ql.plan.ConditionalWork;
import org.apache.hadoop.hive.ql.plan.FetchWork;
import org.apache.hadoop.hive.ql.plan.MapWork;
import org.apache.hadoop.hive.ql.plan.MapredLocalWork;
import org.apache.hadoop.hive.ql.plan.MapredWork;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.PartitionDesc;
import org.apache.hadoop.hive.ql.plan.ReduceWork;
import org.apache.hadoop.hive.ql.plan.SMBJoinDesc;
/**
* Iterator over each task. If the task has a smb join, convert the task to a conditional task.
* The conditional task will first try all mapjoin possibilities, and go the the smb join if the
* mapjoin fails. The smb join will be a backup task for all the mapjoin tasks.
*/
public class SortMergeJoinTaskDispatcher extends AbstractJoinTaskDispatcher implements Dispatcher {
public SortMergeJoinTaskDispatcher(PhysicalContext context) {
super(context);
}
// Convert the work in the SMB plan to a regular join
// Note that the operator tree is not fixed, only the path/alias mappings in the
// plan are fixed. The operator tree will still contain the SMBJoinOperator
private void genSMBJoinWork(MapWork currWork, SMBMapJoinOperator smbJoinOp) {
// Remove the paths which are not part of aliasToPartitionInfo
Map aliasToPartitionInfo = currWork.getAliasToPartnInfo();
List removePaths = new ArrayList();
for (Map.Entry> entry : currWork.getPathToAliases().entrySet()) {
boolean keepPath = false;
for (String alias : entry.getValue()) {
if (aliasToPartitionInfo.containsKey(alias)) {
keepPath = true;
break;
}
}
// Remove if the path is not present
if (!keepPath) {
removePaths.add(entry.getKey());
}
}
List removeAliases = new ArrayList();
for (String removePath : removePaths) {
removeAliases.addAll(currWork.getPathToAliases().get(removePath));
currWork.getPathToAliases().remove(removePath);
currWork.getPathToPartitionInfo().remove(removePath);
}
for (String alias : removeAliases) {
currWork.getAliasToPartnInfo().remove(alias);
currWork.getAliasToWork().remove(alias);
}
// Get the MapredLocalWork
MapredLocalWork localWork = smbJoinOp.getConf().getLocalWork();
for (Map.Entry> entry : localWork.getAliasToWork()
.entrySet()) {
String alias = entry.getKey();
Operator op = entry.getValue();
FetchWork fetchWork = localWork.getAliasToFetchWork().get(alias);
// Add the entry in mapredwork
currWork.getAliasToWork().put(alias, op);
PartitionDesc partitionInfo = currWork.getAliasToPartnInfo().get(alias);
if (fetchWork.getTblDir() != null) {
currWork.mergeAliasedInput(alias, fetchWork.getTblDir().toUri().toString(), partitionInfo);
} else {
for (Path pathDir : fetchWork.getPartDir()) {
currWork.mergeAliasedInput(alias, pathDir.toUri().toString(), partitionInfo);
}
}
}
// Remove the dummy store operator from the tree
for (Operator parentOp : smbJoinOp.getParentOperators()) {
if (parentOp instanceof DummyStoreOperator) {
Operator grandParentOp = parentOp.getParentOperators().get(0);
smbJoinOp.replaceParent(parentOp, grandParentOp);
grandParentOp.setChildOperators(parentOp.getChildOperators());
parentOp.setParentOperators(null);
parentOp.setParentOperators(null);
}
}
}
/*
* Convert the work containing to sort-merge join into a work, as if it had a regular join.
* Note that the operator tree is not changed - is still contains the SMB join, but the
* plan is changed (aliasToWork etc.) to contain all the paths as if it was a regular join.
*/
private MapredWork convertSMBWorkToJoinWork(MapredWork currWork, SMBMapJoinOperator oldSMBJoinOp)
throws SemanticException {
try {
// deep copy a new mapred work
MapredWork currJoinWork = Utilities.clonePlan(currWork);
SMBMapJoinOperator newSMBJoinOp = getSMBMapJoinOp(currJoinWork);
// change the newly created map-red plan as if it was a join operator
genSMBJoinWork(currJoinWork.getMapWork(), newSMBJoinOp);
return currJoinWork;
} catch (Exception e) {
e.printStackTrace();
throw new SemanticException("Generate Map Join Task Error: " + e.getMessage());
}
}
// create map join task and set big table as bigTablePosition
private MapRedTask convertSMBTaskToMapJoinTask(MapredWork origWork,
int bigTablePosition,
SMBMapJoinOperator smbJoinOp)
throws UnsupportedEncodingException, SemanticException {
// deep copy a new mapred work
MapredWork newWork = Utilities.clonePlan(origWork);
// create a mapred task for this work
MapRedTask newTask = (MapRedTask) TaskFactory.get(newWork, physicalContext
.getParseContext().getConf());
// generate the map join operator; already checked the map join
MapJoinOperator newMapJoinOp =
getMapJoinOperator(newTask, newWork, smbJoinOp, bigTablePosition);
// The reducer needs to be restored - Consider a query like:
// select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
// The reducer contains a groupby, which needs to be restored.
ReduceWork rWork = newWork.getReduceWork();
// create the local work for this plan
MapJoinProcessor.genLocalWorkForMapJoin(newWork, newMapJoinOp, bigTablePosition);
// restore the reducer
newWork.setReduceWork(rWork);
return newTask;
}
private boolean isEligibleForOptimization(SMBMapJoinOperator originalSMBJoinOp) {
if (originalSMBJoinOp == null) {
return false;
}
// Only create a map-join if the user explicitly gave a join (without a mapjoin hint)
if (!originalSMBJoinOp.isConvertedAutomaticallySMBJoin()) {
return false;
}
Operator currOp = originalSMBJoinOp;
while (true) {
if ((currOp.getChildOperators() == null) || (currOp.getChildOperators().isEmpty())) {
if (currOp instanceof FileSinkOperator) {
FileSinkOperator fsOp = (FileSinkOperator)currOp;
// The query has enforced that a sort-merge join should be performed.
// For more details, look at 'removedReduceSinkBucketSort' in FileSinkDesc.java
return !fsOp.getConf().isRemovedReduceSinkBucketSort();
}
// If it contains a reducer, the optimization is always on.
// Since there exists a reducer, the sorting/bucketing properties due to the
// sort-merge join operator are lost anyway. So, the plan cannot be wrong by
// changing the sort-merge join to a map-join
if (currOp instanceof ReduceSinkOperator) {
return true;
}
return false;
}
if (currOp.getChildOperators().size() > 1) {
return true;
}
currOp = currOp.getChildOperators().get(0);
}
}
@Override
public Task processCurrentTask(MapRedTask currTask,
ConditionalTask conditionalTask, Context context)
throws SemanticException {
// whether it contains a sort merge join operator
MapredWork currWork = currTask.getWork();
SMBMapJoinOperator originalSMBJoinOp = getSMBMapJoinOp(currWork);
if (!isEligibleForOptimization(originalSMBJoinOp)) {
return null;
}
currTask.setTaskTag(Task.CONVERTED_SORTMERGEJOIN);
// get parseCtx for this Join Operator
ParseContext parseCtx = physicalContext.getParseContext();
// Convert the work containing to sort-merge join into a work, as if it had a regular join.
// Note that the operator tree is not changed - is still contains the SMB join, but the
// plan is changed (aliasToWork etc.) to contain all the paths as if it was a regular join.
// This is used to convert the plan to a map-join, and then the original SMB join plan is used
// as a backup task.
MapredWork currJoinWork = convertSMBWorkToJoinWork(currWork, originalSMBJoinOp);
SMBMapJoinOperator newSMBJoinOp = getSMBMapJoinOp(currJoinWork);
currWork.getMapWork().setLeftInputJoin(originalSMBJoinOp.getConf().isLeftInputJoin());
currWork.getMapWork().setBaseSrc(originalSMBJoinOp.getConf().getBaseSrc());
currWork.getMapWork().setMapAliases(originalSMBJoinOp.getConf().getMapAliases());
currJoinWork.getMapWork().setLeftInputJoin(originalSMBJoinOp.getConf().isLeftInputJoin());
currJoinWork.getMapWork().setBaseSrc(originalSMBJoinOp.getConf().getBaseSrc());
currJoinWork.getMapWork().setMapAliases(originalSMBJoinOp.getConf().getMapAliases());
// create conditional work list and task list
List listWorks = new ArrayList();
List> listTasks = new ArrayList>();
// create task to aliases mapping and alias to input file mapping for resolver
// Must be deterministic order map for consistent q-test output across Java versions
HashMap, Set> taskToAliases =
new LinkedHashMap, Set>();
// Note that pathToAlias will behave as if the original plan was a join plan
HashMap> pathToAliases = currJoinWork.getMapWork().getPathToAliases();
// generate a map join task for the big table
SMBJoinDesc originalSMBJoinDesc = originalSMBJoinOp.getConf();
Byte[] order = originalSMBJoinDesc.getTagOrder();
int numAliases = order.length;
Set bigTableCandidates =
MapJoinProcessor.getBigTableCandidates(originalSMBJoinDesc.getConds());
HashMap aliasToSize = new HashMap();
Configuration conf = context.getConf();
try {
long aliasTotalKnownInputSize = getTotalKnownInputSize(context, currJoinWork.getMapWork(),
pathToAliases, aliasToSize);
long ThresholdOfSmallTblSizeSum = HiveConf.getLongVar(conf,
HiveConf.ConfVars.HIVESMALLTABLESFILESIZE);
for (int bigTablePosition = 0; bigTablePosition < numAliases; bigTablePosition++) {
// this table cannot be big table
if (!bigTableCandidates.contains(bigTablePosition)) {
continue;
}
// create map join task for the given big table position
MapRedTask newTask = convertSMBTaskToMapJoinTask(
currJoinWork, bigTablePosition, newSMBJoinOp);
MapWork mapWork = newTask.getWork().getMapWork();
Operator parentOp = originalSMBJoinOp.getParentOperators().get(bigTablePosition);
Set aliases = GenMapRedUtils.findAliases(mapWork, parentOp);
long aliasKnownSize = Utilities.sumOf(aliasToSize, aliases);
if (aliasKnownSize > 0) {
long smallTblTotalKnownSize = aliasTotalKnownInputSize - aliasKnownSize;
if (smallTblTotalKnownSize > ThresholdOfSmallTblSizeSum) {
// this table is not good to be a big table.
continue;
}
}
// add into conditional task
listWorks.add(newTask.getWork());
listTasks.add(newTask);
newTask.setTaskTag(Task.CONVERTED_MAPJOIN);
newTask.setFetchSource(currTask.isFetchSource());
// set up backup task
newTask.setBackupTask(currTask);
newTask.setBackupChildrenTasks(currTask.getChildTasks());
// put the mapping task to aliases
taskToAliases.put(newTask, aliases);
}
} catch (Exception e) {
e.printStackTrace();
throw new SemanticException("Generate Map Join Task Error: ", e);
}
// insert current common join task to conditional task
listWorks.add(currTask.getWork());
listTasks.add(currTask);
// clear JoinTree and OP Parse Context
currWork.getMapWork().setLeftInputJoin(false);
currWork.getMapWork().setBaseSrc(null);
currWork.getMapWork().setMapAliases(null);
// create conditional task and insert conditional task into task tree
ConditionalWork cndWork = new ConditionalWork(listWorks);
ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork, parseCtx.getConf());
cndTsk.setListTasks(listTasks);
// set resolver and resolver context
cndTsk.setResolver(new ConditionalResolverCommonJoin());
ConditionalResolverCommonJoinCtx resolverCtx = new ConditionalResolverCommonJoinCtx();
resolverCtx.setPathToAliases(pathToAliases);
resolverCtx.setAliasToKnownSize(aliasToSize);
resolverCtx.setTaskToAliases(taskToAliases);
resolverCtx.setCommonJoinTask(currTask);
resolverCtx.setLocalTmpDir(context.getLocalScratchDir(false));
resolverCtx.setHdfsTmpDir(context.getMRScratchDir());
cndTsk.setResolverCtx(resolverCtx);
// replace the current task with the new generated conditional task
replaceTaskWithConditionalTask(currTask, cndTsk);
return cndTsk;
}
/**
* If a join/union is followed by a SMB join, this cannot be converted to a conditional task.
*/
private boolean reducerAllowedSMBJoinOp(Operator reducer) {
while (reducer != null) {
if (!reducer.opAllowedBeforeSortMergeJoin()) {
return false;
}
List> childOps = reducer.getChildOperators();
if ((childOps == null) || (childOps.isEmpty())) {
return true;
}
// multi-table inserts not supported
if (childOps.size() > 1) {
return false;
}
reducer = childOps.get(0);
}
return true;
}
private SMBMapJoinOperator getSMBMapJoinOp(Operator currOp,
Operator reducer) {
SMBMapJoinOperator ret = null;
while (true) {
if (currOp instanceof SMBMapJoinOperator) {
if (ret != null) {
return null;
}
ret = (SMBMapJoinOperator) currOp;
}
// Does any operator in the tree stop the task from being converted to a conditional task
if (!currOp.opAllowedBeforeSortMergeJoin()) {
return null;
}
List> childOps = currOp.getChildOperators();
if ((childOps == null) || (childOps.isEmpty())) {
return reducerAllowedSMBJoinOp(reducer) ? ret : null;
}
// multi-table inserts not supported
if (childOps.size() > 1) {
return null;
}
currOp = childOps.get(0);
}
}
private SMBMapJoinOperator getSMBMapJoinOp(MapredWork work) throws SemanticException {
if (work != null && work.getReduceWork() != null) {
Operator reducer = work.getReduceWork().getReducer();
for (Operator op : work.getMapWork().getAliasToWork().values()) {
SMBMapJoinOperator smbMapJoinOp = getSMBMapJoinOp(op, reducer);
if (smbMapJoinOp != null) {
return smbMapJoinOp;
}
}
}
return null;
}
private MapJoinOperator getMapJoinOperator(MapRedTask task,
MapredWork work,
SMBMapJoinOperator oldSMBJoinOp,
int mapJoinPos) throws SemanticException {
SMBMapJoinOperator newSMBJoinOp = getSMBMapJoinOp(task.getWork());
// generate the map join operator
return MapJoinProcessor.convertSMBJoinToMapJoin(
physicalContext.getConf(), newSMBJoinOp, mapJoinPos, true);
}
}