org.apache.hadoop.hive.ql.optimizer.AbstractBucketJoinProc Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.optimizer;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.Stack;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.FileUtils;
import org.apache.hadoop.hive.ql.ErrorMsg;
import org.apache.hadoop.hive.ql.exec.MapJoinOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.NodeProcessor;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.parse.ParseContext;
import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
import org.apache.hadoop.hive.ql.parse.QB;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.parse.TableAccessAnalyzer;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.MapJoinDesc;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
/**
* this transformation does bucket map join optimization.
*/
abstract public class AbstractBucketJoinProc implements NodeProcessor {
private static final Log LOG =
LogFactory.getLog(AbstractBucketJoinProc.class.getName());
protected ParseContext pGraphContext;
public AbstractBucketJoinProc(ParseContext pGraphContext) {
this.pGraphContext = pGraphContext;
}
public AbstractBucketJoinProc() {
}
@Override
abstract public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException;
public static List getBucketFilePathsOfPartition(
Path location, ParseContext pGraphContext) throws SemanticException {
List fileNames = new ArrayList();
try {
FileSystem fs = location.getFileSystem(pGraphContext.getConf());
FileStatus[] files = fs.listStatus(new Path(location.toString()), FileUtils.HIDDEN_FILES_PATH_FILTER);
if (files != null) {
for (FileStatus file : files) {
fileNames.add(file.getPath().toString());
}
}
} catch (IOException e) {
throw new SemanticException(e);
}
return fileNames;
}
// This function checks whether all bucketing columns are also in join keys and are in same order
private boolean checkBucketColumns(List bucketColumns,
List joinKeys,
Integer[] joinKeyOrders) {
if (joinKeys == null || bucketColumns == null || bucketColumns.isEmpty()) {
return false;
}
for (int i = 0; i < joinKeys.size(); i++) {
int index = bucketColumns.indexOf(joinKeys.get(i));
if (joinKeyOrders[i] != null && joinKeyOrders[i] != index) {
return false;
}
joinKeyOrders[i] = index;
}
// Check if the join columns contains all bucket columns.
// If a table is bucketized on column B, but the join key is A and B,
// it is easy to see joining on different buckets yield empty results.
return joinKeys.containsAll(bucketColumns);
}
private boolean checkNumberOfBucketsAgainstBigTable(
Map> tblAliasToNumberOfBucketsInEachPartition,
int numberOfBucketsInPartitionOfBigTable) {
for (List bucketNums : tblAliasToNumberOfBucketsInEachPartition.values()) {
for (int nxt : bucketNums) {
boolean ok = (nxt >= numberOfBucketsInPartitionOfBigTable) ? nxt
% numberOfBucketsInPartitionOfBigTable == 0
: numberOfBucketsInPartitionOfBigTable % nxt == 0;
if (!ok) {
return false;
}
}
}
return true;
}
protected boolean canConvertMapJoinToBucketMapJoin(
MapJoinOperator mapJoinOp,
BucketJoinProcCtx context) throws SemanticException {
if (!this.pGraphContext.getMapJoinOps().contains(mapJoinOp)) {
return false;
}
List joinAliases = new ArrayList();
String[] srcs = mapJoinOp.getConf().getBaseSrc();
String[] left = mapJoinOp.getConf().getLeftAliases();
List mapAlias = mapJoinOp.getConf().getMapAliases();
String baseBigAlias = null;
for (String s : left) {
if (s != null) {
String subQueryAlias = QB.getAppendedAliasFromId(mapJoinOp.getConf().getId(), s);
if (!joinAliases.contains(subQueryAlias)) {
joinAliases.add(subQueryAlias);
if (!mapAlias.contains(s)) {
baseBigAlias = subQueryAlias;
}
}
}
}
for (String s : srcs) {
if (s != null) {
String subQueryAlias = QB.getAppendedAliasFromId(mapJoinOp.getConf().getId(), s);
if (!joinAliases.contains(subQueryAlias)) {
joinAliases.add(subQueryAlias);
if (!mapAlias.contains(s)) {
baseBigAlias = subQueryAlias;
}
}
}
}
Map> keysMap = mapJoinOp.getConf().getKeys();
return checkConvertBucketMapJoin(
context,
mapJoinOp.getConf().getAliasToOpInfo(),
keysMap,
baseBigAlias,
joinAliases);
}
/*
* Can this mapjoin be converted to a bucketed mapjoin ?
* The following checks are performed:
* a. The join columns contains all the bucket columns.
* b. The join keys are not transformed in the sub-query.
* c. All partitions contain the expected number of files (number of buckets).
* d. The number of buckets in the big table can be divided by no of buckets in small tables.
*/
protected boolean checkConvertBucketMapJoin(
BucketJoinProcCtx context,
Map> aliasToOpInfo,
Map> keysMap,
String baseBigAlias,
List joinAliases) throws SemanticException {
LinkedHashMap> tblAliasToNumberOfBucketsInEachPartition =
new LinkedHashMap>();
LinkedHashMap>> tblAliasToBucketedFilePathsInEachPartition =
new LinkedHashMap>>();
HashMap> topOps = pGraphContext.getTopOps();
HashMap aliasToNewAliasMap = new HashMap();
// (partition to bucket file names) and (partition to bucket number) for
// the big table;
LinkedHashMap> bigTblPartsToBucketFileNames =
new LinkedHashMap>();
LinkedHashMap bigTblPartsToBucketNumber =
new LinkedHashMap();
Integer[] joinKeyOrder = null; // accessing order of join cols to bucket cols, should be same
boolean bigTablePartitioned = true;
for (int index = 0; index < joinAliases.size(); index++) {
String alias = joinAliases.get(index);
Operator extends OperatorDesc> topOp = aliasToOpInfo.get(alias);
// The alias may not be present in case of a sub-query
if (topOp == null) {
return false;
}
List keys = toColumns(keysMap.get((byte) index));
if (keys == null || keys.isEmpty()) {
return false;
}
int oldKeySize = keys.size();
TableScanOperator tso = TableAccessAnalyzer.genRootTableScan(topOp, keys);
if (tso == null) {
// We cannot get to root TableScan operator, likely because there is a join or group-by
// between topOp and root TableScan operator. We don't handle that case, and simply return
return false;
}
// For nested sub-queries, the alias mapping is not maintained in QB currently.
if (topOps.containsValue(tso)) {
for (Map.Entry> topOpEntry : topOps.entrySet()) {
if (topOpEntry.getValue() == tso) {
String newAlias = topOpEntry.getKey();
if (!newAlias.equals(alias)) {
joinAliases.set(index, newAlias);
if (baseBigAlias.equals(alias)) {
baseBigAlias = newAlias;
}
aliasToNewAliasMap.put(alias, newAlias);
alias = newAlias;
}
break;
}
}
}
else {
// Ideally, this should never happen, and this should be an assert.
return false;
}
// The join keys cannot be transformed in the sub-query currently.
// TableAccessAnalyzer.genRootTableScan will only return the base table scan
// if the join keys are constants or a column. Even a simple cast of the join keys
// will result in a null table scan operator. In case of constant join keys, they would
// be removed, and the size before and after the genRootTableScan will be different.
if (keys.size() != oldKeySize) {
return false;
}
if (joinKeyOrder == null) {
joinKeyOrder = new Integer[keys.size()];
}
Table tbl = tso.getConf().getTableMetadata();
if (tbl.isPartitioned()) {
PrunedPartitionList prunedParts = pGraphContext.getPrunedPartitions(alias, tso);
List partitions = prunedParts.getNotDeniedPartns();
// construct a mapping of (Partition->bucket file names) and (Partition -> bucket number)
if (partitions.isEmpty()) {
if (!alias.equals(baseBigAlias)) {
tblAliasToNumberOfBucketsInEachPartition.put(alias, Arrays. asList());
tblAliasToBucketedFilePathsInEachPartition.put(alias, new ArrayList>());
}
} else {
List buckets = new ArrayList();
List> files = new ArrayList>();
for (Partition p : partitions) {
if (!checkBucketColumns(p.getBucketCols(), keys, joinKeyOrder)) {
return false;
}
List fileNames =
getBucketFilePathsOfPartition(p.getDataLocation(), pGraphContext);
// The number of files for the table should be same as number of buckets.
int bucketCount = p.getBucketCount();
if (fileNames.size() != 0 && fileNames.size() != bucketCount) {
String msg = "The number of buckets for table " +
tbl.getTableName() + " partition " + p.getName() + " is " +
p.getBucketCount() + ", whereas the number of files is " + fileNames.size();
throw new SemanticException(
ErrorMsg.BUCKETED_TABLE_METADATA_INCORRECT.getMsg(msg));
}
if (alias.equals(baseBigAlias)) {
bigTblPartsToBucketFileNames.put(p, fileNames);
bigTblPartsToBucketNumber.put(p, bucketCount);
} else {
files.add(fileNames);
buckets.add(bucketCount);
}
}
if (!alias.equals(baseBigAlias)) {
tblAliasToNumberOfBucketsInEachPartition.put(alias, buckets);
tblAliasToBucketedFilePathsInEachPartition.put(alias, files);
}
}
} else {
if (!checkBucketColumns(tbl.getBucketCols(), keys, joinKeyOrder)) {
return false;
}
List fileNames =
getBucketFilePathsOfPartition(tbl.getDataLocation(), pGraphContext);
Integer num = new Integer(tbl.getNumBuckets());
// The number of files for the table should be same as number of buckets.
if (fileNames.size() != 0 && fileNames.size() != num) {
String msg = "The number of buckets for table " +
tbl.getTableName() + " is " + tbl.getNumBuckets() +
", whereas the number of files is " + fileNames.size();
throw new SemanticException(
ErrorMsg.BUCKETED_TABLE_METADATA_INCORRECT.getMsg(msg));
}
if (alias.equals(baseBigAlias)) {
bigTblPartsToBucketFileNames.put(null, fileNames);
bigTblPartsToBucketNumber.put(null, tbl.getNumBuckets());
bigTablePartitioned = false;
} else {
tblAliasToNumberOfBucketsInEachPartition.put(alias, Arrays.asList(num));
tblAliasToBucketedFilePathsInEachPartition.put(alias, Arrays.asList(fileNames));
}
}
}
// All tables or partitions are bucketed, and their bucket number is
// stored in 'bucketNumbers', we need to check if the number of buckets in
// the big table can be divided by no of buckets in small tables.
for (Integer numBucketsInPartitionOfBigTable : bigTblPartsToBucketNumber.values()) {
if (!checkNumberOfBucketsAgainstBigTable(
tblAliasToNumberOfBucketsInEachPartition, numBucketsInPartitionOfBigTable)) {
return false;
}
}
context.setTblAliasToNumberOfBucketsInEachPartition(tblAliasToNumberOfBucketsInEachPartition);
context.setTblAliasToBucketedFilePathsInEachPartition(
tblAliasToBucketedFilePathsInEachPartition);
context.setBigTblPartsToBucketFileNames(bigTblPartsToBucketFileNames);
context.setBigTblPartsToBucketNumber(bigTblPartsToBucketNumber);
context.setJoinAliases(joinAliases);
context.setBaseBigAlias(baseBigAlias);
context.setBigTablePartitioned(bigTablePartitioned);
if (!aliasToNewAliasMap.isEmpty()) {
context.setAliasToNewAliasMap(aliasToNewAliasMap);
}
return true;
}
/*
* Convert mapjoin to a bucketed mapjoin.
* The operator tree is not changed, but the mapjoin descriptor in the big table is
* enhanced to keep the big table bucket -> small table buckets mapping.
*/
protected void convertMapJoinToBucketMapJoin(
MapJoinOperator mapJoinOp,
BucketJoinProcCtx context) throws SemanticException {
MapJoinDesc desc = mapJoinOp.getConf();
Map>> aliasBucketFileNameMapping =
new LinkedHashMap>>();
Map> tblAliasToNumberOfBucketsInEachPartition =
context.getTblAliasToNumberOfBucketsInEachPartition();
Map>> tblAliasToBucketedFilePathsInEachPartition =
context.getTblAliasToBucketedFilePathsInEachPartition();
Map> bigTblPartsToBucketFileNames =
context.getBigTblPartsToBucketFileNames();
Map bigTblPartsToBucketNumber =
context.getBigTblPartsToBucketNumber();
List joinAliases = context.getJoinAliases();
String baseBigAlias = context.getBaseBigAlias();
// sort bucket names for the big table
for (List partBucketNames : bigTblPartsToBucketFileNames.values()) {
Collections.sort(partBucketNames);
}
// go through all small tables and get the mapping from bucket file name
// in the big table to bucket file names in small tables.
for (int j = 0; j < joinAliases.size(); j++) {
String alias = joinAliases.get(j);
if (alias.equals(baseBigAlias)) {
continue;
}
for (List names : tblAliasToBucketedFilePathsInEachPartition.get(alias)) {
Collections.sort(names);
}
List smallTblBucketNums = tblAliasToNumberOfBucketsInEachPartition.get(alias);
List> smallTblFilesList = tblAliasToBucketedFilePathsInEachPartition.get(alias);
Map> mappingBigTableBucketFileNameToSmallTableBucketFileNames =
new LinkedHashMap>();
aliasBucketFileNameMapping.put(alias,
mappingBigTableBucketFileNameToSmallTableBucketFileNames);
// for each bucket file in big table, get the corresponding bucket file
// name in the small table.
// more than 1 partition in the big table, do the mapping for each partition
Iterator>> bigTblPartToBucketNames =
bigTblPartsToBucketFileNames.entrySet().iterator();
Iterator> bigTblPartToBucketNum = bigTblPartsToBucketNumber
.entrySet().iterator();
while (bigTblPartToBucketNames.hasNext()) {
assert bigTblPartToBucketNum.hasNext();
int bigTblBucketNum = bigTblPartToBucketNum.next().getValue();
List bigTblBucketNameList = bigTblPartToBucketNames.next().getValue();
fillMappingBigTableBucketFileNameToSmallTableBucketFileNames(smallTblBucketNums,
smallTblFilesList,
mappingBigTableBucketFileNameToSmallTableBucketFileNames, bigTblBucketNum,
bigTblBucketNameList,
desc.getBigTableBucketNumMapping());
}
}
desc.setAliasBucketFileNameMapping(aliasBucketFileNameMapping);
desc.setBigTableAlias(baseBigAlias);
boolean bigTablePartitioned = context.isBigTablePartitioned();
if (bigTablePartitioned) {
desc.setBigTablePartSpecToFileMapping(convert(bigTblPartsToBucketFileNames));
}
Map> posToAliasMap = mapJoinOp.getPosToAliasMap();
Map aliasToNewAliasMap = context.getAliasToNewAliasMap();
if (aliasToNewAliasMap != null && posToAliasMap != null) {
for (Map.Entry entry: aliasToNewAliasMap.entrySet()) {
for (Set aliases: posToAliasMap.values()) {
if (aliases.remove(entry.getKey())) {
aliases.add(entry.getValue());
}
}
}
}
// successfully convert to bucket map join
desc.setBucketMapJoin(true);
}
// convert partition to partition spec string
private Map> convert(Map> mapping) {
Map> converted = new HashMap>();
for (Map.Entry> entry : mapping.entrySet()) {
converted.put(entry.getKey().getName(), entry.getValue());
}
return converted;
}
public static List toColumns(List keys) {
List columns = new ArrayList();
for (ExprNodeDesc key : keys) {
if (key instanceof ExprNodeColumnDesc) {
columns.add(((ExprNodeColumnDesc) key).getColumn());
} else if ((key instanceof ExprNodeConstantDesc)) {
ExprNodeConstantDesc constant = (ExprNodeConstantDesc) key;
String colName = constant.getFoldedFromCol();
if (colName == null){
return null;
} else {
columns.add(colName);
}
} else {
return null;
}
}
return columns;
}
// called for each partition of big table and populates mapping for each file in the partition
private void fillMappingBigTableBucketFileNameToSmallTableBucketFileNames(
List smallTblBucketNums,
List> smallTblFilesList,
Map> bigTableBucketFileNameToSmallTableBucketFileNames,
int bigTblBucketNum, List bigTblBucketNameList,
Map bucketFileNameMapping) {
for (int bindex = 0; bindex < bigTblBucketNameList.size(); bindex++) {
ArrayList resultFileNames = new ArrayList();
for (int sindex = 0; sindex < smallTblBucketNums.size(); sindex++) {
int smallTblBucketNum = smallTblBucketNums.get(sindex);
List smallTblFileNames = smallTblFilesList.get(sindex);
if (bigTblBucketNum >= smallTblBucketNum) {
// if the big table has more buckets than the current small table,
// use "MOD" to get small table bucket names. For example, if the big
// table has 4 buckets and the small table has 2 buckets, then the
// mapping should be 0->0, 1->1, 2->0, 3->1.
int toAddSmallIndex = bindex % smallTblBucketNum;
resultFileNames.add(smallTblFileNames.get(toAddSmallIndex));
} else {
int jump = smallTblBucketNum / bigTblBucketNum;
for (int i = bindex; i < smallTblFileNames.size(); i = i + jump) {
resultFileNames.add(smallTblFileNames.get(i));
}
}
}
String inputBigTBLBucket = bigTblBucketNameList.get(bindex);
bigTableBucketFileNameToSmallTableBucketFileNames.put(inputBigTBLBucket, resultFileNames);
bucketFileNameMapping.put(inputBigTBLBucket, bindex);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy