Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.apache.hadoop.hive.ql.optimizer.calcite.translator.HiveGBOpConvUtil Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.optimizer.calcite.translator;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.calcite.rel.RelNode;
import org.apache.calcite.rel.core.AggregateCall;
import org.apache.calcite.rex.RexInputRef;
import org.apache.calcite.util.ImmutableBitSet;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.ErrorMsg;
import org.apache.hadoop.hive.ql.exec.ColumnInfo;
import org.apache.hadoop.hive.ql.exec.GroupByOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.OperatorFactory;
import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
import org.apache.hadoop.hive.ql.exec.RowSchema;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.io.AcidUtils;
import org.apache.hadoop.hive.ql.metadata.VirtualColumn;
import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil;
import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate;
import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveGroupingID;
import org.apache.hadoop.hive.ql.optimizer.calcite.translator.HiveOpConverter.OpAttr;
import org.apache.hadoop.hive.ql.parse.ASTNode;
import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.GenericUDAFInfo;
import org.apache.hadoop.hive.ql.plan.AggregationDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils;
import org.apache.hadoop.hive.ql.plan.GroupByDesc;
import org.apache.hadoop.hive.ql.plan.PlanUtils;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import com.google.common.collect.ImmutableList;
/**
* TODO:
* 1. Change the output col/ExprNodeColumn names to external names.
* 2. Verify if we need to use the "KEY."/"VALUE." in RS cols; switch to
* external names if possible.
* 3. In ExprNode & in ColumnInfo the tableAlias/VirtualColumn is specified
* differently for different GB/RS in pipeline. Remove the different treatments.
* 3. VirtualColMap needs to be maintained
*
*/
public class HiveGBOpConvUtil {
private static enum HIVEGBPHYSICALMODE {
MAP_SIDE_GB_NO_SKEW_NO_ADD_MR_JOB, MAP_SIDE_GB_NO_SKEW_ADD_MR_JOB, MAP_SIDE_GB_SKEW_GBKEYS_OR_DIST_UDAF_PRESENT, MAP_SIDE_GB_SKEW_GBKEYS_AND_DIST_UDAF_NOT_PRESENT, NO_MAP_SIDE_GB_NO_SKEW, NO_MAP_SIDE_GB_SKEW
};
private static class UDAFAttrs {
private boolean isDistinctUDAF;
private String udafName;
private GenericUDAFEvaluator udafEvaluator;
private final ArrayList udafParams = new ArrayList();
private List udafParamsIndxInGBInfoDistExprs = new ArrayList();
};
private static class GBInfo {
private final List outputColNames = new ArrayList();
private final List gbKeyColNamesInInput = new ArrayList();
private final List gbKeyTypes = new ArrayList();
private final List gbKeys = new ArrayList();
private final List grpSets = new ArrayList();
private boolean grpSetRqrAdditionalMRJob;
private boolean grpIdFunctionNeeded;
private final List distExprNames = new ArrayList();
private final List distExprTypes = new ArrayList();
private final List distExprNodes = new ArrayList();
private final List> distColIndices = new ArrayList>();
private final List deDupedNonDistIrefs = new ArrayList();
private final List udafAttrs = new ArrayList();
private boolean containsDistinctAggr = false;
float groupByMemoryUsage;
float memoryThreshold;
private HIVEGBPHYSICALMODE gbPhysicalPipelineMode;
};
private static HIVEGBPHYSICALMODE getAggOPMode(HiveConf hc, GBInfo gbInfo) {
HIVEGBPHYSICALMODE gbPhysicalPipelineMode = HIVEGBPHYSICALMODE.MAP_SIDE_GB_NO_SKEW_NO_ADD_MR_JOB;
if (hc.getBoolVar(HiveConf.ConfVars.HIVEMAPSIDEAGGREGATE)) {
if (!hc.getBoolVar(HiveConf.ConfVars.HIVEGROUPBYSKEW)) {
if (!gbInfo.grpSetRqrAdditionalMRJob) {
gbPhysicalPipelineMode = HIVEGBPHYSICALMODE.MAP_SIDE_GB_NO_SKEW_NO_ADD_MR_JOB;
} else {
gbPhysicalPipelineMode = HIVEGBPHYSICALMODE.MAP_SIDE_GB_NO_SKEW_ADD_MR_JOB;
}
} else {
if (gbInfo.containsDistinctAggr || !gbInfo.gbKeys.isEmpty()) {
gbPhysicalPipelineMode = HIVEGBPHYSICALMODE.MAP_SIDE_GB_SKEW_GBKEYS_OR_DIST_UDAF_PRESENT;
} else {
gbPhysicalPipelineMode = HIVEGBPHYSICALMODE.MAP_SIDE_GB_SKEW_GBKEYS_AND_DIST_UDAF_NOT_PRESENT;
}
}
} else {
if (!hc.getBoolVar(HiveConf.ConfVars.HIVEGROUPBYSKEW)) {
gbPhysicalPipelineMode = HIVEGBPHYSICALMODE.NO_MAP_SIDE_GB_NO_SKEW;
} else {
gbPhysicalPipelineMode = HIVEGBPHYSICALMODE.NO_MAP_SIDE_GB_SKEW;
}
}
return gbPhysicalPipelineMode;
}
// For each of the GB op in the logical GB this should be called seperately;
// otherwise GBevaluator and expr nodes may get shared among multiple GB ops
private static GBInfo getGBInfo(HiveAggregate aggRel, OpAttr inputOpAf, HiveConf hc) throws SemanticException {
GBInfo gbInfo = new GBInfo();
// 0. Collect AggRel output col Names
gbInfo.outputColNames.addAll(aggRel.getRowType().getFieldNames());
// 1. Collect GB Keys
RelNode aggInputRel = aggRel.getInput();
ExprNodeConverter exprConv = new ExprNodeConverter(inputOpAf.tabAlias,
aggInputRel.getRowType(), new HashSet(), aggRel.getCluster().getTypeFactory());
ExprNodeDesc tmpExprNodeDesc;
for (int i : aggRel.getGroupSet()) {
RexInputRef iRef = new RexInputRef(i, aggInputRel.getRowType().getFieldList()
.get(i).getType());
tmpExprNodeDesc = iRef.accept(exprConv);
gbInfo.gbKeys.add(tmpExprNodeDesc);
gbInfo.gbKeyColNamesInInput.add(aggInputRel.getRowType().getFieldNames().get(i));
gbInfo.gbKeyTypes.add(tmpExprNodeDesc.getTypeInfo());
}
// 2. Collect Grouping Set info
if (aggRel.indicator) {
// 2.1 Translate Grouping set col bitset
ImmutableList lstGrpSet = aggRel.getGroupSets();
int bitmap = 0;
for (ImmutableBitSet grpSet : lstGrpSet) {
bitmap = 0;
for (Integer bitIdx : grpSet.asList()) {
bitmap = SemanticAnalyzer.setBit(bitmap, bitIdx);
}
gbInfo.grpSets.add(bitmap);
}
Collections.sort(gbInfo.grpSets);
// 2.2 Check if GRpSet require additional MR Job
gbInfo.grpSetRqrAdditionalMRJob = gbInfo.grpSets.size() > hc
.getIntVar(HiveConf.ConfVars.HIVE_NEW_JOB_GROUPING_SET_CARDINALITY);
// 2.3 Check if GROUPING_ID needs to be projected out
if (!aggRel.getAggCallList().isEmpty()
&& (aggRel.getAggCallList().get(aggRel.getAggCallList().size() - 1).getAggregation() == HiveGroupingID.INSTANCE)) {
gbInfo.grpIdFunctionNeeded = true;
}
}
// 3. Walk through UDAF & Collect Distinct Info
Set distinctRefs = new HashSet();
Map distParamInRefsToOutputPos = new HashMap();
for (AggregateCall aggCall : aggRel.getAggCallList()) {
if ((aggCall.getAggregation() == HiveGroupingID.INSTANCE) || !aggCall.isDistinct()) {
continue;
}
List argLst = new ArrayList(aggCall.getArgList());
List argNames = HiveCalciteUtil.getFieldNames(argLst, aggInputRel);
ExprNodeDesc distinctExpr;
for (int i = 0; i < argLst.size(); i++) {
if (!distinctRefs.contains(argLst.get(i))) {
distinctRefs.add(argLst.get(i));
distParamInRefsToOutputPos.put(argLst.get(i), gbInfo.distExprNodes.size());
distinctExpr = HiveCalciteUtil.getExprNode(argLst.get(i), aggInputRel, exprConv);
gbInfo.distExprNodes.add(distinctExpr);
gbInfo.distExprNames.add(argNames.get(i));
gbInfo.distExprTypes.add(distinctExpr.getTypeInfo());
}
}
}
// 4. Walk through UDAF & Collect UDAF Info
Set deDupedNonDistIrefsSet = new HashSet();
for (AggregateCall aggCall : aggRel.getAggCallList()) {
if (aggCall.getAggregation() == HiveGroupingID.INSTANCE) {
continue;
}
UDAFAttrs udafAttrs = new UDAFAttrs();
List argExps = HiveCalciteUtil.getExprNodes(aggCall.getArgList(), aggInputRel,
inputOpAf.tabAlias);
udafAttrs.udafParams.addAll(argExps);
udafAttrs.udafName = aggCall.getAggregation().getName();
udafAttrs.isDistinctUDAF = aggCall.isDistinct();
List argLst = new ArrayList(aggCall.getArgList());
List distColIndicesOfUDAF = new ArrayList();
List distUDAFParamsIndxInDistExprs = new ArrayList();
for (int i = 0; i < argLst.size(); i++) {
// NOTE: distinct expr can be part of of GB key
if (udafAttrs.isDistinctUDAF) {
ExprNodeDesc argExpr = argExps.get(i);
Integer found = ExprNodeDescUtils.indexOf(argExpr, gbInfo.gbKeys);
distColIndicesOfUDAF.add(found < 0 ? distParamInRefsToOutputPos.get(argLst.get(i)) + gbInfo.gbKeys.size() +
(gbInfo.grpSets.size() > 0 ? 1 : 0) : found);
distUDAFParamsIndxInDistExprs.add(distParamInRefsToOutputPos.get(argLst.get(i)));
} else {
// TODO: this seems wrong (following what Hive Regular does)
if (!distParamInRefsToOutputPos.containsKey(argLst.get(i))
&& !deDupedNonDistIrefsSet.contains(argLst.get(i))) {
deDupedNonDistIrefsSet.add(i);
gbInfo.deDupedNonDistIrefs.add(udafAttrs.udafParams.get(i));
}
}
}
if (udafAttrs.isDistinctUDAF) {
gbInfo.containsDistinctAggr = true;
udafAttrs.udafParamsIndxInGBInfoDistExprs = distUDAFParamsIndxInDistExprs;
gbInfo.distColIndices.add(distColIndicesOfUDAF);
}
// special handling for count, similar to PlanModifierForASTConv::replaceEmptyGroupAggr()
udafAttrs.udafEvaluator = SemanticAnalyzer.getGenericUDAFEvaluator(udafAttrs.udafName,
new ArrayList(udafAttrs.udafParams), new ASTNode(),
udafAttrs.isDistinctUDAF, udafAttrs.udafParams.size() == 0 &&
"count".equalsIgnoreCase(udafAttrs.udafName) ? true : false);
gbInfo.udafAttrs.add(udafAttrs);
}
// 4. Gather GB Memory threshold
gbInfo.groupByMemoryUsage = HiveConf.getFloatVar(hc, HiveConf.ConfVars.HIVEMAPAGGRHASHMEMORY);
gbInfo.memoryThreshold = HiveConf.getFloatVar(hc, HiveConf.ConfVars.HIVEMAPAGGRMEMORYTHRESHOLD);
// 5. Gather GB Physical pipeline (based on user config & Grping Sets size)
gbInfo.gbPhysicalPipelineMode = getAggOPMode(hc, gbInfo);
return gbInfo;
}
static OpAttr translateGB(OpAttr inputOpAf, HiveAggregate aggRel, HiveConf hc)
throws SemanticException {
OpAttr translatedGBOpAttr = null;
GBInfo gbInfo = getGBInfo(aggRel, inputOpAf, hc);
switch (gbInfo.gbPhysicalPipelineMode) {
case MAP_SIDE_GB_NO_SKEW_NO_ADD_MR_JOB:
translatedGBOpAttr = genMapSideGBNoSkewNoAddMRJob(inputOpAf, aggRel, gbInfo);
break;
case MAP_SIDE_GB_NO_SKEW_ADD_MR_JOB:
translatedGBOpAttr = genMapSideGBNoSkewAddMRJob(inputOpAf, aggRel, gbInfo);
break;
case MAP_SIDE_GB_SKEW_GBKEYS_OR_DIST_UDAF_PRESENT:
translatedGBOpAttr = genMapSideGBSkewGBKeysOrDistUDAFPresent(inputOpAf, aggRel, gbInfo);
break;
case MAP_SIDE_GB_SKEW_GBKEYS_AND_DIST_UDAF_NOT_PRESENT:
translatedGBOpAttr = genMapSideGBSkewGBKeysAndDistUDAFNotPresent(inputOpAf, aggRel, gbInfo);
break;
case NO_MAP_SIDE_GB_NO_SKEW:
translatedGBOpAttr = genNoMapSideGBNoSkew(inputOpAf, aggRel, gbInfo);
break;
case NO_MAP_SIDE_GB_SKEW:
translatedGBOpAttr = genNoMapSideGBSkew(inputOpAf, aggRel, gbInfo);
break;
}
return translatedGBOpAttr;
}
/**
* GB-RS-GB1
*
* Construct GB-RS-GB Pipe line. User has enabled Map Side GB, specified no
* skew and Grp Set is below the threshold.
*
* @param inputOpAf
* @param aggRel
* @param gbInfo
* @return
* @throws SemanticException
*/
private static OpAttr genMapSideGBNoSkewNoAddMRJob(OpAttr inputOpAf, HiveAggregate aggRel,
GBInfo gbInfo) throws SemanticException {
OpAttr mapSideGB = null;
OpAttr mapSideRS = null;
OpAttr reduceSideGB = null;
// 1. Insert MapSide GB
mapSideGB = genMapSideGB(inputOpAf, gbInfo);
// 2. Insert MapSide RS
mapSideRS = genMapSideGBRS(mapSideGB, gbInfo);
// 3. Insert ReduceSide GB
reduceSideGB = genReduceSideGB1(mapSideRS, gbInfo, false, false, GroupByDesc.Mode.MERGEPARTIAL);
return reduceSideGB;
}
/**
* GB-RS-GB1-RS-GB2
*/
private static OpAttr genGBRSGBRSGBOpPipeLine(OpAttr inputOpAf, HiveAggregate aggRel,
GBInfo gbInfo) throws SemanticException {
OpAttr mapSideGB = null;
OpAttr mapSideRS = null;
OpAttr reduceSideGB1 = null;
OpAttr reduceSideRS = null;
OpAttr reduceSideGB2 = null;
// 1. Insert MapSide GB
mapSideGB = genMapSideGB(inputOpAf, gbInfo);
// 2. Insert MapSide RS
mapSideRS = genMapSideGBRS(mapSideGB, gbInfo);
// 3. Insert ReduceSide GB1
boolean computeGrpSet = (gbInfo.gbPhysicalPipelineMode == HIVEGBPHYSICALMODE.MAP_SIDE_GB_SKEW_GBKEYS_OR_DIST_UDAF_PRESENT) ? false : true;
reduceSideGB1 = genReduceSideGB1(mapSideRS, gbInfo, computeGrpSet, false, GroupByDesc.Mode.PARTIALS);
// 4. Insert RS on reduce side with Reduce side GB as input
reduceSideRS = genReduceGBRS(reduceSideGB1, gbInfo);
// 5. Insert ReduceSide GB2
reduceSideGB2 = genReduceSideGB2(reduceSideRS, gbInfo);
return reduceSideGB2;
}
/**
* GB-RS-GB1-RS-GB2
*
* @param inputOpAf
* @param aggRel
* @param gbInfo
* @return
* @throws SemanticException
*/
private static OpAttr genMapSideGBNoSkewAddMRJob(OpAttr inputOpAf, HiveAggregate aggRel,
GBInfo gbInfo) throws SemanticException {
// 1. Sanity check
if (gbInfo.containsDistinctAggr) {
String errorMsg = "The number of rows per input row due to grouping sets is "
+ gbInfo.grpSets.size();
throw new SemanticException(
ErrorMsg.HIVE_GROUPING_SETS_THRESHOLD_NOT_ALLOWED_WITH_DISTINCTS.getMsg(errorMsg));
}
// 2. Gen GB-RS-GB-RS-GB pipeline
return genGBRSGBRSGBOpPipeLine(inputOpAf, aggRel, gbInfo);
}
/**
* GB-RS-GB1-RS-GB2
*
* @param inputOpAf
* @param aggRel
* @param gbInfo
* @return
* @throws SemanticException
*/
private static OpAttr genMapSideGBSkewGBKeysOrDistUDAFPresent(OpAttr inputOpAf,
HiveAggregate aggRel, GBInfo gbInfo) throws SemanticException {
// 1. Sanity check
if (gbInfo.grpSetRqrAdditionalMRJob) {
String errorMsg = "The number of rows per input row due to grouping sets is "
+ gbInfo.grpSets.size();
throw new SemanticException(
ErrorMsg.HIVE_GROUPING_SETS_THRESHOLD_NOT_ALLOWED_WITH_SKEW.getMsg(errorMsg));
}
// 2. Gen GB-RS-GB-RS-GB pipeline
return genGBRSGBRSGBOpPipeLine(inputOpAf, aggRel, gbInfo);
}
/**
* GB-RS-GB2
*
* @param inputOpAf
* @param aggRel
* @param gbInfo
* @return
* @throws SemanticException
*/
private static OpAttr genMapSideGBSkewGBKeysAndDistUDAFNotPresent(OpAttr inputOpAf,
HiveAggregate aggRel, GBInfo gbInfo) throws SemanticException {
OpAttr mapSideGB = null;
OpAttr mapSideRS = null;
OpAttr reduceSideGB2 = null;
// 1. Sanity check
if (gbInfo.grpSetRqrAdditionalMRJob) {
String errorMsg = "The number of rows per input row due to grouping sets is "
+ gbInfo.grpSets.size();
throw new SemanticException(
ErrorMsg.HIVE_GROUPING_SETS_THRESHOLD_NOT_ALLOWED_WITH_SKEW.getMsg(errorMsg));
}
// 1. Insert MapSide GB
mapSideGB = genMapSideGB(inputOpAf, gbInfo);
// 2. Insert MapSide RS
mapSideRS = genMapSideGBRS(mapSideGB, gbInfo);
// 3. Insert ReduceSide GB2
reduceSideGB2 = genReduceSideGB2(mapSideRS, gbInfo);
return reduceSideGB2;
}
/**
* RS-Gb1
*
* @param inputOpAf
* @param aggRel
* @param gbInfo
* @return
* @throws SemanticException
*/
private static OpAttr genNoMapSideGBNoSkew(OpAttr inputOpAf, HiveAggregate aggRel, GBInfo gbInfo)
throws SemanticException {
OpAttr mapSideRS = null;
OpAttr reduceSideGB1NoMapGB = null;
// 1. Insert MapSide RS
mapSideRS = genMapSideRS(inputOpAf, gbInfo);
// 2. Insert ReduceSide GB
reduceSideGB1NoMapGB = genReduceSideGB1NoMapGB(mapSideRS, gbInfo, GroupByDesc.Mode.COMPLETE);
return reduceSideGB1NoMapGB;
}
/**
* RS-GB1-RS-GB2
*
* @param inputOpAf
* @param aggRel
* @param gbInfo
* @return
* @throws SemanticException
*/
private static OpAttr genNoMapSideGBSkew(OpAttr inputOpAf, HiveAggregate aggRel, GBInfo gbInfo)
throws SemanticException {
OpAttr mapSideRS = null;
OpAttr reduceSideGB1NoMapGB = null;
OpAttr reduceSideRS = null;
OpAttr reduceSideGB2 = null;
// 1. Insert MapSide RS
mapSideRS = genMapSideRS(inputOpAf, gbInfo);
// 2. Insert ReduceSide GB
reduceSideGB1NoMapGB = genReduceSideGB1NoMapGB(mapSideRS, gbInfo, GroupByDesc.Mode.PARTIAL1);
// 3. Insert RS on reduce side with Reduce side GB as input
reduceSideRS = genReduceGBRS(reduceSideGB1NoMapGB, gbInfo);
// 4. Insert ReduceSide GB2
reduceSideGB2 = genReduceSideGB2(reduceSideRS, gbInfo);
return reduceSideGB2;
}
private static int getParallelismForReduceSideRS(GBInfo gbInfo) {
int degreeOfParallelism = 0;
switch (gbInfo.gbPhysicalPipelineMode) {
case MAP_SIDE_GB_NO_SKEW_ADD_MR_JOB:
case MAP_SIDE_GB_SKEW_GBKEYS_OR_DIST_UDAF_PRESENT:
case NO_MAP_SIDE_GB_SKEW:
if (gbInfo.gbKeys.isEmpty()) {
degreeOfParallelism = 1;
} else {
degreeOfParallelism = -1;
}
break;
default:
throw new RuntimeException(
"Unable to determine Reducer Parallelism - Invalid Physical Mode: "
+ gbInfo.gbPhysicalPipelineMode);
}
return degreeOfParallelism;
}
private static int getParallelismForMapSideRS(GBInfo gbInfo) {
int degreeOfParallelism = 0;
switch (gbInfo.gbPhysicalPipelineMode) {
case MAP_SIDE_GB_NO_SKEW_NO_ADD_MR_JOB:
case MAP_SIDE_GB_NO_SKEW_ADD_MR_JOB:
case NO_MAP_SIDE_GB_NO_SKEW:
if (gbInfo.gbKeys.isEmpty()) {
degreeOfParallelism = 1;
} else {
degreeOfParallelism = -1;
}
break;
case NO_MAP_SIDE_GB_SKEW:
case MAP_SIDE_GB_SKEW_GBKEYS_OR_DIST_UDAF_PRESENT:
degreeOfParallelism = -1;
break;
case MAP_SIDE_GB_SKEW_GBKEYS_AND_DIST_UDAF_NOT_PRESENT:
degreeOfParallelism = 1;
break;
default:
throw new RuntimeException(
"Unable to determine Reducer Parallelism - Invalid Physical Mode: "
+ gbInfo.gbPhysicalPipelineMode);
}
return degreeOfParallelism;
}
private static int getNumPartFieldsForReduceSideRS(GBInfo gbInfo) {
int numPartFields = 0;
switch (gbInfo.gbPhysicalPipelineMode) {
case MAP_SIDE_GB_NO_SKEW_ADD_MR_JOB:
numPartFields = gbInfo.gbKeys.size() + 1;
break;
case MAP_SIDE_GB_SKEW_GBKEYS_OR_DIST_UDAF_PRESENT:
case NO_MAP_SIDE_GB_SKEW:
numPartFields = gbInfo.gbKeys.size();
break;
default:
throw new RuntimeException(
"Unable to determine Number of Partition Fields - Invalid Physical Mode: "
+ gbInfo.gbPhysicalPipelineMode);
}
return numPartFields;
}
private static int getNumPartFieldsForMapSideRS(GBInfo gbInfo) {
int numPartFields = 0;
switch (gbInfo.gbPhysicalPipelineMode) {
case MAP_SIDE_GB_NO_SKEW_NO_ADD_MR_JOB:
case MAP_SIDE_GB_NO_SKEW_ADD_MR_JOB:
case MAP_SIDE_GB_SKEW_GBKEYS_AND_DIST_UDAF_NOT_PRESENT:
case NO_MAP_SIDE_GB_NO_SKEW:
numPartFields += gbInfo.gbKeys.size();
break;
case NO_MAP_SIDE_GB_SKEW:
case MAP_SIDE_GB_SKEW_GBKEYS_OR_DIST_UDAF_PRESENT:
if (gbInfo.containsDistinctAggr) {
numPartFields = Integer.MAX_VALUE;
} else {
numPartFields = -1;
}
break;
default:
throw new RuntimeException(
"Unable to determine Number of Partition Fields - Invalid Physical Mode: "
+ gbInfo.gbPhysicalPipelineMode);
}
return numPartFields;
}
private static boolean inclGrpSetInReduceSide(GBInfo gbInfo) {
boolean inclGrpSet = false;
if (gbInfo.grpSets.size() > 0
&& (gbInfo.gbPhysicalPipelineMode == HIVEGBPHYSICALMODE.MAP_SIDE_GB_NO_SKEW_ADD_MR_JOB || gbInfo.gbPhysicalPipelineMode == HIVEGBPHYSICALMODE.MAP_SIDE_GB_SKEW_GBKEYS_OR_DIST_UDAF_PRESENT)) {
inclGrpSet = true;
}
return inclGrpSet;
}
private static boolean inclGrpSetInMapSide(GBInfo gbInfo) {
boolean inclGrpSet = false;
if (gbInfo.grpSets.size() > 0
&& ((gbInfo.gbPhysicalPipelineMode == HIVEGBPHYSICALMODE.MAP_SIDE_GB_NO_SKEW_NO_ADD_MR_JOB) ||
gbInfo.gbPhysicalPipelineMode == HIVEGBPHYSICALMODE.MAP_SIDE_GB_SKEW_GBKEYS_OR_DIST_UDAF_PRESENT)) {
inclGrpSet = true;
}
return inclGrpSet;
}
private static OpAttr genReduceGBRS(OpAttr inputOpAf, GBInfo gbInfo) throws SemanticException {
Map colExprMap = new HashMap();
ArrayList outputColumnNames = new ArrayList();
ArrayList colInfoLst = new ArrayList();
GroupByOperator reduceSideGB1 = (GroupByOperator) inputOpAf.inputs.get(0);
List gb1ColInfoLst = reduceSideGB1.getSchema().getSignature();
ArrayList reduceKeys = getReduceKeysForRS(reduceSideGB1, 0,
gbInfo.gbKeys.size() - 1, outputColumnNames, false, colInfoLst, colExprMap, true, true);
if (inclGrpSetInReduceSide(gbInfo)) {
addGrpSetCol(false, gb1ColInfoLst.get(reduceKeys.size()).getInternalName(), true, reduceKeys,
outputColumnNames, colInfoLst, colExprMap);
}
ArrayList reduceValues = getValueKeysForRS(reduceSideGB1, reduceSideGB1.getConf()
.getKeys().size(), outputColumnNames, colInfoLst, colExprMap, true, true);
ReduceSinkOperator rsOp = (ReduceSinkOperator) OperatorFactory.getAndMakeChild(PlanUtils
.getReduceSinkDesc(reduceKeys, reduceValues, outputColumnNames, true, -1,
getNumPartFieldsForReduceSideRS(gbInfo), getParallelismForReduceSideRS(gbInfo),
AcidUtils.Operation.NOT_ACID), new RowSchema(colInfoLst), reduceSideGB1);
rsOp.setColumnExprMap(colExprMap);
return new OpAttr("", new HashSet(), rsOp);
}
private static OpAttr genMapSideGBRS(OpAttr inputOpAf, GBInfo gbInfo) throws SemanticException {
Map colExprMap = new HashMap();
List outputKeyColumnNames = new ArrayList();
List outputValueColumnNames = new ArrayList();
ArrayList colInfoLst = new ArrayList();
GroupByOperator mapGB = (GroupByOperator) inputOpAf.inputs.get(0);
ArrayList reduceKeys = getReduceKeysForRS(mapGB, 0, gbInfo.gbKeys.size() - 1,
outputKeyColumnNames, false, colInfoLst, colExprMap, false, false);
int keyLength = reduceKeys.size();
if (inclGrpSetInMapSide(gbInfo)) {
addGrpSetCol(false, SemanticAnalyzer.getColumnInternalName(reduceKeys.size()), true,
reduceKeys, outputKeyColumnNames, colInfoLst, colExprMap);
keyLength++;
}
if (mapGB.getConf().getKeys().size() > reduceKeys.size()) {
// NOTE: All dist cols have single output col name;
reduceKeys.addAll(getReduceKeysForRS(mapGB, reduceKeys.size(), mapGB.getConf().getKeys()
.size() - 1, outputKeyColumnNames, true, colInfoLst, colExprMap, false, false));
} else if (!gbInfo.distColIndices.isEmpty()) {
// This is the case where distinct cols are part of GB Keys in which case
// we still need to add it to out put col names
outputKeyColumnNames.add(SemanticAnalyzer.getColumnInternalName(reduceKeys.size()));
}
ArrayList reduceValues = getValueKeysForRS(mapGB, mapGB.getConf().getKeys()
.size(), outputValueColumnNames, colInfoLst, colExprMap, false, false);
ReduceSinkOperator rsOp = (ReduceSinkOperator) OperatorFactory.getAndMakeChild(PlanUtils
.getReduceSinkDesc(reduceKeys, keyLength, reduceValues, gbInfo.distColIndices,
outputKeyColumnNames, outputValueColumnNames, true, -1,
getNumPartFieldsForMapSideRS(gbInfo), getParallelismForMapSideRS(gbInfo),
AcidUtils.Operation.NOT_ACID), new RowSchema(colInfoLst), mapGB);
rsOp.setColumnExprMap(colExprMap);
return new OpAttr("", new HashSet(), rsOp);
}
private static OpAttr genMapSideRS(OpAttr inputOpAf, GBInfo gbInfo) throws SemanticException {
Map colExprMap = new HashMap();
List outputKeyColumnNames = new ArrayList();
List outputValueColumnNames = new ArrayList();
ArrayList colInfoLst = new ArrayList();
String outputColName;
// 1. Add GB Keys to reduce keys
ArrayList reduceKeys = getReduceKeysForRS(inputOpAf.inputs.get(0), 0,
gbInfo.gbKeys.size() - 1, outputKeyColumnNames, false, colInfoLst, colExprMap, false, false);
int keyLength = reduceKeys.size();
// 2. Add Dist UDAF args to reduce keys
if (gbInfo.containsDistinctAggr) {
// TODO: Why is this needed (doesn't represent any cols)
String udafName = SemanticAnalyzer.getColumnInternalName(reduceKeys.size());
outputKeyColumnNames.add(udafName);
for (int i = 0; i < gbInfo.distExprNodes.size(); i++) {
reduceKeys.add(gbInfo.distExprNodes.get(i));
outputColName = SemanticAnalyzer.getColumnInternalName(i);
String field = Utilities.ReduceField.KEY.toString() + "." + udafName + ":" + i + "."
+ outputColName;
ColumnInfo colInfo = new ColumnInfo(field, gbInfo.distExprNodes.get(i).getTypeInfo(), null,
false);
colInfoLst.add(colInfo);
colExprMap.put(field, gbInfo.distExprNodes.get(i));
}
}
// 3. Add UDAF args deduped to reduce values
ArrayList reduceValues = new ArrayList();
for (int i = 0; i < gbInfo.deDupedNonDistIrefs.size(); i++) {
reduceValues.add(gbInfo.deDupedNonDistIrefs.get(i));
outputColName = SemanticAnalyzer.getColumnInternalName(reduceValues.size() - 1);
outputValueColumnNames.add(outputColName);
String field = Utilities.ReduceField.VALUE.toString() + "." + outputColName;
colInfoLst.add(new ColumnInfo(field, reduceValues.get(reduceValues.size() - 1).getTypeInfo(),
null, false));
colExprMap.put(field, reduceValues.get(reduceValues.size() - 1));
}
// 4. Gen RS
ReduceSinkOperator rsOp = (ReduceSinkOperator) OperatorFactory.getAndMakeChild(PlanUtils
.getReduceSinkDesc(reduceKeys, keyLength, reduceValues,
gbInfo.distColIndices, outputKeyColumnNames,
outputValueColumnNames, true, -1, getNumPartFieldsForMapSideRS(gbInfo),
getParallelismForMapSideRS(gbInfo), AcidUtils.Operation.NOT_ACID), new RowSchema(
colInfoLst), inputOpAf.inputs.get(0));
rsOp.setColumnExprMap(colExprMap);
return new OpAttr("", new HashSet(), rsOp);
}
private static OpAttr genReduceSideGB2(OpAttr inputOpAf, GBInfo gbInfo) throws SemanticException {
ArrayList outputColNames = new ArrayList();
ArrayList colInfoLst = new ArrayList();
Map colExprMap = new HashMap();
String colOutputName = null;
ReduceSinkOperator rs = (ReduceSinkOperator) inputOpAf.inputs.get(0);
List rsColInfoLst = rs.getSchema().getSignature();
ColumnInfo ci;
// 1. Build GB Keys, grouping set starting position
// 1.1 First Add original GB Keys
ArrayList gbKeys = ExprNodeDescUtils.genExprNodeDesc(rs, 0,
gbInfo.gbKeys.size() - 1, false, false);
for (int i = 0; i < gbInfo.gbKeys.size(); i++) {
ci = rsColInfoLst.get(i);
colOutputName = gbInfo.outputColNames.get(i);
outputColNames.add(colOutputName);
colInfoLst.add(new ColumnInfo(colOutputName, ci.getType(), "", false));
colExprMap.put(colOutputName, gbKeys.get(i));
}
// 1.2 Add GrpSet Col
int groupingSetsPosition = -1;
if (inclGrpSetInReduceSide(gbInfo) && gbInfo.grpIdFunctionNeeded) {
groupingSetsPosition = gbKeys.size();
ExprNodeDesc grpSetColExpr = new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo,
rsColInfoLst.get(groupingSetsPosition).getInternalName(), null, false);
gbKeys.add(grpSetColExpr);
colOutputName = gbInfo.outputColNames.get(gbInfo.outputColNames.size() - 1);
;
outputColNames.add(colOutputName);
colInfoLst.add(new ColumnInfo(colOutputName, TypeInfoFactory.stringTypeInfo, null, true));
colExprMap.put(colOutputName, grpSetColExpr);
}
// 2. Add UDAF
UDAFAttrs udafAttr;
ArrayList aggregations = new ArrayList();
int udafStartPosInGBInfOutputColNames = gbInfo.grpSets.isEmpty() ? gbInfo.gbKeys.size()
: gbInfo.gbKeys.size() * 2;
int udafStartPosInInputRS = gbInfo.grpSets.isEmpty() ? gbInfo.gbKeys.size() : gbInfo.gbKeys.size() + 1;
for (int i = 0; i < gbInfo.udafAttrs.size(); i++) {
udafAttr = gbInfo.udafAttrs.get(i);
ArrayList aggParameters = new ArrayList();
aggParameters.add(new ExprNodeColumnDesc(rsColInfoLst.get(udafStartPosInInputRS + i)));
colOutputName = gbInfo.outputColNames.get(udafStartPosInGBInfOutputColNames + i);
outputColNames.add(colOutputName);
Mode udafMode = SemanticAnalyzer.groupByDescModeToUDAFMode(GroupByDesc.Mode.FINAL,
udafAttr.isDistinctUDAF);
GenericUDAFInfo udaf = SemanticAnalyzer.getGenericUDAFInfo(udafAttr.udafEvaluator, udafMode,
aggParameters);
aggregations.add(new AggregationDesc(udafAttr.udafName.toLowerCase(),
udaf.genericUDAFEvaluator, udaf.convertedParameters, false, udafMode));
colInfoLst.add(new ColumnInfo(colOutputName, udaf.returnType, "", false));
}
Operator rsGBOp2 = OperatorFactory.getAndMakeChild(new GroupByDesc(GroupByDesc.Mode.FINAL,
outputColNames, gbKeys, aggregations, false, gbInfo.groupByMemoryUsage,
gbInfo.memoryThreshold, null, false, groupingSetsPosition, gbInfo.containsDistinctAggr),
new RowSchema(colInfoLst), rs);
rsGBOp2.setColumnExprMap(colExprMap);
// TODO: Shouldn't we propgate vc? is it vc col from tab or all vc
return new OpAttr("", new HashSet(), rsGBOp2);
}
private static OpAttr genReduceSideGB1(OpAttr inputOpAf, GBInfo gbInfo, boolean computeGrpSet,
boolean propagateConstInDistinctUDAF, GroupByDesc.Mode gbMode) throws SemanticException {
ArrayList outputColNames = new ArrayList();
ArrayList colInfoLst = new ArrayList();
Map colExprMap = new HashMap();
String colOutputName = null;
ReduceSinkOperator rs = (ReduceSinkOperator) inputOpAf.inputs.get(0);
List rsColInfoLst = rs.getSchema().getSignature();
ColumnInfo ci;
boolean finalGB = (gbInfo.gbPhysicalPipelineMode == HIVEGBPHYSICALMODE.MAP_SIDE_GB_NO_SKEW_NO_ADD_MR_JOB);
// 1. Build GB Keys, grouping set starting position
// 1.1 First Add original GB Keys
ArrayList gbKeys = ExprNodeDescUtils.genExprNodeDesc(rs, 0,
gbInfo.gbKeys.size() - 1, false, false);
for (int i = 0; i < gbInfo.gbKeys.size(); i++) {
ci = rsColInfoLst.get(i);
if (finalGB) {
colOutputName = gbInfo.outputColNames.get(i);
} else {
colOutputName = SemanticAnalyzer.getColumnInternalName(i);
}
outputColNames.add(colOutputName);
colInfoLst.add(new ColumnInfo(colOutputName, ci.getType(), "", false));
colExprMap.put(colOutputName, gbKeys.get(i));
}
// 1.2 Add GrpSet Col
int groupingSetsColPosition = -1;
if ((!finalGB && gbInfo.grpSets.size() > 0) || (finalGB && gbInfo.grpIdFunctionNeeded)) {
groupingSetsColPosition = gbInfo.gbKeys.size();
if (computeGrpSet) {
// GrpSet Col needs to be constructed
gbKeys.add(new ExprNodeConstantDesc("0"));
} else {
// GrpSet Col already part of input RS
// TODO: Can't we just copy the ExprNodeDEsc from input (Do we need to
// explicitly set table alias to null & VC to false
gbKeys.addAll(ExprNodeDescUtils.genExprNodeDesc(rs, groupingSetsColPosition,
groupingSetsColPosition, false, true));
}
colOutputName = SemanticAnalyzer.getColumnInternalName(groupingSetsColPosition);
if (finalGB) {
colOutputName = gbInfo.outputColNames.get(gbInfo.outputColNames.size() - 1);
}
outputColNames.add(colOutputName);
colInfoLst.add(new ColumnInfo(colOutputName, TypeInfoFactory.stringTypeInfo, null, true));
colExprMap.put(colOutputName, gbKeys.get(groupingSetsColPosition));
}
// 2. Walk through UDAF and add them to GB
String lastReduceKeyColName = null;
if (!rs.getConf().getOutputKeyColumnNames().isEmpty()) {
lastReduceKeyColName = rs.getConf().getOutputKeyColumnNames()
.get(rs.getConf().getOutputKeyColumnNames().size() - 1);
}
int numDistinctUDFs = 0;
int distinctStartPosInReduceKeys = gbKeys.size();
List reduceValues = rs.getConf().getValueCols();
ArrayList aggregations = new ArrayList();
int udafColStartPosInOriginalGB = (gbInfo.grpSets.size() > 0) ? gbInfo.gbKeys.size() * 2
: gbInfo.gbKeys.size();
int udafColStartPosInRS = rs.getConf().getKeyCols().size();
for (int i = 0; i < gbInfo.udafAttrs.size(); i++) {
UDAFAttrs udafAttr = gbInfo.udafAttrs.get(i);
ArrayList aggParameters = new ArrayList();
if (udafAttr.isDistinctUDAF) {
ColumnInfo rsDistUDAFParamColInfo;
ExprNodeDesc distinctUDAFParam;
ExprNodeDesc constantPropDistinctUDAFParam;
for (int j = 0; j < udafAttr.udafParamsIndxInGBInfoDistExprs.size(); j++) {
rsDistUDAFParamColInfo = rsColInfoLst.get(distinctStartPosInReduceKeys + j);
String rsDistUDAFParamName = rsDistUDAFParamColInfo.getInternalName();
// TODO: verify if this is needed
if (lastReduceKeyColName != null) {
rsDistUDAFParamName = Utilities.ReduceField.KEY.name() + "." + lastReduceKeyColName
+ ":" + numDistinctUDFs + "." + SemanticAnalyzer.getColumnInternalName(j);
}
distinctUDAFParam = new ExprNodeColumnDesc(rsDistUDAFParamColInfo.getType(),
rsDistUDAFParamName, rsDistUDAFParamColInfo.getTabAlias(),
rsDistUDAFParamColInfo.getIsVirtualCol());
if (propagateConstInDistinctUDAF) {
// TODO: Implement propConstDistUDAFParams
constantPropDistinctUDAFParam = SemanticAnalyzer
.isConstantParameterInAggregationParameters(
rsDistUDAFParamColInfo.getInternalName(), reduceValues);
if (constantPropDistinctUDAFParam != null) {
distinctUDAFParam = constantPropDistinctUDAFParam;
}
}
aggParameters.add(distinctUDAFParam);
}
numDistinctUDFs++;
} else {
aggParameters.add(new ExprNodeColumnDesc(rsColInfoLst.get(udafColStartPosInRS + i)));
}
Mode udafMode = SemanticAnalyzer.groupByDescModeToUDAFMode(gbMode, udafAttr.isDistinctUDAF);
GenericUDAFInfo udaf = SemanticAnalyzer.getGenericUDAFInfo(udafAttr.udafEvaluator, udafMode,
aggParameters);
aggregations.add(new AggregationDesc(udafAttr.udafName.toLowerCase(),
udaf.genericUDAFEvaluator, udaf.convertedParameters,
(gbMode != GroupByDesc.Mode.FINAL && udafAttr.isDistinctUDAF), udafMode));
if (finalGB) {
colOutputName = gbInfo.outputColNames.get(udafColStartPosInOriginalGB + i);
} else {
colOutputName = SemanticAnalyzer.getColumnInternalName(gbKeys.size() + aggregations.size()
- 1);
}
colInfoLst.add(new ColumnInfo(colOutputName, udaf.returnType, "", false));
outputColNames.add(colOutputName);
}
// Nothing special needs to be done for grouping sets if
// this is the final group by operator, and multiple rows corresponding to
// the
// grouping sets have been generated upstream.
// However, if an addition MR job has been created to handle grouping sets,
// additional rows corresponding to grouping sets need to be created here.
//TODO: Clean up/refactor assumptions
boolean includeGrpSetInGBDesc = (gbInfo.grpSets.size() > 0)
&& !finalGB
&& !(gbInfo.gbPhysicalPipelineMode == HIVEGBPHYSICALMODE.MAP_SIDE_GB_SKEW_GBKEYS_OR_DIST_UDAF_PRESENT);
Operator rsGBOp = OperatorFactory.getAndMakeChild(new GroupByDesc(gbMode, outputColNames,
gbKeys, aggregations, gbInfo.groupByMemoryUsage, gbInfo.memoryThreshold, gbInfo.grpSets,
includeGrpSetInGBDesc, groupingSetsColPosition,
gbInfo.containsDistinctAggr), new RowSchema(colInfoLst), rs);
rsGBOp.setColumnExprMap(colExprMap);
return new OpAttr("", new HashSet(), rsGBOp);
}
/**
* RS-GB0
*
* @param inputOpAf
* @param gbInfo
* @param gbMode
* @return
* @throws SemanticException
*/
private static OpAttr genReduceSideGB1NoMapGB(OpAttr inputOpAf, GBInfo gbInfo,
GroupByDesc.Mode gbMode) throws SemanticException {
ArrayList outputColNames = new ArrayList();
ArrayList colInfoLst = new ArrayList();
Map colExprMap = new HashMap();
String colOutputName = null;
ReduceSinkOperator rs = (ReduceSinkOperator) inputOpAf.inputs.get(0);
List rsColInfoLst = rs.getSchema().getSignature();
ColumnInfo ci;
boolean useOriginalGBNames = (gbInfo.gbPhysicalPipelineMode == HIVEGBPHYSICALMODE.NO_MAP_SIDE_GB_NO_SKEW);
// 1. Build GB Keys, grouping set starting position
// 1.1 First Add original GB Keys
ArrayList gbKeys = ExprNodeDescUtils.genExprNodeDesc(rs, 0,
gbInfo.gbKeys.size() - 1, true, false);
for (int i = 0; i < gbInfo.gbKeys.size(); i++) {
ci = rsColInfoLst.get(i);
if (useOriginalGBNames) {
colOutputName = gbInfo.outputColNames.get(i);
} else {
colOutputName = SemanticAnalyzer.getColumnInternalName(i);
}
outputColNames.add(colOutputName);
colInfoLst.add(new ColumnInfo(colOutputName, ci.getType(), null, false));
colExprMap.put(colOutputName, gbKeys.get(i));
}
// 2. Walk through UDAF and add them to GB
String lastReduceKeyColName = null;
if (!rs.getConf().getOutputKeyColumnNames().isEmpty()) {
lastReduceKeyColName = rs.getConf().getOutputKeyColumnNames()
.get(rs.getConf().getOutputKeyColumnNames().size() - 1);
}
int numDistinctUDFs = 0;
int distinctStartPosInReduceKeys = gbKeys.size();
List reduceValues = rs.getConf().getValueCols();
ArrayList aggregations = new ArrayList();
int udafColStartPosInOriginalGB = gbInfo.gbKeys.size();
for (int i = 0; i < gbInfo.udafAttrs.size(); i++) {
UDAFAttrs udafAttr = gbInfo.udafAttrs.get(i);
ArrayList aggParameters = new ArrayList();
ColumnInfo rsUDAFParamColInfo;
ExprNodeDesc udafParam;
ExprNodeDesc constantPropDistinctUDAFParam;
for (int j = 0; j < udafAttr.udafParams.size(); j++) {
rsUDAFParamColInfo = rsColInfoLst.get(distinctStartPosInReduceKeys + j);
String rsUDAFParamName = rsUDAFParamColInfo.getInternalName();
// TODO: verify if this is needed
if (udafAttr.isDistinctUDAF && lastReduceKeyColName != null) {
rsUDAFParamName = Utilities.ReduceField.KEY.name() + "." + lastReduceKeyColName + ":"
+ numDistinctUDFs + "." + SemanticAnalyzer.getColumnInternalName(j);
}
udafParam = new ExprNodeColumnDesc(rsUDAFParamColInfo.getType(), rsUDAFParamName,
rsUDAFParamColInfo.getTabAlias(), rsUDAFParamColInfo.getIsVirtualCol());
constantPropDistinctUDAFParam = SemanticAnalyzer
.isConstantParameterInAggregationParameters(rsUDAFParamColInfo.getInternalName(),
reduceValues);
if (constantPropDistinctUDAFParam != null) {
udafParam = constantPropDistinctUDAFParam;
}
aggParameters.add(udafParam);
}
if (udafAttr.isDistinctUDAF) {
numDistinctUDFs++;
}
Mode udafMode = SemanticAnalyzer.groupByDescModeToUDAFMode(gbMode, udafAttr.isDistinctUDAF);
GenericUDAFInfo udaf = SemanticAnalyzer.getGenericUDAFInfo(udafAttr.udafEvaluator, udafMode,
aggParameters);
aggregations.add(new AggregationDesc(udafAttr.udafName.toLowerCase(),
udaf.genericUDAFEvaluator, udaf.convertedParameters, udafAttr.isDistinctUDAF, udafMode));
if (useOriginalGBNames) {
colOutputName = gbInfo.outputColNames.get(udafColStartPosInOriginalGB + i);
} else {
colOutputName = SemanticAnalyzer.getColumnInternalName(gbKeys.size() + aggregations.size()
- 1);
}
colInfoLst.add(new ColumnInfo(colOutputName, udaf.returnType, "", false));
outputColNames.add(colOutputName);
}
Operator rsGB1 = OperatorFactory.getAndMakeChild(new GroupByDesc(gbMode, outputColNames,
gbKeys, aggregations, false, gbInfo.groupByMemoryUsage, gbInfo.memoryThreshold, null,
false, -1, numDistinctUDFs > 0), new RowSchema(colInfoLst), rs);
rsGB1.setColumnExprMap(colExprMap);
return new OpAttr("", new HashSet(), rsGB1);
}
@SuppressWarnings("unchecked")
private static OpAttr genMapSideGB(OpAttr inputOpAf, GBInfo gbAttrs) throws SemanticException {
ArrayList outputColNames = new ArrayList();
ArrayList colInfoLst = new ArrayList();
Map colExprMap = new HashMap();
Set gbKeyColsAsNamesFrmIn = new HashSet();
String colOutputName = null;
// 1. Build GB Keys, grouping set starting position
// 1.1 First Add original GB Keys
ArrayList gbKeys = new ArrayList();
for (int i = 0; i < gbAttrs.gbKeys.size(); i++) {
gbKeys.add(gbAttrs.gbKeys.get(i));
colOutputName = SemanticAnalyzer.getColumnInternalName(i);
colInfoLst.add(new ColumnInfo(colOutputName, gbAttrs.gbKeyTypes.get(i), "", false));
outputColNames.add(colOutputName);
gbKeyColsAsNamesFrmIn.add(gbAttrs.gbKeyColNamesInInput.get(i));
colExprMap.put(colOutputName, gbKeys.get(i));
}
// 1.2. Adjust GroupingSet Position, GBKeys for GroupingSet Position if
// needed. NOTE: GroupingID is added to map side GB only if we don't GrpSet
// doesn't require additional MR Jobs
int groupingSetsPosition = -1;
boolean inclGrpID = inclGrpSetInMapSide(gbAttrs);
if (inclGrpID) {
groupingSetsPosition = gbKeys.size();
addGrpSetCol(true, null, false, gbKeys, outputColNames, colInfoLst, colExprMap);
}
// 1.3. Add all distinct params
// NOTE: distinct expr can not be part of of GB key (we assume plan
// gen would have prevented it)
for (int i = 0; i < gbAttrs.distExprNodes.size(); i++) {
if (!gbKeyColsAsNamesFrmIn.contains(gbAttrs.distExprNames.get(i))) {
gbKeys.add(gbAttrs.distExprNodes.get(i));
colOutputName = SemanticAnalyzer.getColumnInternalName(gbKeys.size() - 1);
colInfoLst.add(new ColumnInfo(colOutputName, gbAttrs.distExprTypes.get(i), "", false));
outputColNames.add(colOutputName);
gbKeyColsAsNamesFrmIn.add(gbAttrs.distExprNames.get(i));
colExprMap.put(colOutputName, gbKeys.get(gbKeys.size() - 1));
}
}
// 2. Build Aggregations
ArrayList aggregations = new ArrayList();
for (UDAFAttrs udafAttr : gbAttrs.udafAttrs) {
Mode amode = SemanticAnalyzer.groupByDescModeToUDAFMode(GroupByDesc.Mode.HASH,
udafAttr.isDistinctUDAF);
aggregations.add(new AggregationDesc(udafAttr.udafName.toLowerCase(), udafAttr.udafEvaluator,
udafAttr.udafParams, udafAttr.isDistinctUDAF, amode));
GenericUDAFInfo udafInfo;
try {
udafInfo = SemanticAnalyzer.getGenericUDAFInfo(udafAttr.udafEvaluator, amode,
udafAttr.udafParams);
} catch (SemanticException e) {
throw new RuntimeException(e);
}
colOutputName = SemanticAnalyzer.getColumnInternalName(gbKeys.size() + aggregations.size()
- 1);
colInfoLst.add(new ColumnInfo(colOutputName, udafInfo.returnType, "", false));
outputColNames.add(colOutputName);
}
// 3. Create GB
@SuppressWarnings("rawtypes")
Operator gbOp = OperatorFactory.getAndMakeChild(new GroupByDesc(GroupByDesc.Mode.HASH,
outputColNames, gbKeys, aggregations, false, gbAttrs.groupByMemoryUsage,
gbAttrs.memoryThreshold, gbAttrs.grpSets, inclGrpID, groupingSetsPosition,
gbAttrs.containsDistinctAggr), new RowSchema(colInfoLst), inputOpAf.inputs.get(0));
// 5. Setup Expr Col Map
// NOTE: UDAF is not included in ExprColMap
gbOp.setColumnExprMap(colExprMap);
return new OpAttr("", new HashSet(), gbOp);
}
private static void addGrpSetCol(boolean createConstantExpr, String grpSetIDExprName,
boolean addReducePrefixToColInfoName, List exprLst,
List outputColumnNames, List colInfoLst,
Map colExprMap) throws SemanticException {
String outputColName = null;
ExprNodeDesc grpSetColExpr = null;
if (createConstantExpr) {
grpSetColExpr = new ExprNodeConstantDesc("0");
} else {
grpSetColExpr = new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo, grpSetIDExprName,
null, false);
}
exprLst.add(grpSetColExpr);
outputColName = SemanticAnalyzer.getColumnInternalName(exprLst.size() - 1);
outputColumnNames.add(outputColName);
String internalColName = outputColName;
if (addReducePrefixToColInfoName) {
internalColName = Utilities.ReduceField.KEY.toString() + "." + outputColName;
}
colInfoLst.add(new ColumnInfo(internalColName, grpSetColExpr.getTypeInfo(), null, true));
colExprMap.put(internalColName, grpSetColExpr);
}
/**
* Get Reduce Keys for RS following MapSide GB
*
* @param reduceKeys
* assumed to be deduped list of exprs
* @param outputKeyColumnNames
* @param colExprMap
* @return List of ExprNodeDesc of ReduceKeys
* @throws SemanticException
*/
private static ArrayList getReduceKeysForRS(Operator inOp, int startPos,
int endPos, List outputKeyColumnNames, boolean addOnlyOneKeyColName,
ArrayList colInfoLst, Map colExprMap,
boolean addEmptyTabAlias, boolean setColToNonVirtual) throws SemanticException {
ArrayList reduceKeys = null;
if (endPos < 0) {
reduceKeys = new ArrayList();
} else {
reduceKeys = ExprNodeDescUtils.genExprNodeDesc(inOp, startPos, endPos, addEmptyTabAlias,
setColToNonVirtual);
int outColNameIndx = startPos;
for (int i = 0; i < reduceKeys.size(); ++i) {
String outputColName = SemanticAnalyzer.getColumnInternalName(outColNameIndx);
outColNameIndx++;
if (!addOnlyOneKeyColName || i == 0) {
outputKeyColumnNames.add(outputColName);
}
// TODO: Verify if this is needed (Why can't it be always null/empty
String tabAlias = addEmptyTabAlias ? "" : null;
ColumnInfo colInfo = new ColumnInfo(Utilities.ReduceField.KEY.toString() + "."
+ outputColName, reduceKeys.get(i).getTypeInfo(), tabAlias, false);
colInfoLst.add(colInfo);
colExprMap.put(colInfo.getInternalName(), reduceKeys.get(i));
}
}
return reduceKeys;
}
/**
* Get Value Keys for RS following MapSide GB
*
* @param GroupByOperator
* MapSide GB
* @param outputKeyColumnNames
* @param colExprMap
* @return List of ExprNodeDesc of Values
* @throws SemanticException
*/
private static ArrayList getValueKeysForRS(Operator inOp, int aggStartPos,
List outputKeyColumnNames, ArrayList colInfoLst,
Map colExprMap, boolean addEmptyTabAlias, boolean setColToNonVirtual)
throws SemanticException {
List mapGBColInfoLst = inOp.getSchema().getSignature();
ArrayList valueKeys = null;
if (aggStartPos >= mapGBColInfoLst.size()) {
valueKeys = new ArrayList();
} else {
valueKeys = ExprNodeDescUtils.genExprNodeDesc(inOp, aggStartPos, mapGBColInfoLst.size() - 1,
true, setColToNonVirtual);
for (int i = 0; i < valueKeys.size(); ++i) {
String outputColName = SemanticAnalyzer.getColumnInternalName(i);
outputKeyColumnNames.add(outputColName);
// TODO: Verify if this is needed (Why can't it be always null/empty
String tabAlias = addEmptyTabAlias ? "" : null;
ColumnInfo colInfo = new ColumnInfo(Utilities.ReduceField.VALUE.toString() + "."
+ outputColName, valueKeys.get(i).getTypeInfo(), tabAlias, false);
colInfoLst.add(colInfo);
colExprMap.put(colInfo.getInternalName(), valueKeys.get(i));
}
}
return valueKeys;
}
// TODO: Implement this
private static ExprNodeDesc propConstDistUDAFParams() {
return null;
}
}