org.apache.hadoop.hive.ql.optimizer.physical.Vectorizer Maven / Gradle / Ivy
The newest version!
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.optimizer.physical;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.Stack;
import java.util.regex.Pattern;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
import org.apache.hadoop.hive.ql.exec.*;
import org.apache.hadoop.hive.ql.exec.mr.MapRedTask;
import org.apache.hadoop.hive.ql.exec.persistence.MapJoinKey;
import org.apache.hadoop.hive.ql.exec.spark.SparkTask;
import org.apache.hadoop.hive.ql.exec.tez.TezTask;
import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor;
import org.apache.hadoop.hive.ql.exec.vector.VectorGroupByOperator;
import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinInnerBigOnlyLongOperator;
import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinInnerBigOnlyMultiKeyOperator;
import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinInnerBigOnlyStringOperator;
import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinInnerLongOperator;
import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinInnerMultiKeyOperator;
import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinInnerStringOperator;
import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinLeftSemiLongOperator;
import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinLeftSemiMultiKeyOperator;
import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinLeftSemiStringOperator;
import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinOuterLongOperator;
import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinOuterMultiKeyOperator;
import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinOuterStringOperator;
import org.apache.hadoop.hive.ql.exec.vector.VectorMapJoinOperator;
import org.apache.hadoop.hive.ql.exec.vector.VectorMapJoinOuterFilteredOperator;
import org.apache.hadoop.hive.ql.exec.vector.VectorSMBMapJoinOperator;
import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext;
import org.apache.hadoop.hive.ql.exec.vector.VectorizationContextRegion;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedInputFormatInterface;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.VectorAggregateExpression;
import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
import org.apache.hadoop.hive.ql.lib.Dispatcher;
import org.apache.hadoop.hive.ql.lib.GraphWalker;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.NodeProcessor;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.lib.PreOrderWalker;
import org.apache.hadoop.hive.ql.lib.Rule;
import org.apache.hadoop.hive.ql.lib.RuleRegExp;
import org.apache.hadoop.hive.ql.lib.TaskGraphWalker;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.VirtualColumn;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.AbstractOperatorDesc;
import org.apache.hadoop.hive.ql.plan.AggregationDesc;
import org.apache.hadoop.hive.ql.plan.BaseWork;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.plan.GroupByDesc;
import org.apache.hadoop.hive.ql.plan.JoinDesc;
import org.apache.hadoop.hive.ql.plan.MapJoinDesc;
import org.apache.hadoop.hive.ql.plan.MapWork;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.PartitionDesc;
import org.apache.hadoop.hive.ql.plan.ReduceWork;
import org.apache.hadoop.hive.ql.plan.SMBJoinDesc;
import org.apache.hadoop.hive.ql.plan.SparkWork;
import org.apache.hadoop.hive.ql.plan.TableScanDesc;
import org.apache.hadoop.hive.ql.plan.TezWork;
import org.apache.hadoop.hive.ql.plan.VectorGroupByDesc;
import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc;
import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableImplementationType;
import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType;
import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKind;
import org.apache.hadoop.hive.ql.plan.api.OperatorType;
import org.apache.hadoop.hive.ql.udf.UDFAcos;
import org.apache.hadoop.hive.ql.udf.UDFAsin;
import org.apache.hadoop.hive.ql.udf.UDFAtan;
import org.apache.hadoop.hive.ql.udf.UDFBin;
import org.apache.hadoop.hive.ql.udf.UDFConv;
import org.apache.hadoop.hive.ql.udf.UDFCos;
import org.apache.hadoop.hive.ql.udf.UDFDayOfMonth;
import org.apache.hadoop.hive.ql.udf.UDFDegrees;
import org.apache.hadoop.hive.ql.udf.UDFExp;
import org.apache.hadoop.hive.ql.udf.UDFHex;
import org.apache.hadoop.hive.ql.udf.UDFHour;
import org.apache.hadoop.hive.ql.udf.UDFLength;
import org.apache.hadoop.hive.ql.udf.UDFLike;
import org.apache.hadoop.hive.ql.udf.UDFLn;
import org.apache.hadoop.hive.ql.udf.UDFLog;
import org.apache.hadoop.hive.ql.udf.UDFLog10;
import org.apache.hadoop.hive.ql.udf.UDFLog2;
import org.apache.hadoop.hive.ql.udf.UDFMinute;
import org.apache.hadoop.hive.ql.udf.UDFMonth;
import org.apache.hadoop.hive.ql.udf.UDFRadians;
import org.apache.hadoop.hive.ql.udf.UDFRand;
import org.apache.hadoop.hive.ql.udf.UDFRegExp;
import org.apache.hadoop.hive.ql.udf.UDFSecond;
import org.apache.hadoop.hive.ql.udf.UDFSign;
import org.apache.hadoop.hive.ql.udf.UDFSin;
import org.apache.hadoop.hive.ql.udf.UDFSqrt;
import org.apache.hadoop.hive.ql.udf.UDFSubstr;
import org.apache.hadoop.hive.ql.udf.UDFTan;
import org.apache.hadoop.hive.ql.udf.UDFToBoolean;
import org.apache.hadoop.hive.ql.udf.UDFToByte;
import org.apache.hadoop.hive.ql.udf.UDFToDouble;
import org.apache.hadoop.hive.ql.udf.UDFToFloat;
import org.apache.hadoop.hive.ql.udf.UDFToInteger;
import org.apache.hadoop.hive.ql.udf.UDFToLong;
import org.apache.hadoop.hive.ql.udf.UDFToShort;
import org.apache.hadoop.hive.ql.udf.UDFToString;
import org.apache.hadoop.hive.ql.udf.UDFWeekOfYear;
import org.apache.hadoop.hive.ql.udf.UDFYear;
import org.apache.hadoop.hive.ql.udf.generic.*;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
public class Vectorizer implements PhysicalPlanResolver {
protected static transient final Log LOG = LogFactory.getLog(Vectorizer.class);
Pattern supportedDataTypesPattern;
List> vectorizableTasks =
new ArrayList>();
Set> supportedGenericUDFs = new HashSet>();
Set supportedAggregationUdfs = new HashSet();
private PhysicalContext physicalContext = null;
private HiveConf hiveConf;
public Vectorizer() {
StringBuilder patternBuilder = new StringBuilder();
patternBuilder.append("int");
patternBuilder.append("|smallint");
patternBuilder.append("|tinyint");
patternBuilder.append("|bigint");
patternBuilder.append("|integer");
patternBuilder.append("|long");
patternBuilder.append("|short");
patternBuilder.append("|timestamp");
patternBuilder.append("|" + serdeConstants.INTERVAL_YEAR_MONTH_TYPE_NAME);
patternBuilder.append("|" + serdeConstants.INTERVAL_DAY_TIME_TYPE_NAME);
patternBuilder.append("|boolean");
patternBuilder.append("|binary");
patternBuilder.append("|string");
patternBuilder.append("|byte");
patternBuilder.append("|float");
patternBuilder.append("|double");
patternBuilder.append("|date");
patternBuilder.append("|void");
// Decimal types can be specified with different precision and scales e.g. decimal(10,5),
// as opposed to other data types which can be represented by constant strings.
// The regex matches only the "decimal" prefix of the type.
patternBuilder.append("|decimal.*");
// CHAR and VARCHAR types can be specified with maximum length.
patternBuilder.append("|char.*");
patternBuilder.append("|varchar.*");
supportedDataTypesPattern = Pattern.compile(patternBuilder.toString());
supportedGenericUDFs.add(GenericUDFOPPlus.class);
supportedGenericUDFs.add(GenericUDFOPMinus.class);
supportedGenericUDFs.add(GenericUDFOPMultiply.class);
supportedGenericUDFs.add(GenericUDFOPDivide.class);
supportedGenericUDFs.add(GenericUDFOPMod.class);
supportedGenericUDFs.add(GenericUDFOPNegative.class);
supportedGenericUDFs.add(GenericUDFOPPositive.class);
supportedGenericUDFs.add(GenericUDFOPEqualOrLessThan.class);
supportedGenericUDFs.add(GenericUDFOPEqualOrGreaterThan.class);
supportedGenericUDFs.add(GenericUDFOPGreaterThan.class);
supportedGenericUDFs.add(GenericUDFOPLessThan.class);
supportedGenericUDFs.add(GenericUDFOPNot.class);
supportedGenericUDFs.add(GenericUDFOPNotEqual.class);
supportedGenericUDFs.add(GenericUDFOPNotNull.class);
supportedGenericUDFs.add(GenericUDFOPNull.class);
supportedGenericUDFs.add(GenericUDFOPOr.class);
supportedGenericUDFs.add(GenericUDFOPAnd.class);
supportedGenericUDFs.add(GenericUDFOPEqual.class);
supportedGenericUDFs.add(UDFLength.class);
supportedGenericUDFs.add(UDFYear.class);
supportedGenericUDFs.add(UDFMonth.class);
supportedGenericUDFs.add(UDFDayOfMonth.class);
supportedGenericUDFs.add(UDFHour.class);
supportedGenericUDFs.add(UDFMinute.class);
supportedGenericUDFs.add(UDFSecond.class);
supportedGenericUDFs.add(UDFWeekOfYear.class);
supportedGenericUDFs.add(GenericUDFToUnixTimeStamp.class);
supportedGenericUDFs.add(GenericUDFDateAdd.class);
supportedGenericUDFs.add(GenericUDFDateSub.class);
supportedGenericUDFs.add(GenericUDFDate.class);
supportedGenericUDFs.add(GenericUDFDateDiff.class);
supportedGenericUDFs.add(UDFLike.class);
supportedGenericUDFs.add(UDFRegExp.class);
supportedGenericUDFs.add(UDFSubstr.class);
supportedGenericUDFs.add(GenericUDFLTrim.class);
supportedGenericUDFs.add(GenericUDFRTrim.class);
supportedGenericUDFs.add(GenericUDFTrim.class);
supportedGenericUDFs.add(UDFSin.class);
supportedGenericUDFs.add(UDFCos.class);
supportedGenericUDFs.add(UDFTan.class);
supportedGenericUDFs.add(UDFAsin.class);
supportedGenericUDFs.add(UDFAcos.class);
supportedGenericUDFs.add(UDFAtan.class);
supportedGenericUDFs.add(UDFDegrees.class);
supportedGenericUDFs.add(UDFRadians.class);
supportedGenericUDFs.add(GenericUDFFloor.class);
supportedGenericUDFs.add(GenericUDFCeil.class);
supportedGenericUDFs.add(UDFExp.class);
supportedGenericUDFs.add(UDFLn.class);
supportedGenericUDFs.add(UDFLog2.class);
supportedGenericUDFs.add(UDFLog10.class);
supportedGenericUDFs.add(UDFLog.class);
supportedGenericUDFs.add(GenericUDFPower.class);
supportedGenericUDFs.add(GenericUDFRound.class);
supportedGenericUDFs.add(GenericUDFPosMod.class);
supportedGenericUDFs.add(UDFSqrt.class);
supportedGenericUDFs.add(UDFSign.class);
supportedGenericUDFs.add(UDFRand.class);
supportedGenericUDFs.add(UDFBin.class);
supportedGenericUDFs.add(UDFHex.class);
supportedGenericUDFs.add(UDFConv.class);
supportedGenericUDFs.add(GenericUDFLower.class);
supportedGenericUDFs.add(GenericUDFUpper.class);
supportedGenericUDFs.add(GenericUDFConcat.class);
supportedGenericUDFs.add(GenericUDFAbs.class);
supportedGenericUDFs.add(GenericUDFBetween.class);
supportedGenericUDFs.add(GenericUDFIn.class);
supportedGenericUDFs.add(GenericUDFCase.class);
supportedGenericUDFs.add(GenericUDFWhen.class);
supportedGenericUDFs.add(GenericUDFCoalesce.class);
supportedGenericUDFs.add(GenericUDFElt.class);
supportedGenericUDFs.add(GenericUDFInitCap.class);
// For type casts
supportedGenericUDFs.add(UDFToLong.class);
supportedGenericUDFs.add(UDFToInteger.class);
supportedGenericUDFs.add(UDFToShort.class);
supportedGenericUDFs.add(UDFToByte.class);
supportedGenericUDFs.add(UDFToBoolean.class);
supportedGenericUDFs.add(UDFToFloat.class);
supportedGenericUDFs.add(UDFToDouble.class);
supportedGenericUDFs.add(UDFToString.class);
supportedGenericUDFs.add(GenericUDFTimestamp.class);
supportedGenericUDFs.add(GenericUDFToDecimal.class);
supportedGenericUDFs.add(GenericUDFToDate.class);
supportedGenericUDFs.add(GenericUDFToChar.class);
supportedGenericUDFs.add(GenericUDFToVarchar.class);
supportedGenericUDFs.add(GenericUDFToIntervalYearMonth.class);
supportedGenericUDFs.add(GenericUDFToIntervalDayTime.class);
// For conditional expressions
supportedGenericUDFs.add(GenericUDFIf.class);
supportedAggregationUdfs.add("min");
supportedAggregationUdfs.add("max");
supportedAggregationUdfs.add("count");
supportedAggregationUdfs.add("sum");
supportedAggregationUdfs.add("avg");
supportedAggregationUdfs.add("variance");
supportedAggregationUdfs.add("var_pop");
supportedAggregationUdfs.add("var_samp");
supportedAggregationUdfs.add("std");
supportedAggregationUdfs.add("stddev");
supportedAggregationUdfs.add("stddev_pop");
supportedAggregationUdfs.add("stddev_samp");
}
class VectorizationDispatcher implements Dispatcher {
private final PhysicalContext physicalContext;
private List reduceColumnNames;
private List reduceTypeInfos;
public VectorizationDispatcher(PhysicalContext physicalContext) {
this.physicalContext = physicalContext;
reduceColumnNames = null;
reduceTypeInfos = null;
}
@Override
public Object dispatch(Node nd, Stack stack, Object... nodeOutputs)
throws SemanticException {
Task currTask = (Task) nd;
if (currTask instanceof MapRedTask) {
convertMapWork(((MapRedTask) currTask).getWork().getMapWork(), false);
} else if (currTask instanceof TezTask) {
TezWork work = ((TezTask) currTask).getWork();
for (BaseWork w: work.getAllWork()) {
if (w instanceof MapWork) {
convertMapWork((MapWork) w, true);
} else if (w instanceof ReduceWork) {
// We are only vectorizing Reduce under Tez.
if (HiveConf.getBoolVar(hiveConf,
HiveConf.ConfVars.HIVE_VECTORIZATION_REDUCE_ENABLED)) {
convertReduceWork((ReduceWork) w, true);
}
}
}
} else if (currTask instanceof SparkTask) {
SparkWork sparkWork = (SparkWork) currTask.getWork();
for (BaseWork baseWork : sparkWork.getAllWork()) {
if (baseWork instanceof MapWork) {
convertMapWork((MapWork) baseWork, false);
} else if (baseWork instanceof ReduceWork
&& HiveConf.getBoolVar(hiveConf,
HiveConf.ConfVars.HIVE_VECTORIZATION_REDUCE_ENABLED)) {
convertReduceWork((ReduceWork) baseWork, false);
}
}
}
return null;
}
private void convertMapWork(MapWork mapWork, boolean isTez) throws SemanticException {
boolean ret = validateMapWork(mapWork, isTez);
if (ret) {
vectorizeMapWork(mapWork, isTez);
}
}
private void addMapWorkRules(Map opRules, NodeProcessor np) {
opRules.put(new RuleRegExp("R1", TableScanOperator.getOperatorName() + ".*"
+ FileSinkOperator.getOperatorName()), np);
opRules.put(new RuleRegExp("R2", TableScanOperator.getOperatorName() + ".*"
+ ReduceSinkOperator.getOperatorName()), np);
}
private boolean validateMapWork(MapWork mapWork, boolean isTez) throws SemanticException {
LOG.info("Validating MapWork...");
// Eliminate MR plans with more than one TableScanOperator.
LinkedHashMap> aliasToWork = mapWork.getAliasToWork();
if ((aliasToWork == null) || (aliasToWork.size() == 0)) {
return false;
}
int tableScanCount = 0;
for (Operator op : aliasToWork.values()) {
if (op == null) {
LOG.warn("Map work has invalid aliases to work with. Fail validation!");
return false;
}
if (op instanceof TableScanOperator) {
tableScanCount++;
}
}
if (tableScanCount > 1) {
LOG.warn("Map work has more than 1 TableScanOperator aliases to work with. Fail validation!");
return false;
}
// Validate the input format
for (String path : mapWork.getPathToPartitionInfo().keySet()) {
PartitionDesc pd = mapWork.getPathToPartitionInfo().get(path);
List> interfaceList =
Arrays.asList(pd.getInputFileFormatClass().getInterfaces());
if (!interfaceList.contains(VectorizedInputFormatInterface.class)) {
LOG.info("Input format: " + pd.getInputFileFormatClassName()
+ ", doesn't provide vectorized input");
return false;
}
}
Map opRules = new LinkedHashMap();
MapWorkValidationNodeProcessor vnp = new MapWorkValidationNodeProcessor(mapWork, isTez);
addMapWorkRules(opRules, vnp);
Dispatcher disp = new DefaultRuleDispatcher(vnp, opRules, null);
GraphWalker ogw = new DefaultGraphWalker(disp);
// iterator the mapper operator tree
ArrayList topNodes = new ArrayList();
topNodes.addAll(mapWork.getAliasToWork().values());
HashMap nodeOutput = new HashMap();
ogw.startWalking(topNodes, nodeOutput);
for (Node n : nodeOutput.keySet()) {
if (nodeOutput.get(n) != null) {
if (!((Boolean)nodeOutput.get(n)).booleanValue()) {
return false;
}
}
}
return true;
}
private void vectorizeMapWork(MapWork mapWork, boolean isTez) throws SemanticException {
LOG.info("Vectorizing MapWork...");
mapWork.setVectorMode(true);
Map opRules = new LinkedHashMap();
MapWorkVectorizationNodeProcessor vnp = new MapWorkVectorizationNodeProcessor(mapWork, isTez);
addMapWorkRules(opRules, vnp);
Dispatcher disp = new DefaultRuleDispatcher(vnp, opRules, null);
GraphWalker ogw = new PreOrderWalker(disp);
// iterator the mapper operator tree
ArrayList topNodes = new ArrayList();
topNodes.addAll(mapWork.getAliasToWork().values());
HashMap nodeOutput = new HashMap();
ogw.startWalking(topNodes, nodeOutput);
mapWork.setVectorColumnNameMap(vnp.getVectorColumnNameMap());
mapWork.setVectorColumnTypeMap(vnp.getVectorColumnTypeMap());
mapWork.setVectorScratchColumnTypeMap(vnp.getVectorScratchColumnTypeMap());
if (LOG.isDebugEnabled()) {
debugDisplayAllMaps(mapWork);
}
return;
}
private void convertReduceWork(ReduceWork reduceWork, boolean isTez) throws SemanticException {
boolean ret = validateReduceWork(reduceWork);
if (ret) {
vectorizeReduceWork(reduceWork, isTez);
}
}
private boolean getOnlyStructObjectInspectors(ReduceWork reduceWork) throws SemanticException {
try {
// Check key ObjectInspector.
ObjectInspector keyObjectInspector = reduceWork.getKeyObjectInspector();
if (keyObjectInspector == null || !(keyObjectInspector instanceof StructObjectInspector)) {
return false;
}
StructObjectInspector keyStructObjectInspector = (StructObjectInspector)keyObjectInspector;
List keyFields = keyStructObjectInspector.getAllStructFieldRefs();
// Tez doesn't use tagging...
if (reduceWork.getNeedsTagging()) {
return false;
}
// Check value ObjectInspector.
ObjectInspector valueObjectInspector = reduceWork.getValueObjectInspector();
if (valueObjectInspector == null ||
!(valueObjectInspector instanceof StructObjectInspector)) {
return false;
}
StructObjectInspector valueStructObjectInspector = (StructObjectInspector)valueObjectInspector;
List valueFields = valueStructObjectInspector.getAllStructFieldRefs();
reduceColumnNames = new ArrayList();
reduceTypeInfos = new ArrayList();
for (StructField field: keyFields) {
reduceColumnNames.add(Utilities.ReduceField.KEY.toString() + "." + field.getFieldName());
reduceTypeInfos.add(TypeInfoUtils.getTypeInfoFromTypeString(field.getFieldObjectInspector().getTypeName()));
}
for (StructField field: valueFields) {
reduceColumnNames.add(Utilities.ReduceField.VALUE.toString() + "." + field.getFieldName());
reduceTypeInfos.add(TypeInfoUtils.getTypeInfoFromTypeString(field.getFieldObjectInspector().getTypeName()));
}
} catch (Exception e) {
throw new SemanticException(e);
}
return true;
}
private void addReduceWorkRules(Map opRules, NodeProcessor np) {
opRules.put(new RuleRegExp("R1", GroupByOperator.getOperatorName() + ".*"), np);
opRules.put(new RuleRegExp("R2", SelectOperator.getOperatorName() + ".*"), np);
}
private boolean validateReduceWork(ReduceWork reduceWork) throws SemanticException {
LOG.info("Validating ReduceWork...");
// Validate input to ReduceWork.
if (!getOnlyStructObjectInspectors(reduceWork)) {
return false;
}
// Now check the reduce operator tree.
Map opRules = new LinkedHashMap();
ReduceWorkValidationNodeProcessor vnp = new ReduceWorkValidationNodeProcessor();
addReduceWorkRules(opRules, vnp);
Dispatcher disp = new DefaultRuleDispatcher(vnp, opRules, null);
GraphWalker ogw = new DefaultGraphWalker(disp);
// iterator the reduce operator tree
ArrayList topNodes = new ArrayList();
topNodes.add(reduceWork.getReducer());
HashMap nodeOutput = new HashMap();
ogw.startWalking(topNodes, nodeOutput);
for (Node n : nodeOutput.keySet()) {
if (nodeOutput.get(n) != null) {
if (!((Boolean)nodeOutput.get(n)).booleanValue()) {
return false;
}
}
}
return true;
}
private void vectorizeReduceWork(ReduceWork reduceWork, boolean isTez) throws SemanticException {
LOG.info("Vectorizing ReduceWork...");
reduceWork.setVectorMode(true);
// For some reason, the DefaultGraphWalker does not descend down from the reducer Operator as
// expected. We need to descend down, otherwise it breaks our algorithm that determines
// VectorizationContext... Do we use PreOrderWalker instead of DefaultGraphWalker.
Map opRules = new LinkedHashMap();
ReduceWorkVectorizationNodeProcessor vnp =
new ReduceWorkVectorizationNodeProcessor(reduceColumnNames, reduceTypeInfos, isTez);
addReduceWorkRules(opRules, vnp);
Dispatcher disp = new DefaultRuleDispatcher(vnp, opRules, null);
GraphWalker ogw = new PreOrderWalker(disp);
// iterator the reduce operator tree
ArrayList topNodes = new ArrayList();
topNodes.add(reduceWork.getReducer());
LOG.info("vectorizeReduceWork reducer Operator: " +
reduceWork.getReducer().getName() + "...");
HashMap nodeOutput = new HashMap();
ogw.startWalking(topNodes, nodeOutput);
// Necessary since we are vectorizing the root operator in reduce.
reduceWork.setReducer(vnp.getRootVectorOp());
reduceWork.setVectorColumnNameMap(vnp.getVectorColumnNameMap());
reduceWork.setVectorColumnTypeMap(vnp.getVectorColumnTypeMap());
reduceWork.setVectorScratchColumnTypeMap(vnp.getVectorScratchColumnTypeMap());
if (LOG.isDebugEnabled()) {
debugDisplayAllMaps(reduceWork);
}
}
}
class MapWorkValidationNodeProcessor implements NodeProcessor {
private final MapWork mapWork;
private final boolean isTez;
public MapWorkValidationNodeProcessor(MapWork mapWork, boolean isTez) {
this.mapWork = mapWork;
this.isTez = isTez;
}
@Override
public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException {
for (Node n : stack) {
Operator op = (Operator) n;
if (nonVectorizableChildOfGroupBy(op)) {
return new Boolean(true);
}
boolean ret = validateMapWorkOperator(op, mapWork, isTez);
if (!ret) {
LOG.info("MapWork Operator: " + op.getName() + " could not be vectorized.");
return new Boolean(false);
}
}
return new Boolean(true);
}
}
class ReduceWorkValidationNodeProcessor implements NodeProcessor {
@Override
public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException {
for (Node n : stack) {
Operator op = (Operator) n;
if (nonVectorizableChildOfGroupBy(op)) {
return new Boolean(true);
}
boolean ret = validateReduceWorkOperator(op);
if (!ret) {
LOG.info("ReduceWork Operator: " + op.getName() + " could not be vectorized.");
return new Boolean(false);
}
}
return new Boolean(true);
}
}
// This class has common code used by both MapWorkVectorizationNodeProcessor and
// ReduceWorkVectorizationNodeProcessor.
class VectorizationNodeProcessor implements NodeProcessor {
// The vectorization context for the Map or Reduce task.
protected VectorizationContext taskVectorizationContext;
// The input projection column type name map for the Map or Reduce task.
protected Map taskColumnTypeNameMap;
VectorizationNodeProcessor() {
taskColumnTypeNameMap = new HashMap();
}
public Map getVectorColumnNameMap() {
return taskVectorizationContext.getProjectionColumnMap();
}
public Map getVectorColumnTypeMap() {
return taskColumnTypeNameMap;
}
public Map getVectorScratchColumnTypeMap() {
return taskVectorizationContext.getScratchColumnTypeMap();
}
protected final Set> opsDone =
new HashSet>();
protected final Map, Operator> opToVectorOpMap =
new HashMap, Operator>();
public VectorizationContext walkStackToFindVectorizationContext(Stack stack,
Operator op) throws SemanticException {
VectorizationContext vContext = null;
if (stack.size() <= 1) {
throw new SemanticException(
String.format("Expected operator stack for operator %s to have at least 2 operators",
op.getName()));
}
// Walk down the stack of operators until we found one willing to give us a context.
// At the bottom will be the root operator, guaranteed to have a context
int i= stack.size()-2;
while (vContext == null) {
if (i < 0) {
return null;
}
Operator opParent = (Operator) stack.get(i);
Operator vectorOpParent = opToVectorOpMap.get(opParent);
if (vectorOpParent != null) {
if (vectorOpParent instanceof VectorizationContextRegion) {
VectorizationContextRegion vcRegion = (VectorizationContextRegion) vectorOpParent;
vContext = vcRegion.getOuputVectorizationContext();
LOG.info("walkStackToFindVectorizationContext " + vectorOpParent.getName() + " has new vectorization context " + vContext.toString());
} else {
LOG.info("walkStackToFindVectorizationContext " + vectorOpParent.getName() + " does not have new vectorization context");
}
} else {
LOG.info("walkStackToFindVectorizationContext " + opParent.getName() + " is not vectorized");
}
--i;
}
return vContext;
}
public Operator doVectorize(Operator op,
VectorizationContext vContext, boolean isTez) throws SemanticException {
Operator vectorOp = op;
try {
if (!opsDone.contains(op)) {
vectorOp = vectorizeOperator(op, vContext, isTez);
opsDone.add(op);
if (vectorOp != op) {
opToVectorOpMap.put(op, vectorOp);
opsDone.add(vectorOp);
}
}
} catch (HiveException e) {
throw new SemanticException(e);
}
return vectorOp;
}
@Override
public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException {
throw new SemanticException("Must be overridden");
}
}
class MapWorkVectorizationNodeProcessor extends VectorizationNodeProcessor {
private final MapWork mWork;
private final boolean isTez;
public MapWorkVectorizationNodeProcessor(MapWork mWork, boolean isTez) {
super();
this.mWork = mWork;
this.isTez = isTez;
}
@Override
public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException {
Operator op = (Operator) nd;
VectorizationContext vContext = null;
if (op instanceof TableScanOperator) {
if (taskVectorizationContext == null) {
taskVectorizationContext = getVectorizationContext(op.getSchema(), op.getName(),
taskColumnTypeNameMap);
}
vContext = taskVectorizationContext;
} else {
LOG.info("MapWorkVectorizationNodeProcessor process going to walk the operator stack to get vectorization context for " + op.getName());
vContext = walkStackToFindVectorizationContext(stack, op);
if (vContext == null) {
// No operator has "pushed" a new context -- so use the task vectorization context.
vContext = taskVectorizationContext;
}
}
assert vContext != null;
LOG.info("MapWorkVectorizationNodeProcessor process operator " + op.getName() + " using vectorization context" + vContext.toString());
// When Vectorized GROUPBY outputs rows instead of vectorized row batchs, we don't
// vectorize the operators below it.
if (nonVectorizableChildOfGroupBy(op)) {
// No need to vectorize
if (!opsDone.contains(op)) {
opsDone.add(op);
}
return null;
}
Operator vectorOp = doVectorize(op, vContext, isTez);
if (LOG.isDebugEnabled()) {
if (vectorOp instanceof VectorizationContextRegion) {
VectorizationContextRegion vcRegion = (VectorizationContextRegion) vectorOp;
VectorizationContext vNewContext = vcRegion.getOuputVectorizationContext();
LOG.debug("Vectorized MapWork operator " + vectorOp.getName() + " added vectorization context " + vNewContext.toString());
}
}
return null;
}
}
class ReduceWorkVectorizationNodeProcessor extends VectorizationNodeProcessor {
private final List reduceColumnNames;
private final List reduceTypeInfos;
private boolean isTez;
private Operator rootVectorOp;
public Operator getRootVectorOp() {
return rootVectorOp;
}
public ReduceWorkVectorizationNodeProcessor(List reduceColumnNames,
List reduceTypeInfos, boolean isTez) {
super();
this.reduceColumnNames = reduceColumnNames;
this.reduceTypeInfos = reduceTypeInfos;
rootVectorOp = null;
this.isTez = isTez;
}
@Override
public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException {
Operator op = (Operator) nd;
VectorizationContext vContext = null;
boolean saveRootVectorOp = false;
if (op.getParentOperators().size() == 0) {
LOG.info("ReduceWorkVectorizationNodeProcessor process reduceColumnNames " + reduceColumnNames.toString());
vContext = new VectorizationContext("__Reduce_Shuffle__", reduceColumnNames);
taskVectorizationContext = vContext;
int i = 0;
for (TypeInfo typeInfo : reduceTypeInfos) {
taskColumnTypeNameMap.put(i, typeInfo.getTypeName());
i++;
}
saveRootVectorOp = true;
if (LOG.isDebugEnabled()) {
LOG.debug("Vectorized ReduceWork reduce shuffle vectorization context " + vContext.toString());
}
} else {
LOG.info("ReduceWorkVectorizationNodeProcessor process going to walk the operator stack to get vectorization context for " + op.getName());
vContext = walkStackToFindVectorizationContext(stack, op);
if (vContext == null) {
// If we didn't find a context among the operators, assume the top -- reduce shuffle's
// vectorization context.
vContext = taskVectorizationContext;
}
}
assert vContext != null;
LOG.info("ReduceWorkVectorizationNodeProcessor process operator " + op.getName() + " using vectorization context" + vContext.toString());
// When Vectorized GROUPBY outputs rows instead of vectorized row batchs, we don't
// vectorize the operators below it.
if (nonVectorizableChildOfGroupBy(op)) {
// No need to vectorize
if (!opsDone.contains(op)) {
opsDone.add(op);
}
return null;
}
Operator vectorOp = doVectorize(op, vContext, isTez);
if (LOG.isDebugEnabled()) {
if (vectorOp instanceof VectorizationContextRegion) {
VectorizationContextRegion vcRegion = (VectorizationContextRegion) vectorOp;
VectorizationContext vNewContext = vcRegion.getOuputVectorizationContext();
LOG.debug("Vectorized ReduceWork operator " + vectorOp.getName() + " added vectorization context " + vNewContext.toString());
}
}
if (vectorOp instanceof VectorGroupByOperator) {
VectorGroupByOperator groupBy = (VectorGroupByOperator) vectorOp;
VectorGroupByDesc vectorDesc = groupBy.getConf().getVectorDesc();
vectorDesc.setVectorGroupBatches(true);
}
if (saveRootVectorOp && op != vectorOp) {
rootVectorOp = vectorOp;
}
return null;
}
}
private static class ValidatorVectorizationContext extends VectorizationContext {
private ValidatorVectorizationContext() {
super("No Name");
}
@Override
protected int getInputColumnIndex(String name) {
return 0;
}
@Override
protected int getInputColumnIndex(ExprNodeColumnDesc colExpr) {
return 0;
}
}
@Override
public PhysicalContext resolve(PhysicalContext physicalContext) throws SemanticException {
this.physicalContext = physicalContext;
hiveConf = physicalContext.getConf();
boolean vectorPath = HiveConf.getBoolVar(hiveConf,
HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED);
if (!vectorPath) {
LOG.info("Vectorization is disabled");
return physicalContext;
}
// create dispatcher and graph walker
Dispatcher disp = new VectorizationDispatcher(physicalContext);
TaskGraphWalker ogw = new TaskGraphWalker(disp);
// get all the tasks nodes from root task
ArrayList topNodes = new ArrayList();
topNodes.addAll(physicalContext.getRootTasks());
// begin to walk through the task tree.
ogw.startWalking(topNodes, null);
return physicalContext;
}
boolean validateMapWorkOperator(Operator op, MapWork mWork, boolean isTez) {
boolean ret = false;
switch (op.getType()) {
case MAPJOIN:
if (op instanceof MapJoinOperator) {
ret = validateMapJoinOperator((MapJoinOperator) op);
} else if (op instanceof SMBMapJoinOperator) {
ret = validateSMBMapJoinOperator((SMBMapJoinOperator) op);
}
break;
case GROUPBY:
ret = validateGroupByOperator((GroupByOperator) op, false, isTez);
break;
case FILTER:
ret = validateFilterOperator((FilterOperator) op);
break;
case SELECT:
ret = validateSelectOperator((SelectOperator) op);
break;
case REDUCESINK:
ret = validateReduceSinkOperator((ReduceSinkOperator) op);
break;
case TABLESCAN:
ret = validateTableScanOperator((TableScanOperator) op, mWork);
break;
case FILESINK:
case LIMIT:
case EVENT:
ret = true;
break;
default:
ret = false;
break;
}
return ret;
}
boolean validateReduceWorkOperator(Operator op) {
boolean ret = false;
switch (op.getType()) {
case MAPJOIN:
// Does MAPJOIN actually get planned in Reduce?
if (op instanceof MapJoinOperator) {
ret = validateMapJoinOperator((MapJoinOperator) op);
} else if (op instanceof SMBMapJoinOperator) {
ret = validateSMBMapJoinOperator((SMBMapJoinOperator) op);
}
break;
case GROUPBY:
if (HiveConf.getBoolVar(hiveConf,
HiveConf.ConfVars.HIVE_VECTORIZATION_REDUCE_GROUPBY_ENABLED)) {
ret = validateGroupByOperator((GroupByOperator) op, true, true);
} else {
ret = false;
}
break;
case FILTER:
ret = validateFilterOperator((FilterOperator) op);
break;
case SELECT:
ret = validateSelectOperator((SelectOperator) op);
break;
case REDUCESINK:
ret = validateReduceSinkOperator((ReduceSinkOperator) op);
break;
case FILESINK:
ret = validateFileSinkOperator((FileSinkOperator) op);
break;
case LIMIT:
case EVENT:
ret = true;
break;
default:
ret = false;
break;
}
return ret;
}
public Boolean nonVectorizableChildOfGroupBy(Operator op) {
Operator currentOp = op;
while (currentOp.getParentOperators().size() > 0) {
currentOp = currentOp.getParentOperators().get(0);
if (currentOp.getType().equals(OperatorType.GROUPBY)) {
GroupByDesc desc = (GroupByDesc)currentOp.getConf();
boolean isVectorOutput = desc.getVectorDesc().isVectorOutput();
if (isVectorOutput) {
// This GROUP BY does vectorize its output.
return false;
}
return true;
}
}
return false;
}
private boolean validateSMBMapJoinOperator(SMBMapJoinOperator op) {
SMBJoinDesc desc = op.getConf();
// Validation is the same as for map join, since the 'small' tables are not vectorized
return validateMapJoinDesc(desc);
}
private boolean validateTableScanOperator(TableScanOperator op, MapWork mWork) {
TableScanDesc desc = op.getConf();
if (desc.isGatherStats()) {
return false;
}
String columns = "";
String types = "";
String partitionColumns = "";
String partitionTypes = "";
boolean haveInfo = false;
// This over-reaches slightly, since we can have > 1 table-scan per map-work.
// It needs path to partition, path to alias, then check the alias == the same table-scan, to be accurate.
// That said, that is a TODO item to be fixed when we support >1 TableScans per vectorized pipeline later.
LinkedHashMap partitionDescs = mWork.getPathToPartitionInfo();
// For vectorization, compare each partition information for against the others.
// We assume the table information will be from one of the partitions, so it will
// work to focus on the partition information and not compare against the TableScanOperator
// columns (in the VectorizationContext)....
for (Map.Entry entry : partitionDescs.entrySet()) {
PartitionDesc partDesc = entry.getValue();
if (partDesc.getPartSpec() == null || partDesc.getPartSpec().isEmpty()) {
// No partition information -- we match because we would default to using the table description.
continue;
}
Properties partProps = partDesc.getProperties();
if (!haveInfo) {
columns = partProps.getProperty(hive_metastoreConstants.META_TABLE_COLUMNS);
types = partProps.getProperty(hive_metastoreConstants.META_TABLE_COLUMN_TYPES);
partitionColumns = partProps.getProperty(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS);
partitionTypes = partProps.getProperty(hive_metastoreConstants.META_TABLE_PARTITION_COLUMN_TYPES);
haveInfo = true;
} else {
String nextColumns = partProps.getProperty(hive_metastoreConstants.META_TABLE_COLUMNS);
String nextTypes = partProps.getProperty(hive_metastoreConstants.META_TABLE_COLUMN_TYPES);
String nextPartitionColumns = partProps.getProperty(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS);
String nextPartitionTypes = partProps.getProperty(hive_metastoreConstants.META_TABLE_PARTITION_COLUMN_TYPES);
if (!columns.equalsIgnoreCase(nextColumns)) {
LOG.info(
String.format("Could not vectorize partition %s. Its column names %s do not match the other column names %s",
entry.getKey(), nextColumns, columns));
return false;
}
if (!types.equalsIgnoreCase(nextTypes)) {
LOG.info(
String.format("Could not vectorize partition %s. Its column types %s do not match the other column types %s",
entry.getKey(), nextTypes, types));
return false;
}
if (!partitionColumns.equalsIgnoreCase(nextPartitionColumns)) {
LOG.info(
String.format("Could not vectorize partition %s. Its partition column names %s do not match the other partition column names %s",
entry.getKey(), nextPartitionColumns, partitionColumns));
return false;
}
if (!partitionTypes.equalsIgnoreCase(nextPartitionTypes)) {
LOG.info(
String.format("Could not vectorize partition %s. Its partition column types %s do not match the other partition column types %s",
entry.getKey(), nextPartitionTypes, partitionTypes));
return false;
}
}
}
return true;
}
private boolean validateMapJoinOperator(MapJoinOperator op) {
MapJoinDesc desc = op.getConf();
return validateMapJoinDesc(desc);
}
private boolean validateMapJoinDesc(MapJoinDesc desc) {
byte posBigTable = (byte) desc.getPosBigTable();
List filterExprs = desc.getFilters().get(posBigTable);
if (!validateExprNodeDesc(filterExprs, VectorExpressionDescriptor.Mode.FILTER)) {
LOG.info("Cannot vectorize map work filter expression");
return false;
}
List keyExprs = desc.getKeys().get(posBigTable);
if (!validateExprNodeDesc(keyExprs)) {
LOG.info("Cannot vectorize map work key expression");
return false;
}
List valueExprs = desc.getExprs().get(posBigTable);
if (!validateExprNodeDesc(valueExprs)) {
LOG.info("Cannot vectorize map work value expression");
return false;
}
return true;
}
private boolean validateReduceSinkOperator(ReduceSinkOperator op) {
List keyDescs = op.getConf().getKeyCols();
List partitionDescs = op.getConf().getPartitionCols();
List valueDesc = op.getConf().getValueCols();
return validateExprNodeDesc(keyDescs) && validateExprNodeDesc(partitionDescs) &&
validateExprNodeDesc(valueDesc);
}
private boolean validateSelectOperator(SelectOperator op) {
List descList = op.getConf().getColList();
for (ExprNodeDesc desc : descList) {
boolean ret = validateExprNodeDesc(desc);
if (!ret) {
LOG.info("Cannot vectorize select expression: " + desc.toString());
return false;
}
}
return true;
}
private boolean validateFilterOperator(FilterOperator op) {
ExprNodeDesc desc = op.getConf().getPredicate();
return validateExprNodeDesc(desc, VectorExpressionDescriptor.Mode.FILTER);
}
private boolean validateGroupByOperator(GroupByOperator op, boolean isReduce, boolean isTez) {
GroupByDesc desc = op.getConf();
VectorGroupByDesc vectorDesc = desc.getVectorDesc();
if (desc.isGroupingSetsPresent()) {
LOG.info("Grouping sets not supported in vector mode");
return false;
}
if (desc.pruneGroupingSetId()) {
LOG.info("Pruning grouping set id not supported in vector mode");
return false;
}
boolean ret = validateExprNodeDesc(desc.getKeys());
if (!ret) {
LOG.info("Cannot vectorize groupby key expression");
return false;
}
ret = validateAggregationDesc(desc.getAggregators(), isReduce);
if (!ret) {
LOG.info("Cannot vectorize groupby aggregate expression");
return false;
}
if (isReduce) {
if (desc.isDistinct()) {
LOG.info("Distinct not supported in reduce vector mode");
return false;
}
// Sort-based GroupBy?
if (desc.getMode() != GroupByDesc.Mode.COMPLETE &&
desc.getMode() != GroupByDesc.Mode.PARTIAL1 &&
desc.getMode() != GroupByDesc.Mode.PARTIAL2 &&
desc.getMode() != GroupByDesc.Mode.MERGEPARTIAL) {
LOG.info("Reduce vector mode not supported when input for GROUP BY not sorted");
return false;
}
LOG.info("Reduce GROUP BY mode is " + desc.getMode().name());
if (!aggregatorsOutputIsPrimitive(desc.getAggregators(), isReduce)) {
LOG.info("Reduce vector mode only supported when aggregate outputs are primitive types");
return false;
}
if (desc.getKeys().size() > 0) {
if (op.getParentOperators().size() > 0) {
LOG.info("Reduce vector mode can only handle a key group GROUP BY operator when it is fed by reduce-shuffle");
return false;
}
LOG.info("Reduce-side GROUP BY will process key groups");
vectorDesc.setVectorGroupBatches(true);
} else {
LOG.info("Reduce-side GROUP BY will do global aggregation");
}
vectorDesc.setVectorOutput(true);
vectorDesc.setIsReduce(true);
}
return true;
}
private boolean validateFileSinkOperator(FileSinkOperator op) {
return true;
}
private boolean validateExprNodeDesc(List descs) {
return validateExprNodeDesc(descs, VectorExpressionDescriptor.Mode.PROJECTION);
}
private boolean validateExprNodeDesc(List descs,
VectorExpressionDescriptor.Mode mode) {
for (ExprNodeDesc d : descs) {
boolean ret = validateExprNodeDesc(d, mode);
if (!ret) {
return false;
}
}
return true;
}
private boolean validateAggregationDesc(List descs, boolean isReduce) {
for (AggregationDesc d : descs) {
boolean ret = validateAggregationDesc(d, isReduce);
if (!ret) {
return false;
}
}
return true;
}
private boolean validateExprNodeDescRecursive(ExprNodeDesc desc, VectorExpressionDescriptor.Mode mode) {
if (desc instanceof ExprNodeColumnDesc) {
ExprNodeColumnDesc c = (ExprNodeColumnDesc) desc;
// Currently, we do not support vectorized virtual columns (see HIVE-5570).
if (VirtualColumn.VIRTUAL_COLUMN_NAMES.contains(c.getColumn())) {
LOG.info("Cannot vectorize virtual column " + c.getColumn());
return false;
}
}
String typeName = desc.getTypeInfo().getTypeName();
boolean ret = validateDataType(typeName, mode);
if (!ret) {
LOG.info("Cannot vectorize " + desc.toString() + " of type " + typeName);
return false;
}
if (desc instanceof ExprNodeGenericFuncDesc) {
ExprNodeGenericFuncDesc d = (ExprNodeGenericFuncDesc) desc;
boolean r = validateGenericUdf(d);
if (!r) {
return false;
}
}
if (desc.getChildren() != null) {
for (ExprNodeDesc d: desc.getChildren()) {
// Don't restrict child expressions for projection. Always use looser FILTER mode.
boolean r = validateExprNodeDescRecursive(d, VectorExpressionDescriptor.Mode.FILTER);
if (!r) {
return false;
}
}
}
return true;
}
private boolean validateExprNodeDesc(ExprNodeDesc desc) {
return validateExprNodeDesc(desc, VectorExpressionDescriptor.Mode.PROJECTION);
}
boolean validateExprNodeDesc(ExprNodeDesc desc, VectorExpressionDescriptor.Mode mode) {
if (!validateExprNodeDescRecursive(desc, mode)) {
return false;
}
try {
VectorizationContext vc = new ValidatorVectorizationContext();
if (vc.getVectorExpression(desc, mode) == null) {
// TODO: this cannot happen - VectorizationContext throws in such cases.
LOG.info("getVectorExpression returned null");
return false;
}
} catch (Exception e) {
LOG.info("Failed to vectorize", e);
return false;
}
return true;
}
private boolean validateGenericUdf(ExprNodeGenericFuncDesc genericUDFExpr) {
if (VectorizationContext.isCustomUDF(genericUDFExpr)) {
return true;
}
GenericUDF genericUDF = genericUDFExpr.getGenericUDF();
if (genericUDF instanceof GenericUDFBridge) {
Class udf = ((GenericUDFBridge) genericUDF).getUdfClass();
return supportedGenericUDFs.contains(udf);
} else {
return supportedGenericUDFs.contains(genericUDF.getClass());
}
}
private boolean validateAggregationDesc(AggregationDesc aggDesc, boolean isReduce) {
String udfName = aggDesc.getGenericUDAFName().toLowerCase();
if (!supportedAggregationUdfs.contains(udfName)) {
LOG.info("Cannot vectorize groupby aggregate expression: UDF " + udfName + " not supported");
return false;
}
if (aggDesc.getParameters() != null && !validateExprNodeDesc(aggDesc.getParameters())) {
LOG.info("Cannot vectorize groupby aggregate expression: UDF parameters not supported");
return false;
}
// See if we can vectorize the aggregation.
try {
VectorizationContext vc = new ValidatorVectorizationContext();
if (vc.getAggregatorExpression(aggDesc, isReduce) == null) {
// TODO: this cannot happen - VectorizationContext throws in such cases.
LOG.info("getAggregatorExpression returned null");
return false;
}
} catch (Exception e) {
LOG.info("Failed to vectorize", e);
return false;
}
return true;
}
private boolean aggregatorsOutputIsPrimitive(List descs, boolean isReduce) {
for (AggregationDesc d : descs) {
boolean ret = aggregatorsOutputIsPrimitive(d, isReduce);
if (!ret) {
return false;
}
}
return true;
}
private boolean aggregatorsOutputIsPrimitive(AggregationDesc aggDesc, boolean isReduce) {
VectorizationContext vc = new ValidatorVectorizationContext();
VectorAggregateExpression vectorAggrExpr;
try {
vectorAggrExpr = vc.getAggregatorExpression(aggDesc, isReduce);
} catch (Exception e) {
// We should have already attempted to vectorize in validateAggregationDesc.
LOG.info("Vectorization of aggreation should have succeeded ", e);
return false;
}
ObjectInspector outputObjInspector = vectorAggrExpr.getOutputObjectInspector();
if (outputObjInspector.getCategory() == ObjectInspector.Category.PRIMITIVE) {
return true;
}
return false;
}
private boolean validateDataType(String type, VectorExpressionDescriptor.Mode mode) {
type = type.toLowerCase();
boolean result = supportedDataTypesPattern.matcher(type).matches();
if (result && mode == VectorExpressionDescriptor.Mode.PROJECTION && type.equals("void")) {
return false;
}
return result;
}
private VectorizationContext getVectorizationContext(RowSchema rowSchema, String contextName,
Map typeNameMap) {
VectorizationContext vContext = new VectorizationContext(contextName);
// Add all non-virtual columns to make a vectorization context for
// the TableScan operator.
int i = 0;
for (ColumnInfo c : rowSchema.getSignature()) {
// Earlier, validation code should have eliminated virtual columns usage (HIVE-5560).
if (!isVirtualColumn(c)) {
vContext.addInitialColumn(c.getInternalName());
typeNameMap.put(i, c.getTypeName());
i++;
}
}
vContext.finishedAddingInitialColumns();
return vContext;
}
private void fixupParentChildOperators(Operator op,
Operator vectorOp) {
if (op.getParentOperators() != null) {
vectorOp.setParentOperators(op.getParentOperators());
for (Operator p : op.getParentOperators()) {
p.replaceChild(op, vectorOp);
}
}
if (op.getChildOperators() != null) {
vectorOp.setChildOperators(op.getChildOperators());
for (Operator c : op.getChildOperators()) {
c.replaceParent(op, vectorOp);
}
}
}
private boolean isBigTableOnlyResults(MapJoinDesc desc) {
Byte[] order = desc.getTagOrder();
byte posBigTable = (byte) desc.getPosBigTable();
Byte posSingleVectorMapJoinSmallTable = (order[0] == posBigTable ? order[1] : order[0]);
int[] smallTableIndices;
int smallTableIndicesSize;
List smallTableExprs = desc.getExprs().get(posSingleVectorMapJoinSmallTable);
if (desc.getValueIndices() != null && desc.getValueIndices().get(posSingleVectorMapJoinSmallTable) != null) {
smallTableIndices = desc.getValueIndices().get(posSingleVectorMapJoinSmallTable);
LOG.info("Vectorizer isBigTableOnlyResults smallTableIndices " + Arrays.toString(smallTableIndices));
smallTableIndicesSize = smallTableIndices.length;
} else {
smallTableIndices = null;
LOG.info("Vectorizer isBigTableOnlyResults smallTableIndices EMPTY");
smallTableIndicesSize = 0;
}
List smallTableRetainList = desc.getRetainList().get(posSingleVectorMapJoinSmallTable);
LOG.info("Vectorizer isBigTableOnlyResults smallTableRetainList " + smallTableRetainList);
int smallTableRetainSize = smallTableRetainList.size();
if (smallTableIndicesSize > 0) {
// Small table indices has priority over retain.
for (int i = 0; i < smallTableIndicesSize; i++) {
if (smallTableIndices[i] < 0) {
// Negative numbers indicate a column to be (deserialize) read from the small table's
// LazyBinary value row.
LOG.info("Vectorizer isBigTableOnlyResults smallTableIndices[i] < 0 returning false");
return false;
}
}
} else if (smallTableRetainSize > 0) {
LOG.info("Vectorizer isBigTableOnlyResults smallTableRetainSize > 0 returning false");
return false;
}
LOG.info("Vectorizer isBigTableOnlyResults returning true");
return true;
}
Operator specializeMapJoinOperator(Operator op,
VectorizationContext vContext, MapJoinDesc desc) throws HiveException {
Operator vectorOp = null;
Class> opClass = null;
boolean isOuterJoin = !desc.getNoOuterJoin();
VectorMapJoinDesc.HashTableImplementationType hashTableImplementationType = HashTableImplementationType.NONE;
VectorMapJoinDesc.HashTableKind hashTableKind = HashTableKind.NONE;
VectorMapJoinDesc.HashTableKeyType hashTableKeyType = HashTableKeyType.NONE;
if (HiveConf.getBoolVar(hiveConf,
HiveConf.ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_FAST_HASHTABLE_ENABLED)) {
hashTableImplementationType = HashTableImplementationType.FAST;
} else {
// Restrict to using BytesBytesMultiHashMap via MapJoinBytesTableContainer or
// HybridHashTableContainer.
hashTableImplementationType = HashTableImplementationType.OPTIMIZED;
}
int joinType = desc.getConds()[0].getType();
boolean isInnerBigOnly = false;
if (joinType == JoinDesc.INNER_JOIN && isBigTableOnlyResults(desc)) {
isInnerBigOnly = true;
}
// By default, we can always use the multi-key class.
hashTableKeyType = HashTableKeyType.MULTI_KEY;
if (!HiveConf.getBoolVar(hiveConf,
HiveConf.ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_MULTIKEY_ONLY_ENABLED)) {
// Look for single column optimization.
byte posBigTable = (byte) desc.getPosBigTable();
Map> keyExprs = desc.getKeys();
List bigTableKeyExprs = keyExprs.get(posBigTable);
if (bigTableKeyExprs.size() == 1) {
String typeName = bigTableKeyExprs.get(0).getTypeString();
LOG.info("Vectorizer vectorizeOperator map join typeName " + typeName);
if (typeName.equals("boolean")) {
hashTableKeyType = HashTableKeyType.BOOLEAN;
} else if (typeName.equals("tinyint")) {
hashTableKeyType = HashTableKeyType.BYTE;
} else if (typeName.equals("smallint")) {
hashTableKeyType = HashTableKeyType.SHORT;
} else if (typeName.equals("int")) {
hashTableKeyType = HashTableKeyType.INT;
} else if (typeName.equals("bigint") || typeName.equals("long")) {
hashTableKeyType = HashTableKeyType.LONG;
} else if (VectorizationContext.isStringFamily(typeName)) {
hashTableKeyType = HashTableKeyType.STRING;
}
}
}
switch (joinType) {
case JoinDesc.INNER_JOIN:
if (!isInnerBigOnly) {
hashTableKind = HashTableKind.HASH_MAP;
} else {
hashTableKind = HashTableKind.HASH_MULTISET;
}
break;
case JoinDesc.LEFT_OUTER_JOIN:
case JoinDesc.RIGHT_OUTER_JOIN:
hashTableKind = HashTableKind.HASH_MAP;
break;
case JoinDesc.LEFT_SEMI_JOIN:
hashTableKind = HashTableKind.HASH_SET;
break;
default:
throw new HiveException("Unknown join type " + joinType);
}
LOG.info("Vectorizer vectorizeOperator map join hashTableKind " + hashTableKind.name() + " hashTableKeyType " + hashTableKeyType.name());
switch (hashTableKeyType) {
case BOOLEAN:
case BYTE:
case SHORT:
case INT:
case LONG:
switch (joinType) {
case JoinDesc.INNER_JOIN:
if (!isInnerBigOnly) {
opClass = VectorMapJoinInnerLongOperator.class;
} else {
opClass = VectorMapJoinInnerBigOnlyLongOperator.class;
}
break;
case JoinDesc.LEFT_OUTER_JOIN:
case JoinDesc.RIGHT_OUTER_JOIN:
opClass = VectorMapJoinOuterLongOperator.class;
break;
case JoinDesc.LEFT_SEMI_JOIN:
opClass = VectorMapJoinLeftSemiLongOperator.class;
break;
default:
throw new HiveException("Unknown join type " + joinType);
}
break;
case STRING:
switch (joinType) {
case JoinDesc.INNER_JOIN:
if (!isInnerBigOnly) {
opClass = VectorMapJoinInnerStringOperator.class;
} else {
opClass = VectorMapJoinInnerBigOnlyStringOperator.class;
}
break;
case JoinDesc.LEFT_OUTER_JOIN:
case JoinDesc.RIGHT_OUTER_JOIN:
opClass = VectorMapJoinOuterStringOperator.class;
break;
case JoinDesc.LEFT_SEMI_JOIN:
opClass = VectorMapJoinLeftSemiStringOperator.class;
break;
default:
throw new HiveException("Unknown join type " + joinType);
}
break;
case MULTI_KEY:
switch (joinType) {
case JoinDesc.INNER_JOIN:
if (!isInnerBigOnly) {
opClass = VectorMapJoinInnerMultiKeyOperator.class;
} else {
opClass = VectorMapJoinInnerBigOnlyMultiKeyOperator.class;
}
break;
case JoinDesc.LEFT_OUTER_JOIN:
case JoinDesc.RIGHT_OUTER_JOIN:
opClass = VectorMapJoinOuterMultiKeyOperator.class;
break;
case JoinDesc.LEFT_SEMI_JOIN:
opClass = VectorMapJoinLeftSemiMultiKeyOperator.class;
break;
default:
throw new HiveException("Unknown join type " + joinType);
}
break;
}
vectorOp = OperatorFactory.getVectorOperator(opClass, op.getConf(), vContext);
LOG.info("Vectorizer vectorizeOperator map join class " + vectorOp.getClass().getSimpleName());
boolean minMaxEnabled = HiveConf.getBoolVar(hiveConf,
HiveConf.ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_MINMAX_ENABLED);
VectorMapJoinDesc vectorDesc = desc.getVectorDesc();
vectorDesc.setHashTableImplementationType(hashTableImplementationType);
vectorDesc.setHashTableKind(hashTableKind);
vectorDesc.setHashTableKeyType(hashTableKeyType);
vectorDesc.setMinMaxEnabled(minMaxEnabled);
return vectorOp;
}
private boolean onExpressionHasNullSafes(MapJoinDesc desc) {
boolean[] nullSafes = desc.getNullSafes();
for (boolean nullSafe : nullSafes) {
if (nullSafe) {
return true;
}
}
return false;
}
private boolean canSpecializeMapJoin(Operator op, MapJoinDesc desc,
boolean isTez) {
boolean specialize = false;
if (op instanceof MapJoinOperator &&
HiveConf.getBoolVar(hiveConf,
HiveConf.ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_ENABLED)) {
// Currently, only under Tez and non-N-way joins.
if (isTez && desc.getConds().length == 1 && !onExpressionHasNullSafes(desc)) {
// Ok, all basic restrictions satisfied so far...
specialize = true;
if (!HiveConf.getBoolVar(hiveConf,
HiveConf.ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_FAST_HASHTABLE_ENABLED)) {
// We are using the optimized hash table we have further
// restrictions (using optimized and key type).
if (!HiveConf.getBoolVar(hiveConf,
HiveConf.ConfVars.HIVEMAPJOINUSEOPTIMIZEDTABLE)) {
specialize = false;
} else {
byte posBigTable = (byte) desc.getPosBigTable();
Map> keyExprs = desc.getKeys();
List bigTableKeyExprs = keyExprs.get(posBigTable);
for (ExprNodeDesc exprNodeDesc : bigTableKeyExprs) {
String typeName = exprNodeDesc.getTypeString();
if (!MapJoinKey.isSupportedField(typeName)) {
specialize = false;
break;
}
}
}
} else {
// With the fast hash table implementation, we currently do not support
// Hybrid Grace Hash Join.
if (HiveConf.getBoolVar(hiveConf,
HiveConf.ConfVars.HIVEUSEHYBRIDGRACEHASHJOIN)) {
specialize = false;
}
}
}
}
return specialize;
}
Operator vectorizeOperator(Operator op,
VectorizationContext vContext, boolean isTez) throws HiveException {
Operator vectorOp = null;
switch (op.getType()) {
case MAPJOIN:
{
MapJoinDesc desc = (MapJoinDesc) op.getConf();
boolean specialize = canSpecializeMapJoin(op, desc, isTez);
if (!specialize) {
Class> opClass = null;
if (op instanceof MapJoinOperator) {
// *NON-NATIVE* vector map differences for LEFT OUTER JOIN and Filtered...
List bigTableFilters = desc.getFilters().get((byte) desc.getPosBigTable());
boolean isOuterAndFiltered = (!desc.isNoOuterJoin() && bigTableFilters.size() > 0);
if (!isOuterAndFiltered) {
opClass = VectorMapJoinOperator.class;
} else {
opClass = VectorMapJoinOuterFilteredOperator.class;
}
} else if (op instanceof SMBMapJoinOperator) {
opClass = VectorSMBMapJoinOperator.class;
}
vectorOp = OperatorFactory.getVectorOperator(opClass, op.getConf(), vContext);
} else {
// TEMPORARY Until Native Vector Map Join with Hybrid passes tests...
// HiveConf.setBoolVar(physicalContext.getConf(),
// HiveConf.ConfVars.HIVEUSEHYBRIDGRACEHASHJOIN, false);
vectorOp = specializeMapJoinOperator(op, vContext, desc);
}
}
break;
case GROUPBY:
case FILTER:
case SELECT:
case FILESINK:
case REDUCESINK:
case LIMIT:
case EXTRACT:
case EVENT:
vectorOp = OperatorFactory.getVectorOperator(op.getConf(), vContext);
break;
default:
vectorOp = op;
break;
}
LOG.info("vectorizeOperator " + (vectorOp == null ? "NULL" : vectorOp.getClass().getName()));
LOG.info("vectorizeOperator " + (vectorOp == null || vectorOp.getConf() == null ? "NULL" : vectorOp.getConf().getClass().getName()));
if (vectorOp != op) {
fixupParentChildOperators(op, vectorOp);
((AbstractOperatorDesc) vectorOp.getConf()).setVectorMode(true);
}
return vectorOp;
}
private boolean isVirtualColumn(ColumnInfo column) {
// Not using method column.getIsVirtualCol() because partitioning columns are also
// treated as virtual columns in ColumnInfo.
if (VirtualColumn.VIRTUAL_COLUMN_NAMES.contains(column.getInternalName())) {
return true;
}
return false;
}
public void debugDisplayAllMaps(BaseWork work) {
Map columnNameMap = work.getVectorColumnNameMap();
Map columnTypeMap = work.getVectorColumnTypeMap();
Map scratchColumnTypeMap = work.getVectorScratchColumnTypeMap();
LOG.debug("debugDisplayAllMaps columnNameMap " + columnNameMap.toString());
LOG.debug("debugDisplayAllMaps columnTypeMap " + columnTypeMap.toString());
LOG.debug("debugDisplayAllMaps scratchColumnTypeMap " + scratchColumnTypeMap.toString());
}
}