Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.optimizer;
import java.sql.Date;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Stack;
import org.apache.hadoop.hive.common.StatsSetupConst;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
import org.apache.hadoop.hive.metastore.api.DateColumnStatsData;
import org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData;
import org.apache.hadoop.hive.metastore.api.LongColumnStatsData;
import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils;
import org.apache.hadoop.hive.ql.exec.ColumnInfo;
import org.apache.hadoop.hive.ql.exec.FetchTask;
import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
import org.apache.hadoop.hive.ql.exec.GroupByOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
import org.apache.hadoop.hive.ql.exec.SelectOperator;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.exec.TaskFactory;
import org.apache.hadoop.hive.ql.io.AcidUtils;
import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
import org.apache.hadoop.hive.ql.lib.Dispatcher;
import org.apache.hadoop.hive.ql.lib.GraphWalker;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.NodeProcessor;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.lib.Rule;
import org.apache.hadoop.hive.ql.lib.RuleRegExp;
import org.apache.hadoop.hive.ql.metadata.Hive;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.parse.ParseContext;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.AggregationDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils;
import org.apache.hadoop.hive.ql.plan.FetchWork;
import org.apache.hadoop.hive.ql.plan.GroupByDesc;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFCount;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMax;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMin;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFResolver;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFSum;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.io.DateWritable;
import org.apache.hadoop.hive.serde2.io.TimestampWritable;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.thrift.TException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.Lists;
/** There is a set of queries which can be answered entirely from statistics stored in metastore.
* Examples of such queries are count(*), count(a), max(a), min(b) etc. Hive already collects
* these basic statistics for query planning purposes. These same statistics can be used to
* answer queries also.
*
* Optimizer looks at query plan to determine if it can answer query using statistics
* and than change the plan to answer query entirely using statistics stored in metastore.
*/
public class StatsOptimizer extends Transform {
// TODO: [HIVE-6289] while getting stats from metastore, we currently only get one col at
// a time; this could be improved - get all necessary columns in advance, then use local.
// TODO: [HIVE-6292] aggregations could be done directly in metastore. Hive over MySQL!
private static final Logger Logger = LoggerFactory.getLogger(StatsOptimizer.class);
@Override
public ParseContext transform(ParseContext pctx) throws SemanticException {
if (pctx.getFetchTask() != null || !pctx.getQueryProperties().isQuery()
|| pctx.getQueryProperties().isAnalyzeRewrite() || pctx.getQueryProperties().isCTAS()
|| pctx.getLoadFileWork().size() > 1 || !pctx.getLoadTableWork().isEmpty()
// If getNameToSplitSample is not empty, at least one of the source
// tables is being sampled and we can not optimize.
|| !pctx.getNameToSplitSample().isEmpty()) {
return pctx;
}
String TS = TableScanOperator.getOperatorName() + "%";
String GBY = GroupByOperator.getOperatorName() + "%";
String RS = ReduceSinkOperator.getOperatorName() + "%";
String SEL = SelectOperator.getOperatorName() + "%";
String FS = FileSinkOperator.getOperatorName() + "%";
Map opRules = new LinkedHashMap();
opRules.put(new RuleRegExp("R1", TS + SEL + GBY + RS + GBY + SEL + FS),
new MetaDataProcessor(pctx));
opRules.put(new RuleRegExp("R2", TS + SEL + GBY + RS + GBY + FS),
new MetaDataProcessor(pctx));
NodeProcessorCtx soProcCtx = new StatsOptimizerProcContext();
Dispatcher disp = new DefaultRuleDispatcher(null, opRules, soProcCtx);
GraphWalker ogw = new DefaultGraphWalker(disp);
ArrayList topNodes = new ArrayList();
topNodes.addAll(pctx.getTopOps().values());
ogw.startWalking(topNodes, null);
return pctx;
}
private static class StatsOptimizerProcContext implements NodeProcessorCtx {
boolean stopProcess = false;
}
private static class MetaDataProcessor implements NodeProcessor {
private final ParseContext pctx;
public MetaDataProcessor (ParseContext pctx) {
this.pctx = pctx;
}
enum StatType{
Integer,
Double,
String,
Boolean,
Binary,
Date,
Unsupported
}
enum LongSubType {
BIGINT { @Override
Object cast(long longValue) { return longValue; } },
INT { @Override
Object cast(long longValue) { return (int)longValue; } },
SMALLINT { @Override
Object cast(long longValue) { return (short)longValue; } },
TINYINT { @Override
Object cast(long longValue) { return (byte)longValue; } };
abstract Object cast(long longValue);
}
enum DoubleSubType {
DOUBLE { @Override
Object cast(double doubleValue) { return doubleValue; } },
FLOAT { @Override
Object cast(double doubleValue) { return (float) doubleValue; } };
abstract Object cast(double doubleValue);
}
enum DateSubType {
DAYS {@Override
Object cast(long longValue) { return (new DateWritable((int)longValue)).get();}
};
abstract Object cast(long longValue);
}
enum GbyKeyType {
NULL, CONSTANT, OTHER
}
private StatType getType(String origType) {
if (serdeConstants.IntegralTypes.contains(origType)) {
return StatType.Integer;
} else if (origType.equals(serdeConstants.DOUBLE_TYPE_NAME) ||
origType.equals(serdeConstants.FLOAT_TYPE_NAME)) {
return StatType.Double;
} else if (origType.equals(serdeConstants.BINARY_TYPE_NAME)) {
return StatType.Binary;
} else if (origType.equals(serdeConstants.BOOLEAN_TYPE_NAME)) {
return StatType.Boolean;
} else if (origType.equals(serdeConstants.STRING_TYPE_NAME)) {
return StatType.String;
} else if (origType.equals(serdeConstants.DATE_TYPE_NAME)) {
return StatType.Date;
}
return StatType.Unsupported;
}
private Long getNullcountFor(StatType type, ColumnStatisticsData statData) {
switch(type) {
case Integer :
return statData.getLongStats().getNumNulls();
case Double:
return statData.getDoubleStats().getNumNulls();
case String:
return statData.getStringStats().getNumNulls();
case Boolean:
return statData.getBooleanStats().getNumNulls();
case Binary:
return statData.getBinaryStats().getNumNulls();
case Date:
return statData.getDateStats().getNumNulls();
default:
return null;
}
}
private GbyKeyType getGbyKeyType(GroupByOperator gbyOp) {
GroupByDesc gbyDesc = gbyOp.getConf();
int numCols = gbyDesc.getOutputColumnNames().size();
int aggCols = gbyDesc.getAggregators().size();
// If the Group by operator has null key
if (numCols == aggCols) {
return GbyKeyType.NULL;
}
// If the Gby key is a constant
List dpCols = gbyOp.getSchema().getColumnNames().subList(0, numCols - aggCols);
for(String dpCol : dpCols) {
ExprNodeDesc end = ExprNodeDescUtils.findConstantExprOrigin(dpCol, gbyOp);
if (!(end instanceof ExprNodeConstantDesc)) {
return GbyKeyType.OTHER;
}
}
return GbyKeyType.CONSTANT;
}
@Override
public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException {
// 1. Do few checks to determine eligibility of optimization
// 2. look at ExprNodeFuncGenericDesc in select list to see if its min, max, count etc.
// If it is
// 3. Connect to metastore and get the stats
// 4. Compose rows and add it in FetchWork
// 5. Delete GBY - RS - GBY - SEL from the pipeline.
StatsOptimizerProcContext soProcCtx = (StatsOptimizerProcContext) procCtx;
// If the optimization has been stopped for the reasons like being not qualified,
// or lack of the stats data. we do not continue this process. For an example,
// for a query select max(value) from src1 union all select max(value) from src2
// if it has been union remove optimized, the AST tree will become
// TS[0]->SEL[1]->GBY[2]-RS[3]->GBY[4]->FS[17]
// TS[6]->SEL[7]->GBY[8]-RS[9]->GBY[10]->FS[18]
// if TS[0] branch for src1 is not optimized because src1 does not have column stats
// there is no need to continue processing TS[6] branch
if (soProcCtx.stopProcess) {
return null;
}
boolean isOptimized = false;
try {
TableScanOperator tsOp = (TableScanOperator) stack.get(0);
if (tsOp.getNumParent() > 0) {
// looks like a subq plan.
return null;
}
if (tsOp.getConf().getRowLimit() != -1) {
// table is sampled. In some situation, we really can leverage row
// limit. In order to be safe, we do not use it now.
return null;
}
Table tbl = tsOp.getConf().getTableMetadata();
if (MetaStoreUtils.isExternalTable(tbl.getTTable())) {
Logger.info("Table " + tbl.getTableName() + " is external. Skip StatsOptimizer.");
return null;
}
if (AcidUtils.isTransactionalTable(tbl)) {
//todo: should this be OK for MM table?
Logger.info("Table " + tbl.getTableName() + " is ACID table. Skip StatsOptimizer.");
return null;
}
Long rowCnt = getRowCnt(pctx, tsOp, tbl);
// if we can not have correct table stats, then both the table stats and column stats are not useful.
if (rowCnt == null) {
return null;
}
SelectOperator pselOp = (SelectOperator)stack.get(1);
for(ExprNodeDesc desc : pselOp.getConf().getColList()) {
if (!((desc instanceof ExprNodeColumnDesc) || (desc instanceof ExprNodeConstantDesc))) {
// Probably an expression, cant handle that
return null;
}
}
Map exprMap = pselOp.getColumnExprMap();
// Since we have done an exact match on TS-SEL-GBY-RS-GBY-(SEL)-FS
// we need not to do any instanceof checks for following.
GroupByOperator pgbyOp = (GroupByOperator)stack.get(2);
if (getGbyKeyType(pgbyOp) == GbyKeyType.OTHER) {
return null;
}
// we already check if rowCnt is null and rowCnt==0 means table is
// empty.
else if (getGbyKeyType(pgbyOp) == GbyKeyType.CONSTANT && rowCnt == 0) {
return null;
}
ReduceSinkOperator rsOp = (ReduceSinkOperator)stack.get(3);
if (rsOp.getConf().getDistinctColumnIndices().size() > 0) {
// we can't handle distinct
return null;
}
GroupByOperator cgbyOp = (GroupByOperator)stack.get(4);
if (getGbyKeyType(cgbyOp) == GbyKeyType.OTHER) {
return null;
}
// we already check if rowCnt is null and rowCnt==0 means table is
// empty.
else if (getGbyKeyType(cgbyOp) == GbyKeyType.CONSTANT && rowCnt == 0) {
return null;
}
Operator> last = (Operator>) stack.get(5);
SelectOperator cselOp = null;
Map posToConstant = new LinkedHashMap<>();
if (last instanceof SelectOperator) {
cselOp = (SelectOperator) last;
if (!cselOp.isIdentitySelect()) {
for (int pos = 0; pos < cselOp.getConf().getColList().size(); pos++) {
ExprNodeDesc desc = cselOp.getConf().getColList().get(pos);
if (desc instanceof ExprNodeConstantDesc) {
//We store the position to the constant value for later use.
posToConstant.put(pos, ((ExprNodeConstantDesc)desc).getValue());
} else {
if (!(desc instanceof ExprNodeColumnDesc)) {
// Probably an expression, cant handle that
return null;
}
}
}
}
last = (Operator>) stack.get(6);
} else {
// Add constants if there is no SELECT on top
GroupByDesc gbyDesc = cgbyOp.getConf();
int numCols = gbyDesc.getOutputColumnNames().size();
int aggCols = gbyDesc.getAggregators().size();
List dpCols = cgbyOp.getSchema().getColumnNames().subList(0, numCols - aggCols);
for(int i = 0; i < dpCols.size(); i++) {
ExprNodeDesc end = ExprNodeDescUtils.findConstantExprOrigin(dpCols.get(i), cgbyOp);
assert end instanceof ExprNodeConstantDesc;
posToConstant.put(i, ((ExprNodeConstantDesc)end).getValue());
}
}
FileSinkOperator fsOp = (FileSinkOperator)last;
if (fsOp.getNumChild() > 0) {
// looks like a subq plan.
return null; // todo we can collapse this part of tree into single TS
}
List