org.apache.hadoop.hive.ql.parse.ColumnStatsAutoGatherContext Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.parse;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.ql.Context;
import org.apache.hadoop.hive.ql.QueryState;
import org.apache.hadoop.hive.ql.exec.ColumnInfo;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.RowSchema;
import org.apache.hadoop.hive.ql.exec.SelectOperator;
import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.AnalyzeRewriteContext;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.plan.LoadFileDesc;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.SelectDesc;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
/**
* ColumnStatsAutoGatherContext: This is passed to the compiler when set
* hive.stats.autogather=true during the INSERT OVERWRITE command.
*
**/
public class ColumnStatsAutoGatherContext {
public AnalyzeRewriteContext analyzeRewrite;
private final List loadFileWork = new ArrayList<>();
private final SemanticAnalyzer sa;
private final HiveConf conf;
private final Operator extends OperatorDesc> op;
private final List columns;
private final List partitionColumns;
private boolean isInsertInto;
private Table tbl;
private Map partSpec;
private Context origCtx;
public ColumnStatsAutoGatherContext(SemanticAnalyzer sa, HiveConf conf,
Operator extends OperatorDesc> op, Table tbl, Map partSpec,
boolean isInsertInto, Context ctx) throws SemanticException {
super();
this.sa = sa;
this.conf = conf;
this.op = op;
this.tbl = tbl;
this.partSpec = partSpec;
this.isInsertInto = isInsertInto;
this.origCtx = ctx;
columns = tbl.getCols();
partitionColumns = tbl.getPartCols();
}
public List getLoadFileWork() {
return loadFileWork;
}
public AnalyzeRewriteContext getAnalyzeRewrite() {
return analyzeRewrite;
}
public void setAnalyzeRewrite(AnalyzeRewriteContext analyzeRewrite) {
this.analyzeRewrite = analyzeRewrite;
}
public void insertAnalyzePipeline() throws SemanticException{
// 1. Generate the statement of analyze table [tablename] compute statistics for columns
// In non-partitioned table case, it will generate TS-SEL-GBY-RS-GBY-SEL-FS operator
// In static-partitioned table case, it will generate TS-FIL(partitionKey)-SEL-GBY(partitionKey)-RS-GBY-SEL-FS operator
// In dynamic-partitioned table case, it will generate TS-SEL-GBY(partitionKey)-RS-GBY-SEL-FS operator
// However, we do not need to specify the partition-spec because (1) the data is going to be inserted to that specific partition
// (2) we can compose the static/dynamic partition using a select operator in replaceSelectOperatorProcess..
String analyzeCommand = "analyze table `" + tbl.getDbName() + "`.`" + tbl.getTableName() + "`"
+ " compute statistics for columns ";
// 2. Based on the statement, generate the selectOperator
Operator> selOp = null;
try {
selOp = genSelOpForAnalyze(analyzeCommand, origCtx);
} catch (IOException | ParseException e) {
throw new SemanticException(e);
}
// 3. attach this SEL to the operator right before FS
op.getChildOperators().add(selOp);
selOp.getParentOperators().clear();
selOp.getParentOperators().add(op);
// 4. address the colExp, colList, etc for the SEL
try {
replaceSelectOperatorProcess((SelectOperator)selOp, op);
} catch (HiveException e) {
throw new SemanticException(e);
}
}
@SuppressWarnings("rawtypes")
private Operator genSelOpForAnalyze(String analyzeCommand, Context origCtx) throws IOException, ParseException, SemanticException{
//0. initialization
Context ctx = new Context(conf);
ctx.setExplainConfig(origCtx.getExplainConfig());
ASTNode tree = ParseUtils.parse(analyzeCommand, ctx);
//1. get the ColumnStatsSemanticAnalyzer
BaseSemanticAnalyzer baseSem = SemanticAnalyzerFactory.get(new QueryState(conf), tree);
ColumnStatsSemanticAnalyzer colSem = (ColumnStatsSemanticAnalyzer) baseSem;
//2. get the rewritten AST
ASTNode ast = colSem.rewriteAST(tree, this);
baseSem = SemanticAnalyzerFactory.get(new QueryState(conf), ast);
SemanticAnalyzer sem = (SemanticAnalyzer) baseSem;
QB qb = new QB(null, null, false);
ASTNode child = ast;
ParseContext subPCtx = ((SemanticAnalyzer) sem).getParseContext();
subPCtx.setContext(ctx);
((SemanticAnalyzer) sem).initParseCtx(subPCtx);
sem.doPhase1(child, qb, sem.initPhase1Ctx(), null);
// This will trigger new calls to metastore to collect metadata
// TODO: cache the information from the metastore
sem.getMetaData(qb);
Operator> operator = sem.genPlan(qb);
//3. populate the load file work so that ColumnStatsTask can work
loadFileWork.addAll(sem.getLoadFileWork());
//4. because there is only one TS for analyze statement, we can get it.
if (sem.topOps.values().size() != 1) {
throw new SemanticException(
"ColumnStatsAutoGatherContext is expecting exactly one TS, but finds "
+ sem.topOps.values().size());
}
operator = sem.topOps.values().iterator().next();
//5. get the first SEL after TS
while(!(operator instanceof SelectOperator)){
operator = operator.getChildOperators().get(0);
}
return operator;
}
/**
* @param operator : the select operator in the analyze statement
* @param input : the operator right before FS in the insert overwrite statement
* @throws HiveException
*/
private void replaceSelectOperatorProcess(SelectOperator operator, Operator extends OperatorDesc> input)
throws HiveException {
RowSchema selRS = operator.getSchema();
ArrayList signature = new ArrayList<>();
OpParseContext inputCtx = sa.opParseCtx.get(input);
RowResolver inputRR = inputCtx.getRowResolver();
ArrayList columns = inputRR.getColumnInfos();
ArrayList colList = new ArrayList();
ArrayList columnNames = new ArrayList();
Map columnExprMap =
new HashMap();
// the column positions in the operator should be like this
// <----non-partition columns---->|<--static partition columns-->|<--dynamic partition columns-->
// ExprNodeColumnDesc | ExprNodeConstantDesc | ExprNodeColumnDesc
// from input | generate itself | from input
// |
// 1. deal with non-partition columns
for (int i = 0; i < this.columns.size(); i++) {
ColumnInfo col = columns.get(i);
ExprNodeDesc exprNodeDesc = new ExprNodeColumnDesc(col);
colList.add(exprNodeDesc);
String internalName = selRS.getColumnNames().get(i);
columnNames.add(internalName);
columnExprMap.put(internalName, exprNodeDesc);
signature.add(selRS.getSignature().get(i));
}
// if there is any partition column (in static partition or dynamic
// partition or mixed case)
int dynamicPartBegin = -1;
for (int i = 0; i < partitionColumns.size(); i++) {
ExprNodeDesc exprNodeDesc = null;
String partColName = partitionColumns.get(i).getName();
// 2. deal with static partition columns
if (partSpec != null && partSpec.containsKey(partColName)
&& partSpec.get(partColName) != null) {
if (dynamicPartBegin > 0) {
throw new SemanticException(
"Dynamic partition columns should not come before static partition columns.");
}
exprNodeDesc = new ExprNodeConstantDesc(partSpec.get(partColName));
TypeInfo srcType = exprNodeDesc.getTypeInfo();
TypeInfo destType = selRS.getSignature().get(this.columns.size() + i).getType();
if (!srcType.equals(destType)) {
// This may be possible when srcType is string but destType is integer
exprNodeDesc = ParseUtils
.createConversionCast(exprNodeDesc, (PrimitiveTypeInfo) destType);
}
}
// 3. dynamic partition columns
else {
dynamicPartBegin++;
ColumnInfo col = columns.get(this.columns.size() + dynamicPartBegin);
TypeInfo srcType = col.getType();
TypeInfo destType = selRS.getSignature().get(this.columns.size() + i).getType();
exprNodeDesc = new ExprNodeColumnDesc(col);
if (!srcType.equals(destType)) {
exprNodeDesc = ParseUtils
.createConversionCast(exprNodeDesc, (PrimitiveTypeInfo) destType);
}
}
colList.add(exprNodeDesc);
String internalName = selRS.getColumnNames().get(this.columns.size() + i);
columnNames.add(internalName);
columnExprMap.put(internalName, exprNodeDesc);
signature.add(selRS.getSignature().get(this.columns.size() + i));
}
operator.setConf(new SelectDesc(colList, columnNames));
operator.setColumnExprMap(columnExprMap);
selRS.setSignature(signature);
operator.setSchema(selRS);
}
public String getCompleteName() {
return tbl.getDbName() + "." + tbl.getTableName();
}
public boolean isInsertInto() {
return isInsertInto;
}
public static boolean canRunAutogatherStats(Operator curr) {
// check the ObjectInspector
for (ColumnInfo cinfo : curr.getSchema().getSignature()) {
if (cinfo.getIsVirtualCol()) {
return false;
} else if (cinfo.getObjectInspector().getCategory() != ObjectInspector.Category.PRIMITIVE) {
return false;
} else {
switch (((PrimitiveTypeInfo) cinfo.getType()).getPrimitiveCategory()) {
case BOOLEAN:
case BYTE:
case SHORT:
case INT:
case LONG:
case TIMESTAMP:
case FLOAT:
case DOUBLE:
case STRING:
case CHAR:
case VARCHAR:
case BINARY:
case DECIMAL:
// TODO: Support case DATE:
break;
default:
return false;
}
}
}
return true;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy