org.apache.hadoop.hive.ql.parse.ColumnStatsAutoGatherContext Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hive-exec
There is a newer version: 4.0.1
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.parse;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.ql.Context;
import org.apache.hadoop.hive.ql.QueryState;
import org.apache.hadoop.hive.ql.exec.ColumnInfo;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.RowSchema;
import org.apache.hadoop.hive.ql.exec.SelectOperator;
import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.AnalyzeRewriteContext;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.plan.LoadFileDesc;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.SelectDesc;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;

/**
 * ColumnStatsAutoGatherContext: This is passed to the compiler when set
 * hive.stats.autogather=true during the INSERT OVERWRITE command.
 *
 **/

public class ColumnStatsAutoGatherContext {

  public AnalyzeRewriteContext analyzeRewrite;
  private final List loadFileWork = new ArrayList<>();
  private final SemanticAnalyzer sa;
  private final HiveConf conf;
  private final Operator op;
  private final List columns;
  private final List partitionColumns;
  private boolean isInsertInto;
  private Table tbl;
  private Map partSpec;
  private Context origCtx;
  
  public ColumnStatsAutoGatherContext(SemanticAnalyzer sa, HiveConf conf,
      Operator op, Table tbl, Map partSpec,
      boolean isInsertInto, Context ctx) throws SemanticException {
    super();
    this.sa = sa;
    this.conf = conf;
    this.op = op;
    this.tbl = tbl;
    this.partSpec = partSpec;
    this.isInsertInto = isInsertInto;
    this.origCtx = ctx;
    columns = tbl.getCols();
    partitionColumns = tbl.getPartCols();
  }

  public List getLoadFileWork() {
    return loadFileWork;
  }

  public AnalyzeRewriteContext getAnalyzeRewrite() {
    return analyzeRewrite;
  }

  public void setAnalyzeRewrite(AnalyzeRewriteContext analyzeRewrite) {
    this.analyzeRewrite = analyzeRewrite;
  }

  public void insertAnalyzePipeline() throws SemanticException{
    // 1. Generate the statement of analyze table [tablename] compute statistics for columns
    // In non-partitioned table case, it will generate TS-SEL-GBY-RS-GBY-SEL-FS operator
    // In static-partitioned table case, it will generate TS-FIL(partitionKey)-SEL-GBY(partitionKey)-RS-GBY-SEL-FS operator
    // In dynamic-partitioned table case, it will generate TS-SEL-GBY(partitionKey)-RS-GBY-SEL-FS operator
    // However, we do not need to specify the partition-spec because (1) the data is going to be inserted to that specific partition
    // (2) we can compose the static/dynamic partition using a select operator in replaceSelectOperatorProcess..
    String analyzeCommand = "analyze table `" + tbl.getDbName() + "`.`" + tbl.getTableName() + "`"
        + " compute statistics for columns ";

    // 2. Based on the statement, generate the selectOperator
    Operator selOp = null;
    try {
      selOp = genSelOpForAnalyze(analyzeCommand, origCtx);
    } catch (IOException | ParseException e) {
      throw new SemanticException(e);
    }

    // 3. attach this SEL to the operator right before FS
    op.getChildOperators().add(selOp);
    selOp.getParentOperators().clear();
    selOp.getParentOperators().add(op);

    // 4. address the colExp, colList, etc for the SEL
    try {
      replaceSelectOperatorProcess((SelectOperator)selOp, op);
    } catch (HiveException e) {
      throw new SemanticException(e);
    }
  }
  
  @SuppressWarnings("rawtypes")
  private Operator genSelOpForAnalyze(String analyzeCommand, Context origCtx) throws IOException, ParseException, SemanticException{
    //0. initialization
    Context ctx = new Context(conf);
    ctx.setExplainConfig(origCtx.getExplainConfig());
    ASTNode tree = ParseUtils.parse(analyzeCommand, ctx);

    //1. get the ColumnStatsSemanticAnalyzer
    BaseSemanticAnalyzer baseSem = SemanticAnalyzerFactory.get(new QueryState(conf), tree);
    ColumnStatsSemanticAnalyzer colSem = (ColumnStatsSemanticAnalyzer) baseSem;

    //2. get the rewritten AST
    ASTNode ast = colSem.rewriteAST(tree, this);
    baseSem = SemanticAnalyzerFactory.get(new QueryState(conf), ast);
    SemanticAnalyzer sem = (SemanticAnalyzer) baseSem;
    QB qb = new QB(null, null, false);
    ASTNode child = ast;
    ParseContext subPCtx = ((SemanticAnalyzer) sem).getParseContext();
    subPCtx.setContext(ctx);
    ((SemanticAnalyzer) sem).initParseCtx(subPCtx);
    sem.doPhase1(child, qb, sem.initPhase1Ctx(), null);
    // This will trigger new calls to metastore to collect metadata
    // TODO: cache the information from the metastore
    sem.getMetaData(qb);
    Operator operator = sem.genPlan(qb);

    //3. populate the load file work so that ColumnStatsTask can work
    loadFileWork.addAll(sem.getLoadFileWork());

    //4. because there is only one TS for analyze statement, we can get it.
    if (sem.topOps.values().size() != 1) {
      throw new SemanticException(
          "ColumnStatsAutoGatherContext is expecting exactly one TS, but finds "
              + sem.topOps.values().size());
    }
    operator = sem.topOps.values().iterator().next();

    //5. get the first SEL after TS
    while(!(operator instanceof SelectOperator)){
      operator = operator.getChildOperators().get(0);
    }
    return operator;
  }

  /**
   * @param operator : the select operator in the analyze statement
   * @param input : the operator right before FS in the insert overwrite statement
   * @throws HiveException 
   */
  private void replaceSelectOperatorProcess(SelectOperator operator, Operator input)
      throws HiveException {
    RowSchema selRS = operator.getSchema();
    ArrayList signature = new ArrayList<>();
    OpParseContext inputCtx = sa.opParseCtx.get(input);
    RowResolver inputRR = inputCtx.getRowResolver();
    ArrayList columns = inputRR.getColumnInfos();
    ArrayList colList = new ArrayList();
    ArrayList columnNames = new ArrayList();
    Map columnExprMap =
        new HashMap();
    // the column positions in the operator should be like this
    // <----non-partition columns---->|<--static partition columns-->|<--dynamic partition columns-->
    //        ExprNodeColumnDesc      |      ExprNodeConstantDesc    |     ExprNodeColumnDesc
    //           from input           |         generate itself      |        from input
    //                                |

    // 1. deal with non-partition columns
    for (int i = 0; i < this.columns.size(); i++) {
      ColumnInfo col = columns.get(i);
      ExprNodeDesc exprNodeDesc = new ExprNodeColumnDesc(col);
      colList.add(exprNodeDesc);
      String internalName = selRS.getColumnNames().get(i);
      columnNames.add(internalName);
      columnExprMap.put(internalName, exprNodeDesc);
      signature.add(selRS.getSignature().get(i));
    }
    // if there is any partition column (in static partition or dynamic
    // partition or mixed case)
    int dynamicPartBegin = -1;
    for (int i = 0; i < partitionColumns.size(); i++) {
      ExprNodeDesc exprNodeDesc = null;
      String partColName = partitionColumns.get(i).getName();
      // 2. deal with static partition columns
      if (partSpec != null && partSpec.containsKey(partColName)
          && partSpec.get(partColName) != null) {
        if (dynamicPartBegin > 0) {
          throw new SemanticException(
              "Dynamic partition columns should not come before static partition columns.");
        }
        exprNodeDesc = new ExprNodeConstantDesc(partSpec.get(partColName));
        TypeInfo srcType = exprNodeDesc.getTypeInfo();
        TypeInfo destType = selRS.getSignature().get(this.columns.size() + i).getType();
        if (!srcType.equals(destType)) {
          // This may be possible when srcType is string but destType is integer
          exprNodeDesc = ParseUtils
              .createConversionCast(exprNodeDesc, (PrimitiveTypeInfo) destType);
        }
      }
      // 3. dynamic partition columns
      else {
        dynamicPartBegin++;
        ColumnInfo col = columns.get(this.columns.size() + dynamicPartBegin);
        TypeInfo srcType = col.getType();
        TypeInfo destType = selRS.getSignature().get(this.columns.size() + i).getType();
        exprNodeDesc = new ExprNodeColumnDesc(col);
        if (!srcType.equals(destType)) {
          exprNodeDesc = ParseUtils
              .createConversionCast(exprNodeDesc, (PrimitiveTypeInfo) destType);
        }
      }
      colList.add(exprNodeDesc);
      String internalName = selRS.getColumnNames().get(this.columns.size() + i);
      columnNames.add(internalName);
      columnExprMap.put(internalName, exprNodeDesc);
      signature.add(selRS.getSignature().get(this.columns.size() + i));
    }
    operator.setConf(new SelectDesc(colList, columnNames));
    operator.setColumnExprMap(columnExprMap);
    selRS.setSignature(signature);
    operator.setSchema(selRS);
  }

  public String getCompleteName() {
    return tbl.getDbName() + "." + tbl.getTableName();
  }

  public boolean isInsertInto() {
    return isInsertInto;
  }

  public static boolean canRunAutogatherStats(Operator curr) {
    // check the ObjectInspector
    for (ColumnInfo cinfo : curr.getSchema().getSignature()) {
      if (cinfo.getIsVirtualCol()) {
        return false;
      } else if (cinfo.getObjectInspector().getCategory() != ObjectInspector.Category.PRIMITIVE) {
        return false;
      } else {
        switch (((PrimitiveTypeInfo) cinfo.getType()).getPrimitiveCategory()) {
        case BOOLEAN:
        case BYTE:
        case SHORT:
        case INT:
        case LONG:
        case TIMESTAMP:
        case FLOAT:
        case DOUBLE:
        case STRING:
        case CHAR:
        case VARCHAR:
        case BINARY:
        case DECIMAL:
          // TODO: Support case DATE:
          break;
        default:
          return false;
        }
      }
    }
    return true;
  }

}