org.apache.hadoop.hive.ql.optimizer.StatsOptimizer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hive-exec
There is a newer version: 4.0.1
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.optimizer;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.Stack;

import org.apache.hadoop.hive.common.StatsSetupConst;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
import org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData;
import org.apache.hadoop.hive.metastore.api.LongColumnStatsData;
import org.apache.hadoop.hive.ql.exec.ColumnInfo;
import org.apache.hadoop.hive.ql.exec.FetchTask;
import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
import org.apache.hadoop.hive.ql.exec.GroupByOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
import org.apache.hadoop.hive.ql.exec.SelectOperator;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.exec.TaskFactory;
import org.apache.hadoop.hive.ql.io.AcidUtils;
import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
import org.apache.hadoop.hive.ql.lib.Dispatcher;
import org.apache.hadoop.hive.ql.lib.GraphWalker;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.NodeProcessor;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.lib.Rule;
import org.apache.hadoop.hive.ql.lib.RuleRegExp;
import org.apache.hadoop.hive.ql.metadata.Hive;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.parse.ParseContext;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.AggregationDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils;
import org.apache.hadoop.hive.ql.plan.FetchWork;
import org.apache.hadoop.hive.ql.plan.GroupByDesc;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFCount;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMax;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMin;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFResolver;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFSum;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.thrift.TException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.Lists;


/** There is a set of queries which can be answered entirely from statistics stored in metastore.
 * Examples of such queries are count(*), count(a), max(a), min(b) etc. Hive already collects
 * these basic statistics for query planning purposes. These same statistics can be used to
 * answer queries also.
 *
 * Optimizer looks at query plan to determine if it can answer query using statistics
 * and than change the plan to answer query entirely using statistics stored in metastore.
 */
public class StatsOptimizer extends Transform {
  // TODO: [HIVE-6289] while getting stats from metastore, we currently only get one col at
  //       a time; this could be improved - get all necessary columns in advance, then use local.
  // TODO: [HIVE-6292] aggregations could be done directly in metastore. Hive over MySQL!

  private static final Logger Logger = LoggerFactory.getLogger(StatsOptimizer.class);

  @Override
  public ParseContext transform(ParseContext pctx) throws SemanticException {

    if (pctx.getFetchTask() != null || !pctx.getQueryProperties().isQuery()
        || pctx.getQueryProperties().isAnalyzeRewrite() || pctx.getQueryProperties().isCTAS()
        || pctx.getLoadFileWork().size() > 1 || !pctx.getLoadTableWork().isEmpty()
        // If getNameToSplitSample is not empty, at least one of the source
        // tables is being sampled and we can not optimize.
        || !pctx.getNameToSplitSample().isEmpty()) {
      return pctx;
    }

    String TS = TableScanOperator.getOperatorName() + "%";
    String GBY = GroupByOperator.getOperatorName() + "%";
    String RS = ReduceSinkOperator.getOperatorName() + "%";
    String SEL = SelectOperator.getOperatorName() + "%";
    String FS = FileSinkOperator.getOperatorName() + "%";

    Map opRules = new LinkedHashMap();
    opRules.put(new RuleRegExp("R1", TS + SEL + GBY + RS + GBY + SEL + FS),
        new MetaDataProcessor(pctx));
    opRules.put(new RuleRegExp("R2", TS + SEL + GBY + RS + GBY + FS),
            new MetaDataProcessor(pctx));

    NodeProcessorCtx soProcCtx = new StatsOptimizerProcContext();
    Dispatcher disp = new DefaultRuleDispatcher(null, opRules, soProcCtx);
    GraphWalker ogw = new DefaultGraphWalker(disp);

    ArrayList topNodes = new ArrayList();
    topNodes.addAll(pctx.getTopOps().values());
    ogw.startWalking(topNodes, null);
    return pctx;
  }

  private static class StatsOptimizerProcContext implements NodeProcessorCtx {
    boolean stopProcess = false;
  }

  private static class MetaDataProcessor implements NodeProcessor {

    private final ParseContext pctx;

    public MetaDataProcessor (ParseContext pctx) {
      this.pctx = pctx;
    }

    enum StatType{
      Integeral,
      Double,
      String,
      Boolean,
      Binary,
      Unsupported
    }

    enum LongSubType {
      BIGINT { @Override
      Object cast(long longValue) { return longValue; } },
      INT { @Override
      Object cast(long longValue) { return (int)longValue; } },
      SMALLINT { @Override
      Object cast(long longValue) { return (short)longValue; } },
      TINYINT { @Override
      Object cast(long longValue) { return (byte)longValue; } };

      abstract Object cast(long longValue);
    }

    enum DoubleSubType {
      DOUBLE { @Override
      Object cast(double doubleValue) { return doubleValue; } },
      FLOAT { @Override
      Object cast(double doubleValue) { return (float) doubleValue; } };

      abstract Object cast(double doubleValue);
    }

    private StatType getType(String origType) {
      if (serdeConstants.IntegralTypes.contains(origType)) {
        return StatType.Integeral;
      } else if (origType.equals(serdeConstants.DOUBLE_TYPE_NAME) ||
          origType.equals(serdeConstants.FLOAT_TYPE_NAME)) {
        return StatType.Double;
      } else if (origType.equals(serdeConstants.BINARY_TYPE_NAME)) {
        return StatType.Binary;
      } else if (origType.equals(serdeConstants.BOOLEAN_TYPE_NAME)) {
        return StatType.Boolean;
      } else if (origType.equals(serdeConstants.STRING_TYPE_NAME)) {
        return StatType.String;
      }
      return StatType.Unsupported;
    }

    private Long getNullcountFor(StatType type, ColumnStatisticsData statData) {

      switch(type) {
      case Integeral :
        return statData.getLongStats().getNumNulls();
      case Double:
        return statData.getDoubleStats().getNumNulls();
      case String:
        return statData.getStringStats().getNumNulls();
      case Boolean:
        return statData.getBooleanStats().getNumNulls();
      case Binary:
        return statData.getBinaryStats().getNumNulls();
      default:
        return null;
      }
    }

    private boolean hasNullOrConstantGbyKey(GroupByOperator gbyOp) {
      GroupByDesc gbyDesc = gbyOp.getConf();
      int numCols = gbyDesc.getOutputColumnNames().size();
      int aggCols = gbyDesc.getAggregators().size();
      // If the Group by operator has null key
      if (numCols == aggCols) {
        return true;
      }
      // If the Gby key is a constant
      List dpCols = gbyOp.getSchema().getColumnNames().subList(0, numCols - aggCols);
      for(String dpCol : dpCols) {
        ExprNodeDesc end = ExprNodeDescUtils.findConstantExprOrigin(dpCol, gbyOp);
        if (!(end instanceof ExprNodeConstantDesc)) {
          return false;
        }
      }
      return true;
    }

    @Override
    public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {

      // 1. Do few checks to determine eligibility of optimization
      // 2. look at ExprNodeFuncGenericDesc in select list to see if its min, max, count etc.
      //    If it is
      // 3. Connect to metastore and get the stats
      // 4. Compose rows and add it in FetchWork
      // 5. Delete GBY - RS - GBY - SEL from the pipeline.
      StatsOptimizerProcContext soProcCtx = (StatsOptimizerProcContext) procCtx;

      // If the optimization has been stopped for the reasons like being not qualified,
      // or lack of the stats data. we do not continue this process. For an example,
      // for a query select max(value) from src1 union all select max(value) from src2
      // if it has been union remove optimized, the AST tree will become
      // TS[0]->SEL[1]->GBY[2]-RS[3]->GBY[4]->FS[17]
      // TS[6]->SEL[7]->GBY[8]-RS[9]->GBY[10]->FS[18]
      // if TS[0] branch for src1 is not optimized because src1 does not have column stats
      // there is no need to continue processing TS[6] branch
      if (soProcCtx.stopProcess) {
        return null;
      }

      boolean isOptimized = false;
      try {
        TableScanOperator tsOp = (TableScanOperator) stack.get(0);
        if (tsOp.getNumParent() > 0) {
          // looks like a subq plan.
          return null;
        }
        if (tsOp.getConf().getRowLimit() != -1) {
          // table is sampled. In some situation, we really can leverage row
          // limit. In order to be safe, we do not use it now.
          return null;
        }
        SelectOperator pselOp = (SelectOperator)stack.get(1);
        for(ExprNodeDesc desc : pselOp.getConf().getColList()) {
          if (!((desc instanceof ExprNodeColumnDesc) || (desc instanceof ExprNodeConstantDesc))) {
            // Probably an expression, cant handle that
            return null;
          }
        }
        Map exprMap = pselOp.getColumnExprMap();
        // Since we have done an exact match on TS-SEL-GBY-RS-GBY-(SEL)-FS
        // we need not to do any instanceof checks for following.
        GroupByOperator pgbyOp = (GroupByOperator)stack.get(2);
        if (!hasNullOrConstantGbyKey(pgbyOp)) {
          return null;
        }
        ReduceSinkOperator rsOp = (ReduceSinkOperator)stack.get(3);
        if (rsOp.getConf().getDistinctColumnIndices().size() > 0) {
          // we can't handle distinct
          return null;
        }

        GroupByOperator cgbyOp = (GroupByOperator)stack.get(4);
        if (!hasNullOrConstantGbyKey(cgbyOp)) {
          return null;
        }
        Operator last = (Operator) stack.get(5);
        SelectOperator cselOp = null;
        Map posToConstant = new HashMap<>();
        if (last instanceof SelectOperator) {
          cselOp = (SelectOperator) last;
          if (!cselOp.isIdentitySelect()) {
            for (int pos = 0; pos < cselOp.getConf().getColList().size(); pos++) {
              ExprNodeDesc desc = cselOp.getConf().getColList().get(pos);
              if (desc instanceof ExprNodeConstantDesc) {
                //We store the position to the constant value for later use.
                posToConstant.put(pos, ((ExprNodeConstantDesc)desc).getValue());
              } else {
                if (!(desc instanceof ExprNodeColumnDesc)) {
                  // Probably an expression, cant handle that
                  return null;
                }
              }
            }
          }
          last = (Operator) stack.get(6);
        }
        FileSinkOperator fsOp = (FileSinkOperator)last;
        if (fsOp.getNumChild() > 0) {
          // looks like a subq plan.
          return null;  // todo we can collapse this part of tree into single TS
        }

        Table tbl = tsOp.getConf().getTableMetadata();
        if (AcidUtils.isAcidTable(tbl)) {
          Logger.info("Table " + tbl.getTableName() + " is ACID table. Skip StatsOptimizer.");
          return null;
        }
        List