org.apache.hadoop.hive.ql.optimizer.GlobalLimitOptimizer Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of hive-exec
There is a newer version: 4.0.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.optimizer;

import java.util.Collection;
import java.util.Map;
import java.util.Set;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.hive.ql.Context;
import org.apache.hadoop.hive.ql.exec.FilterOperator;
import org.apache.hadoop.hive.ql.exec.GroupByOperator;
import org.apache.hadoop.hive.ql.plan.LimitDesc;
import org.apache.hadoop.hive.ql.exec.LimitOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.OperatorUtils;
import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner;
import org.apache.hadoop.hive.ql.parse.GlobalLimitCtx;
import org.apache.hadoop.hive.ql.parse.ParseContext;
import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.parse.SplitSample;
import org.apache.hadoop.hive.ql.plan.FilterDesc;
import org.apache.hadoop.hive.ql.plan.GroupByDesc;
import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc;

import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Multimap;

/**
 * This optimizer is used to reduce the input size for the query for queries which are
 * specifying a limit.
 * 
 * For eg. for a query of type:
 * 

 * select expr from T where  limit 100;
 * 
 * Most probably, the whole table T need not be scanned.
 * Chances are that even if we scan the first file of T, we would get the 100 rows
 * needed by this query.
 * This optimizer step populates the GlobalLimitCtx which is used later on to prune the inputs.
 */
public class GlobalLimitOptimizer extends Transform {

  private final Logger LOG = LoggerFactory.getLogger(GlobalLimitOptimizer.class.getName());

  @Override
  public ParseContext transform(ParseContext pctx) throws SemanticException {
    Context ctx = pctx.getContext();
    Map topOps = pctx.getTopOps();
    GlobalLimitCtx globalLimitCtx = pctx.getGlobalLimitCtx();
    Map nameToSplitSample = pctx.getNameToSplitSample();

    // determine the query qualifies reduce input size for LIMIT
    // The query only qualifies when there are only one top operator
    // and there is no transformer or UDTF and no block sampling
    // is used.
    if (topOps.size() == 1
        && !globalLimitCtx.ifHasTransformOrUDTF() &&
        nameToSplitSample.isEmpty()) {

      // Here we recursively check:
      // 1. whether there are exact one LIMIT in the query
      // 2. whether there is no aggregation, group-by, distinct, sort by,
      //    distributed by, or table sampling in any of the sub-query.
      // The query only qualifies if both conditions are satisfied.
      //
      // Example qualified queries:
      //    CREATE TABLE ... AS SELECT col1, col2 FROM tbl LIMIT ..
      //    INSERT OVERWRITE TABLE ... SELECT col1, hash(col2), split(col1)
      //                               FROM ... LIMIT...
      //    SELECT * FROM (SELECT col1 as col2 (SELECT * FROM ...) t1 LIMIT ...) t2);
      //
      TableScanOperator ts = topOps.values().iterator().next();
      LimitOperator tempGlobalLimit = checkQbpForGlobalLimit(ts);

      // query qualify for the optimization
      if (tempGlobalLimit != null) {
        LimitDesc tempGlobalLimitDesc = tempGlobalLimit.getConf();
        Table tab = ts.getConf().getTableMetadata();
        Set filterOps = OperatorUtils.findOperators(ts, FilterOperator.class);

        if (!tab.isPartitioned()) {
          if (filterOps.size() == 0) {
            Integer tempOffset = tempGlobalLimitDesc.getOffset();
            globalLimitCtx.enableOpt(tempGlobalLimitDesc.getLimit(),
                (tempOffset == null) ? 0 : tempOffset);
          }
        } else {
          // check if the pruner only contains partition columns
          if (onlyContainsPartnCols(tab, filterOps)) {

            String alias = (String) topOps.keySet().toArray()[0];
            PrunedPartitionList partsList = pctx.getPrunedPartitions(alias, ts);

            // If there is any unknown partition, create a map-reduce job for
            // the filter to prune correctly
            if (!partsList.hasUnknownPartitions()) {
              Integer tempOffset = tempGlobalLimitDesc.getOffset();
              globalLimitCtx.enableOpt(tempGlobalLimitDesc.getLimit(),
                  (tempOffset == null) ? 0 : tempOffset);
            }
          }
        }
        if (globalLimitCtx.isEnable()) {
          LOG.info("Qualify the optimize that reduces input size for 'offset' for offset "
              + globalLimitCtx.getGlobalOffset());
          LOG.info("Qualify the optimize that reduces input size for 'limit' for limit "
              + globalLimitCtx.getGlobalLimit());
        }
      }
    }
    return pctx;
  }

  private boolean onlyContainsPartnCols(Table table, Set filters) {
    for (FilterOperator filter : filters) {
      if (!PartitionPruner.onlyContainsPartnCols(table, filter.getConf().getPredicate())) {
        return false;
      }
    }
    return true;
  }

  /**
   * Check the limit number in all sub queries
   *
   * @return if there is one and only one limit for all subqueries, return the limit
   *         if there is no limit, return 0
   *         otherwise, return null
   */
  private static LimitOperator checkQbpForGlobalLimit(TableScanOperator ts) {
    Set>> searchedClasses =
          new ImmutableSet.Builder>>()
            .add(ReduceSinkOperator.class)
            .add(GroupByOperator.class)
            .add(FilterOperator.class)
            .add(LimitOperator.class)
            .build();
    Multimap>, Operator> ops =
            OperatorUtils.classifyOperators(ts, searchedClasses);
    // To apply this optimization, in the input query:
    // - There cannot exist any order by/sort by clause,
    // thus existsOrdering should be false.
    // - There cannot exist any distribute by clause, thus
    // existsPartitioning should be false.
    // - There cannot exist any cluster by clause, thus
    // existsOrdering AND existsPartitioning should be false.
    for (Operator op : ops.get(ReduceSinkOperator.class)) {
      ReduceSinkDesc reduceSinkConf = ((ReduceSinkOperator) op).getConf();
      if (reduceSinkConf.isOrdering() || reduceSinkConf.isPartitioning()) {
        return null;
      }
    }
    // - There cannot exist any (distinct) aggregate.
    for (Operator op : ops.get(GroupByOperator.class)) {
      GroupByDesc groupByConf = ((GroupByOperator) op).getConf();
      if (groupByConf.isAggregate() || groupByConf.isDistinct()) {
        return null;
      }
    }
    // - There cannot exist any sampling predicate.
    for (Operator op : ops.get(FilterOperator.class)) {
      FilterDesc filterConf = ((FilterOperator) op).getConf();
      if (filterConf.getIsSamplingPred()) {
        return null;
      }
    }
    // If there is one and only one limit starting at op, return the limit
    // If there is no limit, return 0
    // Otherwise, return null
    Collection> limitOps = ops.get(LimitOperator.class);
    if (limitOps.size() == 1) {
      return (LimitOperator) limitOps.iterator().next();
    }
    else if (limitOps.size() == 0) {
      return null;
    }
    return null;
  }
}