org.apache.hadoop.hive.ql.optimizer.PointLookupOptimizer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hive-exec
There is a newer version: 4.0.1
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.optimizer;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Stack;

import org.apache.calcite.util.Pair;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.FilterOperator;
import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
import org.apache.hadoop.hive.ql.lib.Dispatcher;
import org.apache.hadoop.hive.ql.lib.ForwardWalker;
import org.apache.hadoop.hive.ql.lib.GraphWalker;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.NodeProcessor;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.lib.PreOrderOnceWalker;
import org.apache.hadoop.hive.ql.lib.Rule;
import org.apache.hadoop.hive.ql.lib.RuleRegExp;
import org.apache.hadoop.hive.ql.lib.TypeRule;
import org.apache.hadoop.hive.ql.parse.ParseContext;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFIn;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFStruct;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;

import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.ListMultimap;

/**
 * This optimization will take a Filter expression, and if its predicate contains
 * an OR operator whose children are constant equality expressions, it will try
 * to generate an IN clause (which is more efficient). If the OR operator contains
 * AND operator children, the optimization might generate an IN clause that uses
 * structs.
 */
public class PointLookupOptimizer extends Transform {

  private static final Logger LOG = LoggerFactory.getLogger(PointLookupOptimizer.class);
  private static final String IN_UDF =
          GenericUDFIn.class.getAnnotation(Description.class).name();
  private static final String STRUCT_UDF =
          GenericUDFStruct.class.getAnnotation(Description.class).name();
  // these are closure-bound for all the walkers in context
  public final int minOrExpr;

  /*
   * Pass in configs and pre-create a parse context
   */
  public PointLookupOptimizer(final int min) {
    this.minOrExpr = min;
  }

  @Override
  public ParseContext transform(ParseContext pctx) throws SemanticException {
    // 1. Trigger transformation
    Map opRules = new LinkedHashMap();
    opRules.put(new RuleRegExp("R1", FilterOperator.getOperatorName() + "%"), new FilterTransformer());

    Dispatcher disp = new DefaultRuleDispatcher(null, opRules, null);
    GraphWalker ogw = new ForwardWalker(disp);

    List topNodes = new ArrayList();
    topNodes.addAll(pctx.getTopOps().values());
    ogw.startWalking(topNodes, null);
    return pctx;
  }

  private class FilterTransformer implements NodeProcessor {

    @Override
    public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {
      FilterOperator filterOp = (FilterOperator) nd;
      ExprNodeDesc predicate = filterOp.getConf().getPredicate();

      // Generate the list bucketing pruning predicate
      ExprNodeDesc newPredicate = generateInClause(predicate);
      if (newPredicate != null) {
        // Replace filter in current FIL with new FIL
        if (LOG.isDebugEnabled()) {
          LOG.debug("Generated new predicate with IN clause: " + newPredicate);
        }
        filterOp.getConf().setPredicate(newPredicate);
      }

      return null;
    }

    private ExprNodeDesc generateInClause(ExprNodeDesc predicate) throws SemanticException {
      Map exprRules = new LinkedHashMap();
      exprRules.put(new TypeRule(ExprNodeGenericFuncDesc.class), new OrExprProcessor());

      // The dispatcher fires the processor corresponding to the closest matching
      // rule and passes the context along
      Dispatcher disp = new DefaultRuleDispatcher(null, exprRules, null);
      GraphWalker egw = new PreOrderOnceWalker(disp);

      List startNodes = new ArrayList();
      startNodes.add(predicate);

      HashMap outputMap = new HashMap();
      egw.startWalking(startNodes, outputMap);
      return (ExprNodeDesc) outputMap.get(predicate);
    }
  }

  private class OrExprProcessor implements NodeProcessor {

    @Override
    public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {
      ExprNodeGenericFuncDesc fd = (ExprNodeGenericFuncDesc) nd;

      // 1. If it is not an OR operator, we bail out.
      if (!FunctionRegistry.isOpOr(fd)) {
        return null;
      }

      // 2. It is an OR operator with enough children
      List children = fd.getChildren();
      if (children.size() < minOrExpr) {
        return null;
      }
      ListMultimap> columnConstantsMap =
              ArrayListMultimap.create();
      boolean modeAnd = false;
      for (int i = 0; i < children.size(); i++) {
        ExprNodeDesc child = children.get(i);

        // - If the child is an AND operator, extract its children
        // - Otherwise, take the child itself
        final List conjunctions;
        if (FunctionRegistry.isOpAnd(child)) {
          // If it is the first child, we set the mode variable value
          // Otherwise, if the mode we are working on is different, we
          // bail out
          if (i == 0) {
            modeAnd = true;
          } else {
            if (!modeAnd) {
              return null;
            }
          }

          // Multiple children
          conjunctions = child.getChildren();
        } else {
          // If it is the first child, we set the mode variable value
          // Otherwise, if the mode we are working on is different, we
          // bail out
          if (i == 0) {
            modeAnd = false;
          } else {
            if (modeAnd) {
              return null;
            }
          }

          // One child
          conjunctions = new ArrayList(1);
          conjunctions.add(child);
        }

        // 3. We will extract the literals to introduce in the IN clause.
        //    If the patterns OR-AND-EqOp or OR-EqOp are not matched, we bail out
        for (ExprNodeDesc conjunction: conjunctions) {
          if (!(conjunction instanceof ExprNodeGenericFuncDesc)) {
            return null;
          }

          ExprNodeGenericFuncDesc conjCall = (ExprNodeGenericFuncDesc) conjunction;
          Class genericUdfClass = conjCall.getGenericUDF().getClass();
          if(GenericUDFOPEqual.class == genericUdfClass) {
            if (conjCall.getChildren().get(0) instanceof ExprNodeColumnDesc &&
                    conjCall.getChildren().get(1) instanceof ExprNodeConstantDesc) {
              ExprNodeColumnDesc ref = (ExprNodeColumnDesc) conjCall.getChildren().get(0);
              String refString = ref.toString();
              columnConstantsMap.put(refString,
                      new Pair(
                              ref, (ExprNodeConstantDesc) conjCall.getChildren().get(1)));
              if (columnConstantsMap.get(refString).size() != i+1) {
                // If we have not added to this column desc before, we bail out
                return null;
              }
            } else if (conjCall.getChildren().get(1) instanceof ExprNodeColumnDesc &&
                    conjCall.getChildren().get(0) instanceof ExprNodeConstantDesc) {
              ExprNodeColumnDesc ref = (ExprNodeColumnDesc) conjCall.getChildren().get(1);
              String refString = ref.toString();
              columnConstantsMap.put(refString,
                      new Pair(
                              ref, (ExprNodeConstantDesc) conjCall.getChildren().get(0)));
              if (columnConstantsMap.get(refString).size() != i+1) {
                // If we have not added to this column desc before, we bail out
                return null;
              }
            } else {
              // We bail out
              return null;
            }
          } else {
            // We bail out
            return null;
          }
        }
      }

      // 4. We build the new predicate and return it
      ExprNodeDesc newPredicate = null;
      List newChildren = new ArrayList(children.size());
      // 4.1 Create structs
      List columns = new ArrayList();
      List names = new ArrayList();
      List typeInfos = new ArrayList();
      for (int i = 0; i < children.size(); i++) {
        List constantFields = new ArrayList(children.size());

        for (String keyString : columnConstantsMap.keySet()) {
          Pair columnConstant =
                  columnConstantsMap.get(keyString).get(i);
          if (i == 0) {
            columns.add(columnConstant.left);
            names.add(columnConstant.left.getColumn());
            typeInfos.add(columnConstant.left.getTypeInfo());
          }
          constantFields.add(columnConstant.right);
        }

        if (i == 0) {
          ExprNodeDesc columnsRefs;
          if (columns.size() == 1) {
            columnsRefs = columns.get(0);
          } else {
            columnsRefs = new ExprNodeGenericFuncDesc(
                    TypeInfoFactory.getStructTypeInfo(names, typeInfos),
                    FunctionRegistry.getFunctionInfo(STRUCT_UDF).getGenericUDF(),
                    columns);
          }
          newChildren.add(columnsRefs);
        }
        ExprNodeDesc values;
        if (constantFields.size() == 1) {
          values = constantFields.get(0);
        } else {
          values = new ExprNodeGenericFuncDesc(
                  TypeInfoFactory.getStructTypeInfo(names, typeInfos),
                  FunctionRegistry.getFunctionInfo(STRUCT_UDF).getGenericUDF(),
                  constantFields);
        }
        newChildren.add(values);
      }
      newPredicate = new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo,
              FunctionRegistry.getFunctionInfo(IN_UDF).getGenericUDF(), newChildren);

      return newPredicate;
    }

  }

}