All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hive.ql.ppd.SyntheticJoinPredicate Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.ppd;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Stack;

import org.apache.hadoop.hive.ql.exec.FilterOperator;
import org.apache.hadoop.hive.ql.exec.GroupByOperator;
import org.apache.hadoop.hive.ql.exec.LateralViewForwardOperator;
import org.apache.hadoop.hive.ql.exec.SelectOperator;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
import org.apache.hadoop.hive.ql.exec.CommonJoinOperator;
import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
import org.apache.hadoop.hive.ql.exec.JoinOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.OperatorFactory;
import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
import org.apache.hadoop.hive.ql.exec.RowSchema;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
import org.apache.hadoop.hive.ql.lib.SemanticDispatcher;
import org.apache.hadoop.hive.ql.lib.SemanticGraphWalker;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.SemanticNodeProcessor;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.lib.PreOrderOnceWalker;
import org.apache.hadoop.hive.ql.lib.SemanticRule;
import org.apache.hadoop.hive.ql.lib.RuleRegExp;
import org.apache.hadoop.hive.ql.optimizer.Transform;
import org.apache.hadoop.hive.ql.parse.ParseContext;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDynamicListDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.plan.FilterDesc;
import org.apache.hadoop.hive.ql.plan.JoinCondDesc;
import org.apache.hadoop.hive.ql.plan.JoinDesc;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;

/**
 * creates synthetic predicates that represent "IN (keylist other table)"
 */
public class SyntheticJoinPredicate extends Transform {

  private static transient Logger LOG = LoggerFactory.getLogger(SyntheticJoinPredicate.class.getName());

  @Override
  public ParseContext transform(ParseContext pctx) throws SemanticException {

    boolean enabled = false;
    String queryEngine = pctx.getConf().getVar(ConfVars.HIVE_EXECUTION_ENGINE);

    if (queryEngine.equals("tez")
        && pctx.getConf().getBoolVar(ConfVars.TEZ_DYNAMIC_PARTITION_PRUNING)) {
      enabled = true;
    }

    if (!enabled) {
      return pctx;
    }

    Map opRules = new LinkedHashMap();
    opRules.put(new RuleRegExp("R1", "(" +
        TableScanOperator.getOperatorName() + "%" + ".*" +
        ReduceSinkOperator.getOperatorName() + "%" +
        JoinOperator.getOperatorName() + "%)"), new JoinSynthetic());

    // The dispatcher fires the processor corresponding to the closest matching
    // rule and passes the context along
    SyntheticContext context = new SyntheticContext(pctx);
    SemanticDispatcher disp = new DefaultRuleDispatcher(null, opRules, context);
    PreOrderOnceWalker ogw = new PreOrderOnceWalker(disp);
    // The PreOrderOnceWalker traversal tries to cover all possible paths from the root to every other node. A plan
    // graph with lateral view operators has a particular structure that makes the number of paths exponentially big
    // and the traversal of such graphs prohibitively expensive. For this reason, we exclude lateral view operators
    // from the traversal and essentially disable the synthetic predicate generation for such branches.
    ogw.excludeNode(LateralViewForwardOperator.class);
    // Create a list of top op nodes
    List topNodes = new ArrayList();
    topNodes.addAll(pctx.getTopOps().values());
    ogw.startWalking(topNodes, null);

    return pctx;
  }

  // insert filter operator between target(child) and input(parent)
  private static Operator createFilter(Operator target, Operator parent,
      RowSchema parentRS, ExprNodeDesc filterExpr) {
    FilterDesc filterDesc = new FilterDesc(filterExpr, false);
    filterDesc.setSyntheticJoinPredicate(true);
    Operator filter = OperatorFactory.get(parent.getCompilationOpContext(),
        filterDesc, new RowSchema(parentRS.getSignature()));
    filter.getParentOperators().add(parent);
    filter.getChildOperators().add(target);
    parent.replaceChild(target, filter);
    target.replaceParent(parent, filter);
    return filter;
  }

  private static class SyntheticContext implements NodeProcessorCtx {

    ParseContext parseContext;
    boolean extended;

    public SyntheticContext(ParseContext pCtx) {
      parseContext = pCtx;
      extended = parseContext.getConf().getBoolVar(ConfVars.TEZ_DYNAMIC_PARTITION_PRUNING_EXTENDED);
    }

    public ParseContext getParseContext() {
      return parseContext;
    }

    public boolean isExtended() {
      return extended;
    }
  }

  private static class JoinSynthetic implements SemanticNodeProcessor {
    @Override
    public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {

      SyntheticContext sCtx = (SyntheticContext) procCtx;

      @SuppressWarnings("unchecked")
      CommonJoinOperator join = (CommonJoinOperator) nd;

      ReduceSinkOperator source = (ReduceSinkOperator) stack.get(stack.size() - 2);
      int srcPos = join.getParentOperators().indexOf(source);

      List> parents = join.getParentOperators();

      int[][] targets = getTargets(join);

      Operator parent = source.getParentOperators().get(0);
      RowSchema parentRS = parent.getSchema();

      // don't generate for null-safes.
      if (join.getConf().getNullSafes() != null) {
        for (boolean b : join.getConf().getNullSafes()) {
          if (b) {
            return null;
          }
        }
      }

      for (int targetPos: targets[srcPos]) {
        if (srcPos == targetPos) {
          continue;
        }

        ReduceSinkOperator target = (ReduceSinkOperator) parents.get(targetPos);
        List sourceKeys = source.getConf().getKeyCols();
        List targetKeys = target.getConf().getKeyCols();

        ExprNodeDesc syntheticExpr = null;

        if (sourceKeys.size() > 0) {
          for (int i = 0; i < sourceKeys.size(); ++i) {
            final ExprNodeDesc sourceKey = sourceKeys.get(i);

            List inArgs = new ArrayList<>();
            inArgs.add(sourceKey);

            ExprNodeDynamicListDesc dynamicExpr =
              new ExprNodeDynamicListDesc(targetKeys.get(i).getTypeInfo(), target, i);

            inArgs.add(dynamicExpr);

            ExprNodeDesc syntheticInExpr =
              ExprNodeGenericFuncDesc.newInstance(FunctionRegistry.getFunctionInfo("in")
                .getGenericUDF(), inArgs);
            if (LOG.isDebugEnabled()) {
              LOG.debug("Synthetic predicate in " + join + ": " + srcPos + " --> " + targetPos + " (" + syntheticInExpr + ")");
            }

            List andArgs = new ArrayList<>();
            if (syntheticExpr != null) {
              andArgs.add(syntheticExpr);
            }
            andArgs.add(syntheticInExpr);

            if (sCtx.isExtended()) {
              // Backtrack
              List newExprs = createDerivatives(target.getParentOperators().get(0), targetKeys.get(i), sourceKey);
              if (!newExprs.isEmpty()) {
                if (LOG.isDebugEnabled()) {
                  for (ExprNodeDesc expr : newExprs) {
                    LOG.debug("Additional synthetic predicate in " + join + ": " + srcPos + " --> " + targetPos + " (" + expr + ")");
                  }
                }
                andArgs.addAll(newExprs);
              }
            }

            if (andArgs.size() < 2) {
              syntheticExpr = syntheticInExpr;
            } else {
              // Create AND expression
              syntheticExpr =
                ExprNodeGenericFuncDesc.newInstance(FunctionRegistry.getFunctionInfo("and")
                  .getGenericUDF(), andArgs);
            }
          }
        }

        // Handle non-equi joins like <, <=, >, and >=
        List residualFilters = join.getConf().getResidualFilterExprs();
        if (residualFilters != null && residualFilters.size() != 0 &&
          !(srcPos > 1 || targetPos > 1)) { // Either srcPos or targetPos is larger than 1, making this filter a complex one.

          for (ExprNodeDesc filter : residualFilters) {
            if (!(filter instanceof ExprNodeGenericFuncDesc)) {
              continue;
            }

            ExprNodeGenericFuncDesc funcDesc = (ExprNodeGenericFuncDesc) filter;
            // filter should be of type <, >, <= or >=
            if (getFuncText(funcDesc.getFuncText(), 1) == null) {
              // unsupported
              continue;
            }

            final ExprNodeDesc sourceChild = funcDesc.getChildren().get(srcPos);
            final ExprNodeDesc targetChild = funcDesc.getChildren().get(targetPos);
            if (!(sourceChild instanceof ExprNodeColumnDesc &&
              targetChild instanceof ExprNodeColumnDesc)) {
              continue;
            }
            // Create non-equi function.
            List funcArgs = new ArrayList<>();
            ExprNodeDesc sourceKey = getRSColExprFromResidualFilter(sourceChild, join);
            funcArgs.add(sourceKey);
            final ExprNodeDynamicListDesc dynamicExpr =
              new ExprNodeDynamicListDesc(targetChild.getTypeInfo(), target, 0,
                getRSColExprFromResidualFilter(targetChild, join));
            funcArgs.add(dynamicExpr);
            ExprNodeDesc funcExpr =
              ExprNodeGenericFuncDesc.newInstance(FunctionRegistry.getFunctionInfo(getFuncText(funcDesc.getFuncText(), srcPos)).getGenericUDF(), funcArgs);

            // TODO : deduplicate the code below.
            LOG.debug(" Non-Equi Join Predicate {}", funcExpr);

            List andArgs = new ArrayList<>();
            if (syntheticExpr != null) {
              andArgs.add(syntheticExpr);
            }
            andArgs.add(funcExpr);

            // TODO : HIVE-21098 : Support for extended predicates
            if (andArgs.size() < 2) {
              syntheticExpr = funcExpr;
            } else {
              syntheticExpr =
                ExprNodeGenericFuncDesc.newInstance(FunctionRegistry.getFunctionInfo("and").getGenericUDF(), andArgs);
            }
          }
        }

        if (syntheticExpr != null) {
          Operator newFilter = createFilter(source, parent, parentRS, syntheticExpr);
          parent = newFilter;
        }
      }

      return null;
    }

    private ExprNodeDesc getRSColExprFromResidualFilter(ExprNodeDesc childExpr, CommonJoinOperator join) {
      ExprNodeColumnDesc colExpr = ExprNodeDescUtils.getColumnExpr(childExpr);

      final String joinColName = colExpr.getColumn();
      // use name to get the alias pos of parent and name in parent
      final int aliasPos = join.getConf().getReversedExprs().get(joinColName);
      final ExprNodeDesc rsColExpr = join.getColumnExprMap().get(joinColName);

      // Get the correct parent
      final ReduceSinkOperator parentRS = (ReduceSinkOperator) (join.getParentOperators().get(aliasPos));

      // Fetch the colExpr from parent
      return parentRS.getColumnExprMap().get(
        ExprNodeDescUtils.extractColName(rsColExpr));
    }

    // This function serves two purposes
    // 1. As the name suggests, provides inverted function text for a given function text
    // 2. If inversion fails, it can be inferred that the given function is not supported.
    String getFuncText(String funcText, final int srcPos) {
      if (srcPos == 0) {
        return funcText;
      }

      return FunctionRegistry.invertFuncText(funcText);
    }


    // calculate filter propagation directions for each alias
    // L<->R for inner/semi join, L<-R for left outer join, R<-L for right outer
    // join
    private int[][] getTargets(CommonJoinOperator join) {
      JoinCondDesc[] conds = join.getConf().getConds();

      int aliases = conds.length + 1;
      Vectors vector = new Vectors(aliases);
      for (JoinCondDesc cond : conds) {
        int left = cond.getLeft();
        int right = cond.getRight();
        switch (cond.getType()) {
        case JoinDesc.INNER_JOIN:
        case JoinDesc.LEFT_SEMI_JOIN:
          vector.add(left, right);
          vector.add(right, left);
          break;
        case JoinDesc.LEFT_OUTER_JOIN:
        case JoinDesc.ANTI_JOIN:
        //TODO : In case of anti join, bloom filter can be created on left side also ("IN (keylist right table)").
        // But the filter should be "not-in" ("NOT IN (keylist right table)") as we want to select the records from
        // left side which are not present in the right side. But it may cause wrong result as
        // bloom filter may have false positive and thus simply adding not is not correct,
        // special handling is required for "NOT IN".
          vector.add(right, left);
          break;
        case JoinDesc.RIGHT_OUTER_JOIN:
          vector.add(left, right);
          break;
        case JoinDesc.FULL_OUTER_JOIN:
          break;
        }
      }
      int[][] result = new int[aliases][];
      for (int pos = 0 ; pos < aliases; pos++) {
        // find all targets recursively
        result[pos] = vector.traverse(pos);
      }
      return result;
    }

    private List createDerivatives(final Operator currentOp,
        final ExprNodeDesc currentNode, final ExprNodeDesc sourceKey) throws SemanticException {
      List resultExprs = new ArrayList<>();
      return createDerivatives(resultExprs, currentOp, currentNode, sourceKey) ? resultExprs : new ArrayList<>();
    }

    private boolean createDerivatives(final List resultExprs, final Operator op,
        final ExprNodeDesc currentNode, final ExprNodeDesc sourceKey) throws SemanticException {
      // 1. Obtain join operator upstream
      Operator currentOp = op;
      while (!(currentOp instanceof CommonJoinOperator)) {
        if (currentOp.getParentOperators() == null || currentOp.getParentOperators().size() != 1) {
          // Cannot backtrack
          currentOp = null;
          break;
        }
        if (!(currentOp instanceof FilterOperator) &&
            !(currentOp instanceof SelectOperator) &&
            !(currentOp instanceof ReduceSinkOperator) &&
            !(currentOp instanceof GroupByOperator)) {
          // Operator not supported
          currentOp = null;
          break;
        }
        // Move the pointer
        currentOp = currentOp.getParentOperators().get(0);
      }
      if (currentOp == null) {
        // We did not find any join, we are done
        return true;
      }
      CommonJoinOperator joinOp = (CommonJoinOperator) currentOp;

      // 2. Backtrack expression to join output
      ExprNodeDesc expr = currentNode;
      if (currentOp != op) {
        if (expr instanceof ExprNodeColumnDesc) {
          // Expression refers to output of current operator, but backtrack methods works
          // from the input columns, hence we need to make resolution for current operator
          // here. If the operator was already the join, there is nothing to do
          if (op.getColumnExprMap() != null) {
            expr = op.getColumnExprMap().get(((ExprNodeColumnDesc) expr).getColumn());
          }
        } else {
          // TODO: We can extend to other expression types
          // We are done
          return true;
        }
      }
      final ExprNodeDesc joinExprNode = ExprNodeDescUtils.backtrack(expr, op, joinOp);
      if (joinExprNode == null || !(joinExprNode instanceof ExprNodeColumnDesc)) {
        // TODO: We can extend to other expression types
        // We are done
        return true;
      }
      final String columnRefJoinInput = ((ExprNodeColumnDesc)joinExprNode).getColumn();

      // 3. Find input position in join for expression obtained
      String columnOutputName = null;
      for (Map.Entry e : joinOp.getColumnExprMap().entrySet()) {
        if (e.getValue() == joinExprNode) {
          columnOutputName = e.getKey();
          break;
        }
      }
      if (columnOutputName == null) {
        // Maybe the join is pruning columns, though it should not.
        // In any case, we are done
        return true;
      }
      final int srcPos = joinOp.getConf().getReversedExprs().get(columnOutputName);
      final int[][] targets = getTargets(joinOp);
      final ReduceSinkOperator rsOp = (ReduceSinkOperator) joinOp.getParentOperators().get(srcPos);

      // 4. Find expression in input RS operator.
      final Operator rsOpInput = rsOp.getParentOperators().get(0);
      final ExprNodeDesc rsOpInputExprNode = rsOp.getColumnExprMap().get(columnRefJoinInput);
      if (rsOpInputExprNode == null) {
        // Unexpected, we just bail out and we do not infer additional predicates
        return false;
      }
      int posInRSOpKeys = -1;
      for (int i = 0; i < rsOp.getConf().getKeyCols().size(); i++) {
        if (rsOpInputExprNode.isSame(rsOp.getConf().getKeyCols().get(i))) {
          posInRSOpKeys = i;
          break;
        }
      }

      // 5. If it is part of the key, we can create a new semijoin.
      // In addition, we can do the same for siblings
      if (posInRSOpKeys >= 0) {
        // We pass the tests, we add it to the args for the AND expression
        addParentReduceSink(resultExprs, rsOp, posInRSOpKeys, sourceKey);
        for (int targetPos: targets[srcPos]) {
          if (srcPos == targetPos) {
            continue;
          }
          final ReduceSinkOperator otherRsOp = (ReduceSinkOperator) joinOp.getParentOperators().get(targetPos);
          final Operator otherRsOpInput = otherRsOp.getParentOperators().get(0);
          // We pass the tests, we add it to the args for the AND expression
          addParentReduceSink(resultExprs, otherRsOp, posInRSOpKeys, sourceKey);
          // We propagate to operator below
          boolean success = createDerivatives(
              resultExprs, otherRsOpInput, otherRsOp.getConf().getKeyCols().get(posInRSOpKeys), sourceKey);
          if (!success) {
            // Something went wrong, bail out
            return false;
          }
        }
      }

      // 6. Whether it was part of the key or of the value, if we reach here, we can at least
      // continue propagating to operators below
      boolean success = createDerivatives(
          resultExprs, rsOpInput, rsOpInputExprNode, sourceKey);
      if (!success) {
        // Something went wrong, bail out
        return false;
      }

      // 7. We are done, success
      return true;
    }

    private void addParentReduceSink(final List andArgs, final ReduceSinkOperator rsOp,
        final int keyIndex, final ExprNodeDesc sourceKey) throws SemanticException {
      ExprNodeDynamicListDesc dynamicExpr =
          new ExprNodeDynamicListDesc(rsOp.getConf().getKeyCols().get(keyIndex).getTypeInfo(), rsOp, keyIndex);
      // Create synthetic IN expression
      List inArgs = new ArrayList<>();
      inArgs.add(sourceKey);
      inArgs.add(dynamicExpr);
      ExprNodeDesc newNode = ExprNodeGenericFuncDesc.newInstance(
          FunctionRegistry.getFunctionInfo("in").getGenericUDF(), inArgs);
      andArgs.add(newNode);
    }
  }

  private static class Vectors {

    private final Set[] vector;

    @SuppressWarnings("unchecked")
    public Vectors(int length) {
      vector = new Set[length];
    }

    public void add(int from, int to) {
      if (vector[from] == null) {
        vector[from] = new HashSet();
      }
      vector[from].add(to);
    }

    public int[] traverse(int pos) {
      Set targets = new HashSet();
      traverse(targets, pos);
      return toArray(targets);
    }

    private int[] toArray(Set values) {
      int index = 0;
      int[] result = new int[values.size()];
      for (int value : values) {
        result[index++] = value;
      }
      return result;
    }

    private void traverse(Set targets, int pos) {
      if (vector[pos] == null) {
        return;
      }
      for (int target : vector[pos]) {
        if (targets.add(target)) {
          traverse(targets, target);
        }
      }
    }
  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy