org.apache.hadoop.hive.ql.optimizer.physical.SparkDynamicPartitionPruningResolver Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 

 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.optimizer.physical;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.Stack;

import org.apache.hadoop.hive.ql.exec.spark.SparkUtilities;
import com.facebook.presto.hive.$internal.org.slf4j.Logger;
import com.facebook.presto.hive.$internal.org.slf4j.LoggerFactory;

import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.OperatorUtils;
import org.apache.hadoop.hive.ql.exec.Task;
import org.apache.hadoop.hive.ql.exec.spark.SparkTask;
import org.apache.hadoop.hive.ql.lib.Dispatcher;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.TaskGraphWalker;
import org.apache.hadoop.hive.ql.optimizer.spark.SparkPartitionPruningSinkDesc;
import org.apache.hadoop.hive.ql.optimizer.spark.SparkPartitionPruningSinkDesc.DPPTargetInfo;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.parse.spark.SparkPartitionPruningSinkOperator;
import org.apache.hadoop.hive.ql.plan.BaseWork;
import org.apache.hadoop.hive.ql.plan.MapWork;


/**
 * A physical optimization that disables DPP if the source {@link MapWork} and target {@link MapWork} aren't in
 * dependent {@link SparkTask}s.
 *
 * 

 *   When DPP is run, the source {@link MapWork} produces a temp file that is read by the target {@link MapWork}. The
 *   source {@link MapWork} must be run before the target {@link MapWork} is run, otherwise the target {@link MapWork}
 *   will throw a {@link java.io.FileNotFoundException}. In order to guarantee this, the source {@link MapWork} must be
 *   inside a {@link SparkTask} that runs before the {@link SparkTask} containing the target {@link MapWork}.
 * 
 *
 * 
 *   This {@link PhysicalPlanResolver} works by walking through the {@link Task} DAG and iterating over all the
 *   {@link SparkPartitionPruningSinkOperator}s inside the {@link SparkTask}. For each sink operator, it takes the
 *   target {@link MapWork} and checks if it exists in any of the child {@link SparkTask}s. If the target {@link MapWork}
 *   is not in any child {@link SparkTask} then it removes the operator subtree that contains the
 *   {@link SparkPartitionPruningSinkOperator}.
 * 
 */
public class SparkDynamicPartitionPruningResolver implements PhysicalPlanResolver {

  private static final Logger LOG = LoggerFactory.getLogger(SparkDynamicPartitionPruningResolver.class.getName());

  @Override
  public PhysicalContext resolve(PhysicalContext pctx) throws SemanticException {
    // Walk through the Task Graph and invoke SparkDynamicPartitionPruningDispatcher
    TaskGraphWalker graphWalker = new TaskGraphWalker(new SparkDynamicPartitionPruningDispatcher());

    ArrayList rootTasks = new ArrayList<>();
    rootTasks.addAll(pctx.getRootTasks());
    graphWalker.startWalking(rootTasks, null);
    return pctx;
  }

  private class SparkDynamicPartitionPruningDispatcher implements Dispatcher {

    @Override
    public Object dispatch(Node nd, Stack stack, Object... nodeOutputs) throws SemanticException {
      Task task = (Task) nd;

      // If the given Task is a SparkTask then search its Work DAG for SparkPartitionPruningSinkOperator
      if (task instanceof SparkTask) {

        // Search for any SparkPartitionPruningSinkOperator in the SparkTask
        for (BaseWork baseWork : ((SparkTask) task).getWork().getAllWork()) {
          Set> pruningSinkOps = OperatorUtils.getOp(baseWork, SparkPartitionPruningSinkOperator.class);

          // For each SparkPartitionPruningSinkOperator, take the target MapWork and see if it is in a dependent SparkTask
          for (Operator op : pruningSinkOps) {
            SparkPartitionPruningSinkOperator pruningSinkOp =
                (SparkPartitionPruningSinkOperator) op;
            SparkPartitionPruningSinkDesc desc = pruningSinkOp.getConf();
            List toRemove = new ArrayList<>();
            for (DPPTargetInfo targetInfo : desc.getTargetInfos()) {
              MapWork targetMapWork = targetInfo.work;
              // Check if the given SparkTask has a child SparkTask that contains the target MapWork
              // If it does not, then remove the target from DPP op
              if (!taskContainsDependentMapWork(task, targetMapWork)) {
                toRemove.add(targetInfo);
                pruningSinkOp.removeFromSourceEvent(targetMapWork, targetInfo.partKey,
                    targetInfo.columnName, targetInfo.columnType);
                LOG.info("Removing target map work " + targetMapWork.getName() + " from " + baseWork
                    .getName() + " as no dependency exists between the two works.");
              }
            }
            desc.getTargetInfos().removeAll(toRemove);
            if (desc.getTargetInfos().isEmpty()) {
              // The DPP sink has no target, remove the subtree.
              OperatorUtils.removeBranch(pruningSinkOp);
            }
          }
        }
      }
      return null;
    }
  }

  /**
   * Recursively go through the children of the given {@link Task} and check if any child {@link SparkTask} contains
   * the specified {@link MapWork} object.
   */
  private boolean taskContainsDependentMapWork(Task task,
                                               MapWork work) throws SemanticException {
    if (task == null || task.getChildTasks() == null) {
      return false;
    }
    for (Task childTask : task.getChildTasks()) {
      if (childTask != null && childTask instanceof SparkTask && childTask.getMapWork().contains(work)) {
        return true;
      } else if (taskContainsDependentMapWork(childTask, work)) {
        return true;
      }
    }
    return false;
  }
}