org.apache.hadoop.hive.ql.optimizer.QueryPlanPostProcessor Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.optimizer;

import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.OperatorUtils;
import org.apache.hadoop.hive.ql.exec.Task;
import org.apache.hadoop.hive.ql.exec.repl.ReplStateLogWork;
import org.apache.hadoop.hive.ql.exec.repl.bootstrap.ReplLoadWork;
import org.apache.hadoop.hive.ql.io.AcidUtils;
import org.apache.hadoop.hive.ql.parse.GenTezProcContext;
import org.apache.hadoop.hive.ql.parse.GenTezWork;
import org.apache.hadoop.hive.ql.parse.spark.GenSparkWork;
import org.apache.hadoop.hive.ql.plan.ArchiveWork;
import org.apache.hadoop.hive.ql.plan.BaseWork;
import org.apache.hadoop.hive.ql.plan.BasicStatsNoJobWork;
import org.apache.hadoop.hive.ql.plan.BasicStatsWork;
import org.apache.hadoop.hive.ql.plan.ColumnStatsUpdateWork;
import org.apache.hadoop.hive.ql.plan.ConditionalWork;
import org.apache.hadoop.hive.ql.plan.CopyWork;
import org.apache.hadoop.hive.ql.plan.DDLWork;
import org.apache.hadoop.hive.ql.plan.DependencyCollectionWork;
import org.apache.hadoop.hive.ql.plan.ExplainSQRewriteWork;
import org.apache.hadoop.hive.ql.plan.ExplainWork;
import org.apache.hadoop.hive.ql.plan.FetchWork;
import org.apache.hadoop.hive.ql.plan.FileSinkDesc;
import org.apache.hadoop.hive.ql.plan.FunctionWork;
import org.apache.hadoop.hive.ql.plan.MapredLocalWork;
import org.apache.hadoop.hive.ql.plan.MapredWork;
import org.apache.hadoop.hive.ql.plan.MoveWork;
import org.apache.hadoop.hive.ql.plan.SparkWork;
import org.apache.hadoop.hive.ql.plan.StatsWork;
import org.apache.hadoop.hive.ql.plan.TezWork;
import com.facebook.presto.hive.$internal.org.slf4j.Logger;
import com.facebook.presto.hive.$internal.org.slf4j.LoggerFactory;

import java.util.List;
import java.util.Set;

/**
 * Finds Acid FileSinkDesc objects which can be created in the physical (disconnected) plan, e.g.
 * {@link org.apache.hadoop.hive.ql.parse.GenTezUtils#removeUnionOperators(GenTezProcContext, BaseWork, int)}
 * so that statementId can be properly assigned to ensure unique ROW__IDs
 * {@link org.apache.hadoop.hive.ql.optimizer.unionproc.UnionProcFactory} is another example where
 * Union All optimizations create new FileSinkDescS
 */
public class QueryPlanPostProcessor {
  private static final Logger LOG = LoggerFactory.getLogger(QueryPlanPostProcessor.class);

  public QueryPlanPostProcessor(List> rootTasks, Set acidSinks, String executionId) {
    for(Task t : rootTasks) {
      //Work
      Object work = t.getWork();
      if(work instanceof TezWork) {
        for(BaseWork bw : ((TezWork)work).getAllWorkUnsorted()) {
          collectFileSinkDescs(bw.getAllLeafOperators(), acidSinks);
        }
      }
      else if(work instanceof BaseWork) {
        collectFileSinkDescs(((BaseWork)work).getAllLeafOperators(), acidSinks);
      }
      else if(work instanceof MapredWork) {
        MapredWork w = (MapredWork)work;
        if(w.getMapWork() != null) {
          collectFileSinkDescs(w.getMapWork().getAllLeafOperators(), acidSinks);
        }
        if(w.getReduceWork() != null) {
          collectFileSinkDescs(w.getReduceWork().getAllLeafOperators(), acidSinks);
        }
      }
      else if(work instanceof SparkWork) {
        for(BaseWork bw : ((SparkWork)work).getRoots()) {
          collectFileSinkDescs(bw.getAllLeafOperators(), acidSinks);
        }
      }
      else if(work instanceof MapredLocalWork) {
        //I don't think this can have any FileSinkOperatorS - more future proofing
        Set fileSinkOperatorSet = OperatorUtils.findOperators(((MapredLocalWork) work).getAliasToWork().values(), FileSinkOperator.class);
        for(FileSinkOperator fsop : fileSinkOperatorSet) {
          collectFileSinkDescs(fsop, acidSinks);
        }
      }
      else if(work instanceof ExplainWork) {
        new QueryPlanPostProcessor(((ExplainWork)work).getRootTasks(), acidSinks, executionId);
      }
      else if(work instanceof ReplLoadWork ||
        work instanceof ReplStateLogWork ||
        work instanceof GenTezWork ||
        work instanceof GenSparkWork ||
        work instanceof ArchiveWork ||
        work instanceof ColumnStatsUpdateWork ||
        work instanceof BasicStatsWork ||
        work instanceof ConditionalWork ||
        work instanceof CopyWork ||
        work instanceof DDLWork ||
        work instanceof DependencyCollectionWork ||
        work instanceof ExplainSQRewriteWork ||
        work instanceof FetchWork ||
        work instanceof FunctionWork ||
        work instanceof MoveWork ||
        work instanceof BasicStatsNoJobWork ||
        work instanceof StatsWork) {
        LOG.debug("Found " + work.getClass().getName() + " - no FileSinkOperation can be present.  executionId=" + executionId);
      }
      else {
        //if here, someone must have added new Work object - should it be walked to find FileSinks?
        throw new IllegalArgumentException("Unexpected Work object: " + work.getClass() + " executionId=" + executionId);
      }
    }
  }
  private void collectFileSinkDescs(Operator leaf, Set acidSinks) {
    if(leaf instanceof FileSinkOperator) {
      FileSinkDesc fsd = ((FileSinkOperator) leaf).getConf();
      if(fsd.getWriteType() != AcidUtils.Operation.NOT_ACID) {
        if(acidSinks.add(fsd)) {
          if(LOG.isDebugEnabled()) {
            LOG.debug("Found Acid Sink: " + fsd.getDirName());
          }
        }
      }
    }
  }
  private void collectFileSinkDescs(Set> leaves, Set acidSinks) {
    for(Operator leaf : leaves) {
      collectFileSinkDescs(leaf, acidSinks);
    }
  }
}