org.apache.hadoop.hive.ql.optimizer.SamplePruner Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of hive-exec
There is a newer version: 4.0.0
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.optimizer;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Stack;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.FilterOperator;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
import org.apache.hadoop.hive.ql.lib.Dispatcher;
import org.apache.hadoop.hive.ql.lib.GraphWalker;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.NodeProcessor;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.lib.Rule;
import org.apache.hadoop.hive.ql.lib.RuleRegExp;
import org.apache.hadoop.hive.ql.metadata.Hive;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.parse.ParseContext;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.FilterDesc;
import org.apache.hadoop.hive.ql.plan.FilterDesc.sampleDesc;

/**
 * The transformation step that does sample pruning.
 *
 */
public class SamplePruner implements Transform {

  /**
   * SamplePrunerCtx.
   *
   */
  public static class SamplePrunerCtx implements NodeProcessorCtx {
    HashMap opToSamplePruner;

    public SamplePrunerCtx(
        HashMap opToSamplePruner) {
      this.opToSamplePruner = opToSamplePruner;
    }

    /**
     * @return the opToSamplePruner
     */
    public HashMap getOpToSamplePruner() {
      return opToSamplePruner;
    }

    /**
     * @param opToSamplePruner
     *          the opToSamplePruner to set
     */
    public void setOpToSamplePruner(
        HashMap opToSamplePruner) {
      this.opToSamplePruner = opToSamplePruner;
    }
  }

  // The log
  private static final Log LOG = LogFactory
      .getLog("hive.ql.optimizer.SamplePruner");

  /*
   * (non-Javadoc)
   *
   * @see
   * org.apache.hadoop.hive.ql.optimizer.Transform#transform(org.apache.hadoop
   * .hive.ql.parse.ParseContext)
   */
  @Override
  public ParseContext transform(ParseContext pctx) throws SemanticException {

    // create a the context for walking operators
    SamplePrunerCtx samplePrunerCtx = new SamplePrunerCtx(pctx
        .getOpToSamplePruner());

    Map opRules = new LinkedHashMap();
    opRules.put(new RuleRegExp("R1",
      "(" + TableScanOperator.getOperatorName() + "%"
      + FilterOperator.getOperatorName() + "%"
      + FilterOperator.getOperatorName() + "%|"
      + TableScanOperator.getOperatorName() + "%"
      + FilterOperator.getOperatorName() + "%)"), getFilterProc());

    // The dispatcher fires the processor corresponding to the closest matching
    // rule and passes the context along
    Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules,
        samplePrunerCtx);
    GraphWalker ogw = new DefaultGraphWalker(disp);

    // Create a list of topop nodes
    ArrayList topNodes = new ArrayList();
    topNodes.addAll(pctx.getTopOps().values());
    ogw.startWalking(topNodes, null);
    return pctx;
  }

  /**
   * FilterPPR filter processor.
   *
   */
  public static class FilterPPR implements NodeProcessor {

    @Override
    public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {
      FilterOperator filOp = (FilterOperator) nd;
      FilterDesc filOpDesc = filOp.getConf();
      sampleDesc sampleDescr = filOpDesc.getSampleDescr();

      if ((sampleDescr == null) || !sampleDescr.getInputPruning()) {
        return null;
      }

      assert (stack.size() == 3 && stack.get(1) instanceof FilterOperator) ||
          stack.size() == 2;

      TableScanOperator tsOp = (TableScanOperator) stack.get(0);
      ((SamplePrunerCtx) procCtx).getOpToSamplePruner().put(tsOp, sampleDescr);
      return null;
    }
  }

  public static NodeProcessor getFilterProc() {
    return new FilterPPR();
  }

  /**
   * DefaultPPR default processor which does nothing.
   *
   */
  public static class DefaultPPR implements NodeProcessor {

    @Override
    public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {
      // Nothing needs to be done.
      return null;
    }
  }

  public static NodeProcessor getDefaultProc() {
    return new DefaultPPR();
  }

  /**
   * Prunes to get all the files in the partition that satisfy the TABLESAMPLE
   * clause.
   *
   * @param part
   *          The partition to prune
   * @return Path[]
   * @throws SemanticException
   */
  @SuppressWarnings("nls")
  public static Path[] prune(Partition part, sampleDesc sampleDescr)
      throws SemanticException {
    int num = sampleDescr.getNumerator();
    int den = sampleDescr.getDenominator();
    int bucketCount = part.getBucketCount();
    String fullScanMsg = "";

    // check if input pruning is possible
    if (sampleDescr.getInputPruning()) {
      LOG.trace("numerator = " + num);
      LOG.trace("denominator = " + den);
      LOG.trace("bucket count = " + bucketCount);
      if (bucketCount == den) {
        Path[] ret = new Path[1];
        ret[0] = part.getBucketPath(num - 1);
        return (ret);
      } else if (bucketCount > den && bucketCount % den == 0) {
        int numPathsInSample = bucketCount / den;
        Path[] ret = new Path[numPathsInSample];
        for (int i = 0; i < numPathsInSample; i++) {
          ret[i] = part.getBucketPath(i * den + num - 1);
        }
        return ret;
      } else if (bucketCount < den && den % bucketCount == 0) {
        Path[] ret = new Path[1];
        ret[0] = part.getBucketPath((num - 1) % bucketCount);
        return ret;
      } else {
        // need to do full scan
        fullScanMsg = "Tablesample denominator " + den
            + " is not multiple/divisor of bucket count " + bucketCount
            + " of table " + part.getTable().getTableName();
      }
    } else {
      // need to do full scan
      fullScanMsg = "Tablesample not on clustered columns";
    }
    LOG.warn(fullScanMsg + ", using full table scan");
    Path[] ret = part.getPath();
    return ret;
  }

  /**
   * Class used for return value of addPath()
   *
   */
  public static class AddPathReturnStatus {
    public AddPathReturnStatus(boolean hasFile, boolean allFile, long sizeLeft) {
      this.hasFile = hasFile;
      this.allFile = allFile;
      this.sizeLeft = sizeLeft;
    }
    // whether the sub-directory has any file
    public boolean hasFile;
    // whether all files are not sufficient to reach sizeLeft
    public boolean allFile;
    // remaining size needed after putting files in the return path list
    public long sizeLeft;
  }


  /**
   * Try to recursively add files in sub-directories into retPathList until
   * reaching the sizeLeft.
   * @param fs
   * @param pathPattern
   * @param sizeLeft
   * @param fileLimit
   * @param retPathList
   * @return status of the recursive call
   * @throws IOException
   */
  public static AddPathReturnStatus addPath(FileSystem fs, String pathPattern, long sizeLeft, int fileLimit,
      Collection retPathList)
      throws IOException {
    LOG.info("Path pattern = " + pathPattern);
    FileStatus srcs[] = fs.globStatus(new Path(pathPattern));
    Arrays.sort(srcs);

    boolean hasFile = false, allFile = true;

    for (FileStatus src : srcs) {
      if (sizeLeft <= 0) {
        allFile = false;
        break;
      }
      if (src.isDir()) {
        LOG.info("Got directory: " + src.getPath());
        AddPathReturnStatus ret = addPath(fs, src.getPath().toString() + "/*", sizeLeft,
            fileLimit, retPathList);
        if (ret == null) {
          // not qualify this optimization
          return null;
        }
        sizeLeft = ret.sizeLeft;
        hasFile |= ret.hasFile;
        allFile &= ret.allFile;
      } else {
        LOG.info("Got file: " + src.getPath());
        hasFile = true;
        retPathList.add(src.getPath());
        sizeLeft -= src.getLen();
        if (retPathList.size() >= fileLimit && sizeLeft > 0) {
          return null;
        }
      }
    }
    return new AddPathReturnStatus(hasFile, allFile, sizeLeft);
  }

  public enum LimitPruneRetStatus {
    // no files in the partition
    NoFile,
    // sum size of all files in the partition is smaller than size required
    NeedAllFiles,
    // a susbset of files for the partition are sufficient for the optimization
    NeedSomeFiles,
    // the partition doesn't qualify the global limit optimization for some reason
    NotQualify
  }


  /**
   * Try to generate a list of subset of files in the partition to reach a size
   * limit with number of files less than fileLimit
   * @param part
   * @param sizeLimit
   * @param fileLimit
   * @param retPathList list of Paths returned
   * @return the result of the attempt
   * @throws SemanticException
   */
  public static LimitPruneRetStatus limitPrune(Partition part, long sizeLimit, int fileLimit,
      Collection retPathList)
      throws SemanticException {

    try {
      FileSystem fs = part.getDataLocation().getFileSystem(Hive.get().getConf());
      String pathPattern = part.getDataLocation().toString() + "/*";
      AddPathReturnStatus ret = addPath(fs, pathPattern, sizeLimit, fileLimit, retPathList);
      if (ret == null) {
        return LimitPruneRetStatus.NotQualify;
      } else if (!ret.hasFile){
        return LimitPruneRetStatus.NoFile;
      } else if (ret.sizeLeft > 0) {
        return LimitPruneRetStatus.NotQualify;
      } else if (ret.allFile) {
        return LimitPruneRetStatus.NeedAllFiles;
      } else {
        return LimitPruneRetStatus.NeedSomeFiles;
      }
    } catch (Exception e) {
      throw new RuntimeException("Cannot get path", e);
    }

  }

}