org.apache.hadoop.hive.ql.optimizer.physical.BucketingSortingCtx Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of hive-exec
There is a newer version: 4.0.0
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.optimizer.physical;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;

/**
 * This class contains the bucketing sorting context that is passed
 * while walking the operator tree in inferring bucket/sort columns. The context
 * contains the mappings from operators and files to the columns their output is
 * bucketed/sorted on.
 */
public class BucketingSortingCtx implements NodeProcessorCtx {

  private boolean disableBucketing;

  // A mapping from an operator to the columns by which it's output is bucketed
  private Map, List> bucketedColsByOp;
  // A mapping from a directory which a FileSinkOperator writes into to the columns by which that
  // output is bucketed
  private Map> bucketedColsByDirectory;

  // A mapping from an operator to the columns by which it's output is sorted
  private Map, List> sortedColsByOp;
  // A mapping from a directory which a FileSinkOperator writes into to the columns by which that
  // output is sorted
  private Map> sortedColsByDirectory;

  public BucketingSortingCtx(boolean disableBucketing) {
    this.disableBucketing = disableBucketing;
    this.bucketedColsByOp = new HashMap, List>();
    this.bucketedColsByDirectory = new HashMap>();
    this.sortedColsByOp = new HashMap, List>();
    this.sortedColsByDirectory = new HashMap>();
  }


  public List getBucketedCols(Operator op) {
    return disableBucketing ? null : bucketedColsByOp.get(op);
  }


  public void setBucketedCols(Operator op, List bucketCols) {
    if (!disableBucketing) {
      bucketedColsByOp.put(op, bucketCols);
    }
  }

  public Map> getBucketedColsByDirectory() {
    return disableBucketing ? null : bucketedColsByDirectory;
  }


  public void setBucketedColsByDirectory(Map> bucketedColsByDirectory) {
    if (!disableBucketing) {
      this.bucketedColsByDirectory = bucketedColsByDirectory;
    }
  }


  public List getSortedCols(Operator op) {
    return sortedColsByOp.get(op);
  }


  public void setSortedCols(Operator op, List sortedCols) {
    this.sortedColsByOp.put(op, sortedCols);
  }

  public Map> getSortedColsByDirectory() {
    return sortedColsByDirectory;
  }


  public void setSortedColsByDirectory(Map> sortedColsByDirectory) {
    this.sortedColsByDirectory = sortedColsByDirectory;
  }

  /**
   *
   * BucketSortCol.
   *
   * Classes that implement this interface provide a way to store information about equivalent
   * columns as their names and indexes in the schema change going into and out of operators.  The
   * definition of equivalent columns is up to the class which uses these classes, e.g.
   * BucketingSortingOpProcFactory.  For example, two columns are equivalent if they
   * contain exactly the same data.  Though, it's possible that two columns contain exactly the
   * same data and are not known to be equivalent.
   *
   * E.g. SELECT key a, key b FROM (SELECT key, count(*) c FROM src GROUP BY key) s;
   * In this case, assuming this is done in a single map reduce job with the group by operator
   * processed in the reducer, the data coming out of the group by operator will be bucketed
   * by key, which would be at index 0 in the schema, after the outer select operator, the output
   * can be viewed as bucketed by either the column with alias a or the column with alias b.  To
   * represent this, there could be a single BucketSortCol implementation instance whose names
   * include both a and b, and whose indexes include both 0 and 1.
   *
   * Implementations of this interface should maintain the restriction that the alias
   * getNames().get(i) should have index getIndexes().get(i) in the schema.
   */
  public static interface BucketSortCol {
    // Get a list of aliases for the same column
    public List getNames();

    // Get a list of indexes for which the columns in the schema are the same
    public List getIndexes();

    // Add an alternative alias for the column this instance represents, and its index in the
    // schema.
    public void addAlias(String name, Integer index);
  }

  /**
   *
   * BucketCol.
   *
   * An implementation of BucketSortCol which contains known aliases/indexes of equivalent columns
   * which data is determined to be bucketed on.
   */
  public static final class BucketCol implements BucketSortCol, Serializable {
    private static final long serialVersionUID = 1L;
    // Equivalent aliases for the column
    private final List names = new ArrayList();
    // Indexes of those equivalent columns
    private final List indexes = new ArrayList();

    public BucketCol(String name, int index) {
      addAlias(name, index);
    }

    public BucketCol() {

    }

    @Override
    public List getNames() {
      return names;
    }

    @Override
    public List getIndexes() {
      return indexes;
    }

    @Override
    public void addAlias(String name, Integer index) {
      names.add(name);
      indexes.add(index);
    }

    @Override
    // Chooses a representative alias and index to use as the String, the first is used because
    // it is set in the constructor
    public String toString() {
      return "name: " + names.get(0) + " index: " + indexes.get(0);
    }
  }

  /**
   *
   * SortCol.
   *
   * An implementation of BucketSortCol which contains known aliases/indexes of equivalent columns
   * which data is determined to be sorted on.  Unlike aliases, and indexes the sort order is known
   * to be constant for all equivalent columns.
   */
  public static final class SortCol implements BucketSortCol, Serializable {

    public SortCol() {
      super();
    }

    private static final long serialVersionUID = 1L;
    // Equivalent aliases for the column
    private List names = new ArrayList();
    // Indexes of those equivalent columns
    private List indexes = new ArrayList();
    // Sort order (+|-)
    private char sortOrder;

    public SortCol(String name, int index, char sortOrder) {
      this(sortOrder);
      addAlias(name, index);
    }

    public SortCol(char sortOrder) {
      this.sortOrder = sortOrder;
    }


    @Override
    public List getNames() {
      return names;
    }

    @Override
    public List getIndexes() {
      return indexes;
    }

    @Override
    public void addAlias(String name, Integer index) {
      names.add(name);
      indexes.add(index);
    }

    public char getSortOrder() {
      return sortOrder;
    }

    @Override
    // Chooses a representative alias, index, and order to use as the String, the first is used
    // because it is set in the constructor
    public String toString() {
      return "name: " + names.get(0) + " index: " + indexes.get(0) + " order: " + sortOrder;
    }
  }
}