org.apache.hadoop.hive.ql.optimizer.physical.BucketingSortingCtx Maven / Gradle / Ivy
The newest version!
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.optimizer.physical;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
/**
* This class contains the bucketing sorting context that is passed
* while walking the operator tree in inferring bucket/sort columns. The context
* contains the mappings from operators and files to the columns their output is
* bucketed/sorted on.
*/
public class BucketingSortingCtx implements NodeProcessorCtx {
private boolean disableBucketing;
// A mapping from an operator to the columns by which it's output is bucketed
private Map, List> bucketedColsByOp;
// A mapping from a directory which a FileSinkOperator writes into to the columns by which that
// output is bucketed
private Map> bucketedColsByDirectory;
// A mapping from an operator to the columns by which it's output is sorted
private Map, List> sortedColsByOp;
// A mapping from a directory which a FileSinkOperator writes into to the columns by which that
// output is sorted
private Map> sortedColsByDirectory;
public BucketingSortingCtx(boolean disableBucketing) {
this.disableBucketing = disableBucketing;
this.bucketedColsByOp = new HashMap, List>();
this.bucketedColsByDirectory = new HashMap>();
this.sortedColsByOp = new HashMap, List>();
this.sortedColsByDirectory = new HashMap>();
}
public List getBucketedCols(Operator op) {
return disableBucketing ? null : bucketedColsByOp.get(op);
}
public void setBucketedCols(Operator op, List bucketCols) {
if (!disableBucketing) {
bucketedColsByOp.put(op, bucketCols);
}
}
public Map> getBucketedColsByDirectory() {
return disableBucketing ? null : bucketedColsByDirectory;
}
public void setBucketedColsByDirectory(Map> bucketedColsByDirectory) {
if (!disableBucketing) {
this.bucketedColsByDirectory = bucketedColsByDirectory;
}
}
public List getSortedCols(Operator op) {
return sortedColsByOp.get(op);
}
public void setSortedCols(Operator op, List sortedCols) {
this.sortedColsByOp.put(op, sortedCols);
}
public Map> getSortedColsByDirectory() {
return sortedColsByDirectory;
}
public void setSortedColsByDirectory(Map> sortedColsByDirectory) {
this.sortedColsByDirectory = sortedColsByDirectory;
}
/**
*
* BucketSortCol.
*
* Classes that implement this interface provide a way to store information about equivalent
* columns as their names and indexes in the schema change going into and out of operators. The
* definition of equivalent columns is up to the class which uses these classes, e.g.
* BucketingSortingOpProcFactory. For example, two columns are equivalent if they
* contain exactly the same data. Though, it's possible that two columns contain exactly the
* same data and are not known to be equivalent.
*
* E.g. SELECT key a, key b FROM (SELECT key, count(*) c FROM src GROUP BY key) s;
* In this case, assuming this is done in a single map reduce job with the group by operator
* processed in the reducer, the data coming out of the group by operator will be bucketed
* by key, which would be at index 0 in the schema, after the outer select operator, the output
* can be viewed as bucketed by either the column with alias a or the column with alias b. To
* represent this, there could be a single BucketSortCol implementation instance whose names
* include both a and b, and whose indexes include both 0 and 1.
*
* Implementations of this interface should maintain the restriction that the alias
* getNames().get(i) should have index getIndexes().get(i) in the schema.
*/
public static interface BucketSortCol {
// Get a list of aliases for the same column
public List getNames();
// Get a list of indexes for which the columns in the schema are the same
public List getIndexes();
// Add an alternative alias for the column this instance represents, and its index in the
// schema.
public void addAlias(String name, Integer index);
}
/**
*
* BucketCol.
*
* An implementation of BucketSortCol which contains known aliases/indexes of equivalent columns
* which data is determined to be bucketed on.
*/
public static final class BucketCol implements BucketSortCol, Serializable {
private static final long serialVersionUID = 1L;
// Equivalent aliases for the column
private final List names = new ArrayList();
// Indexes of those equivalent columns
private final List indexes = new ArrayList();
public BucketCol(String name, int index) {
addAlias(name, index);
}
public BucketCol() {
}
@Override
public List getNames() {
return names;
}
@Override
public List getIndexes() {
return indexes;
}
@Override
public void addAlias(String name, Integer index) {
names.add(name);
indexes.add(index);
}
@Override
// Chooses a representative alias and index to use as the String, the first is used because
// it is set in the constructor
public String toString() {
return "name: " + names.get(0) + " index: " + indexes.get(0);
}
}
/**
*
* SortCol.
*
* An implementation of BucketSortCol which contains known aliases/indexes of equivalent columns
* which data is determined to be sorted on. Unlike aliases, and indexes the sort order is known
* to be constant for all equivalent columns.
*/
public static final class SortCol implements BucketSortCol, Serializable {
public SortCol() {
super();
}
private static final long serialVersionUID = 1L;
// Equivalent aliases for the column
private List names = new ArrayList();
// Indexes of those equivalent columns
private List indexes = new ArrayList();
// Sort order (+|-)
private char sortOrder;
public SortCol(String name, int index, char sortOrder) {
this(sortOrder);
addAlias(name, index);
}
public SortCol(char sortOrder) {
this.sortOrder = sortOrder;
}
@Override
public List getNames() {
return names;
}
@Override
public List getIndexes() {
return indexes;
}
@Override
public void addAlias(String name, Integer index) {
names.add(name);
indexes.add(index);
}
public char getSortOrder() {
return sortOrder;
}
@Override
// Chooses a representative alias, index, and order to use as the String, the first is used
// because it is set in the constructor
public String toString() {
return "name: " + names.get(0) + " index: " + indexes.get(0) + " order: " + sortOrder;
}
}
}