Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.optimizer.index;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.Index;
import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.index.AggregateIndexHandler;
import org.apache.hadoop.hive.ql.metadata.Hive;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.optimizer.IndexUtils;
import org.apache.hadoop.hive.ql.optimizer.Transform;
import org.apache.hadoop.hive.ql.parse.OpParseContext;
import org.apache.hadoop.hive.ql.parse.ParseContext;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
/**
* RewriteGBUsingIndex is implemented as one of the Rule-based Optimizations.
* Implements optimizations for GroupBy clause rewrite using aggregate index.
* This optimization rewrites GroupBy query over base table to the query over simple table-scan
* over index table, if there is index on the group by key(s) or the distinct column(s).
* E.g.
*
* select count(key)
* from table
* group by key;
*
* to
*
* select sum(_count_of_key)
* from idx_table
* group by key;
*
*
* The rewrite supports following queries:
*
*
Queries having only those col refs that are in the index key.
*
Queries that have index key col refs
*
*
in SELECT
*
in WHERE
*
in GROUP BY
*
*
Queries with agg func COUNT(index key col ref) in SELECT
*
Queries with SELECT DISTINCT index_key_col_refs
*
Queries having a subquery satisfying above condition (only the subquery is rewritten)
*
*
* @see AggregateIndexHandler
* @see IndexUtils
* @see RewriteCanApplyCtx
* @see RewriteCanApplyProcFactory
* @see RewriteParseContextGenerator
* @see RewriteQueryUsingAggregateIndexCtx
* @see RewriteQueryUsingAggregateIndex
* For test cases, @see ql_rewrite_gbtoidx.q
*/
public class RewriteGBUsingIndex implements Transform {
private ParseContext parseContext;
private Hive hiveDb;
private HiveConf hiveConf;
private static final Log LOG = LogFactory.getLog(RewriteGBUsingIndex.class.getName());
/*
* Stores the list of top TableScanOperator names for which the rewrite
* can be applied and the action that needs to be performed for operator tree
* starting from this TableScanOperator
*/
private final Map tsOpToProcess =
new LinkedHashMap();
//Name of the current table on which rewrite is being performed
private String baseTableName = null;
//Name of the current index which is used for rewrite
private String indexTableName = null;
//Index Validation Variables
private static final String IDX_BUCKET_COL = "_bucketname";
private static final String IDX_OFFSETS_ARRAY_COL = "_offsets";
@Override
public ParseContext transform(ParseContext pctx) throws SemanticException {
parseContext = pctx;
hiveConf = parseContext.getConf();
try {
hiveDb = Hive.get(hiveConf);
} catch (HiveException e) {
LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
throw new SemanticException(e.getMessage(), e);
}
// Don't try to index optimize the query to build the index
HiveConf.setBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTINDEXFILTER, false);
/* Check if the input query passes all the tests to be eligible for a rewrite
* If yes, rewrite original query; else, return the current parseContext
*/
if(shouldApplyOptimization()){
LOG.info("Rewriting Original Query using " + getName() + " optimization.");
rewriteOriginalQuery();
}
return parseContext;
}
private String getName() {
return "RewriteGBUsingIndex";
}
/**
* We traverse the current operator tree to check for conditions in which the
* optimization cannot be applied.
*
* At the end, we check if all conditions have passed for rewrite. If yes, we
* determine if the the index is usable for rewrite. Else, we log the condition which
* did not meet the rewrite criterion.
*
* @return
* @throws SemanticException
*/
boolean shouldApplyOptimization() throws SemanticException{
boolean canApply = false;
if(ifQueryHasMultipleTables()){
//We do not apply this optimization for this case as of now.
return false;
}else{
/*
* This code iterates over each TableScanOperator from the topOps map from ParseContext.
* For each operator tree originating from this top TableScanOperator, we determine
* if the optimization can be applied. If yes, we add the name of the top table to
* the tsOpToProcess to apply rewrite later on.
* */
Map topToTable = parseContext.getTopToTable();
Iterator topOpItr = topToTable.keySet().iterator();
while(topOpItr.hasNext()){
TableScanOperator topOp = topOpItr.next();
Table table = topToTable.get(topOp);
baseTableName = table.getTableName();
Map
> indexes = getIndexesForRewrite();
if(indexes == null){
LOG.debug("Error getting valid indexes for rewrite, " +
"skipping " + getName() + " optimization");
return false;
}
if(indexes.size() == 0){
LOG.debug("No Valid Index Found to apply Rewrite, " +
"skipping " + getName() + " optimization");
return false;
}else{
//we need to check if the base table has confirmed or unknown partitions
if(parseContext.getOpToPartList() != null && parseContext.getOpToPartList().size() > 0){
//if base table has partitions, we need to check if index is built for
//all partitions. If not, then we do not apply the optimization
if(checkIfIndexBuiltOnAllTablePartitions(topOp, indexes)){
//check if rewrite can be applied for operator tree
//if partitions condition returns true
canApply = checkIfRewriteCanBeApplied(topOp, table, indexes);
}else{
LOG.debug("Index is not built for all table partitions, " +
"skipping " + getName() + " optimization");
return false;
}
}else{
//check if rewrite can be applied for operator tree
//if there are no partitions on base table
canApply = checkIfRewriteCanBeApplied(topOp, table, indexes);
}
}
}
}
return canApply;
}
/**
* This methods checks if rewrite can be applied using the index and also
* verifies all conditions of the operator tree.
*
* @param topOp - TableScanOperator for a single the operator tree branch
* @param indexes - Map of a table and list of indexes on it
* @return - true if rewrite can be applied on the current branch; false otherwise
* @throws SemanticException
*/
private boolean checkIfRewriteCanBeApplied(TableScanOperator topOp, Table baseTable,
Map
> indexes) throws SemanticException{
boolean canApply = false;
//Context for checking if this optimization can be applied to the input query
RewriteCanApplyCtx canApplyCtx = RewriteCanApplyCtx.getInstance(parseContext);
Map> topOps = parseContext.getTopOps();
canApplyCtx.setBaseTableName(baseTableName);
canApplyCtx.populateRewriteVars(topOp);
Map> indexTableMap = getIndexToKeysMap(indexes.get(baseTable));
Iterator indexMapItr = indexTableMap.keySet().iterator();
Index index = null;
while(indexMapItr.hasNext()){
//we rewrite the original query using the first valid index encountered
//this can be changed if we have a better mechanism to
//decide which index will produce a better rewrite
index = indexMapItr.next();
canApply = canApplyCtx.isIndexUsableForQueryBranchRewrite(index,
indexTableMap.get(index));
if(canApply){
canApply = checkIfAllRewriteCriteriaIsMet(canApplyCtx);
//break here if any valid index is found to apply rewrite
if(canApply){
//check if aggregation function is set.
//If not, set it using the only indexed column
if(canApplyCtx.getAggFunction() == null){
//strip of the start and end braces [...]
String aggregationFunction = indexTableMap.get(index).toString();
aggregationFunction = aggregationFunction.substring(1,
aggregationFunction.length() - 1);
canApplyCtx.setAggFunction("_count_of_" + aggregationFunction + "");
}
}
break;
}
}
indexTableName = index.getIndexTableName();
if(canApply && topOps.containsValue(topOp)) {
Iterator topOpNamesItr = topOps.keySet().iterator();
while(topOpNamesItr.hasNext()){
String topOpName = topOpNamesItr.next();
if(topOps.get(topOpName).equals(topOp)){
tsOpToProcess.put(topOpName, canApplyCtx);
}
}
}
if(tsOpToProcess.size() == 0){
canApply = false;
}else{
canApply = true;
}
return canApply;
}
/**
* This block of code iterates over the topToTable map from ParseContext
* to determine if the query has a scan over multiple tables.
* @return
*/
boolean ifQueryHasMultipleTables(){
Map topToTable = parseContext.getTopToTable();
Iterator
valuesItr = topToTable.values().iterator();
Set tableNameSet = new HashSet();
while(valuesItr.hasNext()){
Table table = valuesItr.next();
tableNameSet.add(table.getTableName());
}
if(tableNameSet.size() > 1){
LOG.debug("Query has more than one table " +
"that is not supported with " + getName() + " optimization.");
return true;
}
return false;
}
/**
* Get a list of indexes which can be used for rewrite.
* @return
* @throws SemanticException
*/
private Map
> getIndexesForRewrite() throws SemanticException{
List supportedIndexes = new ArrayList();
supportedIndexes.add(AggregateIndexHandler.class.getName());
// query the metastore to know what columns we have indexed
Collection
>();
for (Table tbl : topTables){
List tblIndexes = IndexUtils.getIndexes(tbl, supportedIndexes);
if (tblIndexes.size() > 0) {
indexes.put(tbl, tblIndexes);
}
}
return indexes;
}
/**
* This method checks if the index is built on all partitions of the base
* table. If not, then the method returns false as we do not apply optimization
* for this case.
* @param tableScan
* @param indexes
* @return
* @throws SemanticException
*/
private boolean checkIfIndexBuiltOnAllTablePartitions(TableScanOperator tableScan,
Map
> indexes) throws SemanticException{
// check if we have indexes on all partitions in this table scan
Set queryPartitions;
try {
queryPartitions = IndexUtils.checkPartitionsCoveredByIndex(tableScan, parseContext, indexes);
if (queryPartitions == null) { // partitions not covered
return false;
}
} catch (HiveException e) {
LOG.error("Fatal Error: problem accessing metastore", e);
throw new SemanticException(e);
}
if(queryPartitions.size() != 0){
return true;
}
return false;
}
/**
* This code block iterates over indexes on the table and populates the indexToKeys map
* for all the indexes that satisfy the rewrite criteria.
* @param indexTables
* @return
* @throws SemanticException
*/
Map> getIndexToKeysMap(List indexTables) throws SemanticException{
Index index = null;
Hive hiveInstance = hiveDb;
Map> indexToKeysMap = new LinkedHashMap>();
for (int idxCtr = 0; idxCtr < indexTables.size(); idxCtr++) {
final Set indexKeyNames = new LinkedHashSet();
index = indexTables.get(idxCtr);
//Getting index key columns
StorageDescriptor sd = index.getSd();
List idxColList = sd.getCols();
for (FieldSchema fieldSchema : idxColList) {
indexKeyNames.add(fieldSchema.getName());
}
assert indexKeyNames.size()==1;
// Check that the index schema is as expected. This code block should
// catch problems of this rewrite breaking when the AggregateIndexHandler
// index is changed.
List idxTblColNames = new ArrayList();
try {
Table idxTbl = hiveInstance.getTable(index.getDbName(),
index.getIndexTableName());
for (FieldSchema idxTblCol : idxTbl.getCols()) {
idxTblColNames.add(idxTblCol.getName());
}
} catch (HiveException e) {
LOG.error("Got exception while locating index table, " +
"skipping " + getName() + " optimization");
LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
throw new SemanticException(e.getMessage(), e);
}
assert(idxTblColNames.contains(IDX_BUCKET_COL));
assert(idxTblColNames.contains(IDX_OFFSETS_ARRAY_COL));
// we add all index tables which can be used for rewrite
// and defer the decision of using a particular index for later
// this is to allow choosing a index if a better mechanism is
// designed later to chose a better rewrite
indexToKeysMap.put(index, indexKeyNames);
}
return indexToKeysMap;
}
/**
* Method to rewrite the input query if all optimization criteria is passed.
* The method iterates over the tsOpToProcess {@link ArrayList} to apply the rewrites
* @throws SemanticException
*
*/
@SuppressWarnings("unchecked")
private void rewriteOriginalQuery() throws SemanticException {
Map> topOpMap =
(HashMap>) parseContext.getTopOps().clone();
Iterator tsOpItr = tsOpToProcess.keySet().iterator();
while(tsOpItr.hasNext()){
baseTableName = tsOpItr.next();
RewriteCanApplyCtx canApplyCtx = tsOpToProcess.get(baseTableName);
TableScanOperator topOp = (TableScanOperator) topOpMap.get(baseTableName);
RewriteQueryUsingAggregateIndexCtx rewriteQueryCtx =
RewriteQueryUsingAggregateIndexCtx.getInstance(parseContext, hiveDb,
indexTableName, baseTableName, canApplyCtx.getAggFunction());
rewriteQueryCtx.invokeRewriteQueryProc(topOp);
parseContext = rewriteQueryCtx.getParseContext();
parseContext.setOpParseCtx((LinkedHashMap,
OpParseContext>) rewriteQueryCtx.getOpc());
}
LOG.info("Finished Rewriting query");
}
/**
* This method logs the reason for which we cannot apply the rewrite optimization.
* @return
*/
boolean checkIfAllRewriteCriteriaIsMet(RewriteCanApplyCtx canApplyCtx){
if (canApplyCtx.getAggFuncCnt() > 1){
LOG.debug("More than 1 agg funcs: " +
"Not supported by " + getName() + " optimization.");
return false;
}
if (canApplyCtx.isAggFuncIsNotCount()){
LOG.debug("Agg func other than count is " +
"not supported by " + getName() + " optimization.");
return false;
}
if (canApplyCtx.isCountOnAllCols()){
LOG.debug("Currently count function needs group by on key columns. This is a count(*) case.,"
+ "Cannot apply this " + getName() + " optimization.");
return false;
}
if (canApplyCtx.isCountOfOne()){
LOG.debug("Currently count function needs group by on key columns. This is a count(1) case.,"
+ "Cannot apply this " + getName() + " optimization.");
return false;
}
if (canApplyCtx.isAggFuncColsFetchException()){
LOG.debug("Got exception while locating child col refs " +
"of agg func, skipping " + getName() + " optimization.");
return false;
}
if (canApplyCtx.isWhrClauseColsFetchException()){
LOG.debug("Got exception while locating child col refs for where clause, "
+ "skipping " + getName() + " optimization.");
return false;
}
if (canApplyCtx.isSelClauseColsFetchException()){
LOG.debug("Got exception while locating child col refs for select list, "
+ "skipping " + getName() + " optimization.");
return false;
}
if (canApplyCtx.isGbyKeysFetchException()){
LOG.debug("Got exception while locating child col refs for GroupBy key, "
+ "skipping " + getName() + " optimization.");
return false;
}
return true;
}
}