All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hive.ql.optimizer.index.RewriteGBUsingIndex Maven / Gradle / Ivy

There is a newer version: 4.0.0
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.optimizer.index;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.Index;
import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.index.AggregateIndexHandler;
import org.apache.hadoop.hive.ql.metadata.Hive;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.optimizer.IndexUtils;
import org.apache.hadoop.hive.ql.optimizer.Transform;
import org.apache.hadoop.hive.ql.parse.ParseContext;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;


/**
 * RewriteGBUsingIndex is implemented as one of the Rule-based Optimizations.
 * Implements optimizations for GroupBy clause rewrite using aggregate index.
 * This optimization rewrites GroupBy query over base table to the query over simple table-scan
 * over index table, if there is index on the group by key(s) or the distinct column(s).
 * E.g.
 * 
 *   select count(key)
 *   from table
 *   group by key;
 * 
 *  to
 *  
 *   select sum(_count_of_key)
 *   from idx_table
 *   group by key;
 *  
 *
 *  The rewrite supports following queries:
 *  
    *
  • Queries having only those col refs that are in the index key. *
  • Queries that have index key col refs *
      *
    • in SELECT *
    • in WHERE *
    • in GROUP BY *
    *
  • Queries with agg func COUNT(index key col ref) in SELECT *
  • Queries with SELECT DISTINCT index_key_col_refs *
  • Queries having a subquery satisfying above condition (only the subquery is rewritten) *
* * @see AggregateIndexHandler * @see IndexUtils * @see RewriteCanApplyCtx * @see RewriteCanApplyProcFactory * @see RewriteParseContextGenerator * @see RewriteQueryUsingAggregateIndexCtx * @see RewriteQueryUsingAggregateIndex * For test cases, @see ql_rewrite_gbtoidx.q */ public class RewriteGBUsingIndex implements Transform { private ParseContext parseContext; private Hive hiveDb; private HiveConf hiveConf; private static final Log LOG = LogFactory.getLog(RewriteGBUsingIndex.class.getName()); /* * Stores the list of top TableScanOperator names for which the rewrite * can be applied and the action that needs to be performed for operator tree * starting from this TableScanOperator */ private final Map tsOpToProcess = new LinkedHashMap(); //Index Validation Variables private static final String IDX_BUCKET_COL = "_bucketname"; private static final String IDX_OFFSETS_ARRAY_COL = "_offsets"; @Override public ParseContext transform(ParseContext pctx) throws SemanticException { parseContext = pctx; hiveConf = parseContext.getConf(); try { hiveDb = Hive.get(hiveConf); } catch (HiveException e) { LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); throw new SemanticException(e.getMessage(), e); } // Don't try to index optimize the query to build the index HiveConf.setBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTINDEXFILTER, false); /* Check if the input query passes all the tests to be eligible for a rewrite * If yes, rewrite original query; else, return the current parseContext */ if (shouldApplyOptimization()) { LOG.info("Rewriting Original Query using " + getName() + " optimization."); rewriteOriginalQuery(); } return parseContext; } private String getName() { return "RewriteGBUsingIndex"; } /** * We traverse the current operator tree to check for conditions in which the * optimization cannot be applied. * * At the end, we check if all conditions have passed for rewrite. If yes, we * determine if the the index is usable for rewrite. Else, we log the condition which * did not meet the rewrite criterion. * * @return * @throws SemanticException */ boolean shouldApplyOptimization() throws SemanticException { Map> tableToIndex = getIndexesForRewrite(); if (tableToIndex.isEmpty()) { LOG.debug("No Valid Index Found to apply Rewrite, " + "skipping " + getName() + " optimization"); return false; } /* * This code iterates over each TableScanOperator from the topOps map from ParseContext. * For each operator tree originating from this top TableScanOperator, we determine * if the optimization can be applied. If yes, we add the name of the top table to * the tsOpToProcess to apply rewrite later on. * */ for (Map.Entry> entry : parseContext.getTopOps().entrySet()) { String alias = entry.getKey(); TableScanOperator topOp = (TableScanOperator) entry.getValue(); Table table = topOp.getConf().getTableMetadata(); List indexes = tableToIndex.get(table); if (indexes.isEmpty()) { continue; } if (table.isPartitioned()) { //if base table has partitions, we need to check if index is built for //all partitions. If not, then we do not apply the optimization if (!checkIfIndexBuiltOnAllTablePartitions(topOp, indexes)) { LOG.debug("Index is not built for all table partitions, " + "skipping " + getName() + " optimization"); continue; } } //check if rewrite can be applied for operator tree //if there are no partitions on base table checkIfRewriteCanBeApplied(alias, topOp, table, indexes); } return !tsOpToProcess.isEmpty(); } /** * This methods checks if rewrite can be applied using the index and also * verifies all conditions of the operator tree. * * @param topOp - TableScanOperator for a single the operator tree branch * @param indexes - Map of a table and list of indexes on it * @return - true if rewrite can be applied on the current branch; false otherwise * @throws SemanticException */ private boolean checkIfRewriteCanBeApplied(String alias, TableScanOperator topOp, Table baseTable, List indexes) throws SemanticException{ //Context for checking if this optimization can be applied to the input query RewriteCanApplyCtx canApplyCtx = RewriteCanApplyCtx.getInstance(parseContext); canApplyCtx.setAlias(alias); canApplyCtx.setBaseTableName(baseTable.getTableName()); canApplyCtx.populateRewriteVars(topOp); Map indexTableMap = getIndexToKeysMap(indexes); for (Map.Entry entry : indexTableMap.entrySet()) { //we rewrite the original query using the first valid index encountered //this can be changed if we have a better mechanism to //decide which index will produce a better rewrite Index index = entry.getKey(); String indexKeyName = entry.getValue(); //break here if any valid index is found to apply rewrite if (canApplyCtx.getIndexKey() != null && canApplyCtx.getIndexKey().equals(indexKeyName) && checkIfAllRewriteCriteriaIsMet(canApplyCtx)) { canApplyCtx.setAggFunction("_count_of_" + indexKeyName + ""); canApplyCtx.addTable(canApplyCtx.getBaseTableName(), index.getIndexTableName()); canApplyCtx.setIndexTableName(index.getIndexTableName()); tsOpToProcess.put(alias, canApplyCtx); return true; } } return false; } /** * Get a list of indexes which can be used for rewrite. * @return * @throws SemanticException */ private Map> getIndexesForRewrite() throws SemanticException{ List supportedIndexes = new ArrayList(); supportedIndexes.add(AggregateIndexHandler.class.getName()); // query the metastore to know what columns we have indexed Collection> topTables = parseContext.getTopOps().values(); Map> indexes = new HashMap>(); for (Operator op : topTables) { if (op instanceof TableScanOperator) { TableScanOperator tsOP = (TableScanOperator) op; List tblIndexes = IndexUtils.getIndexes(tsOP.getConf().getTableMetadata(), supportedIndexes); if (tblIndexes.size() > 0) { indexes.put(tsOP.getConf().getTableMetadata(), tblIndexes); } } } return indexes; } /** * This method checks if the index is built on all partitions of the base * table. If not, then the method returns false as we do not apply optimization * for this case. * @param tableScan * @param indexes * @return * @throws SemanticException */ private boolean checkIfIndexBuiltOnAllTablePartitions(TableScanOperator tableScan, List indexes) throws SemanticException { // check if we have indexes on all partitions in this table scan Set queryPartitions; try { queryPartitions = IndexUtils.checkPartitionsCoveredByIndex(tableScan, parseContext, indexes); if (queryPartitions == null) { // partitions not covered return false; } } catch (HiveException e) { LOG.error("Fatal Error: problem accessing metastore", e); throw new SemanticException(e); } if (queryPartitions.size() != 0) { return true; } return false; } /** * This code block iterates over indexes on the table and populates the indexToKeys map * for all the indexes that satisfy the rewrite criteria. * @param indexTables * @return * @throws SemanticException */ Map getIndexToKeysMap(List indexTables) throws SemanticException{ Hive hiveInstance = hiveDb; Map indexToKeysMap = new LinkedHashMap(); for (int idxCtr = 0; idxCtr < indexTables.size(); idxCtr++) { Index index = indexTables.get(idxCtr); //Getting index key columns StorageDescriptor sd = index.getSd(); List idxColList = sd.getCols(); assert idxColList.size()==1; String indexKeyName = idxColList.get(0).getName(); // Check that the index schema is as expected. This code block should // catch problems of this rewrite breaking when the AggregateIndexHandler // index is changed. List idxTblColNames = new ArrayList(); try { String[] qualified = Utilities.getDbTableName(index.getDbName(), index.getIndexTableName()); Table idxTbl = hiveInstance.getTable(qualified[0], qualified[1]); for (FieldSchema idxTblCol : idxTbl.getCols()) { idxTblColNames.add(idxTblCol.getName()); } } catch (HiveException e) { LOG.error("Got exception while locating index table, " + "skipping " + getName() + " optimization"); LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); throw new SemanticException(e.getMessage(), e); } assert(idxTblColNames.contains(IDX_BUCKET_COL)); assert(idxTblColNames.contains(IDX_OFFSETS_ARRAY_COL)); // we add all index tables which can be used for rewrite // and defer the decision of using a particular index for later // this is to allow choosing a index if a better mechanism is // designed later to chose a better rewrite indexToKeysMap.put(index, indexKeyName); } return indexToKeysMap; } /** * Method to rewrite the input query if all optimization criteria is passed. * The method iterates over the tsOpToProcess {@link ArrayList} to apply the rewrites * @throws SemanticException * */ private void rewriteOriginalQuery() throws SemanticException { for (RewriteCanApplyCtx canApplyCtx : tsOpToProcess.values()) { RewriteQueryUsingAggregateIndexCtx rewriteQueryCtx = RewriteQueryUsingAggregateIndexCtx.getInstance(parseContext, hiveDb, canApplyCtx); rewriteQueryCtx.invokeRewriteQueryProc(); parseContext = rewriteQueryCtx.getParseContext(); } LOG.info("Finished Rewriting query"); } /** * This method logs the reason for which we cannot apply the rewrite optimization. * @return */ boolean checkIfAllRewriteCriteriaIsMet(RewriteCanApplyCtx canApplyCtx) { if (canApplyCtx.isSelClauseColsFetchException()) { LOG.debug("Got exception while locating child col refs for select list, " + "skipping " + getName() + " optimization."); return false; } if (canApplyCtx.isAggFuncIsNotCount()) { LOG.debug("Agg func other than count is " + "not supported by " + getName() + " optimization."); return false; } if (canApplyCtx.isAggParameterException()) { LOG.debug("Got exception while locating parameter refs for aggregation, " + "skipping " + getName() + " optimization."); return false; } return true; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy