All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hive.ql.optimizer.listbucketingpruner.ListBucketingPruner Maven / Gradle / Ivy

There is a newer version: 4.0.0
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.optimizer.listbucketingpruner;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.FileUtils;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.optimizer.PrunerUtils;
import org.apache.hadoop.hive.ql.optimizer.Transform;
import org.apache.hadoop.hive.ql.parse.ParseContext;
import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;

/**
 * The transformation step that does list bucketing pruning.
 *
 */
public class ListBucketingPruner implements Transform {
  static final Log LOG = LogFactory.getLog(ListBucketingPruner.class.getName());

  /*
   * (non-Javadoc)
   *
   * @see org.apache.hadoop.hive.ql.optimizer.Transform#transform(org.apache.hadoop.hive.ql.parse.
   * ParseContext)
   */
  @Override
  public ParseContext transform(ParseContext pctx) throws SemanticException {
    // create a the context for walking operators
    NodeProcessorCtx opPartWalkerCtx = new LBOpPartitionWalkerCtx(pctx);

    // Retrieve all partitions generated from partition pruner and partition column pruner
    PrunerUtils.walkOperatorTree(pctx, opPartWalkerCtx, LBPartitionProcFactory.getFilterProc(),
        LBPartitionProcFactory.getDefaultProc());

    PrunedPartitionList partsList = ((LBOpPartitionWalkerCtx) opPartWalkerCtx).getPartitions();
    if (partsList != null) {
      Set parts = partsList.getPartitions();
      if ((parts != null) && (parts.size() > 0)) {
        for (Partition part : parts) {
          // only process partition which is skewed and list bucketed
          if (ListBucketingPrunerUtils.isListBucketingPart(part)) {
            // create a the context for walking operators
            NodeProcessorCtx opWalkerCtx = new LBOpWalkerCtx(pctx.getOpToPartToSkewedPruner(),
                part);

            // walk operator tree to create expression tree for list bucketing
            PrunerUtils.walkOperatorTree(pctx, opWalkerCtx, LBProcFactory.getFilterProc(),
                LBProcFactory.getDefaultProc());
          }
        }
      }
    }

    return pctx;
  }

  /**
   * Prunes to the directories which match the skewed keys in where clause.
   *
   *
   * Algorithm
   *
   * =========
   *
   * For each possible skewed element combination:
   * 1. walk through ExprNode tree
   * 2. decide Boolean (True/False/unknown(null))
   *
   * Go through each skewed element combination again:
   * 1. if it is skewed value, skip the directory only if it is false, otherwise keep it
   * 2. skip the default directory only if all skewed elements,non-skewed value, are false.
   *
   * Example
   * =======
   * For example:
   * 1. skewed column (list): C1, C2
   * 2. skewed value (list of list): (1,a), (2,b), (1,c)
   *
   * Unique skewed elements for each skewed column (list of list):
   * (1,2,other), (a,b,c,other)
   *
   * Index: (0,1,2) (0,1,2,3)
   * Output matches order of skewed column. Output can be read as:
   *
   * C1 has unique element list (1,2,other)
   * C2 has unique element list (a,b,c,other)
   *
   * C1\C2 | a | b | c |Other
   * 1 | (1,a) | X | (1,c) |X
   * 2 | X |(2,b) | X |X
   * other | X | X | X |X
   *
   * Complete dynamic-multi-dimension collection
   *
   * (0,0) (1,a) * -> T
   * (0,1) (1,b) -> T
   * (0,2) (1,c) *-> F
   * (0,3) (1,other)-> F
   * (1,0) (2,a)-> F
   * (1,1) (2,b) * -> T
   * (1,2) (2,c)-> F
   * (1,3) (2,other)-> F
   * (2,0) (other,a) -> T
   * (2,1) (other,b) -> T
   * (2,2) (other,c) -> T
   * (2,3) (other,other) -> T
   * * is skewed value entry
   *
   * Expression Tree : ((c1=1) and (c2=a)) or ( (c1=3) or (c2=b))
   *
   * or
   * / \
   * and or
   * / \ / \
   * c1=1 c2=a c1=3 c2=b
   *
   *
   * For each entry in dynamic-multi-dimension container
   *
   * 1. walk through the tree to decide value (please see map's value above)
   * 2. if it is skewed value
   * 2.1 remove the entry from the map
   * 2.2 add directory to path unless value is false
   * 3. otherwise, add value to map
   *
   * Once it is done, go through the rest entries in map to decide default directory
   * 1. we know all is not skewed value
   * 2. we skip default directory only if all value is false
   *
   * What we choose at the end?
   *
   * 1. directory for (1,a) because it 's skewed value and match returns true
   * 2. directory for (2,b) because it 's skewed value and match returns true
   * 3. default directory because not all non-skewed value returns false
   *
   * we skip directory for (1,c) since match returns false
   *
   * Note: unknown is marked in {@link #transform(ParseContext)} 
*
   * newcd = new ExprNodeConstantDesc(cd.getTypeInfo(), null)
   * 
* *
can be checked via
* *
   *     child_nd instanceof ExprNodeConstantDesc
   *               && ((ExprNodeConstantDesc) child_nd).getValue() == null)
   * 
* *
* * @param ctx * parse context * @param part * partition * @param pruner * expression node tree * @return */ public static Path[] prune(ParseContext ctx, Partition part, ExprNodeDesc pruner) { Path[] finalPaths = null; try { finalPaths = execute(ctx, part, pruner); } catch (SemanticException e) { // Use full partition path for error case. LOG.warn("Using full partition scan :" + Arrays.toString(part.getPath()) + ".", e); finalPaths = part.getPath(); } return finalPaths; } /** * Main skeleton for list bucketing pruning. * * @param ctx * @param part * @param pruner * @return * @throws SemanticException */ private static Path[] execute(ParseContext ctx, Partition part, ExprNodeDesc pruner) throws SemanticException { Path[] finalPaths; List selectedPaths = new ArrayList(); if (ListBucketingPrunerUtils.isUnknownState(pruner)) { // Use full partition path for error case. LOG.warn("List bucketing pruner is either null or in unknown state " + " so that it uses full partition scan :" + Arrays.toString(part.getPath())); finalPaths = part.getPath(); } else { // Retrieve skewed columns. List> sVals = part.getSkewedColValues(); assert ((sVals != null) && (sVals.size() > 0)) : part.getName() + " skewed metadata is corrupted. No skewed value information."; // Calculate collection. List> indexCollection = DynamicMultiDimensionalCollection .generateCollection(sVals); assert (indexCollection != null) : "Collection is null."; // Calculate unique skewed elements for each skewed column. List> uniqSkewValues = DynamicMultiDimensionalCollection.uniqueSkewedValueList( sVals); // Decide skewed value directory selection. List nonSkewedValueMatchResult = decideSkewedValueDirSelection(part, pruner, selectedPaths, indexCollection, uniqSkewValues); // Decide default directory selection. decideDefaultDirSelection(part, selectedPaths, nonSkewedValueMatchResult); // Finalize paths. finalPaths = generateFinalPath(part, selectedPaths); } return finalPaths; } /** * Walk through every entry in complete collection * 1. calculate if it matches expression tree * 2. decide if select skewed value directory * 3. store match result for non-skewed value for later handle on default directory * C1\C2 | a | b | c |Other * 1 | (1,a) | X | (1,c) |X * 2 | X |(2,b) | X |X * other | X | X | X |X * Final result * Complete dynamic-multi-dimension collection * (0,0) (1,a) * -> T * (0,1) (1,b) -> T * (0,2) (1,c) *-> F * (0,3) (1,other)-> F * (1,0) (2,a)-> F * (1,1) (2,b) * -> T * (1,2) (2,c)-> F * (1,3) (2,other)-> F * (2,0) (other,a) -> T * (2,1) (other,b) -> T * (2,2) (other,c) -> T * (2,3) (other,other) -> T * * * is skewed value entry * * 1. directory for (1,a) is chosen because it 's skewed value and match returns true * 2. directory for (2,b) is chosen because it 's skewed value and match returns true * * @param part * @param pruner * @param selectedPaths * @param collections * @param uniqSkewedValues * @return * @throws SemanticException */ private static List decideSkewedValueDirSelection(Partition part, ExprNodeDesc pruner, List selectedPaths, List> collections, List> uniqSkewedValues) throws SemanticException { // For each entry in dynamic-multi-dimension collection. List skewedCols = part.getSkewedColNames(); // Retrieve skewed column. Map, String> mappings = part.getSkewedColValueLocationMaps(); // Retrieve skewed // map. assert ListBucketingPrunerUtils.isListBucketingPart(part) : part.getName() + " skewed metadata is corrupted. No skewed column and/or location mappings information."; List> skewedValues = part.getSkewedColValues(); List nonSkewedValueMatchResult = new ArrayList(); for (List cell : collections) { // Walk through the tree to decide value. // Example: skewed column: C1, C2 ; // index: (1,a) ; // expression tree: ((c1=1) and (c2=a)) or ((c1=3) or (c2=b)) Boolean matchResult = ListBucketingPrunerUtils.evaluateExprOnCell(skewedCols, cell, pruner, uniqSkewedValues); // Handle skewed value. if (skewedValues.contains(cell)) { // if it is skewed value if ((matchResult == null) || matchResult) { // add directory to path unless value is false /* It's valid case if a partition: */ /* 1. is defined with skewed columns and skewed values in metadata */ /* 2. doesn't have all skewed values within its data */ if (mappings.get(cell) != null) { selectedPaths.add(new Path(mappings.get(cell))); } } } else { // Non-skewed value, add it to list for later handle on default directory. nonSkewedValueMatchResult.add(matchResult); } } return nonSkewedValueMatchResult; } /** * Decide whether should select the default directory. * * @param part * @param selectedPaths * @param nonSkewedValueMatchResult */ private static void decideDefaultDirSelection(Partition part, List selectedPaths, List nonSkewedValueMatchResult) { boolean skipDefDir = true; for (Boolean v : nonSkewedValueMatchResult) { if ((v == null) || v) { skipDefDir = false; // we skip default directory only if all value is false break; } } if (!skipDefDir) { StringBuilder builder = new StringBuilder(); builder.append(part.getLocation()); builder.append(Path.SEPARATOR); builder .append((FileUtils.makeDefaultListBucketingDirName( part.getSkewedColNames(), ListBucketingPrunerUtils.HIVE_LIST_BUCKETING_DEFAULT_DIR_NAME))); selectedPaths.add(new Path(builder.toString())); } } /** * Decide the final path. * * @param part * @param selectedPaths * @return */ private static Path[] generateFinalPath(Partition part, List selectedPaths) { Path[] finalPaths; if (selectedPaths.size() == 0) { LOG.warn("Using full partition scan :" + Arrays.toString(part.getPath()) + "."); finalPaths = part.getPath(); } else { finalPaths = selectedPaths.toArray(new Path[0]); } return finalPaths; } /** * Note: this class is not designed to be used in general but for list bucketing pruner only. * The structure addresses the following requirements: * 1. multiple dimension collection * 2. length of each dimension is dynamic. It's decided at runtime. * The first user is list bucketing pruner and used in pruning phase: * 1. Each skewed column has a batch of skewed elements. * 2. One skewed column represents one dimension. * 3. Length of dimension is size of skewed elements. * 4. no. of skewed columns and length of dimension are dynamic and configured by user. * use case: * ======== * Use case #1: * Multiple dimension collection represents if to select a directory representing by the cell. * skewed column: C1, C2, C3 * skewed value: (1,a,x), (2,b,x), (1,c,x), (2,a,y) * Other: represent value for the column which is not part of skewed value. * C3 = x * C1\C2 | a | b | c |Other * 1 | Boolean(1,a,x) | X | Boolean(1,c,x) |X * 2 | X |Boolean(2,b,x) | X |X * other | X | X | X |X * C3 = y * C1\C2 | a | b | c |Other * 1 | X | X | X |X * 2 | Boolean(2,a,y) | X | X |X * other | X | X | X |X * Boolean is cell type which can be False/True/Null(Unknown). * (1,a,x) is just for information purpose to explain which skewed value it represents. * 1. value of Boolean(1,a,x) represents if we select the directory for list bucketing * 2. value of Boolean(2,b,x) represents if we select the directory for list bucketing * ... * 3. All the rest, marked as "X", will decide if to pickup the default directory. * 4. Not only "other" columns/rows but also the rest as long as it doesn't represent skewed * value. * For cell representing skewed value: * 1. False, skip the directory * 2. True/Unknown, select the directory * For cells representing default directory: * 1. only if all cells are false, skip the directory * 2. all other cases, select the directory * Use case #2: * Multiple dimension collection represents skewed elements so that walk through tree one by one. * Cell is a List representing the value mapping from index path and skewed value. * skewed column: C1, C2, C3 * skewed value: (1,a,x), (2,b,x), (1,c,x), (2,a,y) * Other: represent value for the column which is not part of skewed value. * C3 = x * C1\C2 | a | b | c |Other * 1 | (1,a,x) | X | (1,c,x) |X * 2 | X |(2,b,x) | X |X * other | X | X | X |X * C3 = y * C1\C2 | a | b | c |Other * 1 | X | X | X |X * 2 | (2,a,y) | X | X |X * other | X | X | X |X * Implementation: * ============== * please see another example in {@link ListBucketingPruner#prune} * We will use a HasMap to represent the Dynamic-Multiple-Dimension collection: * 1. Key is List representing the index path to the cell * 2. value represents the cell (Boolean for use case #1, List for case #2) * For example: * 1. skewed column (list): C1, C2, C3 * 2. skewed value (list of list): (1,a,x), (2,b,x), (1,c,x), (2,a,y) * From skewed value, we calculate the unique skewed element for each skewed column: * C1: (1,2) * C2: (a,b,c) * C3: (x,y) * We store them in list of list. We don't need to store skewed column name since we use order to * match: * 1. Skewed column (list): C1, C2, C3 * 2. Unique skewed elements for each skewed column (list of list): * (1,2,other), (a,b,c,other), (x,y,other) * 3. index (0,1,2) (0,1,2,3) (0,1,2) * * We use the index,starting at 0. to construct hashmap representing dynamic-multi-dimension * collection: * key (what skewed value key represents) -> value (Boolean for use case #1, List for case * #2). * (0,0,0) (1,a,x) * (0,0,1) (1,a,y) * (0,1,0) (1,b,x) * (0,1,1) (1,b,y) * (0,2,0) (1,c,x) * (0,2,1) (1,c,y) * (1,0,0) (2,a,x) * (1,0,1) (2,a,y) * (1,1,0) (2,b,x) * (1,1,1) (2,b,y) * (1,2,0) (2,c,x) * (1,2,1) (2,c,y) * ... */ public static class DynamicMultiDimensionalCollection { /** * Find out complete skewed-element collection * For example: * 1. skewed column (list): C1, C2 * 2. skewed value (list of list): (1,a), (2,b), (1,c) * It returns the complete collection * (1,a) , (1,b) , (1,c) , (1,other), (2,a), (2,b) , (2,c), (2,other), (other,a), (other,b), * (other,c), (other,other) * @throws SemanticException */ public static List> generateCollection(List> values) throws SemanticException { // Calculate unique skewed elements for each skewed column. List> uniqSkewedElements = DynamicMultiDimensionalCollection.uniqueElementsList( values, ListBucketingPrunerUtils.HIVE_LIST_BUCKETING_DEFAULT_KEY); // Calculate complete dynamic-multi-dimension collection. return DynamicMultiDimensionalCollection.flat(uniqSkewedElements); } /** * Convert value to unique element list. This is specific for skew value use case: * For example: * 1. skewed column (list): C1, C2, C3 * 2. skewed value (list of list): (1,a,x), (2,b,x), (1,c,x), (2,a,y) * Input: skewed value (list of list): (1,a,x), (2,b,x), (1,c,x), (2,a,y) * Output: Unique skewed elements for each skewed column (list of list): * (1,2,other), (a,b,c,other), (x,y,other) * Output matches order of skewed column. Output can be read as: * C1 has unique element list (1,2,other) * C2 has unique element list (a,b,c,other) * C3 has unique element list (x,y,other) * Other represents any value which is not part skewed-value combination. * @param values * skewed value list * @return a list of unique element lists */ public static List> uniqueElementsList(List> values, String defaultDirName) { // Get unique skewed value list. List> result = uniqueSkewedValueList(values); // Add default dir at the end of each list for (List list : result) { list.add(defaultDirName); } return result; } /** * Convert value to unique skewed value list. It is used in * {@link ListBucketingPrunerUtils#evaluateExprOnCell} * * For example: * * 1. skewed column (list): C1, C2, C3 * 2. skewed value (list of list): (1,a,x), (2,b,x), (1,c,x), (2,a,y) * * Input: skewed value (list of list): (1,a,x), (2,b,x), (1,c,x), (2,a,y) * Output: Unique skewed value for each skewed column (list of list): * (1,2), (a,b,c), (x,y) * * Output matches order of skewed column. Output can be read as: * C1 has unique skewed value list (1,2,) * C2 has unique skewed value list (a,b,c) * C3 has unique skewed value list (x,y) * * @param values * skewed value list * @return a list of unique skewed value lists */ public static List> uniqueSkewedValueList(List> values) { if ((values == null) || (values.size() == 0)) { return null; } // skewed value has the same length. List> result = new ArrayList>(); for (int i = 0; i < values.get(0).size(); i++) { result.add(new ArrayList()); } // add unique element to list per occurrence order in skewed value. // occurrence order in skewed value doesn't matter. // as long as we add them to a list, order is preserved from now on. for (List value : values) { for (int i = 0; i < value.size(); i++) { if (!result.get(i).contains(value.get(i))) { result.get(i).add(value.get(i)); } } } return result; } /** * Flat a dynamic-multi-dimension collection. * * For example: * 1. skewed column (list): C1, C2, C3 * 2. skewed value (list of list): (1,a,x), (2,b,x), (1,c,x), (2,a,y) * * Unique skewed elements for each skewed column (list of list): * (1,2,other), (a,b,c,other) * Index: (0,1,2) (0,1,2,3) * * Complete dynamic-multi-dimension collection * (0,0) (1,a) * -> T * (0,1) (1,b) -> T * (0,2) (1,c) *-> F * (0,3) (1,other)-> F * (1,0) (2,a)-> F * (1,1) (2,b) * -> T * (1,2) (2,c)-> F * (1,3) (2,other)-> F * (2,0) (other,a) -> T * (2,1) (other,b) -> T * (2,2) (other,c) -> T * (2,3) (other,other) -> T * * is skewed value entry * * @param uniqSkewedElements * * @return */ public static List> flat(List> uniqSkewedElements) throws SemanticException { if (uniqSkewedElements == null) { return null; } List> collection = new ArrayList>(); walker(collection, uniqSkewedElements, new ArrayList(), 0); return collection; } /** * Flat the collection recursively. * * @param finalResult * @param input * @param listSoFar * @param level * @throws SemanticException */ private static void walker(List> finalResult, final List> input, List listSoFar, final int level) throws SemanticException { // Base case. if (level == (input.size() - 1)) { assert (input.get(level) != null) : "Unique skewed element list has null list in " + level + "th position."; for (String v : input.get(level)) { List oneCompleteIndex = new ArrayList(listSoFar); oneCompleteIndex.add(v); finalResult.add(oneCompleteIndex); } return; } // Recursive. for (String v : input.get(level)) { List clonedListSoFar = new ArrayList(listSoFar); clonedListSoFar.add(v); int nextLevel = level + 1; walker(finalResult, input, clonedListSoFar, nextLevel); } } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy