org.apache.hadoop.hive.ql.optimizer.ColumnPrunerProcCtx Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hive-exec
There is a newer version: 4.0.1
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.optimizer;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.hive.ql.exec.ColumnInfo;
import org.apache.hadoop.hive.ql.exec.CommonJoinOperator;
import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
import org.apache.hadoop.hive.ql.exec.FilterOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.OperatorFactory;
import org.apache.hadoop.hive.ql.exec.RowSchema;
import org.apache.hadoop.hive.ql.exec.SelectOperator;
import org.apache.hadoop.hive.ql.exec.UnionOperator;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.parse.ParseContext;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.SelectDesc;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;

import static org.apache.hadoop.hive.ql.optimizer.FieldNode.mergeFieldNodes;

/**
 * This class implements the processor context for Column Pruner.
 */
public class ColumnPrunerProcCtx implements NodeProcessorCtx {
  private final ParseContext pctx;

  /**
   * A mapping from operators to nested column paths being used in them.
   * Note: paths are of format "s.a.b" which represents field "b" of
   *   struct "a" is being used, while "a" itself is a field of struct "s".
   */
  private final Map, List> prunedColLists;
  private final Map>> joinPrunedColLists;

  public ColumnPrunerProcCtx(ParseContext pctx) {
    this.pctx = pctx;
    prunedColLists = new HashMap<>();
    joinPrunedColLists = new HashMap<>();
  }

  public ParseContext getParseContext() {
    return pctx;
  }

  public Map>> getJoinPrunedColLists() {
    return joinPrunedColLists;
  }

  public List getPrunedColList(Operator op) {
    return prunedColLists.get(op);
  }

  public Map, List> getPrunedColLists() {
    return prunedColLists;
  }

  /**
   * Creates the list of internal column names(represented by field nodes,
   * these names are used in the RowResolver and are different from the
   * external column names) that are needed in the subtree. These columns
   * eventually have to be selected from the table scan.
   *
   * @param curOp The root of the operator subtree.
   * @return a list of field nodes representing the internal column names.
   */
  public List genColLists(Operator curOp)
      throws SemanticException {
    if (curOp.getChildOperators() == null) {
      return null;
    }
    List colList = null;
    for (Operator child : curOp.getChildOperators()) {
      List prunList = null;
      if (child instanceof CommonJoinOperator) {
        int tag = child.getParentOperators().indexOf(curOp);
        prunList = joinPrunedColLists.get(child).get((byte) tag);
      } else if (child instanceof FileSinkOperator) {
        prunList = new ArrayList<>();
        RowSchema oldRS = curOp.getSchema();
        for (ColumnInfo colInfo : oldRS.getSignature()) {
          prunList.add(new FieldNode(colInfo.getInternalName()));
        }
      } else {
        prunList = prunedColLists.get(child);
      }
      if (prunList == null) {
        continue;
      }
      if (colList == null) {
        colList = new ArrayList<>(prunList);
      } else {
        colList = mergeFieldNodes(colList, prunList);
      }
    }
    return colList;
  }

  /**
   * Creates the list of internal column names (represented by field nodes,
   * these names are used in the RowResolver and are different from the
   * external column names) that are needed in the subtree. These columns
   * eventually have to be selected from the table scan.
   *
   * @param curOp The root of the operator subtree.
   * @param child The consumer.
   * @return a list of field nodes representing the internal column names.
   */
  public List genColLists(Operator curOp,
          Operator child)
      throws SemanticException {
    if (curOp.getChildOperators() == null) {
      return null;
    }
    if (child instanceof CommonJoinOperator) {
      int tag = child.getParentOperators().indexOf(curOp);
      return joinPrunedColLists.get(child).get((byte) tag);
    } else {
      return prunedColLists.get(child);
    }
  }

  /**
   * Creates the list of internal column names (represented by field nodes)
   * from select expressions in a select operator. This function is used for the
   * select operator instead of the genColLists function (which is used by
   * the rest of the operators).
   *
   * @param op The select operator.
   * @return a list of field nodes representing the internal column names.
   */
  public List getColsFromSelectExpr(SelectOperator op) {
    List cols = new ArrayList<>();
    SelectDesc conf = op.getConf();
    if(conf.isSelStarNoCompute()) {
      for (ColumnInfo colInfo : op.getSchema().getSignature()) {
        cols.add(new FieldNode(colInfo.getInternalName()));
      }
    } else {
      List exprList = conf.getColList();
        for (ExprNodeDesc expr : exprList) {
          cols = mergeFieldNodesWithDesc(cols, expr);
        }
    }
    return cols;
  }

  /**
   * Creates the list of internal column names for select * expressions.
   *
   * @param op The select operator.
   * @param colList The list of internal column names (represented by field nodes)
   *                returned by the children of the select operator.
   * @return a list of field nodes representing the internal column names.
   */
  public List getSelectColsFromChildren(SelectOperator op,
      List colList) {
    List cols = new ArrayList<>();
    SelectDesc conf = op.getConf();

    if (colList != null  && conf.isSelStarNoCompute()) {
      cols.addAll(colList);
      return cols;
    }

    List selectExprs = conf.getColList();

    // The colList is the output columns used by child operators, they are
    // different
    // from input columns of the current operator. we need to find out which
    // input columns are used.
    List outputColumnNames = conf.getOutputColumnNames();
    for (int i = 0; i < outputColumnNames.size(); i++) {
      if (colList == null) {
        cols = mergeFieldNodesWithDesc(cols, selectExprs.get(i));
      } else {
        FieldNode childFn = lookupColumn(colList, outputColumnNames.get(i));
        if (childFn != null) {
          // In SemanticAnalyzer we inject SEL op before aggregation. The columns
          // in this SEL are derived from the table schema, and do not reflect the
          // actual columns being selected in the current query.
          // In this case, we skip the merge and just use the path from the child ops.
          ExprNodeDesc desc = selectExprs.get(i);
          if (desc instanceof ExprNodeColumnDesc && ((ExprNodeColumnDesc) desc).getIsGenerated()) {
            FieldNode fn = new FieldNode(((ExprNodeColumnDesc) desc).getColumn());
            fn.setNodes(childFn.getNodes());
            cols = mergeFieldNodes(cols, fn);
          } else {
            cols = mergeFieldNodesWithDesc(cols, selectExprs.get(i));
          }
        }
      }
    }

    return cols;
  }

  /**
   * Given the 'desc', construct a list of field nodes representing the
   * nested columns paths referenced by this 'desc'.
   * @param desc the node descriptor
   * @return a list of nested column paths referenced in the 'desc'
   */
  private static List getNestedColPathByDesc(ExprNodeDesc desc) {
    List res = new ArrayList<>();
    getNestedColsFromExprNodeDesc(desc, null, res);
    return mergeFieldNodes(new ArrayList(), res);
  }

  private static void getNestedColsFromExprNodeDesc(
    ExprNodeDesc desc,
    FieldNode pathToRoot,
    List paths) {
    if (desc instanceof ExprNodeColumnDesc) {
      ExprNodeColumnDesc columnDesc = (ExprNodeColumnDesc) desc;
      FieldNode p = new FieldNode(columnDesc.getColumn());
      checkListAndMap(columnDesc, pathToRoot, p);
      paths.add(p);
    } else if (desc instanceof ExprNodeFieldDesc) {
      ExprNodeFieldDesc fieldDesc = (ExprNodeFieldDesc) desc;
      ExprNodeDesc childDesc = fieldDesc.getDesc();
      FieldNode p = new FieldNode(fieldDesc.getFieldName());
      checkListAndMap(fieldDesc, pathToRoot, p);
      getNestedColsFromExprNodeDesc(childDesc, p, paths);
    } else {
      List children = desc.getChildren();
      if (children != null) {
        for (ExprNodeDesc c : children) {
          getNestedColsFromExprNodeDesc(c, pathToRoot, paths);
        }
      }
    }
  }

  private static void checkListAndMap(ExprNodeDesc desc, FieldNode pathToRoot, FieldNode fn) {
    TypeInfo ti = desc.getTypeInfo();

    // Check cases for arr[i].f and map[key].v
    // For these we should not generate paths like arr.f or map.v
    // Otherwise we would have a mismatch between type info and path
    if (ti.getCategory() != ObjectInspector.Category.LIST
        && ti.getCategory() != ObjectInspector.Category.MAP) {
      fn.addFieldNodes(pathToRoot);
    }
  }

  /**
   * Create the list of internal columns for select tag of LV
   */
  public List getSelectColsFromLVJoin(RowSchema rs,
      List colList) throws SemanticException {
    List columns = new ArrayList<>();
    for (FieldNode col : colList) {
      if (rs.getColumnInfo(col.getFieldName()) != null) {
        columns.add(col);
      }
    }
    return columns;
  }

  /**
   * If the input filter operator has direct child(ren) which are union operator,
   * and the filter's column is not the same as union's
   * create select operator between them. The select operator has same number of columns as
   * pruned child operator.
   *
   * @param curOp
   *          The filter operator which need to handle children.
   * @throws SemanticException
   */
  public void handleFilterUnionChildren(Operator curOp)
      throws SemanticException {
    if (curOp.getChildOperators() == null || !(curOp instanceof FilterOperator)) {
      return;
    }
    List parentPrunList = prunedColLists.get(curOp);
    if(parentPrunList == null || parentPrunList.size() == 0) {
      return;
    }
    List prunList = null;

    for (Operator child : curOp.getChildOperators()) {
      if (child instanceof UnionOperator) {
        prunList = genColLists(child);
        if (prunList == null || prunList.size() == 0 || parentPrunList.size() == prunList.size()) {
          continue;
        }

        ArrayList exprs = new ArrayList();
        ArrayList outputColNames = new ArrayList();
        Map colExprMap = new HashMap();
        ArrayList outputRS = new ArrayList();
        for (ColumnInfo colInfo : child.getSchema().getSignature()) {
          if (lookupColumn(prunList, colInfo.getInternalName()) == null) {
            continue;
          }
          ExprNodeDesc colDesc = new ExprNodeColumnDesc(colInfo.getType(),
              colInfo.getInternalName(), colInfo.getTabAlias(), colInfo.getIsVirtualCol());
          exprs.add(colDesc);
          outputColNames.add(colInfo.getInternalName());
          ColumnInfo newCol = new ColumnInfo(colInfo.getInternalName(), colInfo.getType(),
                  colInfo.getTabAlias(), colInfo.getIsVirtualCol(), colInfo.isHiddenVirtualCol());
          newCol.setAlias(colInfo.getAlias());
          outputRS.add(newCol);
          colExprMap.put(colInfo.getInternalName(), colDesc);
        }
        SelectDesc select = new SelectDesc(exprs, outputColNames, false);
        curOp.removeChild(child);
        SelectOperator sel = (SelectOperator) OperatorFactory.getAndMakeChild(
            select, new RowSchema(outputRS), curOp);
        OperatorFactory.makeChild(sel, child);
        sel.setColumnExprMap(colExprMap);
      }
    }
  }

  static ArrayList toColumnNames(List columns) {
    ArrayList names = new ArrayList<>();
    for (FieldNode fn : columns) {
      names.add(fn.getFieldName());
    }
    return names;
  }

  static List fromColumnNames(List columnNames) {
    List fieldNodes = new ArrayList<>();
    for (String cn : columnNames) {
      fieldNodes.add(new FieldNode(cn));
    }
    return fieldNodes;
  }

  static FieldNode lookupColumn(Collection columns, String colName) {
    for (FieldNode fn : columns) {
      if (fn.getFieldName() != null && fn.getFieldName().equals(colName)) {
        return fn;
      }
    }
    return null;
  }

  static List mergeFieldNodesWithDesc(List left, ExprNodeDesc desc) {
    return FieldNode.mergeFieldNodes(left, getNestedColPathByDesc(desc));
  }
}