shark.execution.optimization.ColumnPruner.scala Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of shark_2.10 Show documentation
shark
The newest version!
/*
 * Copyright (C) 2012 The Regents of The University California.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package shark.execution.optimization

import java.util.BitSet
import java.util.{List => JList}

import scala.collection.JavaConversions.{asScalaBuffer, collectionAsScalaIterable}
import scala.collection.mutable.{Set, HashSet}

import org.apache.hadoop.hive.ql.exec.GroupByPreShuffleOperator
import org.apache.hadoop.hive.ql.metadata.Table
import org.apache.hadoop.hive.ql.plan._

import shark.execution._


class ColumnPruner(@transient op: TopOperator[_], @transient tbl: Table) extends Serializable {

  val columnsUsed: BitSet = {
    val colsToKeep = computeColumnsToKeep()
    val allColumns = tbl.getCols().map(x => x.getName())

    if (colsToKeep.contains("*")) {
      // If colsToKeep contains a select *, use all columns.
      val b = new BitSet(allColumns.size)
      b.set(0, allColumns.size, true)
      b
    } else {
      // No need to prune partition columns - Hive does that for us.
      val b = new BitSet
      for (i <- Range(0, allColumns.size) if colsToKeep.contains(allColumns(i))) {
        b.set(i, true)
      }
      b
    }
  }

  private def computeColumnsToKeep(): Set[String] = {
    val cols = HashSet[String]()
    computeColumnsToKeep(op, cols)
    cols
  }

  /**
   * Computes the column names that are referenced in the Query
   */
  private def computeColumnsToKeep(
      op: Operator[_],
      cols: HashSet[String],
      parentOp: Operator[_] = null) {

    def nullGuard[T](s: JList[T]): Seq[T] = {
      if (s == null) Seq[T]() else s
    }

    op match {
      case selOp: SelectOperator =>
        val cnf: SelectDesc = selOp.getConf
        //Select Descriptor contains SelectConf, which holds the list of columns
        //referenced by the select op
        if (cnf != null) {
          if (cnf.isSelStarNoCompute) {
            // For star, return immediately since there is no point doing any further pruning.
            cols.clear()
            cols += "*"
            return cols
          } else {
            val evals = nullGuard(cnf.getColList)
            cols ++= (HashSet() ++ evals).flatMap(x => nullGuard(x.getCols))
          }
        }

      case filterOp: FilterOperator =>
        val cnf:FilterDesc = filterOp.getConf
        //FilterDesc has predicates, which are the columns involved in predicate operations
        if (cnf != null) {
          cols ++= (HashSet() ++ nullGuard(cnf.getPredicate.getCols))
        }

      case joinOp: JoinOperator =>
        val cnf: ReduceSinkDesc = parentOp.asInstanceOf[ReduceSinkOperator].getConf
        //before a regular join operation, the reduce sink operator is always present.
        //the key and value columns need to be examined for the input to the join
        if (cnf != null) {
          val keyEvals = nullGuard(cnf.getKeyCols)
          val valEvals = nullGuard(cnf.getValueCols)
          val evals = HashSet() ++ keyEvals ++ valEvals
          cols ++= evals.flatMap(x => nullGuard(x.getCols))
        }

      case joinOp: MapJoinOperator =>
        val cnf: MapJoinDesc = joinOp.getConf
        if (cnf != null) {
          val keyEvals = cnf.getKeys.values
          val valEvals = cnf.getExprs.values
          val evals = HashSet() ++ keyEvals ++ valEvals
          cols ++= evals.flatMap(x => x).flatMap(x => nullGuard(x.getCols))
        }

      case groupBy: GroupByPreShuffleOperator =>
        val cnf: GroupByDesc = groupBy.getConf
        if (cnf != null) {
          val keys = nullGuard(groupBy.getConf.getKeys)
          cols ++= (HashSet() ++ keys).flatMap(x => nullGuard(x.getCols))
        }

      case _ =>
    }

    // recurse on the subtree
    val numChildren = op.childOperators.size
    var currentChildIndex = 0
    while (currentChildIndex < numChildren) {
      val childOp = op.childOperators(currentChildIndex)
      if (op.isInstanceOf[TableScanOperator] && childOp.isInstanceOf[LateralViewForwardOperator]) {
        // The query has a LATERAL VIEW command and its operator tree includes am LVJ Op.
        // See LateralViewJoinOperator.scala for documentation on execution details.
        // There is an implied SELECT * projection on a table's rows when we evaluate the LVF Op
        // from the UDTF Op branch, so short-circuit the pruning here.
        // Note that the actual Select Op in that branch only contains the Array evaluators, so we
        // can't column prune based on it.
        cols += "*"
        return cols
      } else {
        computeColumnsToKeep(childOp, cols, op)
      }
      currentChildIndex = currentChildIndex + 1
    }
  }
}