shark.execution.GroupByPreShuffleOperator.scala Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of shark_2.10 Show documentation
shark
The newest version!
/*
 * Copyright (C) 2012 The Regents of The University California. 
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.exec
// Put this file in Hive's exec package to access package level visible fields and methods.

import java.util.{ArrayList => JArrayList, HashMap => JHashMap}

import scala.collection.immutable.BitSet.BitSet1
import scala.collection.JavaConversions._
import scala.collection.mutable.ArrayBuffer
import scala.reflect.BeanProperty

import org.apache.hadoop.hive.conf.HiveConf
import org.apache.hadoop.hive.ql.exec.{GroupByOperator => HiveGroupByOperator}
import org.apache.hadoop.hive.ql.plan.AggregationDesc
import org.apache.hadoop.hive.ql.plan.{ExprNodeConstantDesc, ExprNodeDesc}
import org.apache.hadoop.hive.ql.plan.GroupByDesc
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.AggregationBuffer
import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspector, ObjectInspectorFactory,
    ObjectInspectorUtils, StructObjectInspector}
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption

import shark.execution.UnaryOperator


/**
 * The pre-shuffle group by operator responsible for map side aggregations.
 */
class GroupByPreShuffleOperator extends UnaryOperator[GroupByDesc] {

  @BeanProperty var conf: GroupByDesc = _
  @BeanProperty var minReductionHashAggr: Float = _
  @BeanProperty var numRowsCompareHashAggr: Int = _

  @transient var keyFactory: KeyWrapperFactory = _
  @transient var rowInspector: ObjectInspector = _

  // The aggregation functions.
  @transient var aggregationEvals: Array[GenericUDAFEvaluator] = _
  @transient var aggregationObjectInspectors: Array[ObjectInspector] = _

  // Key fields to be grouped.
  @transient var keyFields: Array[ExprNodeEvaluator] = _
  // A struct object inspector composing of all the fields.
  @transient var keyObjectInspector: StructObjectInspector = _

  @transient var aggregationParameterFields: Array[Array[ExprNodeEvaluator]] = _
  @transient var aggregationParameterObjectInspectors: Array[Array[ObjectInspector]] = _
  @transient var aggregationParameterStandardObjectInspectors: Array[Array[ObjectInspector]] = _

  @transient var aggregationIsDistinct: Array[Boolean] = _
  @transient var currentKeyObjectInspectors: Array[ObjectInspector] = _

  // Grouping set related properties.
  @transient var groupingSetsPresent: Boolean = _
  @transient var groupingSets: java.util.List[java.lang.Integer] = _
  @transient var groupingSetsPosition: Int = _
  @transient var newKeysGroupingSets: Array[Object] = _
  @transient var groupingSetsBitSet: Array[BitSet1] = _
  @transient var cloneNewKeysArray: Array[Object] = _

  def createLocals() {
    aggregationEvals = conf.getAggregators.map(_.getGenericUDAFEvaluator).toArray
    aggregationIsDistinct = conf.getAggregators.map(_.getDistinct).toArray
    rowInspector = objectInspector.asInstanceOf[StructObjectInspector]
    keyFields = conf.getKeys().map(k => ExprNodeEvaluatorFactory.get(k)).toArray
    val keyObjectInspectors: Array[ObjectInspector] = keyFields.map(k => k.initialize(rowInspector))
    currentKeyObjectInspectors = keyObjectInspectors.map { k =>
      ObjectInspectorUtils.getStandardObjectInspector(k, ObjectInspectorCopyOption.WRITABLE)
    }

    aggregationParameterFields = conf.getAggregators.toArray.map { aggr =>
      aggr.asInstanceOf[AggregationDesc].getParameters.toArray.map { param =>
        ExprNodeEvaluatorFactory.get(param.asInstanceOf[ExprNodeDesc])
      }
    }
    aggregationParameterObjectInspectors = aggregationParameterFields.map { aggr =>
      aggr.map { param => param.initialize(rowInspector) }
    }
    aggregationParameterStandardObjectInspectors = aggregationParameterObjectInspectors.map { ois =>
      ois.map { oi =>
        ObjectInspectorUtils.getStandardObjectInspector(oi, ObjectInspectorCopyOption.WRITABLE)
      }
    }

    aggregationEvals.zipWithIndex.map { pair =>
      pair._1.init(conf.getAggregators.get(pair._2).getMode,
        aggregationParameterObjectInspectors(pair._2))
    }

    aggregationObjectInspectors = 
      Array.tabulate[ObjectInspector](aggregationEvals.length) { i=>
        val mode = conf.getAggregators()(i).getMode()
        aggregationEvals(i).init(mode, aggregationParameterObjectInspectors(i))
      }
    
    val keyFieldNames = conf.getOutputColumnNames.slice(0, keyFields.length)
    val totalFields = keyFields.length + aggregationEvals.length
    val keyois = new JArrayList[ObjectInspector](totalFields)
    keyObjectInspectors.foreach(keyois.add(_))

    keyObjectInspector = ObjectInspectorFactory.
      getStandardStructObjectInspector(keyFieldNames, keyois)

    keyFactory = new KeyWrapperFactory(keyFields, keyObjectInspectors, currentKeyObjectInspectors)
    
    // Initializations for grouping set.
    groupingSetsPresent = conf.isGroupingSetsPresent()
    if (groupingSetsPresent) {
      groupingSets = conf.getListGroupingSets()
      groupingSetsPosition = conf.getGroupingSetPosition()
      newKeysGroupingSets = new Array[Object](groupingSets.size)
      groupingSetsBitSet = new Array[BitSet1](groupingSets.size)
      cloneNewKeysArray = new Array[Object](groupingSets.size)

      groupingSets.zipWithIndex.foreach { case(groupingSet, i) =>
        val groupingSetValueEvaluator: ExprNodeEvaluator =
          ExprNodeEvaluatorFactory.get(new ExprNodeConstantDesc(String.valueOf(groupingSet)));

        newKeysGroupingSets(i) = groupingSetValueEvaluator.evaluate(null)
        groupingSetsBitSet(i) = new BitSet1(groupingSet.longValue())
      }
    }
  }

  protected final def getNewKeysIterator (newKeysArray: Array[Object]): Iterator[Unit] = {
    // This iterator abstracts the operation that gets an array of groupby keys for the next
    // grouping set of the grouping sets and makes such logic re-usable in several places.
    //
    // Initially, newKeysArray is an array containing all groupby keys for the superset of
    // grouping sets. next() method updates newKeysArray to be an array of groupby keys for
    // the next grouping set.
    new Iterator[Unit]() {
      Array.copy(newKeysArray, 0, cloneNewKeysArray, 0, groupingSetsPosition)
      var groupingSetIndex = 0

      override def hasNext: Boolean = groupingSetIndex < groupingSets.size

      // Update newKeys according to the current grouping set.
      override def next(): Unit = {
        for (i <- 0 until groupingSetsPosition) {
          newKeysArray(i) = null
        }
        groupingSetsBitSet(groupingSetIndex).foreach {keyIndex =>
          newKeysArray(keyIndex) = cloneNewKeysArray(keyIndex)
        }
        newKeysArray(groupingSetsPosition) = newKeysGroupingSets(groupingSetIndex)
        groupingSetIndex += 1
      }
    }
  }
  
  def createRemotes() {
     conf = desc
     minReductionHashAggr = hconf.get(HiveConf.ConfVars.HIVEMAPAGGRHASHMINREDUCTION.varname).toFloat
     numRowsCompareHashAggr = hconf.get(HiveConf.ConfVars.HIVEGROUPBYMAPINTERVAL.varname).toInt
  }

  override def initializeOnMaster() {
    super.initializeOnMaster()
    
    createRemotes()
    createLocals()
  }

  override def initializeOnSlave() {
    super.initializeOnSlave()
    createLocals()
  }

  // copied from the org.apache.hadoop.hive.ql.exec.GroupByOperator 
  override def outputObjectInspector() = {
    val totalFields = keyFields.length + aggregationEvals.length
        
    val ois = new ArrayBuffer[ObjectInspector](totalFields)
    ois ++= (currentKeyObjectInspectors)
    ois ++= (aggregationObjectInspectors)

    val fieldNames = conf.getOutputColumnNames()

    import scala.collection.JavaConversions._
    ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, ois.toList)
  }
  
  override def processPartition(split: Int, iter: Iterator[_]) = {
    logDebug("Running Pre-Shuffle Group-By")
    var numRowsInput = 0
    var numRowsHashTbl = 0
    var useHashAggr = true

    // Do aggregation on map side using hashAggregations hash table.
    val hashAggregations = new JHashMap[KeyWrapper, Array[AggregationBuffer]]()

    val newKeys: KeyWrapper = keyFactory.getKeyWrapper()

    while (iter.hasNext && useHashAggr) {
      val row = iter.next().asInstanceOf[AnyRef]
      numRowsInput += 1

      newKeys.getNewKey(row, rowInspector)
      val newKeysIter =
        if (groupingSetsPresent) getNewKeysIterator(newKeys.getKeyArray) else null

      do {
        if (groupingSetsPresent) {
          newKeysIter.next
        }
        newKeys.setHashKey()

        var aggs = hashAggregations.get(newKeys)
        var isNewKey = false
        if (aggs == null) {
          isNewKey = true
          val newKeyProber = newKeys.copyKey()
          aggs = newAggregations()
          hashAggregations.put(newKeyProber, aggs)
          numRowsHashTbl += 1
        }
        if (isNewKey) {
          aggregateNewKey(row, aggs)
        } else {
          aggregateExistingKey(row, aggs)
        }
      } while (groupingSetsPresent && newKeysIter.hasNext)

      // Disable partial hash-based aggregation if desired minimum reduction is
      // not observed after initial interval.
      if (numRowsInput == numRowsCompareHashAggr) {
        if (numRowsHashTbl > numRowsInput * minReductionHashAggr) {
          useHashAggr = false
          logInfo("Mapside hash aggregation disabled")
        } else {
          logInfo("Mapside hash aggregation enabled")
        }
        logInfo("#hash table=" + numRowsHashTbl + " #rows=" +
          numRowsInput + " reduction=" + numRowsHashTbl.toFloat/numRowsInput +
          " minReduction=" + minReductionHashAggr)
      }
    }

    // Generate an iterator for the aggregation output from hashAggregations.
    val outputCache = new Array[Object](keyFields.length + aggregationEvals.length)
    hashAggregations.toIterator.map { case(key, aggrs) =>
      val keyArr = key.getKeyArray()
      var i = 0
      while (i < keyArr.length) {
        outputCache(i) = keyArr(i)
        i += 1
      }
      i = 0
      while (i < aggrs.length) {
        outputCache(i + keyArr.length) = aggregationEvals(i).evaluate(aggrs(i))
        i += 1
      }
      outputCache
    } ++ {
      // Concatenate with iterator for remaining rows not in hashAggregations.
      val newIter = iter.map { case row: AnyRef =>
        newKeys.getNewKey(row, rowInspector)
        val newAggrKey = newKeys.copyKey()
        val aggrs = newAggregations()
        aggregateNewKey(row, aggrs)
        val keyArr = newAggrKey.getKeyArray()
        var i = 0
        while (i < keyArr.length) {
          outputCache(i) = keyArr(i)
          i += 1
        }
        i = 0
        while (i < aggrs.length) {
          outputCache(i + keyArr.length) = aggregationEvals(i).evaluate(aggrs(i))
          i += 1
        }
        outputCache
      }
      if (groupingSetsPresent) {
        val outputBuffer = new Array[Array[Object]](groupingSets.size)
        newIter.flatMap { row: Array[Object] =>
          val newKeysIter = getNewKeysIterator(row)

          var i = 0
          while (newKeysIter.hasNext) {
            newKeysIter.next
            outputBuffer(i) = row.clone()
            i += 1
          }
          outputBuffer
        }
      } else {
        newIter
      }
    }
  }

  @inline protected final
  def aggregateNewKey(row: Object, aggregations: Array[AggregationBuffer]) {
    var i = 0
    while (i < aggregations.length) {
      aggregationEvals(i).aggregate(
        aggregations(i), aggregationParameterFields(i).map(_.evaluate(row)))
      i += 1
    }
  }

  @inline protected final
  def aggregateExistingKey(row: AnyRef, aggregations: Array[AggregationBuffer]) {
    var i = 0
    while (i < aggregations.length) {
      if (!aggregationIsDistinct(i)) {
        aggregationEvals(i).aggregate(
          aggregations(i), aggregationParameterFields(i).map(_.evaluate(row)))
      }
      i += 1
    }
  }

  protected def newAggregations(): Array[AggregationBuffer] = {
    aggregationEvals.map(eval => eval.getNewAggregationBuffer)
  }
}