shark.execution.SparkTask.scala Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of shark_2.10 Show documentation
shark
The newest version!
/*
 * Copyright (C) 2012 The Regents of The University California.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package shark.execution

import java.util.{HashMap => JHashMap, List => JavaList}

import org.apache.hadoop.hive.metastore.api.FieldSchema
import org.apache.hadoop.hive.ql.{Context, DriverContext}
import org.apache.hadoop.hive.ql.exec.{TableScanOperator => HiveTableScanOperator, Utilities}
import org.apache.hadoop.hive.ql.exec.{Task => HiveTask}
import org.apache.hadoop.hive.ql.metadata.{Partition, Table}
import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner
import org.apache.hadoop.hive.ql.parse._
import org.apache.hadoop.hive.ql.plan.{PlanUtils, PartitionDesc}
import org.apache.hadoop.hive.ql.plan.api.StageType
import org.apache.hadoop.hive.ql.session.SessionState

import org.apache.spark.rdd.RDD

import shark.api.TableRDD
import shark.{LogHelper, SharkEnv}


class SparkWork(
  val pctx: ParseContext,
  val terminalOperator: TerminalOperator,
  val resultSchema: JavaList[FieldSchema])
extends java.io.Serializable


/**
 * SparkTask executes a query plan composed of RDD operators.
 */
private[shark]
class SparkTask extends HiveTask[SparkWork] with Serializable with LogHelper {

  private var _tableRdd: Option[TableRDD] = None

  def tableRdd: Option[TableRDD] = _tableRdd

  override def execute(driverContext: DriverContext): Int = {
    logDebug("Executing " + this.getClass.getName)

    val ctx = driverContext.getCtx()

    // Adding files to the SparkContext
    // Added required files
    val files = Utilities.getResourceFiles(conf, SessionState.ResourceType.FILE)
    files.split(",").filterNot(x => x.isEmpty || SharkEnv.addedFiles.contains(x)).foreach { x =>
      logInfo("Adding file "  + x )
      SharkEnv.addedFiles.add(x)
      SharkEnv.sc.addFile(x)
    }

    // Added required jars
    val jars = Utilities.getResourceFiles(conf, SessionState.ResourceType.JAR)
    jars.split(",").filterNot(x => x.isEmpty || SharkEnv.addedJars.contains(x)).foreach { x =>
      logInfo("Adding jar "  + x )
      SharkEnv.addedJars.add(x)
      SharkEnv.sc.addJar(x)
    }

    Operator.hconf = conf

    // Replace Hive physical plan with Shark plan.
    val terminalOp = work.terminalOperator
    val tableScanOps = terminalOp.returnTopOperators().asInstanceOf[Seq[TableScanOperator]]

    //ExplainTaskHelper.outputPlan(terminalOp, Console.out, true, 2)
    //ExplainTaskHelper.outputPlan(hiveTopOps.head, Console.out, true, 2)

    initializeTableScanTableDesc(tableScanOps)

    terminalOp.initializeMasterOnAll()

    // Set Spark's job description to be this query.
    SharkEnv.sc.setJobDescription(work.pctx.getContext.getCmd)

    // Set the fair scheduler's pool using mapred.fairscheduler.pool if it is defined.
    Option(conf.get("mapred.fairscheduler.pool")).foreach { pool =>
      SharkEnv.sc.setLocalProperty("spark.scheduler.pool", pool)
    }

    val sinkRdd = terminalOp.execute().asInstanceOf[RDD[Any]]

    val limit = terminalOp.parentOperators.head match {
      case op: LimitOperator => op.limit
      case _ => -1
    }

    if (terminalOp.isInstanceOf[TableRddSinkOperator]) {
      _tableRdd = Some(new TableRDD(sinkRdd, work.resultSchema, terminalOp.objectInspector, limit))
    }

    0
  }

  def initializeTableScanTableDesc(topOps: Seq[TableScanOperator]) {
    // topToTable maps Hive's TableScanOperator to the Table object.
    val topToTable: JHashMap[HiveTableScanOperator, Table] = work.pctx.getTopToTable()

    val emptyPartnArray = new Array[Partition](0)
    // Add table metadata to TableScanOperators
    topOps.foreach { op =>
      op.table = topToTable.get(op.hiveOp)
      op.tableDesc = Utilities.getTableDesc(op.table)
      PlanUtils.configureInputJobPropertiesForStorageHandler(op.tableDesc)
      if (op.table.isPartitioned) {
        val ppl = PartitionPruner.prune(
          op.table,
          work.pctx.getOpToPartPruner().get(op.hiveOp),
          work.pctx.getConf(), "",
          work.pctx.getPrunedPartitions())
        op.parts = ppl.getConfirmedPartns.toArray(emptyPartnArray) ++
          ppl.getUnknownPartns.toArray(emptyPartnArray)
        val allParts = op.parts ++ ppl.getDeniedPartns.toArray
        if (allParts.size == 0) {
          op.firstConfPartDesc = new PartitionDesc(op.tableDesc, null)
        } else {
          op.firstConfPartDesc = Utilities.getPartitionDesc(allParts(0).asInstanceOf[Partition])
        }
      }
    }
  }

  override def getType = StageType.MAPRED

  override def getName = "MAPRED-SPARK"

  override def localizeMRTmpFilesImpl(ctx: Context) = Unit

}