shark.execution.MemoryStoreSinkOperator.scala Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of shark_2.10 Show documentation
shark
The newest version!
/*
 * Copyright (C) 2012 The Regents of The University California.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package shark.execution

import java.nio.ByteBuffer

import scala.collection.mutable.ArrayBuffer
import scala.reflect.BeanProperty

import org.apache.hadoop.io.Writable

import org.apache.spark.rdd.{RDD, UnionRDD}
import org.apache.spark.storage.StorageLevel

import shark.{SharkConfVars, SharkEnv}
import shark.execution.serialization.{OperatorSerializationWrapper, JavaSerializer}
import shark.memstore2._


/**
 * Cache the RDD and force evaluate it (so the cache is filled).
 */
class MemoryStoreSinkOperator extends TerminalOperator {

  // The initial capacity for ArrayLists used to construct the columnar storage. If -1,
  // the ColumnarSerde will obtain the partition size from a Configuration during execution
  // initialization (see ColumnarSerde#initialize()).
  @BeanProperty var partitionSize: Int = _

  // If true, columnar storage will use compression.
  @BeanProperty var shouldCompress: Boolean = _

  // For CTAS, this is the name of the table that is created. For INSERTS, this is the name of*
  // the table that is modified.
  @BeanProperty var tableName: String = _

  // The Hive metastore DB that the `tableName` table belongs to.
  @BeanProperty var databaseName: String = _

  // Used only for commands that target Hive partitions. The partition key is a set of unique values
  // for the the table's partitioning columns and identifies the partition (represented by an RDD)
  // that will be created or modified by the INSERT command being handled.
  @BeanProperty var hivePartitionKeyOpt: Option[String] = _

  // The memory storage used to store the output RDD - e.g., CacheType.HEAP refers to Spark's
  // block manager.
  @transient var cacheMode: CacheType.CacheType = _

  // Whether to compose a UnionRDD from the output RDD and a previous RDD. For example, for an
  // INSERT INTO  command, the previous RDD will contain the contents of the 'tableName'.
  @transient var isInsertInto: Boolean = _

  // The number of columns in the schema for the table corresponding to 'tableName'. Used only
  // to create an OffHeapTableWriter, if off-heap storage is used.
  @transient var numColumns: Int = _

  override def initializeOnMaster() {
    super.initializeOnMaster()
    partitionSize = SharkConfVars.getIntVar(localHconf, SharkConfVars.COLUMN_BUILDER_PARTITION_SIZE)
    shouldCompress = SharkConfVars.getBoolVar(localHconf, SharkConfVars.COLUMNAR_COMPRESSION)
  }

  override def initializeOnSlave() {
    super.initializeOnSlave()
    localHconf.setInt(SharkConfVars.COLUMN_BUILDER_PARTITION_SIZE.varname, partitionSize)
    localHconf.setBoolean(SharkConfVars.COLUMNAR_COMPRESSION.varname, shouldCompress)
  }

  override def execute(): RDD[_] = {
    val inputRdd = if (parentOperators.size == 1) executeParents().head._2 else null

    val statsAcc = SharkEnv.sc.accumulableCollection(ArrayBuffer[(Int, TablePartitionStats)]())
    val op = OperatorSerializationWrapper(this)
    val tableKey = MemoryMetadataManager.makeTableKey(databaseName, tableName)

    val offHeapWriter: OffHeapTableWriter =
      if (cacheMode == CacheType.OFFHEAP) {
        val offHeapClient = OffHeapStorageClient.client
        if (!isInsertInto && offHeapClient.tablePartitionExists(tableKey, hivePartitionKeyOpt)) {
          // For INSERT OVERWRITE, delete the old table or Hive partition directory, if it exists.
          offHeapClient.dropTablePartition(tableKey, hivePartitionKeyOpt)
        }
        // Use an additional row to store metadata (e.g. number of rows in each partition).
        offHeapClient.createTablePartitionWriter(tableKey, hivePartitionKeyOpt, numColumns + 1)
      } else {
        null
      }

    // Put all rows of the table into a set of TablePartition's. Each partition contains
    // only one TablePartition object.
    var outputRDD: RDD[TablePartition] = inputRdd.mapPartitionsWithIndex { case (part, iter) =>
      op.initializeOnSlave()
      val serde = new ColumnarSerDe
      serde.initialize(op.localHconf, op.localHiveOp.getConf.getTableInfo.getProperties)

      // Serialize each row into the builder object.
      // ColumnarSerDe will return a TablePartitionBuilder.
      var builder: Writable = null
      iter.foreach { row =>
        builder = serde.serialize(row.asInstanceOf[AnyRef], op.objectInspector)
      }

      if (builder == null) {
        // Empty partition.
        statsAcc += Tuple2(part, new TablePartitionStats(Array(), 0))
        Iterator(new TablePartition(0, Array()))
      } else {
        statsAcc += Tuple2(part, builder.asInstanceOf[TablePartitionBuilder].stats)
        Iterator(builder.asInstanceOf[TablePartitionBuilder].build)
      }
    }

    if (offHeapWriter != null) {
      // Put the table in off-heap storage.
      op.logInfo("Putting RDD for %s.%s in off-heap storage".format(databaseName, tableName))
      offHeapWriter.createTable()
      outputRDD = outputRDD.mapPartitionsWithIndex { case(part, iter) =>
        val partition = iter.next()
        partition.toOffHeap.zipWithIndex.foreach { case(buf, column) =>
          offHeapWriter.writeColumnPartition(column, part, buf)
        }
        Iterator(partition)
      }
      // Force evaluate so the data gets put into off-heap storage.
      outputRDD.context.runJob(
        outputRDD, (iter: Iterator[TablePartition]) => iter.foreach(_ => Unit))
    } else {
      // Run a job on the RDD that contains the query output to force the data into the memory
      // store. The statistics will also be collected by 'statsAcc' during job execution.
      if (cacheMode == CacheType.MEMORY) {
        outputRDD.persist(StorageLevel.MEMORY_AND_DISK)
      } else if (cacheMode == CacheType.MEMORY_ONLY) {
        outputRDD.persist(StorageLevel.MEMORY_ONLY)
      }
      outputRDD.context.runJob(
        outputRDD, (iter: Iterator[TablePartition]) => iter.foreach(_ => Unit))
    }

    // Put the table in Spark block manager or off-heap storage.
    op.logInfo("Putting %sRDD for %s.%s in %s store".format(
      if (isInsertInto) "Union" else "",
      databaseName,
      tableName,
      if (cacheMode == CacheType.NONE) "disk" else cacheMode.toString))

    val tableStats =
      if (cacheMode == CacheType.OFFHEAP) {
        offHeapWriter.setStats(statsAcc.value.toMap)
        statsAcc.value.toMap
      } else {
        val isHivePartitioned = SharkEnv.memoryMetadataManager.isHivePartitioned(
          databaseName, tableName)
        if (isHivePartitioned) {
          val partitionedTable = SharkEnv.memoryMetadataManager.getPartitionedTable(
            databaseName, tableName).get
          val hivePartitionKey = hivePartitionKeyOpt.get
          outputRDD.setName("%s.%s(%s)".format(databaseName, tableName, hivePartitionKey))
          if (isInsertInto) {
            // An RDD for the Hive partition already exists, so update its metadata entry in
            // 'partitionedTable'.
            partitionedTable.updatePartition(hivePartitionKey, outputRDD, statsAcc.value)
          } else {
            // This is a new Hive-partition. Add a new metadata entry in 'partitionedTable'.
            partitionedTable.putPartition(hivePartitionKey, outputRDD, statsAcc.value.toMap)
          }
          // Stats should be updated at this point.
          partitionedTable.getStats(hivePartitionKey).get
        } else {
          outputRDD.setName(tableName)
          // Create a new MemoryTable entry if one doesn't exist (i.e., this operator is for a CTAS).
          val memoryTable = SharkEnv.memoryMetadataManager.getMemoryTable(databaseName, tableName)
            .getOrElse(SharkEnv.memoryMetadataManager.createMemoryTable(
              databaseName, tableName, cacheMode))
          if (isInsertInto) {
            // Ok, a off-heap table should manage stats for each rdd, and never union the maps.
            memoryTable.update(outputRDD, statsAcc.value)
          } else {
            memoryTable.put(outputRDD, statsAcc.value.toMap)
          }
          memoryTable.getStats.get
        }
      }

    if (SharkConfVars.getBoolVar(localHconf, SharkConfVars.MAP_PRUNING_PRINT_DEBUG)) {
      tableStats.foreach { case(index, tablePartitionStats) =>
        println("Partition " + index + " " + tablePartitionStats.toString)
      }
    }

    return outputRDD
  }

  override def processPartition(split: Int, iter: Iterator[_]): Iterator[_] =
    throw new UnsupportedOperationException("CacheSinkOperator.processPartition()")
}