shark.api.RDDTableFunctions.scala Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of shark_2.10 Show documentation
shark
The newest version!
/*
 * Copyright (C) 2012 The Regents of The University California.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package shark.api

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

import org.apache.hadoop.hive.ql.metadata.Hive

import org.apache.spark.rdd.RDD

import shark.{SharkContext, SharkEnv}
import shark.memstore2.{CacheType, TablePartitionStats, TablePartition, TablePartitionBuilder}
import shark.util.HiveUtils


class RDDTableFunctions(self: RDD[Seq[_]], classTags: Seq[ClassTag[_]]) {

  def saveAsTable(tableName: String, fields: Seq[String]): Boolean = {
    require(fields.size == this.classTags.size,
      "Number of column names != number of fields in the RDD.")

    // Get a local copy of the classTags so we don't need to serialize this object.
    val classTags = this.classTags

    val statsAcc = SharkEnv.sc.accumulableCollection(ArrayBuffer[(Int, TablePartitionStats)]())

    // Create the RDD object.
    val rdd = self.mapPartitionsWithIndex { case(partitionIndex, iter) =>
      val ois = classTags.map(HiveUtils.getJavaPrimitiveObjectInspector)
      val builder = new TablePartitionBuilder(
        HiveUtils.makeStandardStructObjectInspector(fields, ois),
        1000000,
        shouldCompress = false)

      for (p <- iter) {
        builder.incrementRowCount()
        // TODO: this is not the most efficient code to do the insertion ...
        p.zipWithIndex.foreach { case (v, i) =>
          builder.append(i, v.asInstanceOf[Object], ois(i))
        }
      }

      statsAcc += Tuple2(partitionIndex, builder.asInstanceOf[TablePartitionBuilder].stats)
      Iterator(builder.build())
    }.persist()

    var isSucessfulCreateTable = HiveUtils.createTableInHive(
      tableName, fields, classTags, Hive.get().getConf())

    // Put the table in the metastore. Only proceed if the DDL statement is executed successfully.
    val databaseName = Hive.get(SharkContext.hiveconf).getCurrentDatabase()
    if (isSucessfulCreateTable) {
      // Create an entry in the MemoryMetadataManager.
      val newTable = SharkEnv.memoryMetadataManager.createMemoryTable(
        databaseName, tableName, CacheType.MEMORY)
      try {
        // Force evaluate to put the data in memory.
        rdd.context.runJob(rdd, (iter: Iterator[TablePartition]) => iter.foreach(_ => Unit))
      } catch {
        case _: Exception => {
          // Intercept the exception thrown by SparkContext#runJob() and handle it silently. The
          // exception message should already be printed to the console by DDLTask#execute().
          HiveUtils.dropTableInHive(tableName)
          // Drop the table entry from MemoryMetadataManager.
          SharkEnv.memoryMetadataManager.removeTable(databaseName, tableName)
          isSucessfulCreateTable = false
        }
      }
      newTable.put(rdd, statsAcc.value.toMap)
    }
    return isSucessfulCreateTable
  }
}