io.eels.component.hive.HiveOps.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of eel-hive_2.11 Show documentation
eel-hive
The newest version!
package io.eels.component.hive

import java.rmi.NoSuchObjectException

import com.sksamuel.exts.Logging
import io.eels.Constants
import io.eels.component.hive.dialect.ParquetHiveDialect
import io.eels.component.hive.partition.PartitionMetaData
import io.eels.schema._
import org.apache.hadoop.fs.Path
import org.apache.hadoop.hive.metastore.api.{Database, FieldSchema, SerDeInfo, StorageDescriptor, Table, Partition => HivePartition}
import org.apache.hadoop.hive.metastore.{IMetaStoreClient, TableType}

import scala.collection.JavaConverters._

// client for operating at a low level on the metastore
// methods in this class will accept/return eel classes, and convert
// the operations into hive specific ones
class HiveOps(val client: IMetaStoreClient) extends Logging {

  /**
    * Returns a map of all partition keys to the distinct values.
    * This operation is optimized, in that it does not need to scan files, but can retrieve the information
    * directly from the hive metastore.
    */
  def partitionMap(dbName: String, tableName: String): Map[String, Seq[String]] = {
    partitions(dbName, tableName)
      .flatMap(_.entries)
      .groupBy(_.key)
      .map { case (key, entries) => key -> entries.map(_.value) }
  }

  /**
    * Returns all partition values for a given partition key.
    * This operation is optimized, in that it does not need to scan files, but can retrieve the information
    * directly from the hive metastore.
    */
  def partitionValues(dbName: String, tableName: String, key: String): Seq[String] = {
    partitions(dbName, tableName).flatMap(_.entries).find(_.key == key).map(_.value).toSeq.distinct
  }

  // returns the eel field instances which correspond to the partition keys for this table
  def partitionFields(dbName: String, tableName: String): List[Field] = client.synchronized {
    val keys = client.getTable(dbName, tableName).getPartitionKeys.asScala
    keys.map { schema =>
      HiveSchemaFns.fromHiveField(schema).withNullable(false).withPartition(true)
    }.toList
  }

  // returns the eel partitions for this hive table
  def partitions(dbName: String, tableName: String): List[Partition] = client.synchronized {
    val fields = partitionFields(dbName, tableName)
    client.listPartitions(dbName, tableName, Short.MaxValue).asScala.map { it =>
      val entries = it.getValues.asScala.zipWithIndex.map { case (str, int) =>
        PartitionEntry(fields(int).name, str)
      }
      Partition(entries.toList)
    }.toList
  }

  /**
    * Creates a new partition in Hive in the given database.table. The location of the partition must be
    * specified. The values for the serialization formats are taken from the values for the table.
    */
  def createPartition(dbName: String, tableName: String, location: Path, partition: Partition): Unit = client.synchronized {
    logger.info(s"Creating partition ${partition.values.mkString(",")} on $dbName.$tableName at location=$location")

    // we fetch the table so we can copy the serde/format values from the table. It makes no sense
    // to store a partition with different serialization formats to other partitions.
    val table = client.getTable(dbName, tableName)
    val sd = new StorageDescriptor(table.getSd)
    sd.setLocation(location.toString)

    // the hive partition requires the values of the entries
    val hivePartition = new HivePartition(
      partition.values.asJava,
      dbName,
      tableName,
      createTimeAsInt(),
      0,
      sd,
      new java.util.HashMap()
    )

    client.add_partition(hivePartition)
  }

  /**
    * Returns hive API partitions for the given dbName:tableName
    *
    * In Hive, a partition is a set
    */
  def hivePartitions(dbName: String, tableName: String): List[HivePartition] = client.synchronized {
    client.listPartitions(dbName, tableName, Short.MaxValue).asScala.toList
  }

  // returns the partition meta datas for this table
  def partitionsMetaData(dbName: String, tableName: String): Seq[PartitionMetaData] = client.synchronized {
    val keys = client.getTable(dbName, tableName).getPartitionKeys.asScala
    client.listPartitions(dbName, tableName, Short.MaxValue).asScala.map { it =>
      val partition = Partition(keys.zip(it.getValues.asScala).map { case (key, value) => PartitionEntry(key.getName, value) })
      PartitionMetaData(
        new Path(it.getSd.getLocation),
        it.getSd.getLocation,
        it.getSd.getInputFormat,
        it.getSd.getOutputFormat,
        it.getCreateTime,
        it.getLastAccessTime,
        partition
      )
    }
  }

  def partitionMetaData(dbName: String, tableName: String, partition: Partition): Option[PartitionMetaData] = {
    partitionsMetaData(dbName, tableName).find(_.partition == partition)
  }

  def createTimeAsInt(): Int = (System.currentTimeMillis() / 1000).toInt

  /**
    * Returns the hive FieldSchema's for partition columns.
    * Hive calls these "partition partitionKeys"
    */
  def partitionFieldSchemas(dbName: String, tableName: String): List[FieldSchema] = client.synchronized {
    client.getTable(dbName, tableName).getPartitionKeys.asScala.toList
  }

  def partitionKeys(dbName: String, tableName: String): List[String] = partitionFieldSchemas(dbName, tableName).map(_.getName)

  def tableExists(databaseName: String, tableName: String): Boolean = client.synchronized {
    client.tableExists(databaseName, tableName)
  }

  def tableFormat(dbName: String, tableName: String): String = client.synchronized {
    client.getTable(dbName, tableName).getSd.getInputFormat
  }

  def location(dbName: String, tableName: String): String = client.synchronized {
    client.getTable(dbName, tableName).getSd.getLocation
  }

  def tablePath(dbName: String, tableName: String): Path = new Path(location(dbName, tableName))

  // Returns the eel schema for the hive dbName:tableName
  def schema(dbName: String, tableName: String): StructType = client.synchronized {
    val table = client.getTable(dbName, tableName)

    // hive columns are always nullable, and hive partitions are never nullable so we can set
    // the nullable fields appropriately
    val columns = table.getSd.getCols.asScala.map { it => HiveSchemaFns.fromHiveField(it) }
    val partitions = table.getPartitionKeys.asScala
      .map { it => HiveSchemaFns.fromHiveField(it).withNullable(false) }
      .map(_.withPartition(true))

    val fields = columns ++ partitions
    StructType(fields.toList)
  }

  /**
    * Adds this column to the hive schema. This is schema evolution.
    * The column must be marked as nullable and cannot have the same name as an existing column.
    */
  def addColumn(dbName: String, tableName: String, field: Field): Unit = client.synchronized {
    val table = client.getTable(dbName, tableName)
    val sd = table.getSd
    sd.addToCols(HiveSchemaFns.toHiveField(field))
    client.alter_table(dbName, tableName, table)
  }

  // returns true if the given partition exists for the given database.table
  def partitionExists(dbName: String,
                      tableName: String,
                      partition: Partition): Boolean = client.synchronized {
    logger.debug(s"Checking if partition exists '${partition.entries.mkString(",")}'")
    import scala.collection.JavaConverters._

    try {
      // when checking if a partition exists, remember the partition path might not actually be
      // the standard hive partition string, so instead pass in the vals which will always work
      client.getPartition(dbName, tableName, partition.values.asJava) != null
    } catch {
      case t: Throwable => false
    }
  }

  // creates (if not existing) the given partition
  def createPartitionIfNotExists(dbName: String,
                                 tableName: String,
                                 partition: Partition,
                                 path: Path): Boolean = client.synchronized {
    logger.debug(s"Ensuring partition exists '$partition'")

    try {
      client.getPartition(dbName, tableName, partition.values.asJava) != null
      true
    } catch {
      // exception is thrown if the partition doesn't exist, quickest way to find out if it exists
      case _: Throwable =>
        createPartition(dbName, tableName, path, partition)
        false
    }
  }

  def createTable(databaseName: String,
                  tableName: String,
                  schema: StructType,
                  partitionKeys: Seq[String],
                  dialect: HiveDialect = ParquetHiveDialect(),
                  props: Map[String, String] = Map.empty,
                  tableType: TableType = TableType.MANAGED_TABLE,
                  location: String = null,
                  overwrite: Boolean = false): Boolean = client.synchronized {
    for (partitionKey <- partitionKeys) {
      if (!schema.contains(partitionKey)) {
        throw new IllegalArgumentException(s"Schema must define all partition partitionKeys but it does not define $partitionKey")
      }
    }

    if (overwrite) {
      logger.debug(s"Removing table $databaseName.$tableName if exists (overwrite mode = true)")
      if (tableExists(databaseName, tableName)) {
        logger.debug(s"Table $databaseName.$tableName  exists, it will be dropped")
        client.dropTable(databaseName, tableName, true, true, true)
      } else {
        logger.debug(s"Table $databaseName.$tableName does not exist")
      }
    }

    if (!tableExists(databaseName, tableName)) {
      logger.info(s"Creating table $databaseName.$tableName; partitionKeys=${if (partitionKeys.isEmpty) "None" else partitionKeys.mkString(",")}")

      // we will normalize all our columns as lower case, which is how hive treats them
      val lowerPartitionKeys = partitionKeys.map(_.toLowerCase)
      val lowerColumns = schema.fields.map(_.toLowerCase)

      val sd = new StorageDescriptor()

      // hive expects that the table fields will not contain partition partitionKeys
      sd.setCols(lowerColumns.filterNot { it => lowerPartitionKeys.contains(it.name) }.map(HiveSchemaFns.toHiveField).asJava)
      sd.setSerdeInfo(new SerDeInfo(
        null,
        dialect.serde,
        Map("serialization.format" -> "1").asJava
      ))
      sd.setInputFormat(dialect.inputFormat)
      sd.setOutputFormat(dialect.outputFormat)
      sd.setLocation(location)

      val table = new Table()
      table.setDbName(databaseName)
      table.setTableName(tableName)
      table.setCreateTime(createTimeAsInt())
      table.setSd(sd)
      // todo support non string partitions
      table.setPartitionKeys(lowerPartitionKeys.map { it => new FieldSchema(it, "string", null) }.asJava)
      table.setTableType(tableType.name)

      table.putToParameters("generated_by", "eel_" + Constants.Version)
      if (tableType == TableType.EXTERNAL_TABLE)
        table.putToParameters("EXTERNAL", "TRUE")
      props.foreach { case (key, value) => table.putToParameters(key, value) }

      client.createTable(table)
      logger.info(s"Table created $databaseName.$tableName")
      true
    } else {
      false
    }
  }

  def createDatabase(name: String, description: String = null, overwrite: Boolean = false): Unit = client.synchronized {
    val exists = databaseExists(name)
    if (exists && overwrite) {
      logger.info(s"Database exists, overwrite=true; dropping database $name")
      client.dropDatabase(name)
    }
    if (overwrite || !exists) {
      val database = new Database(name, description, null, null)
      logger.info(s"Creating database $name")
      client.createDatabase(database)
    }
  }

  def databaseExists(name: String): Boolean = client.synchronized {
    try {
      client.getDatabase(name) != null
    } catch {
      case _: NoSuchObjectException => false
    }
  }
}