io.eels.component.hive.HiveTable.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of eel-hive_2.11 Show documentation
eel-hive
The newest version!
package io.eels.component.hive

import java.util.concurrent.LinkedBlockingQueue
import java.util.concurrent.atomic.AtomicInteger

import com.sksamuel.exts.Logging
import com.sksamuel.exts.OptionImplicits._
import com.sksamuel.exts.collection.BlockingQueueConcurrentIterator
import io.eels.component.hdfs.{AclSpec, HdfsSource}
import io.eels.component.hive.dialect.ParquetHiveDialect
import io.eels.component.hive.partition.PartitionMetaData
import io.eels.datastream.{Subscriber, Subscription}
import io.eels.schema.{Partition, PartitionConstraint, StringType, StructType}
import io.eels.util.HdfsIterator
import io.eels.{FilePattern, Row}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.permission.FsPermission
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.metastore.{IMetaStoreClient, TableType}
import org.apache.hadoop.security.UserGroupInformation

import scala.collection.JavaConverters._
import scala.math.BigDecimal.RoundingMode
import scala.util.matching.Regex

case class HiveTable(dbName: String,
                     tableName: String)
                    (implicit fs: FileSystem,
                     conf: Configuration,
                     client: IMetaStoreClient) extends Logging {

  lazy val ops = new HiveOps(client)

  /**
    * Returns all the partitions used by this hive source.
    */
  def partitions(): Seq[Partition] = ops.partitions(dbName, tableName)

  /**
    * Returns all the partitions along with extra meta data per partition, eg location, creation time.
    */
  def partitionMetaData(): Seq[PartitionMetaData] = ops.partitionsMetaData(dbName, tableName)

  def partitionMetaData(partition: Partition): Option[PartitionMetaData] = partitionMetaData().find(_.partition == partition)

  // returns all partition meta data for partitions that match each constraint
  def partitionMetaData(constraints: Seq[PartitionConstraint]): Seq[PartitionMetaData] =
    partitionMetaData().filter { meta => constraints.forall(_.eval(meta.partition)) }

  def hasPartitions: Boolean = partitions().nonEmpty

  /**
    * Returns just the values for the given partition key
    */
  def partitionValues(key: String): Set[String] = partitions.map(_.get(key)).collect {
    case Some(entry) => entry.value
  }.toSet

  def truncatePartition(partition: Partition): Unit = {
    logger.info(s"Truncating partition $partition")
    val meta = partitionMetaData(partition).getOrError(s"Unknown partition $partition")
    new HivePartitionScanner().scan(Seq(meta), Nil).foreach { case (_, files) =>
      logger.debug(s"Deleting partition files ${files.map(_.getPath).mkString(",")}")
      files.map(_.getPath).foreach(fs.delete(_, false))
    }
  }

  def schema: StructType = {
    ops.schema(dbName, tableName)
  }

  def create(schema: StructType,
             partitionFields: Seq[String] = Nil,
             tableType: TableType = TableType.MANAGED_TABLE,
             dialect: HiveDialect = ParquetHiveDialect(),
             props: Map[String, String] = Map.empty): Unit = {
    if (!ops.tableExists(dbName, tableName)) {
      ops.createTable(dbName,
        tableName,
        schema,
        partitionKeys = schema.partitions.map(_.name.toLowerCase) ++ partitionFields,
        dialect = dialect,
        props = props,
        tableType = tableType
      )
    }
  }

  /**
    * Returns a list of all files used by this hive table.
    *
    * @param includePartitionDirs if true then the partition directories will be included
    * @param includeTableDir      if true then the main table directory will be included
    * @return paths of all files and directories
    */
  def paths(includePartitionDirs: Boolean, includeTableDir: Boolean): Seq[Path] = {

    val _location = location

    val partitions = partitionMetaData
    val files = if (partitions.isEmpty) {
      HiveFileScanner(_location, false).map(_.getPath)
    } else {
      partitions.flatMap { partition =>
        val files = FilePattern(s"${partition.location}/*").toPaths()
        if (includePartitionDirs) {
          files :+ partition.location
        } else {
          files
        }
      }
    }

    if (includeTableDir) {
      files :+ _location
    } else {
      files
    }
  }

  /**
    * Returns a list of all files used by this hive table that match the given regex.
    * The full path of the file will be used when matching against the regex.
    *
    * @param includePartitionDirs if true then the partition directories will be included
    * @param includeTableDir      if true then the main table directory will be included
    * @return paths of all files and directories
    */
  def paths(includePartitionDirs: Boolean, includeTableDir: Boolean, regex: Regex): Seq[Path] = {
    paths(includePartitionDirs, includeTableDir).filter { path => regex.pattern.matcher(path.toString).matches }
  }

  /**
    * Returns all the files used by this table. The result is a mapping of partition path to the files contained
    * in that partition.
    */
  def files(): Map[Path, Seq[Path]] = {
    ops.hivePartitions(dbName, tableName).map { p =>
      val location = new Path(p.getSd.getLocation)
      val paths = HdfsIterator.remote(fs.listFiles(location, false)).map(_.getPath).toList
      location -> paths
    }.toMap
  }

  def setPermissions(permission: FsPermission,
                     includePartitionDirs: Boolean = false,
                     includeTableDir: Boolean = false): Unit = {
    paths(includePartitionDirs, includeTableDir).foreach(fs.setPermission(_, permission))
  }

  def showDdl(ifNotExists: Boolean = true): String = {
    val _spec = spec()
    val partitions = ops.partitionKeys(dbName, tableName)
    HiveDDL.showDDL(
      tableName,
      schema.fields,
      tableType = _spec.tableType,
      location = _spec.location.some,
      serde = _spec.serde,
      partitions = partitions.map(PartitionColumn(_, StringType)),
      outputFormat = _spec.outputFormat,
      inputFormat = _spec.inputFormat,
      ifNotExists = ifNotExists)
  }

  /**
    * Sets the acl for all files of this hive source.
    * Even if the files are not located inside the table directory, this function will find them
    * and correctly update the spec.
    *
    * @param acl the acl values to set
    */
  def setAcl(acl: AclSpec,
             includePartitionDirs: Boolean = false,
             includeTableDir: Boolean = false): Unit = {
    paths(includePartitionDirs, includeTableDir).foreach { path =>
      HdfsSource(path).setAcl(acl)
    }
  }

  // returns the permission of the table location path
  def tablePermission(): FsPermission = {
    val location = ops.location(dbName, tableName)
    fs.getFileStatus(new Path(location)).getPermission
  }

  /**
    * Returns a TableSpec which contains details of the underlying table.
    * Similar to the Table class in the Hive API but using scala friendly types.
    */
  def spec(): TableSpec = client.synchronized {
    val table = client.getTable(dbName, tableName)
    val tableType = TableType.values().find(_.name.toLowerCase == table.getTableType.toLowerCase)
      .getOrError("Hive table type is not supported by this version of hive")
    val params = table.getParameters.asScala.toMap ++ table.getSd.getParameters.asScala.toMap
    TableSpec(
      tableName,
      tableType,
      table.getSd.getLocation,
      table.getSd.getCols.asScala,
      table.getSd.getNumBuckets,
      table.getSd.getBucketCols.asScala.toList,
      params,
      table.getSd.getInputFormat,
      table.getSd.getOutputFormat,
      table.getSd.getSerdeInfo.getName,
      table.getRetention,
      table.getCreateTime,
      table.getLastAccessTime,
      table.getOwner
    )
  }

  def dialect = client.synchronized {
    io.eels.component.hive.HiveDialect(client.getTable(dbName, tableName))
  }

  def exists(): Boolean = ops.tableExists(dbName, tableName)

  // todo use dialect to return correct stats
  def stats(): HiveStats = new ParquetHiveStats(dbName, tableName, this)

  // will compact all the files in each partitions into a single file
  def compact(finalFilename: String = "eel_compacted_" + System.nanoTime): Unit = {
    val _schema = schema
    val _dialect = dialect
    HiveTableFilesFn(dbName, tableName, location, Nil).filter(_._2.nonEmpty).foreach { case (partition, files) =>
      logger.info(s"Starting compact for $partition")
      val queue = new LinkedBlockingQueue[Seq[Row]]
      val done = new AtomicInteger(0)
      files.foreach { file =>
        _dialect.input(file.getPath, _schema, _schema, None).subscribe(new Subscriber[Seq[Row]] {
          override def next(t: Seq[Row]): Unit = queue.put(t)
          override def completed(): Unit = if (done.incrementAndGet == files.size) {
            queue.put(Row.Sentinel)
          }
          override def error(t: Throwable): Unit = {
            logger.error(s"Error compacting $partition", t)
            queue.put(Row.Sentinel)
          }
          override def subscribed(c: Subscription): Unit = ()
        })
      }
      val output = _dialect.output(_schema, new Path(files.head.getPath.getParent, finalFilename), None, RoundingMode.UNNECESSARY, Map.empty)
      BlockingQueueConcurrentIterator(queue, Row.Sentinel).foreach { rows =>
        rows.foreach(output.write)
      }
      output.close()
      logger.info(s"Finished compact for $partition")
    }
  }

  // returns the location of this table as a hadoop Path
  def location(): Path = new Path(spec().location)

  def deletePartition(partition: Partition, deleteData: Boolean): Unit = client.synchronized {
    logger.debug(s"Deleting partition ${partition.unquoted}")
    client.dropPartition(dbName, tableName, partition.values.asJava, deleteData)
  }

  def drop(deleteData: Boolean = true): Unit = client.synchronized {
    logger.debug(s"Dropping table $dbName:$tableName")
    client.dropTable(dbName, tableName, deleteData, true)
  }

  def truncate(removePartitions: Boolean): Unit = client.synchronized {
    logger.debug(s"Truncating table $dbName:$tableName")
    if (removePartitions)
      new HiveOps(client).partitions(dbName, tableName).foreach(deletePartition(_, true))
    else {
      files().values.foreach(_.foreach(path => fs.delete(path, false)))
    }
  }

  def login(principal: String, keytabPath: java.nio.file.Path): Unit = {
    UserGroupInformation.loginUserFromKeytab(principal, keytabPath.toString)
  }

  def toHdfsSource = HdfsSource(FilePattern(location.toString + "/*"))

  def source = HiveSource(dbName, tableName)
  def sink = HiveSink(dbName, tableName)
}