All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.eels.component.hive.HiveSource.scala Maven / Gradle / Ivy

The newest version!
package io.eels.component.hive

import com.sksamuel.exts.Logging
import com.sksamuel.exts.OptionImplicits._
import com.sksamuel.exts.io.Using
import io.eels.component.parquet.util.ParquetLogMute
import io.eels.datastream.Publisher
import io.eels.schema.{PartitionConstraint, StructType}
import io.eels.{Predicate, Row, Source}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.hive.metastore.IMetaStoreClient
import org.apache.hadoop.security.UserGroupInformation

/**
  * @param partitionConstraints optional constraits on the partition data to narrow which partitions are read
  * @param projection  sets which fields are required by the caller.
  * @param predicate   optional predicate which will filter rows at the read level
  *
  */
case class HiveSource(dbName: String,
                      tableName: String,
                      projection: List[String] = Nil,
                      predicate: Option[Predicate] = None,
                      partitionConstraints: Seq[PartitionConstraint] = Nil,
                      principal: Option[String] = None,
                      keytabPath: Option[java.nio.file.Path] = None)
                     (implicit fs: FileSystem,
                      client: IMetaStoreClient) extends Source with Logging with Using {
  ParquetLogMute()

  implicit private val conf: Configuration = fs.getConf
  private val ops = new HiveOps(client)

  def withProjection(first: String, rest: String*): HiveSource = withProjection(first +: rest)
  def withProjection(columns: Seq[String]): HiveSource = {
    require(columns.nonEmpty)
    copy(projection = columns.toList)
  }

  def withPredicate(predicate: Predicate): HiveSource = copy(predicate = predicate.some)

  def withPartitionConstraints(constraints: Seq[PartitionConstraint]): HiveSource = copy(partitionConstraints = constraints)
  def addPartitionConstraint(constraint: PartitionConstraint): HiveSource =
    copy(partitionConstraints = partitionConstraints :+ constraint)

  def withKeytabFile(principal: String, keytabPath: java.nio.file.Path): HiveSource = {
    login()
    copy(principal = principal.some, keytabPath = keytabPath.option)
  }

  private def login(): Unit = {
    for (user <- principal; path <- keytabPath) {
      UserGroupInformation.loginUserFromKeytab(user, path.toString)
    }
  }

  def table = HiveTable(dbName, tableName)

  /**
    * The returned schema should take into account:
    *
    * 1) Any projection. If a projection is set, then it should return the schema in the same order
    * as the projection. If no projection is set then the schema should be driven from the hive metastore.
    *
    * If the projection requests a field that does not exist, then this method will throw an exception.
    *
    * 2) Any partitions set. These should be included in the schema columns.
    */
  override def schema: StructType = {
    login()
    // if no field names were specified, then we will return the schema as is from the hive database,
    // otherwise we will keep only the requested fields
    val schema = if (projection.isEmpty) metastoreSchema
    else {
      // remember hive is always lower case, so when comparing requested field names with
      // hive fields we need to use lower case everything. And we need to return the schema
      // in the same order as the requested projection
      val columns = projection.map { fieldName =>
        metastoreSchema.fields
          .find(_.name == fieldName.toLowerCase)
          .getOrElse(sys.error(s"Requested field $fieldName does not exist in the hive schema"))
      }
      StructType(columns)
    }

    schema
  }

  // returns the full underlying schema from the metastore including partition partitionKeys
  private lazy val metastoreSchema: StructType = {
    login()
    ops.schema(dbName, tableName)
  }

  /**
    * Returns true if the currently set projection is for partition only fields.
    */
  def isPartitionOnlyProjection: Boolean = {
    val partitionKeyNames = ops.partitionKeys(dbName, tableName)
    projection.nonEmpty && projection.map { it => it.toLowerCase() }.forall { it => partitionKeyNames.contains(it) }
  }

  override def parts(): Seq[Publisher[Seq[Row]]] = {
    login()

    val dialect = table.dialect
    val partitionKeys = ops.partitionKeys(dbName, tableName)

    // a predicate cannot operate on partitions, as a predicate is pushed down into the files
    // but partition data is not always written out to the file
    if (predicate.map(_.fields).getOrElse(Nil).exists(partitionKeys.contains))
      sys.error("A predicate cannot operate on partition fields; use a partition constraint")

    // if we requested only partition columns, then we can get this information by scanning the metatstore
    // to see which partitions have been created.
    if (isPartitionOnlyProjection) {
      logger.debug("The requested projection only uses partitions; reading directly from metastore")
      // we pass in the schema so we can order the results to keep them aligned with the given projection
      List(new HivePartitionPublisher(dbName, tableName, schema, partitionKeys, dialect))
    } else {

      val filesandpartitions = HiveTableFilesFn(dbName, tableName, table.location(), partitionConstraints)
      logger.debug(s"Found ${filesandpartitions.size} visible hive files from all locations for $dbName:$tableName")

      // for each seperate hive file part we must pass in the metastore schema
      filesandpartitions.flatMap { case (partition, files) =>
        files.map { file =>
          new HiveFilePublisher(dialect, file, metastoreSchema, schema, predicate, partition)
        }
      }.toSeq
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy