All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.eels.component.hive.HiveSink.scala Maven / Gradle / Ivy

The newest version!
package io.eels.component.hive

import com.sksamuel.exts.Logging
import com.sksamuel.exts.OptionImplicits._
import com.typesafe.config.{Config, ConfigFactory}
import io.eels.component.hive.dialect.ParquetHiveDialect
import io.eels.component.hive.partition.{DynamicPartitionStrategy, PartitionStrategy}
import io.eels.schema.StructType
import io.eels.{Sink, SinkWriter}
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.fs.permission.FsPermission
import org.apache.hadoop.hive.metastore.{IMetaStoreClient, TableType}
import org.apache.hadoop.security.UserGroupInformation

import scala.math.BigDecimal.RoundingMode
import scala.math.BigDecimal.RoundingMode.RoundingMode

object HiveSink {
  private val config: Config = ConfigFactory.load()
  private val upperCaseAction = config.getString("eel.hive.sink.upper-case-action")
}

case class HiveSink(dbName: String,
                    tableName: String,
                    permission: Option[FsPermission] = None,
                    inheritPermissions: Option[Boolean] = None,
                    principal: Option[String] = None,
                    partitionFields: Seq[String] = Nil,
                    partitionStrategy: PartitionStrategy = new DynamicPartitionStrategy,
                    filenameStrategy: FilenameStrategy = DefaultFilenameStrategy,
                    stagingStrategy: StagingStrategy = DefaultStagingStrategy,
                    metastoreSchemaHandler: MetastoreSchemaHandler = RequireCompatibilityMetastoreSchemaHandler,
                    alignStrategy: AlignmentStrategy = RowPaddingAlignmentStrategy,
                    outputSchemaStrategy: OutputSchemaStrategy = SkipPartitionsOutputSchemaStrategy,
                    keytabPath: Option[java.nio.file.Path] = None,
                    fileListener: FileListener = FileListener.noop,
                    createTable: Boolean = false,
                    // dialect used to create a new table, for existing tables the dialect will always be detected
                    dialect: Option[HiveDialect] = None,
                    callbacks: Seq[CommitCallback] = Nil,
                    roundingMode: RoundingMode = RoundingMode.UNNECESSARY,
                    metadata: Map[String, String] = Map.empty)
                   (implicit fs: FileSystem, client: IMetaStoreClient) extends Sink with Logging {

  import HiveSink._

  implicit private val conf = fs.getConf
  private val ops = new HiveOps(client)

  def withCreateTable(createTable: Boolean,
                      partitionFields: Seq[String] = Nil,
                      dialect: HiveDialect = ParquetHiveDialect()): HiveSink =
    copy(createTable = createTable, partitionFields = partitionFields, dialect = dialect.some)

  def withPermission(permission: FsPermission): HiveSink = copy(permission = Option(permission))
  def withInheritPermission(inheritPermissions: Boolean): HiveSink = copy(inheritPermissions = Option(inheritPermissions))
  def withPartitionFields(first: String, rest: String*): HiveSink = copy(partitionFields = first +: rest)
  def withPartitionFields(partitionFields: Seq[String]): HiveSink = copy(partitionFields = partitionFields)
  def withFileListener(listener: FileListener): HiveSink = copy(fileListener = listener)
  def withFilenameStrategy(filenameStrategy: FilenameStrategy): HiveSink = copy(filenameStrategy = filenameStrategy)
  def withPartitionStrategy(strategy: PartitionStrategy): HiveSink = copy(partitionStrategy = strategy)
  def withMetaData(map: Map[String, String]): HiveSink = copy(metadata = map)
  def withRoundingMode(mode: RoundingMode): HiveSink = copy(roundingMode = mode)
  def withStagingStrategy(strategy: StagingStrategy): HiveSink = copy(stagingStrategy = strategy)
  def withMetastoreSchemaHandler(strategy: MetastoreSchemaHandler): HiveSink = copy(metastoreSchemaHandler = strategy)
  def withAlignmentStrategy(strategy: AlignmentStrategy): HiveSink = copy(alignStrategy = strategy)
  def withOutputSchemaStrategy(strategy: OutputSchemaStrategy): HiveSink = copy(outputSchemaStrategy = strategy)

  /**
    * Add a callback that will be invoked when commit operations are taking place.
    */
  def addCommitCallback(callback: CommitCallback): HiveSink = copy(callbacks = callbacks :+ callback)

  def withKeytabFile(principal: String, keytabPath: java.nio.file.Path): HiveSink = {
    login()
    copy(principal = principal.some, keytabPath = keytabPath.option)
  }

  private def detectDialect(): HiveDialect = {
    login()
    val format = ops.tableFormat(dbName, tableName)
    logger.debug(s"Table format is $format; detecting dialect...")
    io.eels.component.hive.HiveDialect(format)
  }

  private def login(): Unit = {
    for (user <- principal; path <- keytabPath) {
      UserGroupInformation.loginUserFromKeytab(user, path.toString)
    }
  }

  def containsUpperCase(schema: StructType): Boolean = schema.fieldNames().exists(name => name.exists(Character.isUpperCase))

  override def open(schema: StructType): SinkWriter = open(schema, 1).head

  // hive metastore is not good with concurrency in versions < 2.0
  override def open(schema: StructType, n: Int): Seq[SinkWriter] = client.synchronized {
    login()
    upperCaseCheck(schema)

    if (createTable) {
      if (!ops.tableExists(dbName, tableName)) {
        ops.createTable(dbName,
          tableName,
          schema,
          partitionKeys = schema.partitions.map(_.name.toLowerCase) ++ partitionFields,
          dialect = dialect.getOrElse(ParquetHiveDialect()),
          props = Map.empty,
          tableType = TableType.MANAGED_TABLE
        )
      }
    }

    val metastoreSchema = ops.schema(dbName, tableName)
    logger.trace(s"Retrieved metastore schema: $metastoreSchema")

    // use the metastore handler to allow custom logic for handling schema differences between metastore and input
    logger.debug("Invoking evolution strategy to align metastore schema")
    metastoreSchemaHandler.evolve(dbName, tableName, metastoreSchema, schema, client)

    val readDialect = detectDialect
    val partitionKeyNames = ops.partitionKeys(dbName, tableName)

    List.tabulate(n) { k =>
      new HiveSinkWriter(
        schema,
        metastoreSchema,
        dbName,
        tableName,
        partitionKeyNames,
        Some(k.toString),
        readDialect,
        partitionStrategy,
        filenameStrategy,
        stagingStrategy,
        metastoreSchemaHandler,
        alignStrategy,
        outputSchemaStrategy,
        inheritPermissions,
        permission,
        fileListener,
        callbacks,
        roundingMode,
        metadata
      )
    }
  }

  private def upperCaseCheck(schema: StructType): Unit = {
    if (containsUpperCase(schema)) {
      upperCaseAction match {
        case "error" =>
          sys.error("Writing to hive with a schema that contains upper case characters is discouraged because Hive will lowercase the fields, which could lead to subtle case sensitivity bugs. " +
            "It is recommended that you lower case the schema before writing (eg, datastream.withLowerCaseSchema). " +
            "To disable this exception, set eel.hive.sink.upper-case-action=warn or eel.hive.sink.upper-case-action=none")
        case "warn" =>
          logger.warn("Writing to hive with a schema that contains upper case characters is discouraged because Hive will lowercase the fields, which could lead to subtle case sensitivity bugs. " +
            "It is recommended that you lower case the schema before writing (eg, datastream.withLowerCaseSchema). " +
            "To disable this warning, set eel.hive.sink.upper-case-action=none")
        case _ =>
      }
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy