All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.eels.component.hive.MetastoreSchemaHandler.scala Maven / Gradle / Ivy

The newest version!
package io.eels.component.hive

import com.sksamuel.exts.Logging
import io.eels.schema.StructType
import org.apache.hadoop.hive.metastore.IMetaStoreClient

/**
  * A handler that is invoked with the schema of the source and the
  * existing schema in the metastore.
  *
  * This allows a handler to decide how to handle differences. For instance
  * an implementation may choose to evolve the metastore schema to add missing fields.
  * Another implemention may throw an exception if the schemas are not aligned.
  */
trait MetastoreSchemaHandler {
  def evolve(dbName: String,
             tableName: String,
             metastoreSchema: StructType,
             targetSchema: StructType,
             client: IMetaStoreClient): Unit
}

/**
  * An implementation of MetastoreSchemaHandler that does nothing, this
  * may result in errors downstream if, for example, the input schema does not
  * include all columns and defaults cannot be applied.
  */
object NoopMetastoreSchemaHandler extends MetastoreSchemaHandler {
  override def evolve(dbName: String,
                      tableName: String,
                      metastoreSchema: StructType,
                      targetSchema: StructType,
                      client: IMetaStoreClient): Unit = ()
}

/**
  * An implementation of MetastoreSchemaHandler that requires the input
  * schema to be compatible with the metastore schema. Compatiblity is
  * achieved when all fields in the input schema are already defined
  * in the metastore, with compatible types.
  *
  * With this handler, the input schema is allowed to have extra fields
  * which are not present in the metastore. It is assumed they will be
  * dropped by the alignment strategy.
  *
  * If the schemas are not compatible then an exception is raised.
  */
object RequireCompatibilityMetastoreSchemaHandler extends MetastoreSchemaHandler {
  override def evolve(dbName: String,
                      tableName: String,
                      metastoreSchema: StructType,
                      targetSchema: StructType,
                      client: IMetaStoreClient): Unit = {
    val compatible = targetSchema.fields.forall { inputField =>
      metastoreSchema.field(inputField.name) match {
        case Some(metastoreField) => metastoreField.dataType == inputField.dataType
        case _ => false
      }
    }
    assert(
      compatible,
      s"Input schema $targetSchema is not compatible with the metastore schema $metastoreSchema. If you wish eel-sdk to automatically evolve the target schema (where possible) then set metastoreSchemaHandler=EvolutionMetastoreSchemaHandler on the HiveSink. Other handlers are also availble, see docs or source"
    )
  }
}

/**
  * An implementation of MetastoreSchemaHandler that requires the input
  * schema to be equal to the metastore schema. Equality is defined
  * as having the same field names with the same types (order is irrelevant).
  *
  * Any missing fields or additional fields not present will cause an
  * exception to be raised.
  *
  * If the schemas are not equal then an exception is raised.
  */
object StrictMetastoreSchemaHandler extends MetastoreSchemaHandler {
  override def evolve(dbName: String,
                      tableName: String,
                      metastoreSchema: StructType,
                      targetSchema: StructType,
                      client: IMetaStoreClient): Unit = {
    assert(
      metastoreSchema.fields.map(field => field.name -> field.dataType) == targetSchema.fields.map(field => field.name -> field.dataType),
      s"Input schema $targetSchema is not equal to the metastore schema $metastoreSchema. If you wish eel-sdk to automatically evolve the target schema (where possible) then set metastoreSchemaHandler=EvolutionMetastoreSchemaHandler on the HiveSink. Other handlers are also availble, see docs or source."
    )
  }
}

/**
  * An implementation of MetastoreSchemaHandler that will evolve the metastore
  * schema were possible to match the incoming data.
  *
  * It will do this by adding missing fields to the end of the current schema.
  * The new fields cannot be added as partition fields as the table will already have been created.
  */
object EvolutionMetastoreSchemaHandler extends MetastoreSchemaHandler with Logging {

  override def evolve(dbName: String,
                      tableName: String,
                      metastoreSchema: StructType,
                      targetSchema: StructType,
                      client: IMetaStoreClient): Unit = client.synchronized {
    val missing = targetSchema.fields.filterNot(field => metastoreSchema.fieldNames().contains(field.name))
    if (missing.nonEmpty) {
      logger.debug("Hive metastore is missing the following fields: " + missing.mkString(", "))
      val table = client.getTable(dbName, tableName)
      val cols = table.getSd.getCols
      missing.foreach { field =>
        logger.info(s"Adding new column to hive table [$field]")
        cols.add(HiveSchemaFns.toHiveField(field))
      }
      table.getSd.setCols(cols)
      client.alter_table(dbName, tableName, table)
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy