All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.eels.component.hive.OutputSchemaStrategy.scala Maven / Gradle / Ivy

The newest version!
package io.eels.component.hive

import io.eels.schema.StructType
import org.apache.hadoop.hive.metastore.IMetaStoreClient

/**
  * Accepts a metastore schema and returns the schema that should actually be persisted to disk.
  * This allows us to determine if some data is not written, for example in parquet files
  * it is common to skip writing out partition data, since that data is present in the metastore.
  */
trait OutputSchemaStrategy {
  def resolve(schema: StructType, partitionKeys: Seq[String], client: IMetaStoreClient): StructType
}

/**
  * This strategy will drop partition columns from the schema
  * so that they not written out to the files.
  */
object SkipPartitionsOutputSchemaStrategy extends OutputSchemaStrategy {

  def resolve(schema: StructType, partitionKeys: Seq[String], client: IMetaStoreClient): StructType = {
    if (partitionKeys.isEmpty) schema
    else
      partitionKeys.foldLeft(schema) { (schema, name) =>
        schema.removeField(name, caseSensitive = false)
      }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy