com.dimajix.flowman.spec.relation.DeltaTableRelation.scala Maven / Gradle / Ivy
* Copyright 2021 Kaya Kupferschmidt
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
package com.dimajix.flowman.spec.relation
import com.fasterxml.jackson.annotation.JsonProperty
import io.delta.tables.DeltaTable
import org.apache.hadoop.fs.Path
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException
import org.apache.spark.sql.catalyst.catalog.CatalogTableType
import org.apache.spark.sql.delta.catalog.DeltaTableV2
import org.apache.spark.sql.streaming.StreamingQuery
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.sql.types.StructType
import org.slf4j.LoggerFactory
import com.dimajix.common.No
import com.dimajix.common.Trilean
import com.dimajix.common.Yes
import com.dimajix.flowman.catalog.PartitionSpec
import com.dimajix.flowman.catalog.TableChange
import com.dimajix.flowman.execution.Context
import com.dimajix.flowman.execution.Execution
import com.dimajix.flowman.execution.MigrationFailedException
import com.dimajix.flowman.execution.MigrationPolicy
import com.dimajix.flowman.execution.MigrationStrategy
import com.dimajix.flowman.execution.OutputMode
import com.dimajix.flowman.execution.UnspecifiedSchemaException
import com.dimajix.flowman.jdbc.HiveDialect
import com.dimajix.flowman.model.PartitionField
import com.dimajix.flowman.model.PartitionSchema
import com.dimajix.flowman.model.Relation
import com.dimajix.flowman.model.ResourceIdentifier
import com.dimajix.flowman.model.Schema
import com.dimajix.flowman.spec.annotation.RelationType
import com.dimajix.flowman.types.FieldValue
import com.dimajix.flowman.types.SingleValue
import com.dimajix.spark.sql.SchemaUtils
case class DeltaTableRelation(
override val instanceProperties:Relation.Properties,
override val schema:Option[Schema] = None,
override val partitions: Seq[PartitionField] = Seq(),
database: String,
table: String,
location: Option[Path] = None,
options: Map[String,String] = Map(),
properties: Map[String, String] = Map(),
mergeKey: Seq[String] = Seq()
) extends DeltaRelation(options) {
private val logger = LoggerFactory.getLogger(classOf[DeltaTableRelation])
private lazy val tableIdentifier: TableIdentifier = {
TableIdentifier(table, Some(database))
* Returns the list of all resources which will be created by this relation.
* @return
override def provides: Set[ResourceIdentifier] = {
Set(ResourceIdentifier.ofHiveTable(table, Some(database)))
* Returns the list of all resources which will be required by this relation for creation.
* @return
override def requires: Set[ResourceIdentifier] = {
* Returns the list of all resources which will are managed by this relation for reading or writing a specific
* partition. The list will be specifically created for a specific partition, or for the full relation (when the
* partition is empty)
* @param partitions
* @return
override def resources(partition: Map[String, FieldValue]): Set[ResourceIdentifier] = {
require(partitions != null)
val allPartitions = PartitionSchema(this.partitions).interpolate(partition)
allPartitions.map(p =>ResourceIdentifier.ofHivePartition(table, Some(database), p.toMap)).toSet
* Reads data from the relation, possibly from specific partitions
* @param execution
* @param schema - the schema to read. If none is specified, all available columns will be read
* @param partitions - List of partitions. If none are specified, all the data will be read
* @return
override def read(execution: Execution, partitions: Map[String, FieldValue]): DataFrame = {
logger.info(s"Reading Delta relation '$identifier' from table $tableIdentifier using partition values $partitions")
val tableDf = execution.spark.read
filterPartition(tableDf, partitions)
* Writes data into the relation, possibly into a specific partition
* @param execution
* @param df - dataframe to write
* @param partition - destination partition
override def write(execution: Execution, df: DataFrame, partition: Map[String, SingleValue], mode: OutputMode): Unit = {
requireAllPartitionKeys(partition, df.columns)
val partitionSpec = PartitionSchema(partitions).spec(partition)
logger.info(s"Writing Delta relation '$identifier' to table $tableIdentifier partition ${HiveDialect.expr.partition(partitionSpec)} with mode '$mode'")
val extDf = SchemaUtils.applySchema(addPartition(df, partition), outputSchema(execution))
mode match {
case OutputMode.OVERWRITE_DYNAMIC => throw new IllegalArgumentException(s"Output mode 'overwrite_dynamic' not supported by Delta table relation '$identifier'")
case OutputMode.UPDATE => doUpdate(extDf, partitionSpec)
case _ => doWrite(extDf, partitionSpec, mode)
private def doWrite(df: DataFrame, partitionSpec: PartitionSpec, mode: OutputMode) : Unit = {
val writer =
if (partitionSpec.nonEmpty) {
.option("replaceWhere", partitionSpec.predicate)
else {
private def doUpdate(df: DataFrame, partitionSpec: PartitionSpec) : Unit = {
val withinPartitionKeyColumns = if (mergeKey.nonEmpty) mergeKey else schema.map(_.primaryKey).getOrElse(Seq())
val keyColumns = partitions.map(_.name).toSet -- partitionSpec.keys ++ withinPartitionKeyColumns
val table = DeltaTable.forName(df.sparkSession, tableIdentifier.quotedString)
DeltaUtils.upsert(table, df, keyColumns, partitionSpec)
* Reads data from a streaming source
* @param execution
* @param schema
* @return
override def readStream(execution: Execution): DataFrame = {
logger.info(s"Streaming from Delta table relation '$identifier' at $tableIdentifier")
val location = DeltaUtils.getLocation(execution, tableIdentifier)
readStreamFrom(execution, location)
* Writes data to a streaming sink
* @param execution
* @param df
* @return
override def writeStream(execution: Execution, df: DataFrame, mode: OutputMode, trigger: Trigger, checkpointLocation: Path): StreamingQuery = {
logger.info(s"Streaming to Delta table relation '$identifier' $tableIdentifier")
val location = DeltaUtils.getLocation(execution, tableIdentifier)
writeStreamTo(execution, df, location, mode, trigger, checkpointLocation)
* Returns true if the relation already exists, otherwise it needs to be created prior usage. This refers to
* the relation itself, not to the data or a specific partition. [[loaded]] should return [[Yes]] after
* [[[create]] has been called, and it should return [[No]] after [[destroy]] has been called.
* @param execution
* @return
override def exists(execution: Execution): Trilean = {
* Returns true if the relation exists and has the correct schema. If the method returns false, but the
* relation exists, then a call to [[migrate]] should result in a conforming relation.
* @param execution
* @return
override def conforms(execution: Execution, migrationPolicy: MigrationPolicy): Trilean = {
val catalog = execution.catalog
if (catalog.tableExists(tableIdentifier)) {
val table = catalog.getTable(tableIdentifier)
if (table.tableType == CatalogTableType.VIEW) {
else if (schema.nonEmpty) {
val table = loadDeltaTable(execution)
val sourceSchema = com.dimajix.flowman.types.StructType.of(table.schema())
val targetSchema = com.dimajix.flowman.types.SchemaUtils.replaceCharVarchar(fullSchema.get)
!TableChange.requiresMigration(sourceSchema, targetSchema, migrationPolicy)
else {
else {
* Returns true if the target partition exists and contains valid data. Absence of a partition indicates that a
* [[write]] is required for getting up-to-date contents. A [[write]] with output mode
* [[OutputMode.ERROR_IF_EXISTS]] then should not throw an error but create the corresponding partition
* @param execution
* @param partition
* @return
override def loaded(execution: Execution, partition: Map[String, SingleValue]): Trilean = {
require(execution != null)
require(partition != null)
val catalog = execution.catalog
if (!catalog.tableExists(tableIdentifier)) {
else if (partitions.nonEmpty) {
val partitionSpec = PartitionSchema(partitions).spec(partition)
DeltaUtils.isLoaded(execution, tableIdentifier, partitionSpec)
else {
val location = catalog.getTableLocation(tableIdentifier)
DeltaUtils.isLoaded(execution, location)
* This method will physically create the corresponding relation. This might be a Hive table or a directory. The
* relation will not contain any data, but all metadata will be processed
* @param execution
override def create(execution: Execution, ifNotExists: Boolean): Unit = {
val tableExists = exists(execution) == Yes
if (!ifNotExists || !tableExists) {
val sparkSchema = HiveTableRelation.cleanupSchema(StructType(fields.map(_.catalogField)))
logger.info(s"Creating Delta table relation '$identifier' with table $tableIdentifier and schema\n${sparkSchema.treeString}")
if (schema.isEmpty) {
throw new UnspecifiedSchemaException(identifier)
if (tableExists)
throw new TableAlreadyExistsException(database, table)
* Removes one or more partitions.
* @param execution
* @param partitions
override def truncate(execution: Execution, partitions: Map[String, FieldValue]): Unit = {
if (partitions.nonEmpty) {
val deltaTable = DeltaTable.forName(execution.spark, tableIdentifier.quotedString)
PartitionSchema(this.partitions).interpolate(partitions).foreach { p =>
else {
logger.info(s"Truncating Delta table relation '$identifier' by truncating table $tableIdentifier")
val deltaTable = DeltaTable.forName(execution.spark, tableIdentifier.quotedString)
* This will delete any physical representation of the relation. Depending on the type only some meta data like
* a Hive table might be dropped or also the physical files might be deleted
* @param execution
override def destroy(execution: Execution, ifExists: Boolean): Unit = {
require(execution != null)
val catalog = execution.catalog
if (!ifExists || catalog.tableExists(tableIdentifier)) {
logger.info(s"Destroying Delta table relation '$identifier' by dropping table $tableIdentifier")
* This will update any existing relation to the specified metadata.
* @param execution
override def migrate(execution: Execution, migrationPolicy: MigrationPolicy, migrationStrategy: MigrationStrategy): Unit = {
require(execution != null)
val catalog = execution.catalog
if (catalog.tableExists(tableIdentifier)) {
val table = catalog.getTable(tableIdentifier)
if (table.tableType == CatalogTableType.VIEW) {
migrationStrategy match {
case MigrationStrategy.NEVER =>
logger.warn(s"Migration required for HiveTable relation '$identifier' from VIEW to a TABLE $tableIdentifier, but migrations are disabled.")
case MigrationStrategy.FAIL =>
logger.error(s"Cannot migrate relation HiveTable '$identifier' from VIEW to a TABLE $tableIdentifier, since migrations are disabled.")
throw new MigrationFailedException(identifier)
case MigrationStrategy.ALTER|MigrationStrategy.ALTER_REPLACE|MigrationStrategy.REPLACE =>
logger.warn(s"TABLE target $tableIdentifier is currently a VIEW, dropping...")
catalog.dropView(tableIdentifier, false)
create(execution, false)
else if (schema.nonEmpty) {
migrateInternal(execution, migrationPolicy, migrationStrategy)
override protected def loadDeltaTable(execution: Execution): DeltaTableV2 = {
val catalog = execution.catalog
val table = catalog.getTable(tableIdentifier)
new Path(table.location),
catalogTable = Some(table),
tableIdentifier = Some(tableIdentifier.toString())
class DeltaTableRelationSpec extends RelationSpec with SchemaRelationSpec with PartitionedRelationSpec {
@JsonProperty(value = "database", required = false) private var database: String = "default"
@JsonProperty(value = "table", required = true) private var table: String = ""
@JsonProperty(value = "location", required = false) private var location: Option[String] = None
@JsonProperty(value = "options", required=false) private var options:Map[String,String] = Map()
@JsonProperty(value = "properties", required = false) private var properties: Map[String, String] = Map()
@JsonProperty(value = "mergeKey", required = false) private var mergeKey: Seq[String] = Seq()
override def instantiate(context: Context): DeltaTableRelation = {
context.evaluate(location).map(p => new Path(p)),
© 2015 - 2025 Weber Informatics LLC | Privacy Policy