All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.spark.sql.delta.DeltaOperations.scala Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (2020) The Delta Lake Project Authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.sql.delta

import org.apache.spark.sql.delta.actions.{ Metadata, Protocol }
import org.apache.spark.sql.delta.util.JsonUtils

import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
import org.apache.spark.sql.execution.metric.SQLMetric
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.types.{ StructField, StructType }

/**
  * Exhaustive list of operations that can be performed on a Delta table. These operations are
  * tracked as the first line in delta logs, and power `DESCRIBE HISTORY` for Delta tables.
  */
object DeltaOperations {

  /**
    * An operation that can be performed on a Delta table.
    * @param name The name of the operation.
    */
  sealed abstract class Operation(val name: String) {
    val parameters: Map[String, Any]

    lazy val jsonEncodedValues: Map[String, String] = parameters.mapValues(JsonUtils.toJson(_))

    val operationMetrics: Set[String] = Set()

    def transformMetrics(metrics: Map[String, SQLMetric]): Map[String, String] = {
      metrics.filterKeys(s => operationMetrics.contains(s)).transform((_, v) => v.value.toString)
    }
  }

  /** Recorded during batch inserts. Predicates can be provided for overwrites. */
  case class Write(mode: SaveMode, partitionBy: Option[Seq[String]] = None, predicate: Option[String] = None)
      extends Operation("WRITE") {
    override val parameters: Map[String, Any]  = Map("mode" -> mode.name()) ++
      partitionBy.map("partitionBy" -> JsonUtils.toJson(_)) ++
      predicate.map("predicate" -> _)

    override val operationMetrics: Set[String] = DeltaOperationMetrics.WRITE
  }

  /** Recorded during streaming inserts. */
  case class StreamingUpdate(outputMode: OutputMode, queryId: String, epochId: Long)
      extends Operation("STREAMING UPDATE") {
    override val parameters: Map[String, Any]  =
      Map("outputMode" -> outputMode.toString, "queryId" -> queryId, "epochId" -> epochId.toString)
    override val operationMetrics: Set[String] = DeltaOperationMetrics.STREAMING_UPDATE
  }

  /** Recorded while deleting certain partitions. */
  case class Delete(predicate: Seq[String]) extends Operation("DELETE") {
    override val parameters: Map[String, Any]  = Map("predicate" -> JsonUtils.toJson(predicate))
    override val operationMetrics: Set[String] = DeltaOperationMetrics.DELETE

    override def transformMetrics(metrics: Map[String, SQLMetric]): Map[String, String] = {
      var strMetrics = super.transformMetrics(metrics)
      if (metrics.contains("numOutputRows")) {
        strMetrics += "numCopiedRows" -> metrics("numOutputRows").value.toString
      }
      // find the case where deletedRows are not captured
      if (strMetrics("numDeletedRows") == "0" && strMetrics("numRemovedFiles") != "0") {
        // identify when row level metrics are unavailable. This will happen when the entire
        // table or partition are deleted.
        strMetrics -= "numDeletedRows"
        strMetrics -= "numCopiedRows"
        strMetrics -= "numAddedFiles"
      }
      strMetrics
    }
  }

  /** Recorded when truncating the table. */
  case class Truncate() extends Operation("TRUNCATE") {
    override val parameters: Map[String, Any]  = Map.empty
    override val operationMetrics: Set[String] = DeltaOperationMetrics.TRUNCATE
  }

  /** Recorded when fscking the table. */
  case class Fsck(numRemovedFiles: Long) extends Operation("FSCK") {
    override val parameters: Map[String, Any]  = Map(
      "numRemovedFiles" -> numRemovedFiles
    )
    override val operationMetrics: Set[String] = DeltaOperationMetrics.FSCK
  }

  /** Recorded when converting a table into a Delta table. */
  case class Convert(numFiles: Long, partitionBy: Seq[String], collectStats: Boolean, catalogTable: Option[String])
      extends Operation("CONVERT") {
    override val parameters: Map[String, Any]  = Map(
      "numFiles"                         -> numFiles,
      "partitionedBy"                    -> JsonUtils.toJson(partitionBy),
      "collectStats"                     -> collectStats
    ) ++ catalogTable.map("catalogTable" -> _)
    override val operationMetrics: Set[String] = DeltaOperationMetrics.CONVERT
  }

  /** Recorded when optimizing the table. */
  case class Optimize(predicate: Seq[String], zOrderBy: Seq[String], batchId: Int, auto: Boolean)
      extends Operation("OPTIMIZE") {
    override val parameters: Map[String, Any]  = Map(
      "predicate" -> JsonUtils.toJson(predicate),
      "zOrderBy"  -> JsonUtils.toJson(zOrderBy),
      "batchId"   -> JsonUtils.toJson(batchId),
      "auto"      -> auto
    )
    override val operationMetrics: Set[String] = DeltaOperationMetrics.OPTIMIZE
  }

  /** Recorded when a merge operation is committed to the table. */
  case class Merge(
    predicate: Option[String],
    updatePredicate: Option[String],
    deletePredicate: Option[String],
    insertPredicate: Option[String]
  ) extends Operation("MERGE") {
    override val parameters: Map[String, Any] = {
      predicate.map("predicate" -> _).toMap ++
        updatePredicate.map("updatePredicate" -> _).toMap ++
        deletePredicate.map("deletePredicate" -> _).toMap ++
        insertPredicate.map("insertPredicate" -> _).toMap
    }
    override val operationMetrics: Set[String] = DeltaOperationMetrics.MERGE
  }

  /** Recorded when an update operation is committed to the table. */
  case class Update(predicate: Option[String]) extends Operation("UPDATE") {
    override val parameters: Map[String, Any]  = predicate.map("predicate" -> _).toMap
    override val operationMetrics: Set[String] = DeltaOperationMetrics.UPDATE

    override def transformMetrics(metrics: Map[String, SQLMetric]): Map[String, String] = {
      val numOutputRows  = metrics("numOutputRows").value
      val numUpdatedRows = metrics("numUpdatedRows").value
      var strMetrics     = super.transformMetrics(metrics)
      // In the case where the numUpdatedRows is not captured in the UpdateCommand implementation
      // we can siphon out the metrics from the BasicWriteStatsTracker for that command.
      // This is for the case where the entire partition is re-written.
      if (numUpdatedRows == 0 && numOutputRows != 0) {
        strMetrics += "numUpdatedRows" -> numOutputRows.toString
        strMetrics += "numCopiedRows"  -> "0"
      } else {
        strMetrics += "numCopiedRows" -> (numOutputRows - strMetrics("numUpdatedRows").toLong).toString
      }
      strMetrics
    }
  }

  /** Recorded when the table is created. */
  case class CreateTable(metadata: Metadata, isManaged: Boolean, asSelect: Boolean = false)
      extends Operation("CREATE TABLE" + s"${if (asSelect) " AS SELECT" else ""}") {
    override val parameters: Map[String, Any]  = Map(
      "isManaged"   -> isManaged.toString,
      "description" -> Option(metadata.description),
      "partitionBy" -> JsonUtils.toJson(metadata.partitionColumns),
      "properties"  -> JsonUtils.toJson(metadata.configuration)
    )
    override val operationMetrics: Set[String] = if (!asSelect) {
      Set()
    } else {
      DeltaOperationMetrics.WRITE
    }
  }

  /** Recorded when the table is replaced. */
  case class ReplaceTable(metadata: Metadata, isManaged: Boolean, orCreate: Boolean, asSelect: Boolean = false)
      extends Operation(
        s"${if (orCreate) "CREATE OR " else ""}REPLACE TABLE" +
          s"${if (asSelect) " AS SELECT" else ""}"
      ) {
    override val parameters: Map[String, Any]  = Map(
      "isManaged"   -> isManaged.toString,
      "description" -> Option(metadata.description),
      "partitionBy" -> JsonUtils.toJson(metadata.partitionColumns),
      "properties"  -> JsonUtils.toJson(metadata.configuration)
    )
    override val operationMetrics: Set[String] = if (!asSelect) {
      Set()
    } else {
      DeltaOperationMetrics.WRITE
    }
  }

  /** Recorded when the table properties are set. */
  case class SetTableProperties(properties: Map[String, String]) extends Operation("SET TBLPROPERTIES") {
    override val parameters: Map[String, Any] = Map("properties" -> JsonUtils.toJson(properties))
  }

  /** Recorded when the table properties are unset. */
  case class UnsetTableProperties(propKeys: Seq[String], ifExists: Boolean) extends Operation("UNSET TBLPROPERTIES") {
    override val parameters: Map[String, Any] = Map("properties" -> JsonUtils.toJson(propKeys), "ifExists" -> ifExists)
  }

  /** Recorded when columns are added. */
  case class AddColumns(colsToAdd: Seq[QualifiedColTypeWithPositionForLog]) extends Operation("ADD COLUMNS") {

    override val parameters: Map[String, Any] = Map("columns" -> JsonUtils.toJson(colsToAdd.map {
      case QualifiedColTypeWithPositionForLog(columnPath, column, colPosition) =>
        Map(
          "column"                      -> structFieldToMap(columnPath, column)
        ) ++ colPosition.map("position" -> _.toString)
    }))
  }

  /** Recorded when columns are changed. */
  case class ChangeColumn(
    columnPath: Seq[String],
    columnName: String,
    newColumn: StructField,
    colPosition: Option[String]
  ) extends Operation("CHANGE COLUMN") {

    override val parameters: Map[String, Any] = Map(
      "column"                      -> JsonUtils.toJson(structFieldToMap(columnPath, newColumn))
    ) ++ colPosition.map("position" -> _)
  }

  /** Recorded when columns are replaced. */
  case class ReplaceColumns(columns: Seq[StructField]) extends Operation("REPLACE COLUMNS") {

    override val parameters: Map[String, Any] = Map(
      "columns" -> JsonUtils.toJson(columns.map(structFieldToMap(Seq.empty, _)))
    )
  }

  case class UpgradeProtocol(newProtocol: Protocol) extends Operation("UPGRADE PROTOCOL") {
    override val parameters: Map[String, Any] = Map(
      "newProtocol" -> JsonUtils.toJson(
        Map(
          "minReaderVersion" -> newProtocol.minReaderVersion,
          "minWriterVersion" -> newProtocol.minWriterVersion
        )
      )
    )
  }

  object ManualUpdate extends Operation("Manual Update") {
    override val parameters: Map[String, Any] = Map.empty
  }

  object FileNotificationRetention extends Operation("FILE NOTIFICATION RETENTION") {
    override val parameters: Map[String, Any] = Map.empty
  }

  case class UpdateColumnMetadata(operationName: String, columns: Seq[(Seq[String], StructField)])
      extends Operation(operationName) {
    override val parameters: Map[String, Any] = {
      Map("columns" -> JsonUtils.toJson(columns.map { case (path, field) =>
        structFieldToMap(path, field)
      }))
    }
  }

  /** Recorded when recomputing stats on the table. */
  case class ComputeStats(predicate: Seq[String]) extends Operation("COMPUTE STATS") {
    override val parameters: Map[String, Any] = Map("predicate" -> JsonUtils.toJson(predicate))
  }

  /** Recorded when manually re-/un-/setting ZCube Information for existing files. */
  case class ResetZCubeInfo(predicate: Seq[String], zOrderBy: Seq[String]) extends Operation("RESET ZCUBE INFO") {

    override val parameters: Map[String, Any] =
      Map("predicate" -> JsonUtils.toJson(predicate), "zOrderBy" -> JsonUtils.toJson(zOrderBy))
  }

  case class UpdateSchema(oldSchema: StructType, newSchema: StructType) extends Operation("UPDATE SCHEMA") {
    override val parameters: Map[String, Any] =
      Map("oldSchema" -> JsonUtils.toJson(oldSchema), "newSchema" -> JsonUtils.toJson(newSchema))
  }

  private def structFieldToMap(colPath: Seq[String], field: StructField): Map[String, Any] = {
    Map(
      "name"     -> UnresolvedAttribute(colPath :+ field.name).name,
      "type"     -> field.dataType.typeName,
      "nullable" -> field.nullable,
      "metadata" -> JsonUtils.mapper.readValue[Map[String, Any]](field.metadata.json)
    )
  }

  /**
    * Qualified column type with position. We define a copy of the type here to avoid depending on
    * the parser output classes in our logging.
    */
  case class QualifiedColTypeWithPositionForLog(
    columnPath: Seq[String],
    column: StructField,
    colPosition: Option[String]
  )
}

private[delta] object DeltaOperationMetrics {
  val WRITE = Set(
    "numFiles",       // number of files written
    "numOutputBytes", // size in bytes of the written contents
    "numOutputRows"   // number of rows written
  )

  val STREAMING_UPDATE = Set(
    "numAddedFiles",   // number of files added
    "numRemovedFiles", // number of files removed
    "numOutputRows",   // number of rows written
    "numOutputBytes"   // number of output writes
  )

  val DELETE = Set(
    "numAddedFiles",   // number of files added
    "numRemovedFiles", // number of files removed
    "numDeletedRows",  // number of rows removed
    "numCopiedRows"    // number of rows copied in the process of deleting files
  )

  /** Deleting the entire table or partition would prevent row level metrics from being recorded */
  val DELETE_PARTITIONS = Set(
    "numRemovedFiles" // number of files removed
  )

  val TRUNCATE = Set(
    "numRemovedFiles" // number of files removed
  )

  val FSCK = Set(
    "numRemovedFiles" // number of files removed
  )

  val CONVERT = Set(
    "numConvertedFiles" // number of parquet files that have been converted.
  )

  val OPTIMIZE = Set(
    "numAddedFiles",   // number of files added
    "numRemovedFiles", // number of files removed
    "numAddedBytes",   // number of bytes added by optimize
    "numRemovedBytes", // number of bytes removed by optimize
    "minFileSize",     // the size of the smallest file
    "p25FileSize",     // the size of the 25th percentile file
    "p50FileSize",     // the median file size
    "p75FileSize",     // the 75th percentile of the file sizes
    "maxFileSize"      // the size of the largest file
  )

  val MERGE = Set(
    "numSourceRows",         // number of rows in the source dataframe
    "numTargetRowsInserted", // number of rows inserted into the target table.
    "numTargetRowsUpdated",  // number of rows updated in the target table.
    "numTargetRowsDeleted",  // number of rows deleted in the target table.
    "numTargetRowsCopied",   // number of target rows copied
    "numOutputRows",         // total number of rows written out
    "numTargetFilesAdded",   // num files added to the sink(target)
    "numTargetFilesRemoved"  // number of files removed from the sink(target)
  )

  val UPDATE = Set(
    "numAddedFiles",   // number of files added
    "numRemovedFiles", // number of files removed
    "numUpdatedRows",  // number of rows updated
    "numCopiedRows"    // number of rows just copied over in the process of updating files.
  )
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy