All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.spark.sql.delta.Snapshot.scala Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (2020) The Delta Lake Project Authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.sql.delta

// scalastyle:off import.ordering.noEmptyLine
import java.net.URI

import org.apache.spark.sql.delta.actions._
import org.apache.spark.sql.delta.actions.Action.logSchema
import org.apache.spark.sql.delta.metering.DeltaLogging
import org.apache.spark.sql.delta.sources.DeltaSQLConf
import org.apache.spark.sql.delta.util.StateCache
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{ FileSystem, Path }

import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.execution.datasources.{ HadoopFsRelation, LogicalRelation }
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.StructType
import org.apache.spark.util.{ SerializableConfiguration, Utils }

/**
  * An immutable snapshot of the state of the log at some delta version. Internally
  * this class manages the replay of actions stored in checkpoint or delta files.
  *
  * After resolving any new actions, it caches the result and collects the
  * following basic information to the driver:
  *  - Protocol Version
  *  - Metadata
  *  - Transaction state
  *
  * @param timestamp The timestamp of the latest commit in milliseconds. Can also be set to -1 if the
  *                  timestamp of the commit is unknown or the table has not been initialized, i.e.
  *                  `version = -1`.
  */
class Snapshot(
  val path: Path,
  val version: Long,
  val logSegment: LogSegment,
  val minFileRetentionTimestamp: Long,
  val deltaLog: DeltaLog,
  val timestamp: Long,
  val checksumOpt: Option[VersionChecksum]
) extends StateCache
    with PartitionFiltering
    with DeltaFileFormat
    with DeltaLogging {

  import Snapshot._
  // For implicits which re-use Encoder:
  import SingleAction._

  protected def spark = SparkSession.active

  protected def getNumPartitions: Int = {
    spark.sessionState.conf
      .getConf(DeltaSQLConf.DELTA_SNAPSHOT_PARTITIONS)
      .getOrElse(Snapshot.defaultNumSnapshotPartitions)
  }

  /** Performs validations during initialization */
  protected def init(): Unit = {
    deltaLog.protocolRead(protocol)
  }

  // Reconstruct the state by applying deltas in order to the checkpoint.
  // We partition by path as it is likely the bulk of the data is add/remove.
  // Non-path based actions will be collocated to a single partition.
  private def stateReconstruction: Dataset[SingleAction] = {
    val implicits = spark.implicits
    import implicits._

    val time       = minFileRetentionTimestamp
    val hadoopConf = spark.sparkContext.broadcast(new SerializableConfiguration(spark.sessionState.newHadoopConf()))
    val logPath    = path.toUri // for serializability

    loadActions.mapPartitions { actions =>
      val hdpConf = hadoopConf.value.value
      actions.flatMap {
        _.unwrap match {
          case add: AddFile           => Some(add.copy(path = canonicalizePath(add.path, hdpConf)).wrap)
          case rm: RemoveFile         => Some(rm.copy(path = canonicalizePath(rm.path, hdpConf)).wrap)
          case other if other == null => None
          case other                  => Some(other.wrap)
        }
      }
    }
      .withColumn("file", assertLogBelongsToTable(logPath)(input_file_name()))
      .repartition(getNumPartitions, coalesce($"add.path", $"remove.path"))
      .sortWithinPartitions("file")
      .as[SingleAction]
      .mapPartitions { iter =>
        val state = new InMemoryLogReplay(time)
        state.append(0, iter.map(_.unwrap))
        state.checkpoint.map(_.wrap)
      }
  }

  def redactedPath: String =
    Utils.redact(spark.sessionState.conf.stringRedactionPattern, path.toUri.toString)

  private lazy val cachedState =
    cacheDS(stateReconstruction, s"Delta Table State #$version - $redactedPath")

  /** The current set of actions in this [[Snapshot]]. */
  def state: Dataset[SingleAction] = cachedState.getDS

  /**
    * Computes some statistics around the transaction log, therefore on the actions made on this
    * Delta table.
    */
  protected lazy val computedState: State = {
    val implicits = spark.implicits
    import implicits._
    state
      .select(
        coalesce(last($"protocol", ignoreNulls = true), defaultProtocol()) as "protocol",
        coalesce(last($"metaData", ignoreNulls = true), emptyMetadata()) as "metadata",
        collect_set($"txn") as "setTransactions",
        // sum may return null for empty data set.
        coalesce(sum($"add.size"), lit(0L)) as "sizeInBytes",
        count($"add") as "numOfFiles",
        count($"metaData") as "numOfMetadata",
        count($"protocol") as "numOfProtocol",
        count($"remove") as "numOfRemoves",
        count($"txn") as "numOfSetTransactions"
      )
      .as[State](stateEncoder)
      .first()
  }

  def protocol: Protocol                   = computedState.protocol
  def metadata: Metadata                   = computedState.metadata
  def setTransactions: Seq[SetTransaction] = computedState.setTransactions
  def sizeInBytes: Long                    = computedState.sizeInBytes
  def numOfFiles: Long                     = computedState.numOfFiles
  def numOfMetadata: Long                  = computedState.numOfMetadata
  def numOfProtocol: Long                  = computedState.numOfProtocol
  def numOfRemoves: Long                   = computedState.numOfRemoves
  def numOfSetTransactions: Long           = computedState.numOfSetTransactions

  /** A map to look up transaction version by appId. */
  lazy val transactions: Map[String, Long] = setTransactions.map(t => t.appId -> t.version).toMap

  // Here we need to bypass the ACL checks for SELECT anonymous function permissions.
  /** All of the files present in this [[Snapshot]]. */
  def allFiles: Dataset[AddFile] = {
    val implicits = spark.implicits
    import implicits._
    state.where("add IS NOT NULL").select($"add".as[AddFile])
  }

  /** All unexpired tombstones. */
  def tombstones: Dataset[RemoveFile] = {
    val implicits = spark.implicits
    import implicits._
    state.where("remove IS NOT NULL").select($"remove".as[RemoveFile])
  }

  /** Returns the schema of the table. */
  def schema: StructType = metadata.schema

  /** Returns the data schema of the table, the schema of the columns written out to file. */
  def dataSchema: StructType = metadata.dataSchema

  /** Number of columns to collect stats on for data skipping */
  lazy val numIndexedCols: Int = DeltaConfigs.DATA_SKIPPING_NUM_INDEXED_COLS.fromMetaData(metadata)

  // Given the list of files from `LogSegment`, create respective file indices to help create
  // a DataFrame and short-circuit the many file existence and partition schema inference checks
  // that exist in DataSource.resolveRelation().
  protected lazy val deltaFileIndexOpt: Option[DeltaLogFileIndex] = {
    DeltaLogFileIndex(DeltaLogFileIndex.COMMIT_FILE_FORMAT, logSegment.deltas)
  }

  protected lazy val checkpointFileIndexOpt: Option[DeltaLogFileIndex] = {
    DeltaLogFileIndex(DeltaLogFileIndex.CHECKPOINT_FILE_FORMAT, logSegment.checkpoint)
  }

  protected lazy val fileIndices: Seq[DeltaLogFileIndex] = {
    checkpointFileIndexOpt.toSeq ++ deltaFileIndexOpt.toSeq
  }

  /**
    * Loads the file indices into a Dataset that can be used for LogReplay.
    */
  private def loadActions: Dataset[SingleAction] = {
    val dfs = fileIndices.map { index: DeltaLogFileIndex =>
      val fsRelation =
        HadoopFsRelation(index, index.partitionSchema, logSchema, None, index.format, Map.empty[String, String])(spark)
      Dataset[SingleAction](spark, LogicalRelation(fsRelation))
    }

    dfs.reduceOption(_.union(_)).getOrElse(emptyActions)
  }

  protected def emptyActions: Dataset[SingleAction] =
    spark.createDataFrame(spark.sparkContext.emptyRDD[Row], logSchema).as[SingleAction]

  override def logInfo(msg: => String): Unit = {
    super.logInfo(s"[tableId=${deltaLog.tableId}] " + msg)
  }

  override def logWarning(msg: => String): Unit = {
    super.logWarning(s"[tableId=${deltaLog.tableId}] " + msg)
  }

  override def logWarning(msg: => String, throwable: Throwable): Unit = {
    super.logWarning(s"[tableId=${deltaLog.tableId}] " + msg, throwable)
  }

  override def logError(msg: => String): Unit = {
    super.logError(s"[tableId=${deltaLog.tableId}] " + msg)
  }

  override def logError(msg: => String, throwable: Throwable): Unit = {
    super.logError(s"[tableId=${deltaLog.tableId}] " + msg, throwable)
  }

  override def toString: String =
    s"${getClass.getSimpleName}(path=$path, version=$version, metadata=$metadata, " +
      s"logSegment=$logSegment, checksumOpt=$checksumOpt)"

  logInfo(s"Created snapshot $this")
  init()
}

object Snapshot extends DeltaLogging {

  private val defaultNumSnapshotPartitions: Int = 50

  /** Canonicalize the paths for Actions */
  private def canonicalizePath(path: String, hadoopConf: Configuration): String = {
    val hadoopPath = new Path(new URI(path))
    if (hadoopPath.isAbsoluteAndSchemeAuthorityNull) {
      val fs = FileSystem.get(hadoopConf)
      fs.makeQualified(hadoopPath).toUri.toString
    } else {
      // return untouched if it is a relative path or is already fully qualified
      hadoopPath.toUri.toString
    }
  }

  /**
    * Make sure that the delta file we're reading belongs to this table. Cached snapshots from
    * the previous states will contain empty strings as the file name.
    */
  private def assertLogBelongsToTable(logBasePath: URI): UserDefinedFunction = {
    udf((filePath: String) => {
      if (filePath.isEmpty || new Path(new URI(filePath)).getParent == new Path(logBasePath)) {
        filePath
      } else {
        // scalastyle:off throwerror
        throw new AssertionError(
          s"File ($filePath) doesn't belong in the " +
            s"transaction log at $logBasePath. Please contact Databricks Support."
        )
        // scalastyle:on throwerror
      }
    })
  }

  /**
    * Metrics and metadata computed around the Delta table
    * @param protocol The protocol version of the Delta table
    * @param metadata The metadata of the table
    * @param setTransactions The streaming queries writing to this table
    * @param sizeInBytes The total size of the table (of active files, not including tombstones)
    * @param numOfFiles The number of files in this table
    * @param numOfMetadata The number of metadata actions in the state. Should be 1
    * @param numOfProtocol The number of protocol actions in the state. Should be 1
    * @param numOfRemoves The number of tombstones in the state
    * @param numOfSetTransactions Number of streams writing to this table
    */
  case class State(
    protocol: Protocol,
    metadata: Metadata,
    setTransactions: Seq[SetTransaction],
    sizeInBytes: Long,
    numOfFiles: Long,
    numOfMetadata: Long,
    numOfProtocol: Long,
    numOfRemoves: Long,
    numOfSetTransactions: Long
  )

  private[this] lazy val _stateEncoder: ExpressionEncoder[State] =
    try {
      ExpressionEncoder[State]()
    } catch {
      case e: Throwable =>
        logError(e.getMessage, e)
        throw e
    }

  private lazy val emptyMetadata   = udf(() => Metadata())
  private lazy val defaultProtocol = udf(() => Protocol())
  implicit private def stateEncoder: Encoder[State] = {
    _stateEncoder.copy()
  }
}

/**
  * An initial snapshot with only metadata specified. Useful for creating a DataFrame from an
  * existing parquet table during its conversion to delta.
  *
  * @param logPath the path to transaction log
  * @param deltaLog the delta log object
  * @param metadata the metadata of the table
  */
class InitialSnapshot(val logPath: Path, override val deltaLog: DeltaLog, override val metadata: Metadata)
    extends Snapshot(logPath, -1, LogSegment.empty, -1, deltaLog, -1, None) {

  override def state: Dataset[SingleAction] = emptyActions
  override protected lazy val computedState: Snapshot.State = {
    Snapshot.State(Protocol(), metadata, Nil, 0L, 0L, 1L, 1L, 0L, 0L)
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy