All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.delta.standalone.internal.DeltaHistoryManager.scala Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (2020-present) The Delta Lake Project Authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package io.delta.standalone.internal

import java.sql.Timestamp

import scala.collection.JavaConverters._

import io.delta.storage.LogStore
import org.apache.hadoop.fs.Path

import io.delta.standalone.internal.actions.{Action, CommitInfo, CommitMarker}
import io.delta.standalone.internal.exception.DeltaErrors
import io.delta.standalone.internal.logging.Logging
import io.delta.standalone.internal.util.FileNames

/**
 * This class keeps tracks of the version of commits and their timestamps for a Delta table to
 * help with operations like describing the history of a table.
 *
 * @param deltaLog the transaction log of this table
 */
private[internal] case class DeltaHistoryManager(deltaLog: DeltaLogImpl) extends Logging {

  /** Get the persisted commit info for the given delta file. */
  def getCommitInfo(version: Long): CommitInfo = {
    import io.delta.standalone.internal.util.Implicits._

    val info = deltaLog.store
      .read(FileNames.deltaFile(deltaLog.logPath, version), deltaLog.hadoopConf)
      .toArray
      .map(Action.fromJson)
      .collectFirst { case c: CommitInfo => c }
    if (info.isEmpty) {
      CommitInfo.empty(Some(version))
    } else {
      info.head.copy(version = Some(version))
    }
  }

  /**
   * Check whether the given version can be recreated by replaying the DeltaLog.
   *
   * @throws IllegalArgumentException if version is outside range of available versions
   */
  def checkVersionExists(version: Long): Unit = {
    val earliestVersion = getEarliestReproducibleCommitVersion
    val latestVersion = deltaLog.update().version
    if (version < earliestVersion || version > latestVersion) {
      throw DeltaErrors.versionNotExistException(version, earliestVersion, latestVersion)
    }
  }

  /**
   * Returns the latest commit that happened at or before `time`.
   *
   * If the given timestamp is outside the range of [earliestCommit, latestCommit] then use params
   * `canReturnLastCommit` and `canReturnEarliestCommit` to control whether an exception is thrown
   * or the corresponding earliest/latest commit is returned. See param docs below.
   *
   * @param timestamp the timestamp to search for
   * @param canReturnLastCommit Whether we can return the latest version of the table if the
   *                            provided timestamp is after the latest commit
   * @param mustBeRecreatable Whether the state at the given commit should be recreatable
   * @param canReturnEarliestCommit Whether we can return the earliest version of the table if the
   *                                provided timestamp is before the earliest commit
   * @throws RuntimeException if the state at the given commit in not recreatable and
   *                          mustBeRecreatable is true
   * @throws IllegalArgumentException if the provided timestamp is before the earliest commit and
   *                                  canReturnEarliestCommit is false
   * @throws IllegalArgumentException if the provided timestamp is after the latest commit and
   *                                  canReturnLastCommit is false
   */
  def getActiveCommitAtTime(
      timestamp: Timestamp,
      canReturnLastCommit: Boolean = false,
      mustBeRecreatable: Boolean = true,
      canReturnEarliestCommit: Boolean = false): Commit = {
    val time = timestamp.getTime
    val earliestVersion = if (mustBeRecreatable) {
      getEarliestReproducibleCommitVersion
    } else {
      getEarliestDeltaFile(deltaLog)
    }
    val latestVersion = deltaLog.update().version

    // Search for the commit
    val commits = getCommits(deltaLog.store, deltaLog.logPath, earliestVersion, latestVersion + 1)

    // If it returns empty, we will fail below with `timestampEarlierThanTableFirstCommit`
    val commit = lastCommitBeforeTimestamp(commits, time).getOrElse(commits.head)

    // Error handling
    val commitTs = new Timestamp(commit.timestamp)
    if (commit.timestamp > time && !canReturnEarliestCommit) {
      throw DeltaErrors.timestampEarlierThanTableFirstCommit(timestamp, commitTs)
    } else if (commit.timestamp < time && commit.version == latestVersion && !canReturnLastCommit) {
      throw DeltaErrors.timestampLaterThanTableLastCommit(timestamp, commitTs)
    }

    commit
  }

  /**
   * Get the earliest commit available for this table. Note that this version isn't guaranteed to
   * exist when performing an action as a concurrent operation can delete the file during cleanup.
   * This value must be used as a lower bound.
   */
  def getEarliestDeltaFile(deltaLog: DeltaLogImpl): Long = {
    val version0 = FileNames.deltaFile(deltaLog.logPath, 0)
    val earliestVersionOpt = deltaLog.store.listFrom(version0, deltaLog.hadoopConf)
      .asScala
      .filter(f => FileNames.isDeltaFile(f.getPath))
      .take(1).toArray.headOption
    if (earliestVersionOpt.isEmpty) {
      throw DeltaErrors.noHistoryFound(deltaLog.logPath)
    }
    FileNames.deltaVersion(earliestVersionOpt.get.getPath)
  }

  /**
   * Get the earliest commit, which we can recreate. Note that this version isn't guaranteed to
   * exist when performing an action as a concurrent operation can delete the file during cleanup.
   * This value must be used as a lower bound.
   *
   * We search for the earliest checkpoint we have, or whether we have the 0th delta file, because
   * that way we can reconstruct the entire history of the table. This method assumes that the
   * commits are contiguous.
   */
  private def getEarliestReproducibleCommitVersion: Long = {
    val files = deltaLog.store
      .listFrom(FileNames.deltaFile(deltaLog.logPath, 0), deltaLog.hadoopConf)
      .asScala
      .filter(f => FileNames.isDeltaFile(f.getPath) || FileNames.isCheckpointFile(f.getPath))

    // A map of checkpoint version and number of parts, to number of parts observed
    val checkpointMap = new scala.collection.mutable.HashMap[(Long, Int), Int]()
    var smallestDeltaVersion = Long.MaxValue
    var lastCompleteCheckpoint: Option[Long] = None

    // Iterate through the log files - this will be in order starting from the lowest version.
    // Checkpoint files come before deltas, so when we see a checkpoint, we remember it and
    // return it once we detect that we've seen a smaller or equal delta version.
    while (files.hasNext) {
      val nextFilePath = files.next().getPath
      if (FileNames.isDeltaFile(nextFilePath)) {
        val version = FileNames.deltaVersion(nextFilePath)
        if (version == 0L) return version
        smallestDeltaVersion = math.min(version, smallestDeltaVersion)

        // Note that we also check this condition at the end of the function - we check it
        // here too to to try and avoid more file listing when it's unnecessary.
        if (lastCompleteCheckpoint.exists(_ >= smallestDeltaVersion)) {
          return lastCompleteCheckpoint.get
        }
      } else if (FileNames.isCheckpointFile(nextFilePath)) {
        val checkpointVersion = FileNames.checkpointVersion(nextFilePath)
        val parts = FileNames.numCheckpointParts(nextFilePath)
        if (parts.isEmpty) {
          lastCompleteCheckpoint = Some(checkpointVersion)
        } else {
          // if we have a multi-part checkpoint, we need to check that all parts exist
          val numParts = parts.getOrElse(1)
          val preCount = checkpointMap.getOrElse(checkpointVersion -> numParts, 0)
          if (numParts == preCount + 1) {
            lastCompleteCheckpoint = Some(checkpointVersion)
          }
          checkpointMap.put(checkpointVersion -> numParts, preCount + 1)
        }
      }
    }

    if (lastCompleteCheckpoint.exists(_ >= smallestDeltaVersion)) {
      lastCompleteCheckpoint.get
    } else if (smallestDeltaVersion < Long.MaxValue) {
      throw DeltaErrors.noReproducibleHistoryFound(deltaLog.logPath)
    } else {
      throw DeltaErrors.noHistoryFound(deltaLog.logPath)
    }
  }

  /**
   * Returns the commit version and timestamps of all commits in `[start, end)`. If `end` is not
   * specified, will return all commits that exist after `start`. Will guarantee that the commits
   * returned will have both monotonically increasing versions as well as timestamps.
   * Exposed for tests.
   */
  private def getCommits(
      logStore: LogStore,
      logPath: Path,
      start: Long,
      end: Long): Array[Commit] = {
    val commits = logStore.listFrom(FileNames.deltaFile(logPath, start), deltaLog.hadoopConf)
      .asScala
      .filter(f => FileNames.isDeltaFile(f.getPath))
      .map { fileStatus =>
        Commit(FileNames.deltaVersion(fileStatus.getPath), fileStatus.getModificationTime)
      }
      .takeWhile(_.version < end)

    monotonizeCommitTimestamps(commits.toArray)
  }

  /**
   * Makes sure that the commit timestamps are monotonically increasing with respect to commit
   * versions. Requires the input commits to be sorted by the commit version.
   */
  private def monotonizeCommitTimestamps[T <: CommitMarker](commits: Array[T]): Array[T] = {
    var i = 0
    val length = commits.length
    while (i < length - 1) {
      val prevTimestamp = commits(i).getTimestamp
      assert(commits(i).getVersion < commits(i + 1).getVersion, "Unordered commits provided.")
      if (prevTimestamp >= commits(i + 1).getTimestamp) {
        logWarning(s"Found Delta commit ${commits(i).getVersion} with a timestamp $prevTimestamp " +
          s"which is greater than the next commit timestamp ${commits(i + 1).getTimestamp}.")
        commits(i + 1) = commits(i + 1).withTimestamp(prevTimestamp + 1).asInstanceOf[T]
      }
      i += 1
    }
    commits
  }

  /** Returns the latest commit that happened at or before `time`. */
  private def lastCommitBeforeTimestamp(commits: Seq[Commit], time: Long): Option[Commit] = {
    val i = commits.lastIndexWhere(_.timestamp <= time)
    if (i < 0) None else Some(commits(i))
  }

  /** A helper class to represent the timestamp and version of a commit. */
  case class Commit(version: Long, timestamp: Long) extends CommitMarker {
    override def withTimestamp(timestamp: Long): Commit = this.copy(timestamp = timestamp)

    override def getTimestamp: Long = timestamp

    override def getVersion: Long = version
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy