org.apache.spark.sql.delta.Checkpoints.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of wasp-delta-lake_2.12 Show documentation
wasp-delta-lake
The newest version!
/*
 * Copyright (2020) The Delta Lake Project Authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.sql.delta

import java.io.FileNotFoundException
import java.util.UUID

import scala.util.control.NonFatal

import org.apache.spark.sql.delta.actions.{ Action, Metadata, SingleAction }
import org.apache.spark.sql.delta.metering.DeltaLogging
import org.apache.spark.sql.delta.storage.LogStore
import org.apache.spark.sql.delta.util.DeltaFileOperations
import org.apache.spark.sql.delta.util.FileNames._
import org.apache.spark.sql.delta.util.JsonUtils
import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapred.{ JobConf, TaskAttemptContextImpl, TaskAttemptID }
import org.apache.hadoop.mapreduce.{ Job, TaskType }

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
import org.apache.spark.util.SerializableConfiguration

/**
  * Records information about a checkpoint.
  *
  * @param version the version of this checkpoint
  * @param size the number of actions in the checkpoint
  * @param parts the number of parts when the checkpoint has multiple parts. None if this is a
  *              singular checkpoint
  */
case class CheckpointMetaData(version: Long, size: Long, parts: Option[Int])

/**
  * A class to help with comparing checkpoints with each other, where we may have had concurrent
  * writers that checkpoint with different number of parts.
  */
case class CheckpointInstance(version: Long, numParts: Option[Int]) extends Ordered[CheckpointInstance] {

  /**
    * Due to lexicographic sorting, a version with more parts will appear after a version with
    * less parts during file listing. We use that logic here as well.
    */
  def isEarlierThan(other: CheckpointInstance): Boolean = {
    if (other == CheckpointInstance.MaxValue) return true
    version < other.version ||
    (version == other.version && numParts.forall(_ < other.numParts.getOrElse(1)))
  }

  def isNotLaterThan(other: CheckpointInstance): Boolean = {
    if (other == CheckpointInstance.MaxValue) return true
    version <= other.version
  }

  def getCorrespondingFiles(path: Path): Seq[Path] = {
    assert(this != CheckpointInstance.MaxValue, "Can't get files for CheckpointVersion.MaxValue.")
    numParts match {
      case None        => checkpointFileSingular(path, version) :: Nil
      case Some(parts) => checkpointFileWithParts(path, version, parts)
    }
  }

  override def compare(that: CheckpointInstance): Int = {
    if (version == that.version) {
      numParts.getOrElse(1) - that.numParts.getOrElse(1)
    } else {
      // we need to guard against overflow. We just can't return (this - that).toInt
      if (version - that.version < 0) -1 else 1
    }
  }
}

object CheckpointInstance {
  def apply(path: Path): CheckpointInstance = {
    CheckpointInstance(checkpointVersion(path), numCheckpointParts(path))
  }

  def apply(metadata: CheckpointMetaData): CheckpointInstance = {
    CheckpointInstance(metadata.version, metadata.parts)
  }

  val MaxValue: CheckpointInstance = CheckpointInstance(-1, None)
}

trait Checkpoints extends DeltaLogging {
  self: DeltaLog =>

  def logPath: Path
  def dataPath: Path
  def snapshot: Snapshot
  protected def store: LogStore
  protected def metadata: Metadata

  /** Used to clean up stale log files. */
  protected def doLogCleanup(): Unit

  /** The path to the file that holds metadata about the most recent checkpoint. */
  val LAST_CHECKPOINT = new Path(logPath, "_last_checkpoint")

  /** Creates a checkpoint at the current log version. */
  def checkpoint(): Unit = recordDeltaOperation(this, "delta.checkpoint") {
    val checkpointMetaData = checkpoint(snapshot)
    val json               = JsonUtils.toJson(checkpointMetaData)
    store.write(LAST_CHECKPOINT, Iterator(json), overwrite = true)

    doLogCleanup()
  }

  protected def checkpoint(snapshotToCheckpoint: Snapshot): CheckpointMetaData = {
    Checkpoints.writeCheckpoint(spark, this, snapshotToCheckpoint)
  }

  /** Returns information about the most recent checkpoint. */
  private[delta] def lastCheckpoint: Option[CheckpointMetaData] = {
    loadMetadataFromFile(0)
  }

  /** Loads the checkpoint metadata from the _last_checkpoint file. */
  private def loadMetadataFromFile(tries: Int): Option[CheckpointMetaData] = {
    try {
      val checkpointMetadataJson = store.read(LAST_CHECKPOINT)
      val checkpointMetadata     =
        JsonUtils.mapper.readValue[CheckpointMetaData](checkpointMetadataJson.head)
      Some(checkpointMetadata)
    } catch {
      case _: FileNotFoundException =>
        None
      case NonFatal(e) if tries < 3 =>
        logWarning(
          s"Failed to parse $LAST_CHECKPOINT. This may happen if there was an error " +
            "during read operation, or a file appears to be partial. Sleeping and trying again.",
          e
        )
        Thread.sleep(1000)
        loadMetadataFromFile(tries + 1)
      case NonFatal(e)              =>
        logWarning(s"$LAST_CHECKPOINT is corrupted. Will search the checkpoint files directly", e)
        // Hit a partial file. This could happen on Azure as overwriting _last_checkpoint file is
        // not atomic. We will try to list all files to find the latest checkpoint and restore
        // CheckpointMetaData from it.
        val verifiedCheckpoint = findLastCompleteCheckpoint(CheckpointInstance(-1L, None))
        verifiedCheckpoint.map(manuallyLoadCheckpoint)
    }
  }

  /** Loads the given checkpoint manually to come up with the CheckpointMetaData */
  protected def manuallyLoadCheckpoint(cv: CheckpointInstance): CheckpointMetaData = {
    CheckpointMetaData(cv.version, -1L, cv.numParts)
  }

  /**
    * Finds the first verified, complete checkpoint before the given version.
    *
    * @param cv The CheckpointVersion to compare against
    */
  protected def findLastCompleteCheckpoint(cv: CheckpointInstance): Option[CheckpointInstance] = {
    var cur = math.max(cv.version, 0L)
    while (cur >= 0) {
      val checkpoints    = store
        .listFrom(checkpointPrefix(logPath, math.max(0, cur - 1000)))
        .map(_.getPath)
        .filter(isCheckpointFile)
        .map(CheckpointInstance(_))
        .takeWhile(tv => (cur == 0 || tv.version <= cur) && tv.isEarlierThan(cv))
        .toArray
      val lastCheckpoint = getLatestCompleteCheckpointFromList(checkpoints, cv)
      if (lastCheckpoint.isDefined) {
        return lastCheckpoint
      } else {
        cur -= 1000
      }
    }
    None
  }

  /**
    * Given a list of checkpoint files, pick the latest complete checkpoint instance which is not
    * later than `notLaterThan`.
    */
  protected def getLatestCompleteCheckpointFromList(
    instances: Array[CheckpointInstance],
    notLaterThan: CheckpointInstance
  ): Option[CheckpointInstance] = {
    val complete = instances.filter(_.isNotLaterThan(notLaterThan)).groupBy(identity).filter {
      case (CheckpointInstance(_, None), inst)        => inst.length == 1
      case (CheckpointInstance(_, Some(parts)), inst) => inst.length == parts
    }
    complete.keys.toArray.sorted.lastOption
  }
}

object Checkpoints {

  /**
    * Writes out the contents of a [[Snapshot]] into a checkpoint file that
    * can be used to short-circuit future replays of the log.
    *
    * Returns the checkpoint metadata to be committed to a file. We will use the value
    * in this file as the source of truth of the last valid checkpoint.
    */
  private[delta] def writeCheckpoint(
    spark: SparkSession,
    deltaLog: DeltaLog,
    snapshot: Snapshot
  ): CheckpointMetaData = {
    import SingleAction._

    val (factory, serConf) = {
      val format = new ParquetFileFormat()
      val job    = Job.getInstance()
      (
        format.prepareWrite(spark, job, Map.empty, Action.logSchema),
        new SerializableConfiguration(job.getConfiguration)
      )
    }

    // The writing of checkpoints doesn't go through log store, so we need to check with the
    // log store and decide whether to use rename.
    val useRename = deltaLog.store.isPartialWriteVisible(deltaLog.logPath)

    val checkpointSize = spark.sparkContext.longAccumulator("checkpointSize")
    val numOfFiles     = spark.sparkContext.longAccumulator("numOfFiles")
    // Use the string in the closure as Path is not Serializable.
    val path           = checkpointFileSingular(snapshot.path, snapshot.version).toString
    val writtenPath    = snapshot.state
      .repartition(1)
      .map { action =>
        if (action.add != null) {
          numOfFiles.add(1)
        }
        action
      }
      .queryExecution // This is a hack to get spark to write directly to a file.
      .executedPlan
      .execute()
      .mapPartitions { iter =>
        val writtenPath =
          if (useRename) {
            val p        = new Path(path)
            // Two instances of the same task may run at the same time in some cases (e.g.,
            // speculation, stage retry), so generate the temp path here to avoid two tasks
            // using the same path.
            val tempPath = new Path(p.getParent, s".${p.getName}.${UUID.randomUUID}.tmp")
            DeltaFileOperations.registerTempFileDeletionTaskFailureListener(serConf.value, tempPath)
            tempPath.toString
          } else {
            path
          }
        try {
          val writer = factory.newInstance(
            writtenPath,
            Action.logSchema,
            new TaskAttemptContextImpl(new JobConf(serConf.value), new TaskAttemptID("", 0, TaskType.REDUCE, 0, 0))
          )

          iter.foreach { row =>
            checkpointSize.add(1)
            writer.write(row)
          }
          writer.close()
        } catch {
          case e: org.apache.hadoop.fs.FileAlreadyExistsException if !useRename =>
            val p = new Path(writtenPath)
            if (p.getFileSystem(serConf.value).exists(p)) {
              // The file has been written by a zombie task. We can just use this checkpoint file
              // rather than failing a Delta commit.
            } else {
              throw e
            }
        }
        Iterator(writtenPath)
      }
      .collect()
      .head

    if (useRename) {
      val src        = new Path(writtenPath)
      val dest       = new Path(path)
      val fs         = dest.getFileSystem(spark.sessionState.newHadoopConf)
      var renameDone = false
      try {
        if (fs.rename(src, dest)) {
          renameDone = true
        } else {
          // There should be only one writer writing the checkpoint file, so there must be
          // something wrong here.
          throw new IllegalStateException(s"Cannot rename $src to $dest")
        }
      } finally {
        if (!renameDone) {
          fs.delete(src, false)
        }
      }
    }

    if (numOfFiles.value != snapshot.numOfFiles) {
      throw new IllegalStateException("State of the checkpoint doesn't match that of the snapshot.")
    }
    CheckpointMetaData(snapshot.version, checkpointSize.value, None)
  }
}