All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.spark.sql.delta.util.FileNames.scala Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (2020) The Delta Lake Project Authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.sql.delta.util

import org.apache.spark.sql.delta.DeltaErrors
import org.apache.hadoop.fs.{ FileStatus, Path }

/** Helper for creating file names for specific commits / checkpoints. */
object FileNames {

  val deltaFilePattern      = "\\d+\\.json".r.pattern
  val checksumFilePattern   = "\\d+\\.crc".r.pattern
  val checkpointFilePattern = "\\d+\\.checkpoint(\\.\\d+\\.\\d+)?\\.parquet".r.pattern

  /** Returns the path for a given delta file. */
  def deltaFile(path: Path, version: Long): Path = new Path(path, f"$version%020d.json")

  /** Returns the path for a given sample file */
  def sampleFile(path: Path, version: Long): Path = new Path(path, f"$version%020d")

  /** Returns the path to the checksum file for the given version. */
  def checksumFile(path: Path, version: Long): Path = new Path(path, f"$version%020d.crc")

  /** Returns the version for the given delta path. */
  def deltaVersion(path: Path): Long = path.getName.stripSuffix(".json").toLong

  /** Returns the version for the given checksum file. */
  def checksumVersion(path: Path): Long = path.getName.stripSuffix(".crc").toLong

  /**
    * Returns the prefix of all checkpoint files for the given version.
    *
    * Intended for use with listFrom to get all files from this version onwards. The returned Path
    * will not exist as a file.
    */
  def checkpointPrefix(path: Path, version: Long): Path =
    new Path(path, f"$version%020d.checkpoint")

  /**
    * Returns the path for a singular checkpoint up to the given version.
    *
    * In a future protocol version this path will stop being written.
    */
  def checkpointFileSingular(path: Path, version: Long): Path =
    new Path(path, f"$version%020d.checkpoint.parquet")

  /**
    * Returns the paths for all parts of the checkpoint up to the given version.
    *
    * In a future protocol version we will write this path instead of checkpointFileSingular.
    *
    * Example of the format: 00000000000000004915.checkpoint.0000000020.0000000060.parquet is
    * checkpoint part 20 out of 60 for the snapshot at version 4915. Zero padding is for
    * lexicographic sorting.
    */
  def checkpointFileWithParts(path: Path, version: Long, numParts: Int): Seq[Path] = {
    Range(1, numParts + 1)
      .map(i => new Path(path, f"$version%020d.checkpoint.$i%010d.$numParts%010d.parquet"))
  }

  def numCheckpointParts(path: Path): Option[Int] = {
    val segments = path.getName.split("\\.")

    if (segments.size != 5) None else Some(segments(3).toInt)
  }

  def isCheckpointFile(path: Path): Boolean = checkpointFilePattern.matcher(path.getName).matches()

  def isDeltaFile(path: Path): Boolean = deltaFilePattern.matcher(path.getName).matches()

  def isChecksumFile(path: Path): Boolean = checksumFilePattern.matcher(path.getName).matches()

  def checkpointVersion(path: Path): Long = path.getName.split("\\.")(0).toLong

  /**
    * Get the version of the checkpoint, checksum or delta file. Throws an error if the log file
    * path is unexpected. Must be used after whitelisting the list of files during a transaction
    * log listing for forwards compatibility.
    */
  def getFileVersion(path: Path): Long = {
    if (isCheckpointFile(path)) {
      checkpointVersion(path)
    } else if (isDeltaFile(path)) {
      deltaVersion(path)
    } else if (isChecksumFile(path)) {
      checksumVersion(path)
    } else {
      // scalastyle:off throwerror
      throw new AssertionError(
        "The file listing needs to be whitelisted to know files before this method can be called."
      )
      // scalastyle:on throwerror
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy