bio.ferlab.datalake.spark3.utils.CsvUtils.scala Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of datalake-spark3_2.12 Show documentation

Library built on top of Apache Spark to speed-up data lakes development..

There is a newer version: 14.8.0

package bio.ferlab.datalake.spark3.utils

import bio.ferlab.datalake.commons.config.Format.CSV
import bio.ferlab.datalake.commons.config.{Coalesce, Configuration, DatasetConf, FixedRepartition}
import bio.ferlab.datalake.commons.file.FileSystemResolver
import bio.ferlab.datalake.spark3.implicits.DatasetConfImplicits.DatasetConfOperations
import org.apache.spark.sql.{DataFrame, SparkSession}

object CsvUtils {

  /**
   * Renames the CSV output file if the destination format is CSV and data is repartitioned into a single file.
   *
   * When writing to CSV format, Spark adds the partition information to the filename. This function replaces the
   * partition info with the table name. It also deletes the unnecessary `_SUCCESS` file created by Spark.
   *
   * @param mainDestination The mainDestination [[DatasetConf]] of the ETL
   * @param suffix          Optional, adds a suffix to the file name, before the extension
   * @param spark           An instance of [[SparkSession]]
   * @param conf            The ETL [[Configuration]]
   * @return The renamed CSV loaded as a Dataframe
   * @example
   * This function would rename this CSV file :
   * {{{
   *   published/nom_du_projet/nom_de_la_table/part-00000-3afd3298-a186-4289-8ba3-3bf55d27953f-c000.csv
   * }}}
   * to :
   * {{{
   *   published/nom_du_projet/nom_de_la_table/nom_de_la_table_suffix.csv
   * }}}
   * where suffix could be : `v1_0_0`, `2020_01_01`, etc.
   */
  def renameCsvFile(mainDestination: DatasetConf, suffix: Option[String] = None)
                   (implicit spark: SparkSession, conf: Configuration): DataFrame = {
    val (format, repartition) = (mainDestination.format, mainDestination.repartition)
    if (format == CSV && repartition.isDefined) {
      if (repartition.get == FixedRepartition(1) || repartition.get == Coalesce(1)) {
        val fs = FileSystemResolver.resolve(conf.getStorage(mainDestination.storageid).filesystem)
        val files = fs.list(mainDestination.location, recursive = false)
        val successFilePath = files
          .filter(_.name == "_SUCCESS")
          .head
          .path
        val csvFilePath = files
          .filter(_.name.startsWith("part-"))
          .head
          .path

        val newPath = mainDestination.location + "/" + mainDestination.path.split("/").last + suffix.map("_" + _).getOrElse("") + ".csv"

        fs.move(csvFilePath, newPath, overwrite = true)
        fs.remove(successFilePath)
      }
    }
    mainDestination.read
  }

}