bio.ferlab.datalake.spark3.loader.JdbcLoader.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of datalake-spark3_2.12 Show documentation
Library built on top of Apache Spark to speed-up data lakes development..
There is a newer version: 14.8.0
package bio.ferlab.datalake.spark3.loader

import org.apache.spark.sql.functions.coalesce
import org.apache.spark.sql.{Column, DataFrame, SaveMode, SparkSession}

import java.time.LocalDate

object JdbcLoader extends Loader {

  /**
   * Default read logic for a loader
   *
   * @param location     absolute path of where the data is
   * @param format       string representing the format
   * @param readOptions  read options
   * @param databaseName Optional database name
   * @param tableName    Optional table name
   * @param spark        spark session
   * @return the data as a dataframe
   */
  override def read(location: String,
                    format: String,
                    readOptions: Map[String, String],
                    databaseName: Option[String],
                    tableName: Option[String])(implicit spark: SparkSession): DataFrame = {
    require(readOptions.isDefinedAt("url"), "Expecting [url] to be defined in readOptions.")
    require(readOptions.isDefinedAt("user"), "Expecting [user] to be defined in readOptions.")
    require(readOptions.isDefinedAt("password"), "Expecting [password] to be defined in readOptions.")
    require(readOptions.isDefinedAt("driver"), "Expecting [driver] to be defined in readOptions.")
    require(readOptions.isDefinedAt("query") || readOptions.isDefinedAt("dbtable"),
      "Expecting either [query] or [dbtable] to be defined in readOptions.")

    spark
      .read
      .format(format)
      .options(readOptions)
      .load()

  }

  /**
   * @param location     where to write the data
   * @param databaseName database name
   * @param tableName    table name
   * @param df           new data to write into the table
   * @param partitioning how the data is partitionned
   * @param format       format
   * @param options      write options
   * @param spark        a spark session
   * @return updated data
   */
  override def writeOnce(location: String,
                         databaseName: String,
                         tableName: String,
                         df: DataFrame,
                         partitioning: List[String],
                         format: String,
                         options: Map[String, String])(implicit spark: SparkSession): DataFrame = {
    df
      .write
      .format(format)
      .mode(SaveMode.Overwrite)
      .options(options + ("dbtable" -> s"$databaseName.$tableName"))
      .save()
    df
  }

  /**
   * Insert or append data into a table
   * Does not resolve duplicates
   *
   * @param location  full path of where the data will be located
   * @param tableName the name of the updated/created table
   * @param updates   new data to be merged with existing data
   * @param spark     a valid spark session
   * @return the data as a dataframe
   */
  override def insert(location: String,
                      databaseName: String,
                      tableName: String,
                      updates: DataFrame,
                      partitioning: List[String],
                      format: String,
                      options: Map[String, String])(implicit spark: SparkSession): DataFrame = {
    updates
      .write
      .format(format)
      .mode(SaveMode.Append)
      .options(options + ("dbtable" -> s"$databaseName.$tableName"))
      .save()
    updates
  }

  /**
   * Update or insert data into a table
   * Resolves duplicates by using the list of primary key passed as argument
   *
   * @param location    full path of where the data will be located
   * @param tableName   the name of the updated/created table
   * @param updates     new data to be merged with existing data
   * @param primaryKeys name of the columns holding the unique id
   * @param spark       a valid spark session
   * @return the data as a dataframe
   */
  override def upsert(location: String,
                      databaseName: String,
                      tableName: String,
                      updates: DataFrame,
                      primaryKeys: Seq[String],
                      partitioning: List[String],
                      format: String,
                      options: Map[String, String])(implicit spark: SparkSession): DataFrame = {
    import spark.implicits._

    require(primaryKeys.nonEmpty, "Primary keys are required for an Upsert write.")
    require(primaryKeys.forall(updates.columns.contains), s"Columns [${primaryKeys.mkString(", ")}] are required in the DataFrame.")

    val readOptions = options + ("dbtable" -> s"$databaseName.$tableName")
    val existingDf = read(location, format, readOptions, Some(databaseName), Some(tableName))
      .persist() // Make sure table is read once and at first

    val updatedDf = if (existingDf.isEmpty) updates
    else {
      // No upsert operation with JDBC connection
      // Do the merge with Spark then overwrite the table with the result
      val keysAreIdentical: Column = primaryKeys.map(col => $"new.$col" <=> $"existing.$col").reduce(_ && _)
      val updatedColumns: Seq[Column] = updates.columns.map(col => coalesce($"new.$col", $"existing.$col") as col)

      updates.as("new")
        .join(existingDf.as("existing"), keysAreIdentical, "full")
        .select(updatedColumns: _*)
    }

    writeOnce(location, databaseName, tableName, updatedDf, partitioning, format, options)
  }

  /**
   * Update the data only if the data has changed
   * Insert new data
   * maintains updatedOn and createdOn timestamps for each record
   * usually used for dimension table for which keeping the full historic is not required.
   *
   * @param location      full path of where the data will be located
   * @param tableName     the name of the updated/created table
   * @param updates       new data to be merged with existing data
   * @param primaryKeys   name of the columns holding the unique id
   * @param oidName       name of the column holding the hash of the column that can change over time (or version number)
   * @param createdOnName name of the column holding the creation timestamp
   * @param updatedOnName name of the column holding the last update timestamp
   * @param spark         a valid spark session
   * @return the data as a dataframe
   */
  override def scd1(location: String,
                    databaseName: String,
                    tableName: String,
                    updates: DataFrame,
                    primaryKeys: Seq[String],
                    oidName: String,
                    createdOnName: String,
                    updatedOnName: String,
                    partitioning: List[String],
                    format: String,
                    options: Map[String, String])(implicit spark: SparkSession): DataFrame = ???

  /**
   * Update the data only if the data has changed
   * Insert new data
   * maintains updatedOn and createdOn timestamps for each record
   * usually used for dimension table for which keeping the full historic is not required.
   *
   * @param location      full path of where the data will be located
   * @param tableName     the name of the updated/created table
   * @param updates       new data to be merged with existing data
   * @param primaryKeys   name of the columns holding the unique id
   * @param oidName       name of the column holding the hash of the column that can change over time (or version number)
   * @param createdOnName name of the column holding the creation timestamp
   * @param updatedOnName name of the column holding the last update timestamp
   * @param spark         a valid spark session
   * @return the data as a dataframe
   */
  override def scd2(location: String,
                    databaseName: String,
                    tableName: String,
                    updates: DataFrame,
                    primaryKeys: Seq[String],
                    buidName: String,
                    oidName: String,
                    isCurrentName: String,
                    partitioning: List[String],
                    format: String,
                    validFromName: String,
                    validToName: String,
                    options: Map[String, String],
                    minValidFromDate: LocalDate,
                    maxValidToDate: LocalDate)(implicit spark: SparkSession): DataFrame = ???

  /**
   * Keeps old partition and overwrite new partitions.
   *
   * @param location     where to write the data
   * @param databaseName database name
   * @param tableName    table name
   * @param df           new data to write into the table
   * @param partitioning how the data is partitionned
   * @param format       format
   * @param options      write options
   * @param spark        a spark session
   * @return updated data
   */
  override def overwritePartition(location: String,
                                  databaseName: String,
                                  tableName: String,
                                  df: DataFrame,
                                  partitioning: List[String],
                                  format: String,
                                  options: Map[String, String])(implicit spark: SparkSession): DataFrame = ???

}