All Downloads are FREE. Search and download functionalities are using the official Maven repository.

bio.ferlab.datalake.spark3.loader.ElasticsearchLoader.scala Maven / Gradle / Ivy

There is a newer version: 14.8.0
Show newest version
package bio.ferlab.datalake.spark3.loader

import bio.ferlab.datalake.spark3.elasticsearch.{ElasticSearchClient, EsWriteOptions}
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.elasticsearch.spark.sql._

import java.time.LocalDate

object ElasticsearchLoader extends Loader {

  /**
   * Default read logic for a loader
   *
   * @param location     absolute path of where the data is
   * @param format       string representing the format
   * @param readOptions  read options
   * @param databaseName Optional database name
   * @param tableName    Optional table name
   * @param spark        spark session
   * @return the data as a dataframe
   */
  override def read(location: String,
                    format: String,
                    readOptions: Map[String, String],
                    databaseName: Option[String],
                    tableName: Option[String])(implicit spark: SparkSession): DataFrame = {
    spark.sqlContext.read.format("es").load(location)
  }

  /**
   * Keeps old partition and overwrite new partitions.
   *
   * @param location     where to write the data
   * @param databaseName database name
   * @param tableName    table name
   * @param df           new data to write into the table
   * @param partitioning how the data is partitionned
   * @param format       format
   * @param options      write options
   * @param spark        a spark session
   * @return updated data
   */
  override def overwritePartition(location: String,
                                  databaseName: String,
                                  tableName: String,
                                  df: DataFrame,
                                  partitioning: List[String],
                                  format: String,
                                  options: Map[String, String])(implicit spark: SparkSession): DataFrame = {

    val es_url: String = options(EsWriteOptions.ES_URL)
    val es_username: Option[String] = options.get(EsWriteOptions.ES_USERNAME)
    val es_password: Option[String] = options.get(EsWriteOptions.ES_PASSWORD)
    implicit val esClient: ElasticSearchClient = new ElasticSearchClient(es_url, es_username, es_password)
    val ES_config = Map("es.write.operation" -> "index")

    options.get(EsWriteOptions.ES_INDEX_TEMPLATE_PATH).foreach(path => setupIndex(tableName, path))

    df.saveToEs(s"$location/_doc", ES_config)
    publish(tableName, location)
    df
  }

  /**
   * Overwrites the data located in output/tableName
   * usually used for small/test tables.
   *
   * @param location     where to write the data
   * @param databaseName database name
   * @param tableName    table name
   * @param df           new data to write into the table
   * @param partitioning how the data is partitionned
   * @param format       format
   * @param options      write options
   * @param spark        a spark session
   * @return updated data
   */
  override def writeOnce(location: String,
                         databaseName: String,
                         tableName: String,
                         df: DataFrame,
                         partitioning: List[String],
                         format: String,
                         options: Map[String, String])(implicit spark: SparkSession): DataFrame = ???

  /**
   * Insert or append data into a table
   * Does not resolve duplicates
   *
   * @param location     full path of where the data will be located
   * @param databaseName database name
   * @param tableName    the name of the updated/created table
   * @param updates      new data to be merged with existing data
   * @param partitioning how the data should be partitioned
   * @param format       spark form
   * @param options      write options
   * @param spark        a valid spark session
   * @return the data as a dataframe
   */
  override def insert(location: String,
                      databaseName: String,
                      tableName: String,
                      updates: DataFrame,
                      partitioning: List[String],
                      format: String,
                      options: Map[String, String])(implicit spark: SparkSession): DataFrame = ???

  /**
   * Update or insert data into a table
   * Resolves duplicates by using the list of primary key passed as argument
   *
   * @param location    full path of where the data will be located
   * @param tableName   the name of the updated/created table
   * @param updates     new data to be merged with existing data
   * @param primaryKeys name of the columns holding the unique id
   * @param spark       a valid spark session
   * @return the data as a dataframe
   */
  override def upsert(location: String,
                      databaseName: String,
                      tableName: String,
                      updates: DataFrame,
                      primaryKeys: Seq[String],
                      partitioning: List[String],
                      format: String,
                      options: Map[String, String])(implicit spark: SparkSession): DataFrame = ???

  /**
   * Update the data only if the data has changed
   * Insert new data
   * maintains updatedOn and createdOn timestamps for each record
   * usually used for dimension table for which keeping the full historic is not required.
   *
   * @param location      full path of where the data will be located
   * @param tableName     the name of the updated/created table
   * @param updates       new data to be merged with existing data
   * @param primaryKeys   name of the columns holding the unique id
   * @param oidName       name of the column holding the hash of the column that can change over time (or version number)
   * @param createdOnName name of the column holding the creation timestamp
   * @param updatedOnName name of the column holding the last update timestamp
   * @param spark         a valid spark session
   * @return the data as a dataframe
   */
  override def scd1(location: String,
                    databaseName: String,
                    tableName: String,
                    updates: DataFrame,
                    primaryKeys: Seq[String],
                    oidName: String,
                    createdOnName: String,
                    updatedOnName: String,
                    partitioning: List[String],
                    format: String,
                    options: Map[String, String])(implicit spark: SparkSession): DataFrame = ???

  /**
   * Update the data only if the data has changed
   * Insert new data
   * maintains updatedOn and createdOn timestamps for each record
   * usually used for dimension table for which keeping the full historic is required.
   *
   * @param location      full path of where the data will be located
   * @param tableName     the name of the updated/created table
   * @param updates       new data to be merged with existing data
   * @param primaryKeys   name of the columns holding the unique id
   * @param oidName       name of the column holding the hash of the column that can change over time (or version number)
   * @param createdOnName name of the column holding the creation timestamp
   * @param updatedOnName name of the column holding the last update timestamp
   * @param spark         a valid spark session
   * @return the data as a dataframe
   */
  override def scd2(location: String,
                    databaseName: String,
                    tableName: String,
                    updates: DataFrame,
                    primaryKeys: Seq[String],
                    buidName: String,
                    oidName: String,
                    isCurrentName: String,
                    partitioning: List[String],
                    format: String,
                    validFromName: String,
                    validToName: String,
                    options: Map[String, String],
                    minValidFromDate: LocalDate,
                    maxValidToDate: LocalDate)(implicit spark: SparkSession): DataFrame = ???

  def publish(alias: String,
              currentIndex: String,
              previousIndex: Option[String] = None)(implicit esClient: ElasticSearchClient): Unit = {
    esClient.setAlias(add = List(currentIndex), remove = List(), alias)
    esClient.setAlias(add = List(), remove = previousIndex.toList, alias)
  }


  /**
   * Setup an index by checking that ES nodes are up, removing the old index and setting the template for this index.
   *
   * @param indexName    full index name
   * @param templatePath path of the template.json that is expected to be in the resource folder or spark
   * @param esClient     an instance of [[ElasticSearchClient]]
   */
  def setupIndex(indexName: String, templatePath: String)
                (implicit spark: SparkSession, esClient: ElasticSearchClient): Unit = {

    log.info(s"ElasticSearch 'isRunning' status: [${esClient.isRunning}]")
    log.info(s"ElasticSearch 'checkNodes' status: [${esClient.checkNodeRoles}]")
    esClient.deleteIndex(indexName)
    esClient.setTemplate(templatePath)

  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy