All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.acxiom.pipeline.connectors.DataConnectorUtilities.scala Maven / Gradle / Ivy

package com.acxiom.pipeline.connectors

import com.acxiom.pipeline.Constants
import com.acxiom.pipeline.steps.{DataFrameReaderOptions, DataFrameWriterOptions, StreamingTriggerOptions}
import org.apache.spark.sql.streaming.{DataStreamReader, DataStreamWriter, OutputMode}
import org.apache.spark.sql.{DataFrameReader, DataFrameWriter, Dataset, SparkSession}

import java.util.Date

object DataConnectorUtilities {
  /**
    *
    * @param sparkSession The current spark session to use.
    * @param options      A DataFrameReaderOptions object for configuring the reader.
    * @return A DataFrameReader based on the provided options.
    */
  def buildDataFrameReader(sparkSession: SparkSession, options: DataFrameReaderOptions): DataFrameReader = {
    val reader = sparkSession.read
      .format(options.format)
      .options(options.options.getOrElse(Map[String, String]()))

    options.schema.map(s => reader.schema(s.toStructType())).getOrElse(reader)
  }

  /**
    *
    * @param dataFrame A DataFrame to write.
    * @param options   A DataFrameWriterOptions object for configuring the writer.
    * @return A DataFrameWriter[Row] based on the provided options.
    */
  def buildDataFrameWriter[T](dataFrame: Dataset[T], options: DataFrameWriterOptions): DataFrameWriter[T] = {
    val writer = dataFrame.write.format(options.format)
      .mode(options.saveMode)
      .options(options.options.getOrElse(Map[String, String]()))

    val w1 = if (options.bucketingOptions.isDefined && options.bucketingOptions.get.columns.nonEmpty) {
      val bucketingOptions = options.bucketingOptions.get
      writer.bucketBy(bucketingOptions.numBuckets, bucketingOptions.columns.head, bucketingOptions.columns.drop(1): _*)
    } else {
      writer
    }
    val w2 = if (options.partitionBy.isDefined && options.partitionBy.get.nonEmpty) {
      w1.partitionBy(options.partitionBy.get: _*)
    } else {
      w1
    }
    if (options.sortBy.isDefined && options.sortBy.get.nonEmpty) {
      val sortBy = options.sortBy.get
      w2.sortBy(sortBy.head, sortBy.drop(1): _*)
    } else {
      w2
    }
  }

  /**
   *
   * @param sparkSession The current spark session to use.
   * @param options      A DataFrameReaderOptions object for configuring the reader.
   * @return A DataStreamReader based on the provided options.
   */
  def buildDataStreamReader(sparkSession: SparkSession, options: DataFrameReaderOptions): DataStreamReader = {
    val reader = sparkSession.readStream
      .format(options.format)
      .options(options.options.getOrElse(Map[String, String]()))
    options.schema.map(s => reader.schema(s.toStructType())).getOrElse(reader)
  }

  /**
    * Build a DataStreamWriter that automattically adds the checkpointLocation if not provided and applies partition
    * information.
    *
    * @param dataFrame    A DataFrame to write.
    * @param writeOptions A DataFrameWriterOptions object for configuring the writer.
    * @param path         The path to write the data.
    * @return A DataStreamWriter[Row] based on the provided options.
    */
  def buildDataStreamWriter[T](dataFrame: Dataset[T], writeOptions: DataFrameWriterOptions, path: String): DataStreamWriter[T] = {
    val options = writeOptions.options.getOrElse(Map[String, String]())
    val finalOptions = if (!options.contains("checkpointLocation")) {
      options + ("checkpointLocation" ->
        s"${path.substring(0, path.lastIndexOf("/"))}/streaming_checkpoints_${Constants.FILE_APPEND_DATE_FORMAT.format(new Date())}")
    } else {
      options
    }
    val mode = writeOptions.saveMode.toLowerCase() match {
      case "overwrite" | "complete" => OutputMode.Complete()
      case "update" => OutputMode.Update()
      case _ => OutputMode.Append()
    }
    val writer = dataFrame.writeStream
      .format(writeOptions.format)
      .outputMode(mode)
      .option("path", path).options(finalOptions)
      .trigger(writeOptions.triggerOptions.getOrElse(StreamingTriggerOptions()).getTrigger)
    addPartitionInformation(writer, writeOptions)
  }

  /**
    * The DataStreamWriter to add partition information.
    *
    * @param writer The DataStreamWriter[Row] to configure.
    * @param writeOptions A DataFrameWriterOptions object for configuring the writer.
    * @return A DataStreamWriter[Row] configured with partitioning if applicable.
    */
  def addPartitionInformation[T](writer: DataStreamWriter[T], writeOptions: DataFrameWriterOptions): DataStreamWriter[T] = {
    if (writeOptions.partitionBy.isDefined && writeOptions.partitionBy.get.nonEmpty) {
      writer.partitionBy(writeOptions.partitionBy.get: _*)
    } else {
      writer
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy