All Downloads are FREE. Search and download functionalities are using the official Maven repository.

za.co.absa.pramen.extras.infofile.InfoFileGeneration.scala Maven / Gradle / Ivy

/*
 * Copyright 2022 ABSA Group Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package za.co.absa.pramen.extras.infofile

import com.typesafe.config.Config
import org.apache.hadoop.fs.Path
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.slf4j.LoggerFactory
import za.co.absa.pramen.extras.utils.ConfigUtils

import java.time.format.DateTimeFormatter
import java.time.{Instant, LocalDate, ZoneId, ZonedDateTime}

object InfoFileGeneration {
  val PLUGINS_GENERATE_INFO_FILE = "info.file.generate"
  val SOURCE_APPLICATION_KEY = "info.file.source.application"
  val COUNTRY_KEY = "info.file.country"
  val HISTORY_TYPE_KEY = "info.file.history.type"
  val TIMESTAMP_FORMAT_KEY = "info.file.timestamp.format"
  val DATE_FORMAT_KEY = "info.file.date.format"

  private val log = LoggerFactory.getLogger(this.getClass)

  def generateInfoFile(pramenVersion: String,
                       timezoneId: ZoneId,
                       sourceCount: Long,
                       rawCount: Long,
                       publishCount: Option[Long],
                       outputPartitionPath: Path,
                       infoDate: LocalDate,
                       sourceStart: Instant,
                       rawStart: Instant,
                       publishStart: Option[Instant])
                      (implicit spark: SparkSession, conf: Config): Unit = {
    val fs = outputPartitionPath.getFileSystem(spark.sparkContext.hadoopConfiguration)

    val infoFileContents = renderInfoFile(pramenVersion, timezoneId, sourceCount, rawCount, publishCount, infoDate, sourceStart, rawStart, Instant.now(), publishStart)

    val infoFilePath = new Path(outputPartitionPath, "_INFO")

    val out = fs.create(infoFilePath)
    out.write(infoFileContents.getBytes())
    out.close()

    log.info(s"The info file is saved to $infoFilePath:\n$infoFileContents")
  }

  def renderInfoFile(pramenVersion: String,
                     timezoneId: ZoneId,
                     sourceCount: Long,
                     rawCount: Long,
                     publishCount: Option[Long],
                     infoDate: LocalDate,
                     sourceStart: Instant,
                     rawStart: Instant,
                     jobFinish: Instant,
                     publishStartOpt: Option[Instant])
                    (implicit conf: Config): String = {
    ConfigUtils.validatePathsExistence(conf, "", Seq(SOURCE_APPLICATION_KEY, COUNTRY_KEY, HISTORY_TYPE_KEY, TIMESTAMP_FORMAT_KEY, DATE_FORMAT_KEY))

    val sourceApplication = conf.getString(SOURCE_APPLICATION_KEY)
    val country = conf.getString(COUNTRY_KEY)
    val historyType = conf.getString(HISTORY_TYPE_KEY)
    val timestampFormat = conf.getString(TIMESTAMP_FORMAT_KEY)
    val dateFormat = conf.getString(DATE_FORMAT_KEY)

    val fmtTimestamp = DateTimeFormatter.ofPattern(timestampFormat)
    val fmtDate = DateTimeFormatter.ofPattern(dateFormat)

    val infoDateStr = fmtDate.format(infoDate)

    val sourceStarted = ZonedDateTime.ofInstant(sourceStart, timezoneId).format(fmtTimestamp)
    val rawStarted = ZonedDateTime.ofInstant(rawStart, timezoneId).format(fmtTimestamp)
    val jobFinished = ZonedDateTime.ofInstant(jobFinish, timezoneId).format(fmtTimestamp)

    val landingToRaw = s"""{
       |    "name" : "Source",
       |    "software" : "pramen",
       |    "version" : "$pramenVersion",
       |    "processStartTime" : "$sourceStarted",
       |    "processEndTime" : "$rawStarted",
       |    "workflowName" : "Source",
       |    "order" : 1,
       |    "controls" : [ {
       |      "controlName" : "recordCount",
       |      "controlType" : "count",
       |      "controlCol" : "*",
       |      "controlValue" : "$sourceCount"
       |    } ]
       |  }, {
       |    "name" : "Raw",
       |    "software" : "pramen",
       |    "version" : "$pramenVersion",
       |    "processStartTime" : "$rawStarted",
       |    "processEndTime" : "$jobFinished",
       |    "workflowName" : "Source",
       |    "order" : 2,
       |    "controls" : [ {
       |      "controlName" : "recordCount",
       |      "controlType" : "count",
       |      "controlCol" : "*",
       |      "controlValue" : "$rawCount"
       |    } ]
       |  }""".stripMargin

    val rawToPublish = publishStartOpt match {
      case Some(publishStart) =>
        val publishStarted = ZonedDateTime.ofInstant(publishStart, timezoneId).format(fmtTimestamp)
        s""", {
           |    "name" : "Standardization",
           |    "software" : "pramen",
           |    "version" : "$pramenVersion",
           |    "processStartTime" : "$publishStarted",
           |    "processEndTime" : "$jobFinished",
           |    "workflowName" : "Standardization",
           |    "order" : 1,
           |    "controls" : [ {
           |      "controlName" : "recordCount",
           |      "controlType" : "count",
           |      "controlCol" : "*",
           |      "controlValue" : "${publishCount.get}"
           |    } ]
           |  }, {
           |    "name" : "Conformance",
           |    "software" : "pramen",
           |    "version" : "$pramenVersion",
           |    "processStartTime" : "$publishStarted",
           |    "processEndTime" : "$jobFinished",
           |    "workflowName" : "Standardization",
           |    "order" : 2,
           |    "controls" : [ {
           |      "controlName" : "recordCount",
           |      "controlType" : "count",
           |      "controlCol" : "*",
           |      "controlValue" : "${publishCount.get}"
           |    } ]
           |  }""".stripMargin
      case None => ""
    }

    s"""{
       |  "metadata" : {
       |    "sourceApplication" : "$sourceApplication",
       |    "country" : "$country",
       |    "historyType" : "$historyType",
       |    "dataFilename" : "JDBC",
       |    "sourceType" : "Source",
       |    "version" : 1,
       |    "informationDate" : "$infoDateStr",
       |    "additionalInfo" : { }
       |  },
       |  "checkpoints" : [ $landingToRaw$rawToPublish ]
       |}
       |""".stripMargin

  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy