All Downloads are FREE. Search and download functionalities are using the official Maven repository.

za.co.absa.enceladus.common.CommonJobExecution.scala Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2018 ABSA Group Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package za.co.absa.enceladus.common

import java.text.MessageFormat
import java.time.Instant
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileSystem
import org.apache.spark.sql.functions.{lit, to_date}
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.SPARK_VERSION
import org.slf4j.{Logger, LoggerFactory}
import za.co.absa.atum.AtumImplicits._
import za.co.absa.atum.core.{Atum, ControlType}
import za.co.absa.enceladus.common.Constants.{InfoDateColumn, InfoDateColumnString, InfoVersionColumn, ReportDateFormat}
import za.co.absa.enceladus.common.config.{CommonConfConstants, JobConfigParser, PathConfig}
import za.co.absa.enceladus.common.plugin.PostProcessingService
import za.co.absa.enceladus.common.plugin.enceladus.{EnceladusAtumPlugin, EnceladusRunUrl}
import za.co.absa.spark.commons.SparkVersionGuard
import za.co.absa.enceladus.dao.EnceladusDAO
import za.co.absa.enceladus.dao.OptionallyRetryableException._
import za.co.absa.enceladus.model.Dataset
import za.co.absa.enceladus.plugins.builtin.errorsender.params.ErrorSenderPluginParams
import za.co.absa.enceladus.utils.general.ProjectMetadata
import za.co.absa.enceladus.utils.config.{ConfigReader, PathWithFs, SecureConfig, UrisConnectionStringParser}
import za.co.absa.enceladus.utils.fs.{FileSystemUtils, HadoopFsUtils}
import za.co.absa.enceladus.utils.modules.SourcePhase
import za.co.absa.enceladus.utils.modules.SourcePhase.Standardization
import za.co.absa.enceladus.common.performance.PerformanceMeasurer
import za.co.absa.enceladus.utils.time.TimeZoneNormalizer
import za.co.absa.enceladus.utils.validation.ValidationLevel
import za.co.absa.standardization.RecordIdGeneration

import scala.util.control.NonFatal
import scala.util.{Failure, Success, Try}

trait CommonJobExecution extends ProjectMetadata {
  protected case class PreparationResult(dataset: Dataset,
                                         reportVersion: Int,
                                         pathCfg: PathConfig,
                                         performance: PerformanceMeasurer)

  TimeZoneNormalizer.normalizeJVMTimeZone()
  SparkVersionGuard.fromDefaultSparkCompatibilitySettings.ensureSparkVersionCompatibility(SPARK_VERSION)

  protected val log: Logger = LoggerFactory.getLogger(this.getClass)
  protected val configReader: ConfigReader = new ConfigReader()
  protected val recordIdStrategy = RecordIdGeneration.getRecordIdGenerationType(
    configReader.getString("enceladus.recordId.generation.strategy")
  )
  protected val restApiBaseUrls: List[String] = UrisConnectionStringParser.parse(configReader.getString("enceladus.rest.uri"))
  protected val restApiUrlsRetryCount: Option[Int] = configReader.getIntOption("enceladus.rest.retryCount")
  protected val restApiAvailabilitySetup: String = configReader.getString("enceladus.rest.availability.setup")
  protected val restApiOptionallyRetryableExceptions: Set[OptRetryableExceptions] =
    configReader
      .getIntListOption("enceladus.rest.optionallyRetryableExceptions")
      .getOrElse(Set.empty)
      .toSet
      .map(getOptionallyRetryableException)
  protected var secureConfig: Map[String, String] = Map.empty

  private val menasBaseUris: List[String] = UrisConnectionStringParser
    .parse(configReader.getString("enceladus.menas.uri"))

  protected def obtainSparkSession[T](jobName: String)(implicit cmd: JobConfigParser[T]): SparkSession = {
    val enceladusVersion = projectVersion
    log.info(s"Enceladus version $enceladusVersion")
    val reportVersion = cmd.reportVersion.map(_.toString).getOrElse("")

    // ssl paths stripped paths for current directory usage (expecting files distributed via spark-submit's "--files"
    val executorSecConfig = SecureConfig.getSslProperties(configReader.config, useCurrentDirectoryPaths = true)

    val spark = SparkSession.builder()
      .appName(s"$jobName $enceladusVersion ${cmd.datasetName} ${cmd.datasetVersion} ${cmd.reportDate} $reportVersion")
      .config("spark.executor.extraJavaOptions", SecureConfig.javaOptsStringFromConfigMap(executorSecConfig)) // system properties on executors
      .config("spark.sql.legacy.timeParserPolicy","LEGACY") // otherwise timestamp parsing migh cause issues
      .getOrCreate()
    TimeZoneNormalizer.normalizeSessionTimeZone(spark)
    spark
  }

  protected def initialValidation(): Unit = {
    // This should be the first thing the app does to make secure Kafka work with our CA.
    // After Spring activates JavaX, it will be too late.
    val secConf = SecureConfig.getSslProperties(configReader.config)
    SecureConfig.setSystemProperties(secConf)
  }

  protected def prepareJob[T]()
                             (implicit dao: EnceladusDAO,
                              cmd: JobConfigParser[T],
                              spark: SparkSession): PreparationResult = {
    configReader.logEffectiveConfigProps(Constants.ConfigKeysToRedact)
    dao.authenticate()

    implicit val hadoopConf: Configuration = spark.sparkContext.hadoopConfiguration

    val dataset = dao.getDataset(cmd.datasetName, cmd.datasetVersion, ValidationLevel.ForRun)
    dataset.propertiesValidation match {
      case Some(validation) if !validation.isValid =>
        throw new IllegalStateException("Dataset validation failed, errors found in fields:\n" +
          validation.errors.map { case (field, errMsg) => s" - '$field': $errMsg" }.mkString("\n")
        )
      case Some(validation) if validation.nonEmpty =>
        val warning = validation.warnings.map { case (field, warnMsg) =>
          val header = s" - '$field': "
          s"$header${warnMsg.mkString(s"\n$header")}"
        }.mkString("\n")
        log.warn("Dataset validation had some warnings:\n" + warning)
      case None => throw new IllegalStateException("Dataset validation was not retrieved correctly")
      case _ => // no problems found
    }

    val minPartition = configReader.getLongOption(CommonConfConstants.minPartitionSizeKey)
    val maxPartition = configReader.getLongOption(CommonConfConstants.maxPartitionSizeKey)

    (minPartition, maxPartition) match {
      case (Some(min), Some(max)) if min >= max => throw new IllegalStateException(
          s"${CommonConfConstants.minPartitionSizeKey} has to be smaller than ${CommonConfConstants.maxPartitionSizeKey}"
      )
      case _ => //validation passed
    }

    val reportVersion = getReportVersion(cmd, dataset)
    val pathCfg: PathConfig = getPathConfig(cmd, dataset, reportVersion)

    validatePaths(pathCfg)

    // Enable non-default persistence storage level if provided in the command line
    cmd.persistStorageLevel.foreach(Atum.setCachingStorageLevel)

    PreparationResult(dataset, reportVersion, pathCfg, new PerformanceMeasurer(spark.sparkContext.appName))
  }

  protected def validatePaths(pathConfig: PathConfig): Unit

  protected def validateIfOutputPathAlreadyExists(entry: PathWithFs): Unit = {
    val fsUtils = HadoopFsUtils.getOrCreate(entry.fileSystem)
    if (fsUtils.exists(entry.path)) {
      throw new IllegalStateException(
        s"Path ${entry.path} already exists. Increment the run version, or delete ${entry.path}"
      )
    }
  }

  protected def validateInputPath(entry: PathWithFs): Unit = {
    val fsUtils = HadoopFsUtils.getOrCreate(entry.fileSystem)
    if (!fsUtils.exists(entry.path)) {
      throw new IllegalStateException(
        s"Input path ${entry.path} does not exist"
      )
    }
  }

  /**
   * Post processing rereads the data from a path on FS (based on `sourcePhase`)
   */
  protected def runPostProcessing[T](sourcePhase: SourcePhase, preparationResult: PreparationResult, jobCmdConfig: JobConfigParser[T])
                                    (implicit spark: SparkSession): Unit = {
    val outputPath = sourcePhase match {
      case Standardization => preparationResult.pathCfg.standardization.path
      case _ => preparationResult.pathCfg.publish.path
    }

    log.info(s"rereading outputPath $outputPath to run postProcessing")
    val df = spark.read.parquet(outputPath)
    val runId = EnceladusAtumPlugin.runNumber

    // reporting the UI url(s) - if more than one, its comma-separated
    val runUrl: Option[String] = runId.map { runNumber =>
      menasBaseUris.map { baseUrl =>
        EnceladusRunUrl.getMenasUiRunUrl(baseUrl, jobCmdConfig.datasetName, jobCmdConfig.datasetVersion, runNumber)
      }.mkString(",")
    }

    val sourceSystem = Atum.getControlMeasure.metadata.sourceApplication
    val uniqueRunId = Atum.getControlMeasure.runUniqueId

    val params = ErrorSenderPluginParams(jobCmdConfig.datasetName,
      jobCmdConfig.datasetVersion, jobCmdConfig.reportDate, preparationResult.reportVersion, outputPath,
      sourcePhase, sourceSystem, runUrl, runId, uniqueRunId, Instant.now)
    val postProcessingService = PostProcessingService(configReader.config, params)
    postProcessingService.onSaveOutput(df)

    if (runId.isEmpty) {
      log.warn("No run number found, the Run URL cannot be properly reported!")
    }
  }

  protected def finishJob[T](jobConfig: JobConfigParser[T])(implicit spark: SparkSession): Unit = {
    // Atum framework initialization is part of the 'prepareStandardization'
    spark.disableControlMeasuresTracking()

    val name = jobConfig.datasetName
    val version = jobConfig.datasetVersion
    EnceladusAtumPlugin.runNumber.foreach(runNumber => {
      restApiBaseUrls.foreach { baseUrl =>
        val apiUrl = EnceladusRunUrl.getApiRunUrl(baseUrl, name, version, runNumber)
        log.info(s"API Run URL: $apiUrl")
      }
      menasBaseUris.foreach { baseUrl =>
        val uiUrl = EnceladusRunUrl.getMenasUiRunUrl(baseUrl, name, version, runNumber)
        log.info(s"Menas UI Run URL: $uiUrl")
      }
    })
  }

  protected def getPathConfig[T](cmd: JobConfigParser[T], dataset: Dataset, reportVersion: Int)
                                (implicit hadoopConf: Configuration): PathConfig = {

    val rawPath = buildRawPath(cmd, dataset, reportVersion)
    val publishPath = buildPublishPath(cmd, dataset, reportVersion)
    val standardizationPath = getStandardizationPath(cmd, reportVersion)

    PathConfig.fromPaths(rawPath, publishPath, standardizationPath)
  }

  private def buildPublishPath[T](cmd: JobConfigParser[T], ds: Dataset, reportVersion: Int): String = {
    val infoDateCol: String = InfoDateColumn
    val infoVersionCol: String = InfoVersionColumn

    cmd.folderPrefix match {
      case None => s"${ds.hdfsPublishPath}/$infoDateCol=${cmd.reportDate}/$infoVersionCol=$reportVersion"
      case Some(folderPrefix) =>
        s"${ds.hdfsPublishPath}/$folderPrefix/$infoDateCol=${cmd.reportDate}/$infoVersionCol=$reportVersion"
    }
  }

  private def buildRawPath[T](cmd: JobConfigParser[T], dataset: Dataset, reportVersion: Int): String = {
    val dateTokens = cmd.reportDate.split("-")
    val folderSuffix = s"/${dateTokens(0)}/${dateTokens(1)}/${dateTokens(2)}/v$reportVersion"
    cmd.folderPrefix match {
      case None => s"${dataset.hdfsPath}$folderSuffix"
      case Some(folderPrefix) => s"${dataset.hdfsPath}/$folderPrefix$folderSuffix"
    }
  }

  private def getStandardizationPath[T](jobConfig: JobConfigParser[T], reportVersion: Int): String = {
    MessageFormat.format(configReader.getString("standardized.hdfs.path"),
      jobConfig.datasetName,
      jobConfig.datasetVersion.toString,
      jobConfig.reportDate,
      reportVersion.toString)
  }

  protected def handleControlInfoValidation(): Unit = {
    ControlInfoValidation.addRawAndSourceRecordCountsToMetadata() match {
      case Failure(ex: za.co.absa.enceladus.utils.validation.ValidationException) =>
        val confEntry = "control.info.validation"
        configReader.getString(confEntry) match {
          case "strict" => throw ex
          case "warning" => log.warn(ex.msg)
          case "none" =>
          case _ => throw new RuntimeException(s"Invalid $confEntry value")
        }
      case Failure(ex) => throw ex
      case Success(_) =>
    }
  }

  protected def writePerformanceMetrics[T](performance: PerformanceMeasurer, jobCmdConfig: JobConfigParser[T]): Unit = {
    jobCmdConfig.performanceMetricsFile.foreach(fileName => try {
      performance.writeMetricsToFile(fileName)
    } catch {
      case NonFatal(e) => log.error(s"Unable to write performance metrics to file '$fileName': ${e.getMessage}")
    })
  }

  protected def addCustomDataToInfoFile(conf: ConfigReader, data: Map[String, String]): Unit = {
    val keyPrefix = Try {
      conf.getString("control.info.dataset.properties.prefix")
    }.toOption.getOrElse("")

    log.debug(s"Writing custom data to info file (with prefix '$keyPrefix'): $data")
    data.foreach { case (key, value) =>
      Atum.setAdditionalInfo((s"$keyPrefix$key", value))
    }
  }

  protected def handleEmptyOutput(job: SourcePhase)(implicit spark: SparkSession): Unit = {

    val areCountMeasurementsAllZero = Atum.getControlMeasure.checkpoints
      .flatMap(checkpoint =>
        checkpoint.controls.filter(control =>
          ControlType.isControlMeasureTypeEqual(control.controlType, ControlType.Count.value)
        )
      )
      .forall(m => Try(m.controlValue.toString.toDouble).toOption.contains(0D))

    if (areCountMeasurementsAllZero) {
      log.warn(s"Empty output after running $job. Previous checkpoints show this is correct.")
    } else {
      val errMsg = s"Empty output after running $job, while previous checkpoints show non zero record count"
      spark.setControlMeasurementError(job.toString, errMsg, "")
      throw new IllegalStateException(errMsg)
    }
  }

  protected def addInfoColumns(intoDf: DataFrame, reportDate: String, reportVersion: Int): DataFrame = {
    import za.co.absa.enceladus.utils.schema.SparkUtils.DataFrameWithEnhancements
    intoDf
      .withColumnOverwriteIfExists(InfoDateColumn, to_date(lit(reportDate), ReportDateFormat))
      .withColumnOverwriteIfExists(InfoDateColumnString, lit(reportDate))
      .withColumnOverwriteIfExists(InfoVersionColumn, lit(reportVersion))
  }

  private def getReportVersion[T](jobConfig: JobConfigParser[T], dataset: Dataset)(implicit hadoopConf: Configuration): Int = {
    jobConfig.reportVersion match {
      case Some(version) => version
      case None =>

        // Since `pathConfig.publish.fileSystem` is not available at this point yet, a temporary publish-FS is create & used here instead
        val tempPublishFs: FileSystem = FileSystemUtils.getFileSystemFromPath(dataset.hdfsPublishPath)
        val fsUtils = HadoopFsUtils.getOrCreate(tempPublishFs)
        val newVersion = fsUtils.getLatestVersion(dataset.hdfsPublishPath, jobConfig.reportDate) + 1
        log.warn(s"Report version not provided, inferred report version: $newVersion")
        log.warn("This is an EXPERIMENTAL feature.")
        log.warn(" -> It can lead to issues when running multiple jobs on a dataset concurrently.")
        log.warn(" -> It may not work as desired when there are gaps in the versions of the data being landed.")
        newVersion
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy