All Downloads are FREE. Search and download functionalities are using the official Maven repository.

za.co.absa.enceladus.standardization.StandardizationExecution.scala Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2018 ABSA Group Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package za.co.absa.enceladus.standardization

import java.util.UUID
import org.apache.hadoop.conf.Configuration
import org.apache.spark.sql.types.{StructField, StructType}
import org.apache.spark.sql.{Column, DataFrame, SparkSession}
import za.co.absa.atum.AtumImplicits._
import za.co.absa.atum.core.Atum
import za.co.absa.enceladus.utils.schema.SchemaUtils
import za.co.absa.enceladus.common.config.{JobConfigParser, PathConfig}
import za.co.absa.enceladus.common.plugin.enceladus.EnceladusAtumPlugin
import za.co.absa.enceladus.common.{CommonJobExecution, Constants, Repartitioner}
import za.co.absa.enceladus.dao.EnceladusDAO
import za.co.absa.enceladus.dao.auth.RestApiCredentials
import za.co.absa.enceladus.model.Dataset
import za.co.absa.enceladus.standardization.config.{StandardizationConfig, StandardizationConfigParser}
import za.co.absa.enceladus.utils.config.{ConfigReader, PathWithFs}
import za.co.absa.enceladus.utils.fs.{DistributedFsUtils, HadoopFsUtils}
import za.co.absa.enceladus.utils.modules.SourcePhase
import za.co.absa.enceladus.common.performance.PerformanceMetricTools
import za.co.absa.enceladus.utils.schema.{MetadataKeys, SparkUtils}
import za.co.absa.standardization.Standardization
import za.co.absa.standardization.stages.PlainSchemaGenerator
import za.co.absa.enceladus.utils.validation.{ValidationException => EnceladusValidationException}
import za.co.absa.standardization.{ValidationException => StandardizationValidationException}
import za.co.absa.standardization.config.{BasicMetadataColumnsConfig, BasicStandardizationConfig, StandardizationConfig => StandardizationLibraryConfig}
import za.co.absa.standardization.types.TypeDefaults

import java.io.{PrintWriter, StringWriter}
import scala.util.control.NonFatal


trait StandardizationExecution extends CommonJobExecution {
  private val sourceId = SourcePhase.Standardization

  protected def prepareStandardizationConfig(): BasicStandardizationConfig = {
    val metadataColumns = BasicMetadataColumnsConfig
      .fromDefault()
      .copy(prefix = "enceladus", recordIdStrategy = recordIdStrategy)

    val standardizationConfigWithoutTZ = BasicStandardizationConfig
      .fromDefault()
      .copy(metadataColumns = metadataColumns)

    configReader.getStringOption("timezone") match {
      case Some(tz) => standardizationConfigWithoutTZ.copy(timezone = tz)
      case None => standardizationConfigWithoutTZ
    }
  }

  protected def prepareStandardization[T](args: Array[String],
                                          restApiCredentials: RestApiCredentials,
                                          preparationResult: PreparationResult)
                                         (implicit dao: EnceladusDAO,
                                          cmd: StandardizationConfigParser[T],
                                          spark: SparkSession,
                                          defaults: TypeDefaults): StructType = {
    val rawFs = preparationResult.pathCfg.raw.fileSystem
    val rawFsUtils = HadoopFsUtils.getOrCreate(rawFs)

    val stdDirSize = rawFsUtils.getDirectorySize(preparationResult.pathCfg.raw.path)
    preparationResult.performance.startMeasurement(stdDirSize)

    // Enable Control Framework
    spark.enableControlMeasuresTracking(Option(s"${preparationResult.pathCfg.raw.path}/_INFO"), None)
      .setControlMeasuresWorkflow(sourceId.toString)

    // Enable control framework performance optimization for pipeline-like jobs
    Atum.setAllowUnpersistOldDatasets(true)

    // Enable Enceladus plugin for Control Framework
    EnceladusAtumPlugin.enableEnceladusAtumPlugin(
      configReader.config,
      cmd.datasetName,
      cmd.datasetVersion,
      cmd.reportDate,
      preparationResult.reportVersion)

    // Add report date and version (aka Enceladus info date and version) to Atum's metadata
    Atum.setAdditionalInfo(Constants.InfoDateColumn -> cmd.reportDate)
    Atum.setAdditionalInfo(Constants.InfoVersionColumn -> preparationResult.reportVersion.toString)

    // Add the raw format of the input file(s) to Atum's metadata
    Atum.setAdditionalInfo("raw_format" -> cmd.rawFormat)
    Atum.setAdditionalInfo(Constants.EnceladusRunNumber -> EnceladusAtumPlugin.runNumber.fold("")(_.toString))

    val defaultTimeZoneForTimestamp = defaults.defaultTimestampTimeZone.getOrElse(spark.conf.get("spark.sql.session.timeZone"))
    Atum.setAdditionalInfo("default_time_zone_for_timestamps"-> defaultTimeZoneForTimestamp)
    val defaultTimeZoneForDate = defaults.defaultDateTimeZone.getOrElse(spark.conf.get("spark.sql.session.timeZone"))
    Atum.setAdditionalInfo("default_time_zone_for_dates"-> defaultTimeZoneForDate)

    // Add Dataset properties marked with putIntoInfoFile=true
    val dataForInfoFile: Map[String, String] = dao.getDatasetPropertiesForInfoFile(cmd.datasetName, cmd.datasetVersion)
    addCustomDataToInfoFile(configReader, dataForInfoFile)

    PerformanceMetricTools.addJobInfoToAtumMetadata("std",
      preparationResult.pathCfg.raw,
      preparationResult.pathCfg.standardization.path,
      restApiCredentials.username, args.mkString(" "))

    dao.getSchema(preparationResult.dataset.schemaName, preparationResult.dataset.schemaVersion)
  }

  override def getPathConfig[T](cmd: JobConfigParser[T], dataset: Dataset, reportVersion: Int)
                               (implicit hadoopConf: Configuration): PathConfig = {
    val initialConfig = super.getPathConfig(cmd, dataset, reportVersion)
    cmd.asInstanceOf[StandardizationConfig].rawPathOverride match {
      case None => initialConfig
      case Some(providedRawPath) => initialConfig.copy(raw = PathWithFs.fromPath(providedRawPath))
    }
  }

  override def validatePaths(pathConfig: PathConfig): Unit = {
    log.info(s"raw path: ${pathConfig.raw.path}")
    log.info(s"standardization path: ${pathConfig.standardization.path}")
    validateInputPath(pathConfig.raw)
    validateIfOutputPathAlreadyExists(pathConfig.standardization)
  }

  protected def readStandardizationInputData[T](schema: StructType,
                                                cmd: StandardizationConfigParser[T],
                                                rawInput: PathWithFs,
                                                dataset: Dataset)
                                               (implicit spark: SparkSession,
                                                dao: EnceladusDAO): DataFrame = {
    val numberOfColumns = schema.fields.length
    val standardizationReader = new StandardizationPropertiesProvider()
    val dfReaderConfigured = standardizationReader.getFormatSpecificReader(cmd, dataset, numberOfColumns)
    val readerWithOptSchema = cmd.rawFormat.toLowerCase() match {
      case "parquet" | "cobol" => dfReaderConfigured
      case _ =>
        val optColumnNameOfCorruptRecord = getColumnNameOfCorruptRecord(schema, cmd)
        val inputSchema = PlainSchemaGenerator.generateInputSchema(schema, optColumnNameOfCorruptRecord)
        dfReaderConfigured.schema(inputSchema)
    }
    val dfWithSchema = readerWithOptSchema.load(s"${rawInput.path}/*")

    ensureSplittable(dfWithSchema, rawInput, schema)
  }

  private def getColumnNameOfCorruptRecord[R](schema: StructType, cmd: StandardizationConfigParser[R])
                                             (implicit spark: SparkSession): Option[String] = {
    // SparkUtils.setUniqueColumnNameOfCorruptRecord is called even if result is not used to avoid conflict
    val columnNameOfCorruptRecord = SparkUtils.setUniqueColumnNameOfCorruptRecord(spark, schema)
    if (cmd.rawFormat.equalsIgnoreCase("fixed-width") || cmd.failOnInputNotPerSchema) {
      None
    } else {
      Option(columnNameOfCorruptRecord)
    }
  }

  protected def standardize(inputData: DataFrame, schema: StructType, standardizationConfig: StandardizationLibraryConfig)
                              (implicit spark: SparkSession): DataFrame = {
    //scalastyle:on parameter.number
    try {
      handleControlInfoValidation()
      Standardization.standardize(inputData, schema, standardizationConfig)
    } catch {
      case e@StandardizationValidationException(msg, errors) =>
        val errorDescription = s"$msg\nDetails: ${errors.mkString("\n")}"
        spark.setControlMeasurementError("Schema Validation", errorDescription, "")
        throw e
      case NonFatal(e) if !e.isInstanceOf[EnceladusValidationException] || !e.isInstanceOf[StandardizationValidationException] =>
        val sw = new StringWriter
        e.printStackTrace(new PrintWriter(sw))
        spark.setControlMeasurementError(sourceId.toString, e.getMessage, sw.toString)
        throw e
    }
  }

  protected def processStandardizationResult[T](args: Array[String],
                                                standardizedDF: DataFrame,
                                                preparationResult: PreparationResult,
                                                schema: StructType,
                                                cmd: StandardizationConfigParser[T],
                                                restApiCredentials: RestApiCredentials)
                                               (implicit spark: SparkSession, configReader: ConfigReader): DataFrame = {
    val rawFs = preparationResult.pathCfg.raw.fileSystem
    val stdFs = preparationResult.pathCfg.standardization.fileSystem

    val fieldRenames = SchemaUtils.getRenamesInSchema(schema)
    fieldRenames.foreach {
      case (destinationName, sourceName) => standardizedDF.registerColumnRename(sourceName, destinationName)(rawFs)
    }

    standardizedDF.setCheckpoint(s"$sourceId - End", persistInDatabase = false)(rawFs)

    val recordCount = standardizedDF.lastCheckpointRowCount match {
      case None => standardizedDF.count
      case Some(p) => p
    }

    if (recordCount == 0) {
      handleEmptyOutput(sourceId)
    }

    log.info(s"Writing into standardized path ${preparationResult.pathCfg.standardization.path}")

    val withRepartitioning = if (cmd.isInstanceOf[StandardizationConfig]) {
      val repartitioner = new Repartitioner(configReader, log)
      repartitioner.repartition(standardizedDF)
    } else {
      standardizedDF
    }
    withRepartitioning.write.parquet(preparationResult.pathCfg.standardization.path)

    // Store performance metrics
    // (record count, directory sizes, elapsed time, etc. to _INFO file metadata and performance file)
    val stdDirSize = HadoopFsUtils.getOrCreate(stdFs).getDirectorySize(preparationResult.pathCfg.standardization.path)
    preparationResult.performance.finishMeasurement(stdDirSize, recordCount)

    PerformanceMetricTools.addPerformanceMetricsToAtumMetadata(
      spark,
      "std",
      preparationResult.pathCfg.raw,
      preparationResult.pathCfg.standardization,
      restApiCredentials.username,
      args.mkString(" ")
    )

    cmd.rowTag.foreach(rowTag => Atum.setAdditionalInfo("xml_row_tag" -> rowTag))
    cmd.csvDelimiter.foreach(delimiter => Atum.setAdditionalInfo("csv_delimiter" -> delimiter))

    log.info(s"infoFilePath = ${preparationResult.pathCfg.standardization.path}/_INFO")
    withRepartitioning.writeInfoFile(preparationResult.pathCfg.standardization.path)(stdFs)
    writePerformanceMetrics(preparationResult.performance, cmd)
    log.info(s"$sourceId finished successfully")
    withRepartitioning
  }

  //scalastyle:off parameter.number

  private def ensureSplittable(df: DataFrame, input: PathWithFs, schema: StructType)
                              (implicit spark: SparkSession): DataFrame = {
    val fsUtils = HadoopFsUtils.getOrCreate(input.fileSystem)
    if (fsUtils.isNonSplittable(input.path)) {
      convertToSplittable(df, schema, fsUtils)
    } else {
      df
    }
  }

  private def convertToSplittable(df: DataFrame, schema: StructType, fsUtils: DistributedFsUtils)
                                 (implicit spark: SparkSession): DataFrame = {
    log.warn("Dataset is stored in a non-splittable format. This can have a severe performance impact.")
    fsUtils match {
      case utils: HadoopFsUtils =>
        val tempParquetDir = s"/tmp/nonsplittable-to-parquet-${UUID.randomUUID()}"
        log.warn(s"Converting to Parquet in temporary dir: $tempParquetDir")

        // Handle renaming of source columns in case there are columns
        // that will break because of issues in column names like spaces
        df.select(schema.fields.map { field: StructField =>
          renameSourceColumn(df, field)
        }: _*).write.parquet(tempParquetDir)

        utils.deleteOnExit(tempParquetDir)
        // Reload from temp parquet and reverse column renaming above
        val dfTmp = spark.read.parquet(tempParquetDir)
        dfTmp.select(schema.fields.map { field: StructField =>
          reverseRenameSourceColumn(dfTmp, field)
        }: _*)

      case utils =>
        log.warn(s"Splittability conversion only available for 'HadoopFsUtils', leaving as is for ${utils.getClass.getName}")
        df
    }

  }

  private def renameSourceColumn(df: DataFrame, field: StructField): Column = {
    if (field.metadata.contains(MetadataKeys.SourceColumn)) {
      val sourceColumnName = field.metadata.getString(MetadataKeys.SourceColumn)
      log.info(s"schema field : ${field.name} : rename : $sourceColumnName")
      df.col(sourceColumnName).as(field.name, field.metadata)
    } else {
      df.col(field.name)
    }
  }

  private def reverseRenameSourceColumn(df: DataFrame, field: StructField): Column = {
    if (field.metadata.contains(MetadataKeys.SourceColumn)) {
      val sourceColumnName = field.metadata.getString(MetadataKeys.SourceColumn)
      log.info(s"schema field : $sourceColumnName : reverse rename : ${field.name}")
      df.col(field.name).as(sourceColumnName)
    } else {
      df.col(field.name)
    }
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy