Maven / Gradle / Ivy
The newest version!
* Copyright 2018 ABSA Group Limited
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.{col, size, sum}
import org.slf4j.{Logger, LoggerFactory}
object PerformanceMetricTools extends ProjectMetadata {
private val logger: Logger = LoggerFactory.getLogger(this.getClass)
* Adds general job information to Atum's metadata that will end up in an info file.
* The method should be ran at the beginning of a job so that this general information is available for debugging.
* @param spark A Spark Session
* @param optionPrefix A prefix for all performance metrics options, e.g. 'std' for Standardization and 'conform' for Conformance
* @param input A path+fs to an input directory of the job
* @param outputPath A path to an output directory of the job
* @param loginUserName A login user name who performed the job
def addJobInfoToAtumMetadata(optionPrefix: String,
input: PathWithFs,
outputPath: String,
loginUserName: String,
cmdLineArgs: String)
(implicit spark: SparkSession): Unit = {
// Spark job configuration
val sc = spark.sparkContext
// The number of executors minus the driver
val numberOfExecutors = sc.getExecutorMemoryStatus.keys.size - 1
val inputFsUtils = HadoopFsUtils.getOrCreate(input.fileSystem)
// Directory sizes and size ratio
val inputDirSize = inputFsUtils.getDirectorySize(input.path)
val inputDataSize = inputFsUtils.getDirectorySizeNoHidden(input.path)
addSparkConfig(optionPrefix, "spark.driver.memory", "driver_memory")
addSparkConfig(optionPrefix, "spark.driver.cores", "driver_cores")
addSparkConfig(optionPrefix, "spark.driver.memoryOverhead", "driver_memory_overhead")
addSparkConfig(optionPrefix, "spark.executor.memory", "executor_memory")
addSparkConfig(optionPrefix, "spark.executor.cores", "executor_cores")
addSparkConfig(optionPrefix, "spark.executor.memoryOverhead", "executor_memory_overhead")
addSparkConfig(optionPrefix, "spark.submit.deployMode", "yarn_deploy_mode")
addSparkConfig(optionPrefix, "spark.master", "spark_master")
.foreach(attemptId => Atum.setAdditionalInfo(s"${optionPrefix}_spark_attempt_id" -> attemptId))
Atum.setAdditionalInfo(s"${optionPrefix}_cmd_line_args" -> cmdLineArgs)
Atum.setAdditionalInfo(s"${optionPrefix}_input_dir" -> input.path)
Atum.setAdditionalInfo(s"${optionPrefix}_output_dir" -> outputPath)
Atum.setAdditionalInfo(s"${optionPrefix}_input_dir_size" -> inputDirSize.toString)
Atum.setAdditionalInfo(s"${optionPrefix}_input_data_size" -> inputDataSize.toString)
Atum.setAdditionalInfo(s"${optionPrefix}_enceladus_version" -> projectVersion)
Atum.setAdditionalInfo(s"${optionPrefix}_application_id" -> spark.sparkContext.applicationId)
Atum.setAdditionalInfo(s"${optionPrefix}_username" -> loginUserName)
Atum.setAdditionalInfo(s"${optionPrefix}_executors_num" -> s"$numberOfExecutors")
* Adds performance metrics to the Spark job metadata. Atum is used to set these metrics.
* @param spark A Spark Session
* @param optionPrefix A prefix for all performance metrics options, e.g. 'std' for Standardization and 'conform' for Conformance
* @param input A path to an input directory of the job
* @param output A path to an output directory of the job
* @param loginUserName A login user name who performed the job
def addPerformanceMetricsToAtumMetadata(spark: SparkSession,
optionPrefix: String,
input: PathWithFs,
output: PathWithFs,
loginUserName: String,
cmdLineArgs: String
): Unit = {
val inputFsUtils = HadoopFsUtils.getOrCreate(input.fileSystem)
val outputFsUtils = HadoopFsUtils.getOrCreate(output.fileSystem)
// Directory sizes and size ratio
val inputDirSize = inputFsUtils.getDirectorySize(input.path)
val outputDirSize = outputFsUtils.getDirectorySize(output.path)
val outputDataSize = outputFsUtils.getDirectorySizeNoHidden(output.path)
val (numRecordsFailed, numRecordsSuccessful, numOfErrors) = getNumberOfErrors(spark, output.path)
calculateSizeRatio(inputDirSize, outputDataSize, numRecordsFailed + numRecordsSuccessful)
.foreach(ratio => {
Atum.setAdditionalInfo(s"${optionPrefix}_size_ratio" -> prettyPercent(ratio * 100))
calculateSizeRatio(inputDirSize, outputDataSize, numRecordsFailed + numRecordsSuccessful)
.foreach(ratio => {
Atum.setAdditionalInfo(s"${optionPrefix}_data_size_ratio" -> prettyPercent(ratio * 100))
Atum.setAdditionalInfo(s"${optionPrefix}_output_dir_size" -> outputDirSize.toString)
Atum.setAdditionalInfo(s"${optionPrefix}_output_data_size" -> outputDataSize.toString)
Atum.setAdditionalInfo(s"${optionPrefix}_record_count" -> (numRecordsSuccessful + numRecordsFailed).toString)
Atum.setAdditionalInfo(s"${optionPrefix}_records_succeeded" -> numRecordsSuccessful.toString)
Atum.setAdditionalInfo(s"${optionPrefix}_records_failed" -> numRecordsFailed.toString)
Atum.setAdditionalInfo(s"${optionPrefix}_errors_count" -> numOfErrors.toString)
if (numRecordsSuccessful == 0) {
logger.error("No successful records after running the Spark Application. Possibly the schema is incorrectly " +
"defined for the dataset.")
* Format a percentages in a pretty way.
* @param percent A percentage value.
* @return A pretty formatted percentage value.
private def prettyPercent(percent: Double): String = f"$percent%3.2f %%"
* Adds a Spark config key-value to Atum metadata if such key is present in Spark runtime config.
* @param optionPrefix A prefix for a job (e.g. "std", "conf", etc.)
* @param sparkKey A Spark configuration key
* @param atumKey An Atum metadata key
private def addSparkConfig(optionPrefix: String, sparkKey: String, atumKey: String)
(implicit spark: SparkSession): Unit = {
val sparkConfigValOpt = spark.sparkContext.getConf.getOption(sparkKey)
sparkConfigValOpt.foreach(sparkVal => Atum.setAdditionalInfo(s"${optionPrefix}_$atumKey" -> s"$sparkVal"))
* Calculates ratio between input and output directory sizes if it makes sense.
* @param inputDirSize An input directory size in bytes
* @param outputDirSize An output directory size in bytes
* @param numRecords Number of records in an output dataset
* @return A ratio between input and output directory sizes if it makes sense, None otherwise
private def calculateSizeRatio(inputDirSize: Long,
outputDirSize: Long,
numRecords: Long): Option[Double] = {
if (doesSizeRatioMakesSense(inputDirSize, numRecords)) {
Option(outputDirSize.toDouble / inputDirSize.toDouble)
} else {
* Returns if output to input size ratio makes sense. The input dir size should be bigger than zero and the output
* dataset should contain at least one record
* @param inputDirSize An input directory size in bytes
* @param numRecords Number of records in an output dataset
* @return true if it makes sense to calculate input to output size ratio.
private def doesSizeRatioMakesSense(inputDirSize: Long,
numRecords: Long): Boolean = {
inputDirSize > 0 && numRecords > 0
/** Returns the number of records failed, the number of records succeeded and the total number of errors encountered
* when running a Standardization or a Dynamic Conformance job. */
private def getNumberOfErrors(spark: SparkSession, outputPath: String): (Long, Long, Long) = {
val df =
val errorCountColumn = df.schema.getClosestUniqueName("enceladus_error_count")
val errCol = col(ErrorMessage.errorColumnName)
val numRecordsFailed = df.filter(size(errCol) > 0).count
val numRecordsSuccessful = df.filter(size(errCol) === 0).count
val numOfErrors = if (numRecordsFailed + numRecordsSuccessful > 0) {
df.withColumn(errorCountColumn, size(errCol)).agg(sum(col(errorCountColumn)))
} else {
// There are 0 errors in the error column if the output dataframe is empty
(numRecordsFailed, numRecordsSuccessful, numOfErrors)