za.co.absa.enceladus.conformance.interpreter.DynamicInterpreter.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-jobs Show documentation
The newest version!
/*
 * Copyright 2018 ABSA Group Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package za.co.absa.enceladus.conformance.interpreter

import org.apache.hadoop.fs.FileSystem
import org.apache.spark.sql.execution.ExtendedMode
import org.apache.spark.sql.execution.command.ExplainCommand
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.storage.StorageLevel
import org.slf4j.LoggerFactory
import za.co.absa.atum.AtumImplicits._
import za.co.absa.enceladus.conformance.config.ConformanceConfigParser
import za.co.absa.enceladus.conformance.datasource.PartitioningUtils
import za.co.absa.enceladus.conformance.interpreter.rules._
import za.co.absa.enceladus.conformance.interpreter.rules.custom.CustomConformanceRule
import za.co.absa.enceladus.conformance.interpreter.rules.mapping.{MappingRuleInterpreter, MappingRuleInterpreterBroadcast,
  MappingRuleInterpreterGroupExplode}
import za.co.absa.enceladus.dao.EnceladusDAO
import za.co.absa.enceladus.model.conformanceRule._
import za.co.absa.enceladus.model.{Dataset => ConfDataset}
import za.co.absa.enceladus.utils.config.PathWithFs
import za.co.absa.enceladus.utils.error.ErrorMessage
import za.co.absa.enceladus.utils.fs.HadoopFsUtils
import za.co.absa.enceladus.utils.udf.ConformanceUDFLibrary
import za.co.absa.spark.commons.utils.explode.ExplosionContext
import za.co.absa.spark.commons.implicits.StructTypeImplicits.StructTypeEnhancementsArrays
import za.co.absa.spark.commons.implicits.DataFrameImplicits.DataFrameEnhancements
import za.co.absa.commons.lang.extensions.SeqExtension._

case class DynamicInterpreter()(implicit inputFs: FileSystem) {
  private val log = LoggerFactory.getLogger(this.getClass)

  /**
    * Interpret conformance rules defined in a dataset.
    *
    * @param conformance  The dataset object - this represents a data conformance workflow.
    * @param inputDf      The dataset to be conformed.
    * @param jobShortName A job name used for checkpoints.
    * @return The conformed DataFrame.
    *
    */
  def interpret[T](conformance: ConfDataset, inputDf: Dataset[Row], jobShortName: String = "Conformance")
               (implicit spark: SparkSession,
                dao: EnceladusDAO,
                progArgs: ConformanceConfigParser[T],
                featureSwitches: FeatureSwitches): DataFrame = {

    implicit val interpreterContext: InterpreterContext = InterpreterContext(inputDf.schema, conformance,
      featureSwitches, jobShortName, spark, dao, InterpreterContextArgs.fromConformanceConfig(progArgs))

    applyCheckpoint(inputDf, "Start")

    val conformedDf = applyConformanceRules(ensureErrorColumnExists(inputDf))

    applyCheckpoint(conformedDf, "End")
    logExecutionPlan(conformedDf)

    // explicitly set errCol (non)nullable, see issue #1818
    ensureErrorColumnNullability(conformedDf, featureSwitches.errColNullability)
  }

  private def findOriginalColumnsModificationRules(steps: List[ConformanceRule],
                                                   schema: StructType): Seq[ConformanceRule] = {
    steps.filter(rule => schema.fieldExists(rule.outputColumn))
  }

  /**
    * Applies conformance rules applying a workaround for the Catalyst optimizer bug.
    *
    * @param inputDf The dataset to be conformed.
    * @return The conformed DataFrame.
    */
  private def applyConformanceRules(inputDf: DataFrame)
                                   (implicit ictx: InterpreterContext): DataFrame = {
    implicit val spark: SparkSession = ictx.spark
    implicit val dao: EnceladusDAO = ictx.dao
    implicit val progArgs: InterpreterContextArgs = ictx.progArgs
    implicit val udfLib: ConformanceUDFLibrary = new ConformanceUDFLibrary
    implicit val explosionState: ExplosionState = new ExplosionState()

    val steps = getConformanceSteps

    checkMutabilityNotViolated(inputDf.schema, steps)

    val interpreters = getInterpreters(steps, inputDf.schema)
    val optimizerTimeTracker = new OptimizerTimeTracker(inputDf, ictx.featureSwitches.catalystWorkaroundEnabled)
    val dfInputWithIdForWorkaround = optimizerTimeTracker.getWorkaroundDataframe

    // Fold left on rules
    var rulesApplied = 0
    val conformedDf = interpreters.foldLeft(dfInputWithIdForWorkaround)({
      case (df, interpreter) =>
        val explosionStateCopy = new ExplosionState(explosionState.explodeContext)
        val ruleAppliedDf = interpreter.conform(df)(spark, explosionStateCopy, dao, progArgs)

        val conformedDf = if (explosionState.isNoExplosionsApplied &&
          optimizerTimeTracker.isCatalystWorkaroundRequired(ruleAppliedDf, rulesApplied)) {
          // Apply a workaround BEFORE applying the rule so that the execution plan generation still runs fast
          val workAroundDf = interpreter.conform(optimizerTimeTracker.applyCatalystWorkaround(df))
          optimizerTimeTracker.recordExecutionPlanOptimizationTime(workAroundDf)
          workAroundDf
        } else {
          explosionState.explodeContext = explosionStateCopy.explodeContext
          ruleAppliedDf
        }
        rulesApplied += 1
        interpreter.conformanceRule match {
          case Some(rule) => applyRuleCheckpoint(rule, conformedDf, progArgs.persistStorageLevel, explosionState.explodeContext)
          case None       => conformedDf
        }
    })
    optimizerTimeTracker.cleanupWorkaroundDf(conformedDf)
  }

  private def checkMutabilityNotViolated(schema: StructType, steps: List[ConformanceRule])
                                        (implicit ictx: InterpreterContext): Unit = {
    val rulesInViolation = findOriginalColumnsModificationRules(steps, schema)

    if (rulesInViolation.nonEmpty) {
      val violationsString = rulesInViolation.map(rule =>
        s"Rule number ${rule.order} - ${rule.getClass.getSimpleName}"
      ).mkString("\n")
      if (ictx.featureSwitches.allowOriginalColumnsMutability) {
        log.warn(
          s"""Mutability of original Data Allowed and there are some rules in violation of immutability pattern.
             |These are:
             |$violationsString""".stripMargin)
      } else {
        throw new IllegalStateException(
          s"""There are some rules in violation of immutability pattern. These are:
             |$violationsString""".stripMargin)
      }
    }
  }

  /**
    * Transforms a list of conformance rules to a list of conformance rule interpreters.
    * For most conformance rules there is only one interpreter to apply. But mapping rule
    * has several strategies. Optimizer chooses which strategy to use and provides an
    * interpreter for each strategy.
    *
    * @param rules  A list of conformance rules.
    * @param schema A schema of a DataFrame to be conformed.
    * @return A list of conformance rule interpreters.
    */
  def getInterpreters(rules: List[ConformanceRule], schema: StructType)
                     (implicit ictx: InterpreterContext): List[RuleInterpreter] = {

    val groupedRules = groupMappingRules(rules, schema)

    getOptimizedInterpreters(groupedRules, schema)
  }

  /**
    * Optimizes a list of groups of conformance rules and returns a list of interpreters as the result.
    *
    * The input conformance rules are expected to be grouped. Only mapping rules are grouped. Each group of
    * mapping rules has the output field in the same array. This makes it possible to apply an optimization of
    * exploding the array only once. The optimization is done by inserting `ArrayExplodeInterpreter` and
    * `ArrayCollapseInterpreter` for each group of mapping rules.
    *
    * @param ruleGroups Conformance rules grouped by output field being in the same array.
    * @param schema     A schema of a DataFrame to be conformed.
    * @return A list of conformance rules interpreters.
    */
  private def getOptimizedInterpreters(ruleGroups: List[List[ConformanceRule]],
                                       schema: StructType)
                                      (implicit ictx: InterpreterContext): List[RuleInterpreter] = {
    ruleGroups.flatMap(rules => {
      val interpreters = rules.map(rule => getInterpreter(rule))
      if (isGroupExplosionUsable(rules) &&
        ictx.featureSwitches.experimentalMappingRuleEnabled) {
        // Inserting an explosion and a collapse between a group of mapping rules operating on a common array
        val optArray = schema.getDeepestArrayPath(rules.head.outputColumn)
        optArray match {
          case Some(arrayColumn) =>
            new ArrayExplodeInterpreter(arrayColumn) :: (interpreters :+ new ArrayCollapseInterpreter())
          case None              =>
            throw new IllegalStateException("Unable to find a common array between fields: " +
              rules.map(_.outputColumn).mkString(", "))
        }
      } else {
        interpreters
      }
    })
  }

  /**
    * Returns an interpreter for a conformance rule. Most conformance rules correspond to one interpreter.
    * The exception is the mapping rule for which there are several interpreters based on the strategy used.
    *
    * @param rule   A conformance rule.
    * @return A conformance rules interpreter.
    */
  private def getInterpreter(rule: ConformanceRule)
                            (implicit ictx: InterpreterContext): RuleInterpreter = {
    rule match {
      case r: DropConformanceRule             => DropRuleInterpreter(r)
      case r: ConcatenationConformanceRule    => ConcatenationRuleInterpreter(r)
      case r: LiteralConformanceRule          => LiteralRuleInterpreter(r)
      case r: SingleColumnConformanceRule     => SingleColumnRuleInterpreter(r)
      case r: SparkSessionConfConformanceRule => SparkSessionConfRuleInterpreter(r)
      case r: UppercaseConformanceRule        => UppercaseRuleInterpreter(r)
      case r: CastingConformanceRule          => CastingRuleInterpreter(r)
      case r: NegationConformanceRule         => NegationRuleInterpreter(r)
      case r: MappingConformanceRule          => getMappingRuleInterpreter(r)
      case r: FillNullsConformanceRule        => FillNullsRuleInterpreter(r)
      case r: CoalesceConformanceRule         => CoalesceRuleInterpreter(r)
      case r: CustomConformanceRule           => r.getInterpreter()
      case r                                  => throw new IllegalStateException(s"Unrecognized rule class: ${r.getClass.getName}")
    }
  }

  /**
    * Returns an interpreter for a mapping rule based on which strategy is applicable.
    *
    * @param rule   A conformance rule.
    * @return A mapping rule interpreter.
    */
  private def getMappingRuleInterpreter(rule: MappingConformanceRule)
                                       (implicit ictx: InterpreterContext): RuleInterpreter = {
    if (canMappingRuleBroadcast(rule)) {
      log.info("Broadcast strategy for mapping rules is used")
      MappingRuleInterpreterBroadcast(rule, ictx.conformance)
    } else {
      //Only MappingRuleInterpreterBroadcast or MappingRuleInterpreterGroupExplode support multiple outputs
      if (ictx.featureSwitches.experimentalMappingRuleEnabled || rule.definedAdditionalColumns().nonEmpty) {
        log.info("Group explode strategy for mapping rules used")
        MappingRuleInterpreterGroupExplode(rule, ictx.conformance)
      } else {
        log.info("Explode strategy for mapping rules used")
        MappingRuleInterpreter(rule, ictx.conformance)
      }
    }
  }

  /**
    * An explosion is needed for a group of mapping rules if the number of mapping rules inside the group
    * for which broadcasting strategy is not applicable is bigger than 1.
    *
    * @param rules  A list of conformance rules grouped by output field being in the same array
    * @return true if a group explosion optimization can be used
    */
  private def isGroupExplosionUsable(rules: List[ConformanceRule])
                                    (implicit ictx: InterpreterContext): Boolean = {
    val eligibleRulesCount = rules.map {
      case rule: MappingConformanceRule => if (canMappingRuleBroadcast(rule)) 0 else 1
      case _                            => 0
    }.sum

    eligibleRulesCount > 1
  }

  /**
    * Returns true if broadcasting strategy is applicable for the specified mapping rule.
    *
    * @param rule   A mapping conformance rule.
    * @return true if the broadcasting mapping rule strategy can be used.
    */
  private def canMappingRuleBroadcast(rule: MappingConformanceRule)
                                     (implicit ictx: InterpreterContext): Boolean = {
    // Broadcasting approach assumes there are at most 10 join conditions
    if(rule.attributeMappings.size <= 10) {
      ictx.featureSwitches.broadcastStrategyMode match {
        case Always => true
        case Never => false
        case Auto => isMappingTableSmallEnough(rule)
      }
    } else {
      false
    }
  }

  /**
    * Returns true if the mapping table size is small enough for the broadcasting strategy to be used.
    *
    * @param rule   A mapping conformance rule.
    * @return true if the mapping table size is small enough.
    */
  private def isMappingTableSmallEnough(rule: MappingConformanceRule)
                                       (implicit ictx: InterpreterContext): Boolean = {
    val maxBroadcastSizeMb = ictx.featureSwitches.broadcastMaxSizeMb
    val mappingTableSize = getMappingTableSizeMb(rule)
    log.info(s"Mapping table (${rule.mappingTable}) size = $mappingTableSize MB (threshold = $maxBroadcastSizeMb MB)")
    mappingTableSize <= maxBroadcastSizeMb
  }

  /**
    * Returns the size of the mapping table in megabytes.
    *
    * @param rule   A mapping conformance rule.
    * @return The size of the mapping table in megabytes.
    */
  private def getMappingTableSizeMb(rule: MappingConformanceRule)
                                   (implicit ictx: InterpreterContext): Int = {

    val mappingTableDef = ictx.dao.getMappingTable(rule.mappingTable, rule.mappingTableVersion)
    val mappingTablePath = PartitioningUtils.getPartitionedPathName(mappingTableDef.hdfsPath,
      ictx.progArgs.reportDate)
    //accommodate different fs for the mapping table or different bucket
    val mappingTableFs = PathWithFs.fromPath(mappingTablePath)(ictx.spark.sparkContext.hadoopConfiguration)

    val mappingTableSize = HadoopFsUtils.getOrCreate(mappingTableFs.fileSystem).getDirectorySizeNoHidden(mappingTableFs.path)
    val mb = (mappingTableSize / (1024 * 1024)).toInt
    log.debug(s"$mappingTablePath size: ${mb}MB")
    mb
  }

  /**
    * Gets the list of conformance rules from the context
    *
    * @return A list of conformance rules
    */
  def getConformanceSteps(implicit ictx: InterpreterContext): List[ConformanceRule] = {
    ictx.conformance.conformance.sortBy(_.order)
  }

  /**
    * Applies a control framework checkpoint given a stage of the conformance
    *
    * @param df       Dataframe to apply the checkpoint on
    * @param jobStage Specifies a job stage that will be added to the checkpoint name
    */
  private def applyCheckpoint(df: Dataset[Row], jobStage: String)(implicit ictx: InterpreterContext): Unit = {
    if (ictx.featureSwitches.controlFrameworkEnabled) {
      df.setCheckpoint(s"${ictx.jobShortName} - $jobStage")
    }
  }

  /**
    * Create a new Control Framework checkpoint for a specified Conformance Rule (after the rule is applied)
    *
    * @param rule                The conformance rule
    * @param df                  Dataframe to apply the checkpoint on
    * @param persistStorageLevel A storage level for caching/persisting the df, if set.
    * @param explodeContext      An exploded context to be taken into account if a checkpoint is created for an exploded
    *                            dataframe
    * @return A cached dataframe if a checkpoint is calculated, otherwise returns the original dataframe
    */
  private def applyRuleCheckpoint(rule: ConformanceRule,
                                  df: Dataset[Row],
                                  persistStorageLevel: Option[StorageLevel],
                                  explodeContext: ExplosionContext)
                                 (implicit ictx: InterpreterContext): Dataset[Row] = {
    if (ictx.featureSwitches.controlFrameworkEnabled && rule.controlCheckpoint) {
      val explodeFilter = explodeContext.getControlFrameworkFilter
      // Cache the data first since Atum will execute an action for each control metric
      val cachedDf = persistStorageLevel match {
        case Some(level) => df.persist(level)
        case None        => df.cacheIfNotCachedYet()
      }
      cachedDf.filter(explodeFilter)
        .setCheckpoint(s"${ictx.jobShortName} (${rule.order}) - ${rule.outputColumn}")
      cachedDf
    }
    else {
      df
    }
  }

  private def logExecutionPlan(df: DataFrame)(implicit spark: SparkSession): Unit = {
    // Need to check this explicitly since the execution plan generation can take significant amount of time
    if (log.isDebugEnabled) {
      val explain = ExplainCommand(df.queryExecution.logical, mode = ExtendedMode)
      spark.sessionState.executePlan(explain).executedPlan.executeCollect().foreach {
        r => log.debug("Output Dataset plan: \n" + r.getString(0))
      }
    }
  }

  /**
    * Ensures the existence of the error column
    *
    * @param inputDf the input data frame
    * @return A dataframe that has an error column
    */
  private def ensureErrorColumnExists(inputDf: Dataset[Row]): DataFrame = {
    if (inputDf.columns.contains(ErrorMessage.errorColumnName)) {
      inputDf
    } else {
      inputDf.withColumn(ErrorMessage.errorColumnName, typedLit(List[ErrorMessage]()))
    }
  }

  private def ensureErrorColumnNullability(inputDf: DataFrame, nullable: Boolean): DataFrame = {
    import za.co.absa.enceladus.utils.implicits.EnceladusDataFrameImplicits.EnceladusDataframeEnhancements

    inputDf.withNullableColumnState(ErrorMessage.errorColumnName, nullable)
  }

  /**
    * Groups mapping rules if their output columns are inside the same array
    *
    * @param rules  a list of conformance rules
    * @param schema a schema of a dataset
    * @return The list of lists of conformance rule groups
    */
  private def groupMappingRules(rules: List[ConformanceRule], schema: StructType): List[List[ConformanceRule]] = {
    rules.groupConsecutiveByOption[String] {
      case m: MappingConformanceRule => schema.getDeepestArrayPath(m.outputColumn)
      case _                         => None
    }.map(_.toList).toList
  }

}