Maven / Gradle / Ivy
* Copyright 2018 ABSA Group Limited
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.functions.{col, lit}
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.slf4j.LoggerFactory
class OptimizerTimeTracker(inputDf: DataFrame, isWorkaroundEnabled: Boolean)(implicit spark: SparkSession) {
import spark.implicits._
private val log = LoggerFactory.getLogger(this.getClass)
private val maxToleratedPlanGenerationPerRuleMs = 100L
private val initialElapsedTimeBaselineMs = 250L
private var baselineTimeMs = initialElapsedTimeBaselineMs
private var lastExecutionPlanOptimizationTime = 0L
private val idField1 = SchemaUtils.getUniqueName("tmpId", Option(inputDf.schema))
private val idField2 = s"${idField1}_2"
private val dfWithId = inputDf.withColumn(idField1, lit(1))
private val dfJustId = Seq(1).toDF(idField2).cache()
* Returns a dataframe prepared to apply the Catalyst workaround
def getWorkaroundDataframe: DataFrame = {
if (isWorkaroundEnabled) {
} else {
* Cleans up the additional field used for applying the Catalyst workaround
def cleanupWorkaroundDf(df: DataFrame): DataFrame = df.drop(col(idField1))
* Returns true of a dataframe might require a Catalyst issue workaround.
* A workaround is required if execution plan optimization step takes too long
* @param df A dataframe that might require a Catalyst workaround
* @return true if a workaround is required
def isCatalystWorkaroundRequired(df: DataFrame, rulesApplied: Int): Boolean = {
if (isWorkaroundEnabled) {
val currentExecutionPlanOptimizationTime = getExecutionPlanGenerationTimeMs(df)
// The algorithm for determining when to apply a workaround is based on 2 triggers.
// 1. If it takes 4 times longer to optimize an execution plan now in comparison to the execution plan
// optimization time before the last conformance rules was applied, then it is too much.
val tooBigTimeDifference = if (lastExecutionPlanOptimizationTime > 0) {
currentExecutionPlanOptimizationTime / lastExecutionPlanOptimizationTime > 4
} else {
// 2. If the time it takes to optimize an execution plan grows faster than a linear function of number of applied
// rules, it is also considered too long. Consider a case when after each application of a conformance rule
// it takes twice as much time. The n.1 condition won't be triggered and the execution time will
// grow exponentially. This is a preotection against such a case.
val tooLong = tooBigTimeDifference ||
currentExecutionPlanOptimizationTime > baselineTimeMs + maxToleratedPlanGenerationPerRuleMs * rulesApplied
val msg = s"Execution optimization time for $rulesApplied rules: $currentExecutionPlanOptimizationTime ms"
if (tooLong) {
log.warn(s"$msg (Too long!)")
} else {
// The last 'approved' execution plan time generation is remembered as a baseline so that the condition n.2 is
// not triggered based solely on a long execution plan generation time. It should be triggered on a big
// increase of execution plan generation time instead.
baselineTimeMs = Math.max(baselineTimeMs, currentExecutionPlanOptimizationTime)
lastExecutionPlanOptimizationTime = currentExecutionPlanOptimizationTime"New baseline execution plan optimization time: $baselineTimeMs ms")
} else {
* Records execution plan optimization time.
* This is used after a workaround and a conformance rule are applied to a dataframe.
* @param df A dataframe for measuring execution plan optimization time
def recordExecutionPlanOptimizationTime(df: DataFrame): Unit = {
val elapsedTime = getExecutionPlanGenerationTimeMs(df)
baselineTimeMs = Math.max(baselineTimeMs, elapsedTime)
lastExecutionPlanOptimizationTime = elapsedTime
* Returns elapsed time of execution plan optimization
* @param df A dataframe to calculate execution plan optimization time
* @return Elapsed time in milliseconds
def getExecutionPlanGenerationTimeMs(df: DataFrame): Long = {
val t0 = System.currentTimeMillis()
if (df.isStreaming) {
// Ensures the execution plan won't be takes from cache
} else {
val t1 = System.currentTimeMillis()
t1 - t0
* Applies a Catalyst workaround by joining a dataframe with the dataframe containing only unique ids.
* @param dfWithId A dataframe containing a unique id field for which execution plan generation takes too long
* @return A new dataframe with Catalyst optimizer issue workaround applied
def applyCatalystWorkaround(dfWithId: DataFrame): DataFrame = {
if (isWorkaroundEnabled) {
log.warn("A Catalyst optimizer issue workaround is applied.")
} else {