com.nvidia.spark.rapids.tool.qualification.Qualification.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of rapids-4-spark-tools_2.12 Show documentation
RAPIDS Accelerator for Apache Spark tools
The newest version!
/*
 * Copyright (c) 2021-2024, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.nvidia.spark.rapids.tool.qualification

import java.util.concurrent.{ConcurrentHashMap, TimeUnit}

import scala.collection.JavaConverters._

import com.nvidia.spark.rapids.tool.{EventLogInfo, FailedEventLog, ToolBase}
import com.nvidia.spark.rapids.tool.qualification.QualOutputWriter.DEFAULT_JOB_FREQUENCY
import com.nvidia.spark.rapids.tool.tuning.TunerContext
import com.nvidia.spark.rapids.tool.views.QualRawReportGenerator
import org.apache.hadoop.conf.Configuration

import org.apache.spark.sql.rapids.tool.FailureApp
import org.apache.spark.sql.rapids.tool.qualification._
import org.apache.spark.sql.rapids.tool.ui.ConsoleProgressBar
import org.apache.spark.sql.rapids.tool.util._

class Qualification(outputPath: String, numRows: Int, hadoopConf: Configuration,
    timeout: Option[Long], nThreads: Int, order: String,
    pluginTypeChecker: PluginTypeChecker, reportReadSchema: Boolean,
    printStdout: Boolean, enablePB: Boolean,
    reportSqlLevel: Boolean, maxSQLDescLength: Int, mlOpsEnabled:Boolean,
    penalizeTransitions: Boolean, tunerContext: Option[TunerContext],
    clusterReport: Boolean) extends ToolBase(timeout) {

  override val simpleName: String = "qualTool"
  override val outputDir = s"$outputPath/rapids_4_spark_qualification_output"
  private val allApps = new ConcurrentHashMap[String, QualificationSummaryInfo]()

  override def getNumThreads: Int = nThreads

  private class QualifyThread(path: EventLogInfo) extends Runnable {
    def run: Unit = qualifyApp(path, hadoopConf)
  }

  def qualifyApps(allPaths: Seq[EventLogInfo]): Seq[QualificationSummaryInfo] = {
    if (enablePB && allPaths.nonEmpty) { // total count to start the PB cannot be 0
      progressBar = Some(new ConsoleProgressBar("Qual Tool", allPaths.length))
    }
    // generate metadata
    generateRuntimeReport()

    allPaths.foreach { path =>
      try {
        threadPool.submit(new QualifyThread(path))
      } catch {
        case e: Exception =>
          logError(s"Unexpected exception submitting log ${path.eventLog.toString}, skipping!", e)
      }
    }
    // wait for the threads to finish processing the files
    threadPool.shutdown()
    if (!threadPool.awaitTermination(waitTimeInSec, TimeUnit.SECONDS)) {
      logError(s"Processing log files took longer then $waitTimeInSec seconds," +
        " stopping processing any more event logs")
      threadPool.shutdownNow()
    }
    progressBar.foreach(_.finishAll())
    val allAppsSum = estimateAppFrequency(allApps.asScala.values.toSeq)
    // sort order and limit only applies to the report summary text file,
    // the csv file we write the entire data in descending order
    val sortedDescDetailed = sortDescForDetailedReport(allAppsSum)
    generateQualificationReport(allAppsSum, sortedDescDetailed)
    sortedDescDetailed
  }

  private def sortDescForDetailedReport(
      allAppsSum: Seq[QualificationSummaryInfo]): Seq[QualificationSummaryInfo] = {
    // Default sorting for of the csv files. Use the endTime to break the tie.
    allAppsSum.sortBy(sum => {
      (sum.estimatedInfo.recommendation, sum.estimatedInfo.estimatedGpuSpeedup,
        sum.estimatedInfo.estimatedGpuTimeSaved, sum.startTime + sum.estimatedInfo.appDur)
    }).reverse
  }

  // Sorting for the pretty printed executive summary.
  // The sums elements is ordered in descending order. so, only we need to reverse it if the order
  // is ascending
  private def sortForExecutiveSummary(appsSumDesc: Seq[QualificationSummaryInfo],
      order: String): Seq[EstimatedSummaryInfo] = {
    if (QualificationArgs.isOrderAsc(order)) {
      appsSumDesc.reverse.map(sum =>
        EstimatedSummaryInfo(
          sum.estimatedInfo, sum.estimatedFrequency.getOrElse(DEFAULT_JOB_FREQUENCY)))
    } else {
      appsSumDesc.map(sum => EstimatedSummaryInfo(
        sum.estimatedInfo, sum.estimatedFrequency.getOrElse(DEFAULT_JOB_FREQUENCY)))
    }
  }

  // Estimate app frequency based off of all applications in this run, unit jobs per month
  private def estimateAppFrequency(
    appsSum: Seq[QualificationSummaryInfo]): Seq[QualificationSummaryInfo] = {
    val appFrequency = scala.collection.mutable.Map[String, Double]()
    var windowStart: Long = Long.MaxValue
    var windowEnd: Long = Long.MinValue

    appsSum.foreach { sum =>
      appFrequency += (sum.appName -> (1.0 + appFrequency.getOrElse(sum.appName, 0.0)))
      windowStart = Math.min(sum.startTime, windowStart)
      windowEnd = Math.max(windowEnd, sum.startTime + sum.estimatedInfo.appDur)
    }
    val windowInMonths =
      if (windowEnd > windowStart) ((windowEnd - windowStart) / (1000.0*60*60*24*30)) else 1.0
    // Scale frequency to per month assuming uniform distribution over the logging window rather
    // than the individual applications window. Single run jobs are given a default frequency
    val monthlyFrequency = appFrequency.map { case (appName, numApps) => (appName ->
      (if (numApps <= 1) DEFAULT_JOB_FREQUENCY else (numApps / windowInMonths).round))
    }
    appsSum.map { app =>
      val frequency = monthlyFrequency.getOrElse(app.appName, DEFAULT_JOB_FREQUENCY)
      // Ensure jobs have a valid frequency, rounding up to 1 (monthly)
      app.copy(estimatedFrequency =
        Option(if (frequency <= 0) 1 else frequency))
    }
  }

  private def qualifyApp(
      path: EventLogInfo,
      hadoopConf: Configuration): Unit = {
    val pathStr = path.eventLog.toString
    try {
      // Early handling of failed event logs
      path match {
        case failedEventLog: FailedEventLog =>
          handleFailedEventLogs(failedEventLog)
          return
        case _ => // No action needed for other cases
      }
      val startTime = System.currentTimeMillis()
      val appResult = QualificationAppInfo.createApp(path, hadoopConf, pluginTypeChecker,
        reportSqlLevel, mlOpsEnabled, penalizeTransitions)
      val qualAppResult = appResult match {
        case Left(FailureApp("skipped", errorMessage)) =>
          // Case to be skipped, e.g. encountered Databricks Photon event log
          progressBar.foreach(_.reportSkippedProcess())
          SkippedAppResult(pathStr, errorMessage)
        case Left(FailureApp(_, errorMessage)) =>
          // Case when other error occurred during QualificationAppInfo creation
          progressBar.foreach(_.reportUnkownStatusProcess())
          UnknownAppResult(pathStr, "", errorMessage)
        case Right(app: QualificationAppInfo) =>
          // Case with successful creation of QualificationAppInfo
          // First, generate the Raw metrics view
          val appIndex = 1
          // this is a bit ugly right now to overload writing out the report and returning the
          // DataSource information but this encapsulates the analyzer to keep the memory usage
          // smaller.
          val dsInfo =
            AppSubscriber.withSafeValidAttempt(app.appId, app.attemptId) { () =>
              QualRawReportGenerator.generateRawMetricQualViewAndGetDataSourceInfo(
                outputDir, app, appIndex)
            }.getOrElse(Seq.empty)
          val qualSumInfo = app.aggregateStats()
          AppSubscriber.withSafeValidAttempt(app.appId, app.attemptId) { () =>
            tunerContext.foreach { tuner =>
              // Run the autotuner if it is enabled.
              // Note that we call the autotuner anyway without checking the aggregate results
              // because the Autotuner can still make some recommendations based on the information
              // enclosed by the QualificationInfo object
              tuner.tuneApplication(app, qualSumInfo, appIndex, dsInfo)
            }
          }
          if (qualSumInfo.isDefined) {
            // add the recommend cluster info into the summary
            val tempSummary = qualSumInfo.get
            val newClusterSummary = tempSummary.clusterSummary.copy(
              recommendedClusterInfo = pluginTypeChecker.platform.recommendedClusterInfo)
            AppSubscriber.withSafeValidAttempt(app.appId, app.attemptId) { () =>
              val newQualSummary = tempSummary.copy(clusterSummary = newClusterSummary)
              // check if the app is already in the map
              if (allApps.containsKey(app.appId)) {
                // fix the progress bar counts
                progressBar.foreach(_.adjustCounterForMultipleAttempts())
                logInfo(s"Removing older app summary for app: ${app.appId} " +
                  s"before adding the new one with attempt: ${app.attemptId}")
              }
              progressBar.foreach(_.reportSuccessfulProcess())
              allApps.put(app.appId, newQualSummary)
              val endTime = System.currentTimeMillis()
              SuccessAppResult(pathStr, app.appId, app.attemptId,
                s"Took ${endTime - startTime}ms to process")
            } match {
              case Some(successfulResult) => successfulResult
              case _ =>
                // If the attemptId is an older attemptId, skip this attempt.
                // This can happen when the user has provided event logs for multiple attempts
                progressBar.foreach(_.reportSkippedProcess())
                SkippedAppResult.fromAppAttempt(pathStr, app.appId, app.attemptId)
            }
          } else {
            progressBar.foreach(_.reportUnkownStatusProcess())
            UnknownAppResult(pathStr, app.appId,
              "No aggregated stats for event log")
          }
      }
      // Log the information to the console
      qualAppResult.logMessage()
      // Update the appStatusReporter with the result of QualificationAppInfo processing
      appStatusReporter.put(pathStr, qualAppResult)
    } catch {
      case oom: OutOfMemoryError =>
        logError(s"OOM error while processing large file: $pathStr." +
            s"Increase heap size.", oom)
        System.exit(1)
      case o: Error =>
        logError(s"Error occurred while processing file: $pathStr", o)
        System.exit(1)
      case e: Exception =>
        progressBar.foreach(_.reportFailedProcess())
        val failureAppResult = FailureAppResult(pathStr,
          s"Unexpected exception processing log, skipping!")
        failureAppResult.logMessage(Some(e))
        appStatusReporter.put(pathStr, failureAppResult)
    }
  }

  /**
   * The outputPath of the current instance of the provider
   */
  def getReportOutputPath: String = {
    s"$outputDir/rapids_4_spark_qualification_output"
  }

  /**
   * Generates a qualification report based on the provided summary information.
   */
  private def generateQualificationReport(allAppsSum: Seq[QualificationSummaryInfo],
      sortedDescDetailed: Seq[QualificationSummaryInfo]): Unit = {
    val qWriter = new QualOutputWriter(outputDir, reportReadSchema, printStdout,
      order)

    qWriter.writeTextReport(allAppsSum,
      sortForExecutiveSummary(sortedDescDetailed, order), numRows)
    qWriter.writeDetailedCSVReport(sortedDescDetailed)
    if (reportSqlLevel) {
      qWriter.writePerSqlTextReport(allAppsSum, numRows, maxSQLDescLength)
      qWriter.writePerSqlCSVReport(allAppsSum, maxSQLDescLength)
    }
    qWriter.writeExecReport(allAppsSum, order)
    qWriter.writeStageReport(allAppsSum, order)
    qWriter.writeUnsupportedOpsSummaryCSVReport(allAppsSum)
    val appStatusResult = generateStatusResults(appStatusReporter.asScala.values.toSeq)
    qWriter.writeStatusReport(appStatusResult, order)
    if (mlOpsEnabled) {
      if (allAppsSum.exists(x => x.mlFunctions.nonEmpty)) {
        qWriter.writeMlFuncsReports(allAppsSum, order)
        qWriter.writeMlFuncsTotalDurationReports(allAppsSum)
      } else {
        logWarning(s"Eventlogs doesn't contain any ML functions")
      }
    }
    if (clusterReport) {
      qWriter.writeClusterReport(allAppsSum)
      qWriter.writeClusterReportCsv(allAppsSum)
    }
  }
}