All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.nvidia.spark.rapids.tool.qualification.QualOutputWriter.scala Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (c) 2021-2024, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.nvidia.spark.rapids.tool.qualification

import java.util.concurrent.atomic.AtomicLong

import scala.collection.mutable
import scala.collection.mutable.{Buffer, LinkedHashMap, ListBuffer}

import com.nvidia.spark.rapids.tool.ToolTextFileWriter
import com.nvidia.spark.rapids.tool.planparser.{DatabricksParseHelper, ExecInfo, PlanInfo, UnsupportedExecSummary}
import com.nvidia.spark.rapids.tool.profiling.AppStatusResult
import com.nvidia.spark.rapids.tool.profiling.ProfileUtils.replaceDelimiter
import com.nvidia.spark.rapids.tool.qualification.QualOutputWriter.{CLUSTER_ID, CLUSTER_ID_STR_SIZE, JOB_ID, JOB_ID_STR_SIZE, RUN_NAME, RUN_NAME_STR_SIZE, TEXT_DELIMITER}
import org.apache.hadoop.conf.Configuration
import org.json4s.DefaultFormats
import org.json4s.jackson.Serialization

import org.apache.spark.sql.rapids.tool.ToolUtils
import org.apache.spark.sql.rapids.tool.qualification.{EstimatedPerSQLSummaryInfo, EstimatedSummaryInfo, QualificationSummaryInfo}
import org.apache.spark.sql.rapids.tool.util._

/**
 * This class handles the output files for qualification.
 * It can write both a raw csv file and then a text summary report.
 *
 * @param outputDir The directory to output the files to
 * @param reportReadSchema Whether to include the read data source schema in csv output
 * @param printStdout Indicates if the summary report should be printed to stdout as well
 * @param prettyPrintOrder The order in which to print the Text output
 * @param hadoopConf Optional Hadoop Configuration to use
 */
class QualOutputWriter(outputDir: String, reportReadSchema: Boolean,
    printStdout: Boolean, prettyPrintOrder: String,
    hadoopConf: Option[Configuration] = None) {

  implicit val formats: DefaultFormats.type = DefaultFormats

  def writeDetailedCSVReport(sums: Seq[QualificationSummaryInfo]): Unit = {
    val csvFileWriter = new ToolTextFileWriter(outputDir,
      s"${QualOutputWriter.LOGFILE_NAME}.csv", "CSV", hadoopConf)
    try {
      writeDetailedCSVReport(csvFileWriter, sums)
    } finally {
      csvFileWriter.close()
    }
  }

  protected def writeDetailedCSVReport(csvFileWriter: ToolTextFileWriter,
      sums: Seq[QualificationSummaryInfo]): Unit = {
    val headersAndSizes = QualOutputWriter.getDetailedHeaderStringsAndSizes(sums,
      reportReadSchema)
    csvFileWriter.write(QualOutputWriter.constructDetailedHeader(headersAndSizes,
      QualOutputWriter.CSV_DELIMITER, prettyPrint = false))
    sums.foreach { sum =>
      csvFileWriter.write(QualOutputWriter.constructAppDetailedInfo(sum, headersAndSizes,
        QualOutputWriter.CSV_DELIMITER, prettyPrint = false, reportReadSchema = reportReadSchema))
    }
  }

  // write the text summary report
  def writeTextReport(sums: Seq[QualificationSummaryInfo], estSums: Seq[EstimatedSummaryInfo],
      numOutputRows: Int): Unit = {
    val textFileWriter = new ToolTextFileWriter(outputDir,
      s"${QualOutputWriter.LOGFILE_NAME}.log", "Summary Report", hadoopConf)
    try {
      writeTextReport(textFileWriter, sums, estSums, numOutputRows)
    } finally {
      textFileWriter.close()
    }
  }

  protected def writeTextReport(writer: ToolTextFileWriter,
      sums: Seq[QualificationSummaryInfo], estSum: Seq[EstimatedSummaryInfo],
      numOutputRows: Int): Unit = {
    val appIdMaxSize = QualOutputWriter.getAppIdSize(sums)
    val unSupExecMaxSize = QualOutputWriter.getunSupportedMaxSize(
      sums.map(_.unSupportedExecs.size),
      QualOutputWriter.UNSUPPORTED_EXECS_MAX_SIZE,
      QualOutputWriter.UNSUPPORTED_EXECS.size)
    val unSupExprMaxSize = QualOutputWriter.getunSupportedMaxSize(
      sums.map(_.unSupportedExprs.size),
      QualOutputWriter.UNSUPPORTED_EXPRS_MAX_SIZE,
      QualOutputWriter.UNSUPPORTED_EXPRS.size)
    val estimatedFrequencyMaxSize = QualOutputWriter.ESTIMATED_FREQUENCY_MAX_SIZE
    val appNameMaxSize = QualOutputWriter.getAppNameSize(sums)
    val hasClusterTags = sums.exists(_.clusterTags.nonEmpty)
    val (clusterIdMaxSize, jobIdMaxSize, runNameMaxSize) = if (hasClusterTags) {
      (QualOutputWriter.getMaxSizeForHeader(sums.map(_.allClusterTagsMap.getOrElse(
        CLUSTER_ID, "").size), QualOutputWriter.CLUSTER_ID),
        QualOutputWriter.getMaxSizeForHeader(sums.map(_.allClusterTagsMap.getOrElse(
          JOB_ID, "").size), QualOutputWriter.JOB_ID),
        QualOutputWriter.getMaxSizeForHeader(sums.map(_.allClusterTagsMap.getOrElse(
          RUN_NAME, "").size), QualOutputWriter.RUN_NAME))
    } else {
      (CLUSTER_ID_STR_SIZE, JOB_ID_STR_SIZE, RUN_NAME_STR_SIZE)
    }
    val headersAndSizes = QualOutputWriter.getSummaryHeaderStringsAndSizes(
      appNameMaxSize, appIdMaxSize, unSupExecMaxSize, unSupExprMaxSize, estimatedFrequencyMaxSize,
      hasClusterTags, clusterIdMaxSize, jobIdMaxSize, runNameMaxSize)
    val entireHeader = QualOutputWriter.constructOutputRowFromMap(headersAndSizes,
      TEXT_DELIMITER, prettyPrint = true)
    val sep = "=" * (entireHeader.size - 1)
    writer.write(s"$sep\n")
    writer.write(entireHeader)
    writer.write(s"$sep\n")
    // write to stdout as well
    if (printStdout) {
      print("APPLICATION SUMMARY:\n")
      print(s"$sep\n")
      print(entireHeader)
      print(s"$sep\n")
    }
    val finalSums = estSum.take(numOutputRows)
    finalSums.foreach { sumInfo =>
      val wStr = QualOutputWriter.constructAppSummaryInfo(sumInfo, headersAndSizes,
        appIdMaxSize, unSupExecMaxSize, unSupExprMaxSize, estimatedFrequencyMaxSize, hasClusterTags,
        clusterIdMaxSize, jobIdMaxSize, runNameMaxSize, TEXT_DELIMITER, true)
      writer.write(wStr)
      if (printStdout) print(wStr)
    }
    writer.write(s"$sep\n")
    if (printStdout) print(s"$sep\n")
  }

  def writeStageReport(sums: Seq[QualificationSummaryInfo], order: String) : Unit = {
    val csvFileWriter = new ToolTextFileWriter(outputDir,
      s"${QualOutputWriter.LOGFILE_NAME}_stages.csv",
      "Stage Exec Info", hadoopConf)
    try {
      val headersAndSizes = QualOutputWriter.getDetailedStagesHeaderStrings
      csvFileWriter.write(QualOutputWriter.constructDetailedHeader(headersAndSizes, ",", false))
      sums.foreach { sumInfo =>
        val rows = QualOutputWriter.constructStagesInfo(sumInfo, headersAndSizes, ",", false)
        rows.foreach(csvFileWriter.write(_))
      }
    } finally {
      csvFileWriter.close()
    }
  }

  def writeUnsupportedOpsSummaryCSVReport(
      sums: Seq[QualificationSummaryInfo]): Unit = {
    val csvFileWriter = new ToolTextFileWriter(outputDir,
      s"${QualOutputWriter.LOGFILE_NAME}_unsupportedOperators.csv",
      "Unsupported Operators DetailedStageDuration CSV Report", hadoopConf)
    try {
      val headersAndSizes = QualOutputWriter.getUnsupportedOperatorsHeaderStrings
      csvFileWriter.write(QualOutputWriter.constructOutputRowFromMap(headersAndSizes,
        QualOutputWriter.CSV_DELIMITER))
      sums.foreach { sum =>
        QualOutputWriter.constructUnsupportedDetailedStagesDurationInfo(csvFileWriter,
          sum, headersAndSizes,
          QualOutputWriter.CSV_DELIMITER, false)
      }
    } finally {
      csvFileWriter.close()
    }
  }

  def writePerSqlCSVReport(sums: Seq[QualificationSummaryInfo], maxSQLDescLength: Int): Unit = {
    val csvFileWriter = new ToolTextFileWriter(outputDir,
      s"${QualOutputWriter.LOGFILE_NAME}_persql.csv",
      "Per SQL CSV Report", hadoopConf)
    try {
      val appNameSize = QualOutputWriter.getAppNameSize(sums)
      val appIdSize = QualOutputWriter.getAppIdSize(sums)
      val sqlDescSize =
        QualOutputWriter.getSqlDescSize(sums, maxSQLDescLength, QualOutputWriter.CSV_DELIMITER)
      val headersAndSizes =
        QualOutputWriter.getDetailedPerSqlHeaderStringsAndSizes(appNameSize, appIdSize, sqlDescSize)
      csvFileWriter.write(QualOutputWriter.constructDetailedHeader(headersAndSizes,
        QualOutputWriter.CSV_DELIMITER, false))
      val appIdMaxSize = QualOutputWriter.getAppIdSize(sums)
      val sortedInfo = sortPerSqlInfo(sums)
      sortedInfo.foreach { sumInfo =>
        val row = QualOutputWriter.constructPerSqlSummaryInfo(sumInfo, headersAndSizes,
          appIdMaxSize, ",", false, maxSQLDescLength)
        csvFileWriter.write(row)
      }
    } finally {
      csvFileWriter.close()
    }
  }

  private def sortPerSqlInfo(
      sums: Seq[QualificationSummaryInfo]): Seq[EstimatedPerSQLSummaryInfo] = {
    val estSumPerSql = sums.flatMap(_.perSQLEstimatedInfo).flatten
    val sortedAsc = estSumPerSql.sortBy(sum => {
      (sum.info.recommendation, sum.info.estimatedGpuSpeedup,
        sum.info.estimatedGpuTimeSaved, sum.info.appDur, sum.info.appId)
    })
    if (QualificationArgs.isOrderAsc(prettyPrintOrder)) {
      sortedAsc
    } else {
      sortedAsc.reverse
    }
  }
  private def writePerSqlTextSummary(writer: ToolTextFileWriter,
      sums: Seq[QualificationSummaryInfo],
      numOutputRows: Int, maxSQLDescLength: Int): Unit = {
    val appNameSize = QualOutputWriter.getAppNameSize(sums)
    val appIdSize = QualOutputWriter.getAppIdSize(sums)
    val sqlDescSize =
      QualOutputWriter.getSqlDescSize(sums, maxSQLDescLength, QualOutputWriter.TEXT_DELIMITER)
    val headersAndSizes =
      QualOutputWriter.getDetailedPerSqlHeaderStringsAndSizes(appNameSize, appIdSize, sqlDescSize)
    val entireHeader = QualOutputWriter.constructOutputRowFromMap(headersAndSizes,
      TEXT_DELIMITER, true)
    val sep = "=" * (entireHeader.size - 1)
    writer.write(s"$sep\n")
    writer.write(entireHeader)
    writer.write(s"$sep\n")
    // write to stdout as well
    if (printStdout) {
      print("PER SQL SUMMARY:\n")
      print(s"$sep\n")
      print(entireHeader)
      print(s"$sep\n")
    }
    val sortedInfo = sortPerSqlInfo(sums)
    val finalSums = sortedInfo.take(numOutputRows)
    finalSums.foreach { estInfo =>
      val wStr = QualOutputWriter.constructPerSqlSummaryInfo(estInfo, headersAndSizes,
        appIdSize, TEXT_DELIMITER, true, maxSQLDescLength, false)
      writer.write(wStr)
      if (printStdout) print(wStr)
    }
    writer.write(s"$sep\n")
    if (printStdout) print(s"$sep\n")
  }

  def writePerSqlTextReport(sums: Seq[QualificationSummaryInfo], numOutputRows: Int,
      maxSQLDescLength: Int): Unit = {
    val textFileWriter = new ToolTextFileWriter(outputDir,
      s"${QualOutputWriter.LOGFILE_NAME}_persql.log",
      "Per SQL Summary Report", hadoopConf)
    try {
      writePerSqlTextSummary(textFileWriter, sums, numOutputRows, maxSQLDescLength)
    } finally {
      textFileWriter.close()
    }
  }

  def writeExecReport(sums: Seq[QualificationSummaryInfo], order: String) : Unit = {
    val csvFileWriter = new ToolTextFileWriter(outputDir,
      s"${QualOutputWriter.LOGFILE_NAME}_execs.csv",
      "Plan Exec Info", hadoopConf)
    try {
      val headersAndSizes = QualOutputWriter.getDetailedExecsHeaderStrings
      csvFileWriter.write(QualOutputWriter.constructDetailedHeader(
        QualOutputWriter.getDetailedExecsHeaderStrings, ",", false))
      sums.foreach { sumInfo =>
        val appRows = QualOutputWriter.constructExecsInfo(sumInfo, headersAndSizes, ",", false)
        appRows.foreach(csvFileWriter.write(_))
      }
    } finally {
      csvFileWriter.close()
    }
  }

  def writeClusterReport(sums: Seq[QualificationSummaryInfo]): Unit = {
    val jsonFileWriter = new ToolTextFileWriter(outputDir,
      s"${QualOutputWriter.LOGFILE_NAME}_cluster_information.json",
      "Cluster Information", hadoopConf)
    try {
      // Append new line at end of JSON string
      jsonFileWriter.write(Serialization.writePretty(sums.map(_.clusterSummary)) + "\n")
    } finally {
      jsonFileWriter.close()
    }
  }

  def writeClusterReportCsv(sums: Seq[QualificationSummaryInfo]): Unit = {
    val csvFileWriter = new ToolTextFileWriter(outputDir,
      s"${QualOutputWriter.LOGFILE_NAME}_cluster_information.csv",
      "Cluster Information", hadoopConf)
    try {
      val headersAndSizes = QualOutputWriter.getClusterInfoHeaderStrings
      csvFileWriter.write(QualOutputWriter.constructDetailedHeader(
        headersAndSizes, ",", prettyPrint = false))
      sums.foreach { sumInfo =>
        val appRows = QualOutputWriter.constructClusterInfo(sumInfo, headersAndSizes, ",",
          prettyPrint = false)
        appRows.foreach(csvFileWriter.write)
      }
    } finally {
      csvFileWriter.close()
    }
  }

  def writeMlFuncsReports(sums: Seq[QualificationSummaryInfo], order: String): Unit = {
    val csvFileWriter = new ToolTextFileWriter(outputDir,
      s"${QualOutputWriter.LOGFILE_NAME}_mlfunctions.csv",
      "", hadoopConf)
    try {
      val headersAndSizes = QualOutputWriter.getDetailedMlFuncsHeaderStringsAndSizes(sums)
      csvFileWriter.write(QualOutputWriter.constructDetailedHeader(headersAndSizes, ",", false))
      sums.foreach { sumInfo =>
        val rows = QualOutputWriter.constructMlFuncsInfo(sumInfo, headersAndSizes, ",", false)
        rows.foreach(csvFileWriter.write(_))
      }
    } finally {
      csvFileWriter.close()
    }
  }

  def writeMlFuncsTotalDurationReports(sums: Seq[QualificationSummaryInfo]): Unit = {
    val csvFileWriter = new ToolTextFileWriter(outputDir,
      s"${QualOutputWriter.LOGFILE_NAME}_mlfunctions_totalduration.csv",
      "", hadoopConf)
    try {
      // Filter only supported ML functions
      val supportedMlFuns = sums.filter(x => x.mlFunctionsStageDurations.nonEmpty)
      val headersAndSizes =
        QualOutputWriter.getDetailedMlFuncsTotalDurationHeaderStringsAndSizes(supportedMlFuns)
      csvFileWriter.write(QualOutputWriter.constructDetailedHeader(headersAndSizes, ",", false))
      supportedMlFuns.foreach { sumInfo =>
        val rows = QualOutputWriter.constructMlFuncsTotalDurationInfo(
          sumInfo, headersAndSizes, ",", false, true)
        rows.foreach(csvFileWriter.write(_))
      }
    } finally {
      csvFileWriter.close()
    }
  }

  def writeStatusReport(statusReports: Seq[AppStatusResult], order: String): Unit = {
    val csvFileWriter = new ToolTextFileWriter(outputDir,
      s"${QualOutputWriter.LOGFILE_NAME}_status.csv",
      "Status Report Info", hadoopConf)
    try {
      val headersAndSizes = QualOutputWriter
        .getDetailedStatusHeaderStringsAndSizes(statusReports)
      csvFileWriter.write(
        QualOutputWriter.constructDetailedHeader(headersAndSizes, ",", prettyPrint = false))
      statusReports.foreach { statusReport =>
        val rows = QualOutputWriter.constructStatusReportInfo(
          statusReport, headersAndSizes, ",", prettyPrint = false)
        rows.foreach(csvFileWriter.write)
      }
    } finally {
      csvFileWriter.close()
    }
  }
}

case class FormattedQualificationSummaryInfo(
    appName: String,
    appId: String,
    estimatedGpuDur: Double,
    estimatedGpuTimeSaved: Double,
    sqlDataframeDuration: Long,
    sqlDataframeTaskDuration: Long,
    appDuration: Long,
    gpuOpportunity: Long,
    executorCpuTimePercent: Double,
    failedSQLIds: String,
    readFileFormatAndTypesNotSupported: String,
    readFileFormats: String,
    writeDataFormat: String,
    complexTypes: String,
    nestedComplexTypes: String,
    potentialProblems: String,
    longestSqlDuration: Long,
    sqlStageDurationsSum: Long,
    nonSqlTaskDurationAndOverhead: Long,
    unsupportedSQLTaskDuration: Long,
    supportedSQLTaskDuration: Long,
    endDurationEstimated: Boolean,
    unSupportedExecs: String,
    unSupportedExprs: String,
    clusterTags: Map[String, String],
    estimatedFrequency: Long,
    totalCoreSec: Long)

object QualOutputWriter {
  val NON_SQL_TASK_DURATION_STR = "NonSQL Task Duration"
  val SQL_ID_STR = "SQL ID"
  val ROOT_SQL_ID_STR = "Root SQL ID"
  val SQL_DESC_STR = "SQL Description"
  val STAGE_ID_STR = "Stage ID"
  val APP_ID_STR = "App ID"
  val APP_NAME_STR = "App Name"
  val APP_DUR_STR = "App Duration"
  val SQL_DUR_STR = "SQL DF Duration"
  val TASK_DUR_STR = "SQL Dataframe Task Duration"
  val STAGE_DUR_STR = "Stage Task Duration"
  val STAGE_WALLCLOCK_DUR_STR = "Stage Duration"
  val SQL_STAGE_DUR_SUM_STR = "SQL Stage Durations Sum"
  val POT_PROBLEM_STR = "Potential Problems"
  val EXEC_CPU_PERCENT_STR = "Executor CPU Time Percent"
  val APP_DUR_ESTIMATED_STR = "App Duration Estimated"
  val SQL_IDS_FAILURES_STR = "SQL Ids with Failures"
  val READ_FILE_FORMAT_TYPES_STR = "Unsupported Read File Formats and Types"
  val WRITE_DATA_FORMAT_STR = "Unsupported Write Data Format"
  val COMPLEX_TYPES_STR = "Complex Types"
  val NESTED_TYPES_STR = "Nested Complex Types"
  val READ_SCHEMA_STR = "Read Schema"
  val NONSQL_DUR_STR = "NONSQL Task Duration Plus Overhead"
  val UNSUPPORTED_TASK_DURATION_STR = "Unsupported Task Duration"
  val SUPPORTED_SQL_TASK_DURATION_STR = "Supported SQL DF Task Duration"
  val LONGEST_SQL_DURATION_STR = "Longest SQL Duration"
  val EXEC_STR = "Exec Name"
  val EXPR_STR = "Expression Name"
  val EXEC_DURATION = "Exec Duration"
  val EXEC_NODEID = "SQL Node Id"
  val EXEC_IS_SUPPORTED = "Exec Is Supported"
  val EXEC_STAGES = "Exec Stages"
  val EXEC_SHOULD_REMOVE = "Exec Should Remove"
  val EXEC_SHOULD_IGNORE = "Exec Should Ignore"
  val EXEC_CHILDREN = "Exec Children"
  val EXEC_CHILDREN_NODE_IDS = "Exec Children Node Ids"
  val GPU_OPPORTUNITY_STR = "GPU Opportunity"
  val ESTIMATED_GPU_DURATION = "Estimated GPU Duration"
  val ESTIMATED_GPU_TIMESAVED = "Estimated GPU Time Saved"
  val STAGE_ESTIMATED_STR = "Stage Estimated"
  val NUM_TRANSITIONS = "Number of transitions from or to GPU"
  val UNSUPPORTED_EXECS = "Unsupported Execs"
  val UNSUPPORTED_EXPRS = "Unsupported Expressions"
  val UNSUPPORTED_OPERATOR = "Unsupported Operator"
  val CLUSTER_TAGS = "Cluster Tags"
  val CLUSTER_ID = DatabricksParseHelper.SUB_PROP_CLUSTER_ID
  val JOB_ID = DatabricksParseHelper.SUB_PROP_JOB_ID
  val UNSUPPORTED_TYPE = "Unsupported Type"
  val EXEC_ID = "ExecId"
  val DETAILS = "Details"
  private val EXEC_ACTION = "Action"
  val RUN_NAME = DatabricksParseHelper.SUB_PROP_RUN_NAME
  val ESTIMATED_FREQUENCY = "Estimated Job Frequency (monthly)"
  val ML_FUNCTIONS = "ML Functions"
  val ML_FUNCTION_NAME = "ML Function Name"
  val ML_TOTAL_STAGE_DURATION = "Total Duration"
  val ML_STAGE_IDS = "Stage Ids"
  val STATUS_REPORT_PATH_STR = "Event Log"
  val STATUS_REPORT_STATUS_STR = "Status"
  val STATUS_REPORT_APP_ID = "AppID"
  val STATUS_REPORT_DESC_STR = "Description"
  val VENDOR = "Vendor"
  val DRIVER_HOST = "Driver Host"
  val CLUSTER_ID_STR = "Cluster Id" // Different from ClusterId used for Databricks Tags
  val CLUSTER_NAME = "Cluster Name"
  val RECOMMENDED_NUM_GPUS = "Recommended Num GPUs Per Node"
  val RECOMMENDED_GPU_DEVICE = "Recommended GPU Device"
  val NUM_EXECS_PER_NODE = "Num Executors Per Node"
  val RECOMMENDED_NUM_EXECS = "Recommended Num Executors"
  val NUM_WORKER_NODES = "Num Worker Nodes"
  val RECOMMENDED_NUM_WORKER_NODES = "Recommended Num Worker Nodes"
  val CORES_PER_EXEC = "Cores Per Executor"
  val RECOMMENDED_CORES_PER_EXEC = "Recommended Cores Per Executor"
  val WORKER_NODE_TYPE = "Worker Node Type"
  val RECOMMENDED_WORKER_NODE_TYPE = "Recommended Worker Node Type"
  val DRIVER_NODE_TYPE = "Driver Node Type"
  val TOTAL_CORE_SEC = "Total Core Seconds"
  // Default frequency for jobs with a single instance is 30 times every month (30 days)
  val DEFAULT_JOB_FREQUENCY = 30L
  val APP_DUR_STR_SIZE: Int = APP_DUR_STR.size
  val SQL_DUR_STR_SIZE: Int = SQL_DUR_STR.size
  val LONGEST_SQL_DURATION_STR_SIZE: Int = LONGEST_SQL_DURATION_STR.size
  val GPU_OPPORTUNITY_STR_SIZE: Int = GPU_OPPORTUNITY_STR.size
  val UNSUPPORTED_EXECS_MAX_SIZE: Int = 25
  val UNSUPPORTED_EXPRS_MAX_SIZE: Int = 25
  val ESTIMATED_FREQUENCY_MAX_SIZE: Int = 33
  val CLUSTER_ID_STR_SIZE: Int = CLUSTER_ID.size
  val JOB_ID_STR_SIZE: Int = JOB_ID.size
  val RUN_NAME_STR_SIZE: Int = RUN_NAME.size
  val CSV_DELIMITER = ","
  val TEXT_DELIMITER = "|"

  // a file extension will be added to this later
  val LOGFILE_NAME = "rapids_4_spark_qualification_output"

  private def getReformatCSVFunc(reformatCSV: Boolean): String => String = {
    if (reformatCSV) str => StringUtils.reformatCSVString(str) else str => stringIfEmpty(str)
  }

  def getAppIdSize(sums: Seq[QualificationSummaryInfo]): Int = {
    val sizes = sums.map(_.appId.size)
    getMaxSizeForHeader(sizes, QualOutputWriter.APP_ID_STR)
  }

  def getAppNameSize(sums: Seq[QualificationSummaryInfo]): Int = {
    getMaxSizeForHeader(sums.map(_.appName.size), APP_NAME_STR)
  }

  def getunSupportedMaxSize(unSupExecs: Seq[Int], maxStringSize: Int, headerSize: Int): Int = {
    val unSupportedExecsSize = unSupExecs.size
    val unSupportedExecsMax = if (unSupExecs.nonEmpty) {
      unSupExecs.max
    } else {
      0
    }
    // return maxString size if the unsupportedString exceeds maxStringSize
    if (unSupportedExecsSize > 0 && unSupportedExecsMax > maxStringSize) {
      maxStringSize
    } else if (unSupportedExecsSize > 0 && unSupportedExecsMax < maxStringSize
      && unSupportedExecsMax >= headerSize) {
      unSupportedExecsMax
    } else {
      headerSize
    }
  }

  def getSqlDescSize(sums: Seq[QualificationSummaryInfo], maxSQLDescLength: Int,
      delimiter: String): Int = {
    val sizes = sums.flatMap(_.perSQLEstimatedInfo).flatten.map { info =>
      formatSQLDescription(info.sqlDesc, maxSQLDescLength, delimiter).size
    }
    val maxSizeOfDesc = getMaxSizeForHeader(sizes, QualOutputWriter.SQL_DESC_STR)
    Math.min(maxSQLDescLength, maxSizeOfDesc)
  }

  def getMaxSizeForHeader(sizes: Seq[Int], headerTxtStr: String): Int = {
    if (sizes.nonEmpty && sizes.max > headerTxtStr.size) {
      sizes.max
    } else {
      headerTxtStr.size
    }
  }

  def getMaxSizeForHeader(sizes: Seq[Long], headerTxtStr: String): Long = {
    if (sizes.size > 0 && sizes.max > headerTxtStr.size) {
      sizes.max
    } else {
      headerTxtStr.size
    }
  }

  // ordered hashmap contains each header string and the size to use
  def constructOutputRowFromMap(
      strAndSizes: LinkedHashMap[String, Int],
      delimiter: String = TEXT_DELIMITER,
      prettyPrint: Boolean = false): String = {
    constructOutputRow(strAndSizes.toBuffer, delimiter, prettyPrint)
  }

  private def constructOutputRow(
      strAndSizes: Buffer[(String, Int)],
      delimiter: String,
      prettyPrint: Boolean): String = {
    val entireHeader = new StringBuffer
    if (prettyPrint) {
      entireHeader.append(delimiter)
    }
    val lastEntry = strAndSizes.last
    strAndSizes.dropRight(1).foreach { case (str, strSize) =>
      if (prettyPrint) {
        val updatedString = stringLengthExceedsMax(str, strSize, delimiter)
        entireHeader.append(updatedString)
      } else {
        entireHeader.append(s"${str}${delimiter}")
      }
    }
    // for the last element we don't want to print the delimiter at the end unless
    // pretty printing
    if (prettyPrint) {
      val updatedString = stringLengthExceedsMax(lastEntry._1, lastEntry._2, delimiter)
      entireHeader.append(updatedString)
    } else {
      entireHeader.append(s"${lastEntry._1}")
    }
    entireHeader.append("\n")
    entireHeader.toString
  }

  private def stringIfEmpty(str: String): String = {
    if (str.isEmpty) "\"\"" else str
  }

  private def stringLengthExceedsMax(str: String, strSize: Int, delimiter: String): String = {
    val prettyPrintValue = if (str.size > strSize) {
      val newStrSize = strSize - 3 // suffixing ... at the end
      s"%${newStrSize}.${newStrSize}s...${delimiter}".format(str)
    } else {
      s"%${strSize}.${strSize}s${delimiter}".format(str)
    }
    prettyPrintValue
  }

  private def getUnsupportedOperatorsHeaderStrings: LinkedHashMap[String, Int] = {
    val detailedHeaderAndFields = LinkedHashMap[String, Int](
      APP_ID_STR -> APP_ID_STR.size,
      SQL_ID_STR -> SQL_ID_STR.size,
      STAGE_ID_STR -> STAGE_ID_STR.size,
      EXEC_ID -> EXEC_ID.size,
      UNSUPPORTED_TYPE -> UNSUPPORTED_TYPE.size,
      UNSUPPORTED_OPERATOR -> UNSUPPORTED_OPERATOR.size,
      DETAILS -> DETAILS.size,
      STAGE_WALLCLOCK_DUR_STR -> STAGE_WALLCLOCK_DUR_STR.size,
      APP_DUR_STR -> APP_DUR_STR.size,
      EXEC_ACTION -> EXEC_ACTION.size
    )
    detailedHeaderAndFields
  }

  def getDetailedHeaderStringsAndSizes(appInfos: Seq[QualificationSummaryInfo],
      reportReadSchema: Boolean): LinkedHashMap[String, Int] = {
    val detailedHeadersAndFields = LinkedHashMap[String, Int](
      APP_NAME_STR -> getMaxSizeForHeader(appInfos.map(_.appName.size), APP_NAME_STR),
      APP_ID_STR -> QualOutputWriter.getAppIdSize(appInfos),
      ESTIMATED_GPU_DURATION -> ESTIMATED_GPU_DURATION.size,
      ESTIMATED_GPU_TIMESAVED -> ESTIMATED_GPU_TIMESAVED.size,
      SQL_DUR_STR -> SQL_DUR_STR_SIZE,
      TASK_DUR_STR -> TASK_DUR_STR.size,
      APP_DUR_STR -> APP_DUR_STR_SIZE,
      GPU_OPPORTUNITY_STR -> GPU_OPPORTUNITY_STR_SIZE,
      EXEC_CPU_PERCENT_STR -> EXEC_CPU_PERCENT_STR.size,
      SQL_IDS_FAILURES_STR -> getMaxSizeForHeader(appInfos.map(_.failedSQLIds.size),
        SQL_IDS_FAILURES_STR),
      READ_FILE_FORMAT_TYPES_STR ->
        getMaxSizeForHeader(appInfos.map(_.readFileFormats.map(_.length).sum),
          READ_FILE_FORMAT_TYPES_STR),
      WRITE_DATA_FORMAT_STR ->
        getMaxSizeForHeader(appInfos.map(_.writeDataFormat.map(_.length).sum),
          WRITE_DATA_FORMAT_STR),
      COMPLEX_TYPES_STR ->
        getMaxSizeForHeader(appInfos.map(_.complexTypes.size), COMPLEX_TYPES_STR),
      NESTED_TYPES_STR -> getMaxSizeForHeader(appInfos.map(_.nestedComplexTypes.size),
        NESTED_TYPES_STR),
      POT_PROBLEM_STR ->
        getMaxSizeForHeader(appInfos.map(_.potentialProblems.size), POT_PROBLEM_STR),
      LONGEST_SQL_DURATION_STR -> LONGEST_SQL_DURATION_STR_SIZE,
      SQL_STAGE_DUR_SUM_STR -> SQL_STAGE_DUR_SUM_STR.size,
      NONSQL_DUR_STR -> NONSQL_DUR_STR.size,
      UNSUPPORTED_TASK_DURATION_STR -> UNSUPPORTED_TASK_DURATION_STR.size,
      SUPPORTED_SQL_TASK_DURATION_STR -> SUPPORTED_SQL_TASK_DURATION_STR.size,
      APP_DUR_ESTIMATED_STR -> APP_DUR_ESTIMATED_STR.size,
      UNSUPPORTED_EXECS -> UNSUPPORTED_EXECS.size,
      UNSUPPORTED_EXPRS -> UNSUPPORTED_EXPRS.size,
      ESTIMATED_FREQUENCY -> ESTIMATED_FREQUENCY.size,
      TOTAL_CORE_SEC -> TOTAL_CORE_SEC.size
    )
    if (appInfos.exists(_.clusterTags.nonEmpty)) {
      detailedHeadersAndFields += (CLUSTER_TAGS -> getMaxSizeForHeader(
        appInfos.map(_.clusterTags.length), CLUSTER_TAGS))
    }
    if (reportReadSchema) {
      detailedHeadersAndFields +=
        (READ_SCHEMA_STR ->
          getMaxSizeForHeader(appInfos.map(_.readFileFormats.map(_.length).sum), READ_SCHEMA_STR))
    }
    detailedHeadersAndFields
  }

  private[qualification] def getSummaryHeaderStringsAndSizes(
      appNameMaxSize: Int,
      appIdMaxSize: Int,
      unSupExecMaxSize: Int = UNSUPPORTED_EXECS_MAX_SIZE,
      unSupExprMaxSize: Int = UNSUPPORTED_EXPRS_MAX_SIZE,
      estimatedFrequencyMaxSize: Int = ESTIMATED_FREQUENCY_MAX_SIZE,
      hasClusterTags: Boolean = false,
      clusterIdMaxSize: Int = CLUSTER_ID_STR_SIZE,
      jobIdMaxSize: Int = JOB_ID_STR_SIZE,
      runNameMaxSize: Int = RUN_NAME_STR_SIZE): LinkedHashMap[String, Int] = {
    val data = LinkedHashMap[String, Int](
      APP_NAME_STR -> appNameMaxSize,
      APP_ID_STR -> appIdMaxSize,
      APP_DUR_STR -> APP_DUR_STR_SIZE,
      SQL_DUR_STR -> SQL_DUR_STR_SIZE,
      GPU_OPPORTUNITY_STR -> GPU_OPPORTUNITY_STR_SIZE,
      ESTIMATED_GPU_DURATION -> ESTIMATED_GPU_DURATION.size,
      ESTIMATED_GPU_TIMESAVED -> ESTIMATED_GPU_TIMESAVED.size,
      UNSUPPORTED_EXECS -> unSupExecMaxSize,
      UNSUPPORTED_EXPRS -> unSupExprMaxSize,
      ESTIMATED_FREQUENCY -> estimatedFrequencyMaxSize
    )
    if (hasClusterTags) {
      data += (CLUSTER_ID -> clusterIdMaxSize)
      data += (JOB_ID -> jobIdMaxSize)
      data += (RUN_NAME -> runNameMaxSize)
    }
    data
  }

  def constructAppSummaryInfo(
      sumInfo: EstimatedSummaryInfo,
      headersAndSizes: LinkedHashMap[String, Int],
      appIdMaxSize: Int,
      unSupExecMaxSize: Int,
      unSupExprMaxSize: Int,
      estimatedFrequencyMaxSize: Int,
      hasClusterTags: Boolean,
      clusterIdMaxSize: Int,
      jobIdMaxSize: Int,
      runNameMaxSize: Int,
      delimiter: String,
      prettyPrint: Boolean): String = {
    val data = ListBuffer[(String, Int)](
      sumInfo.estimatedInfo.appName -> headersAndSizes(APP_NAME_STR),
      sumInfo.estimatedInfo.appId -> appIdMaxSize,
      sumInfo.estimatedInfo.appDur.toString -> APP_DUR_STR_SIZE,
      sumInfo.estimatedInfo.sqlDfDuration.toString -> SQL_DUR_STR_SIZE,
      sumInfo.estimatedInfo.gpuOpportunity.toString -> GPU_OPPORTUNITY_STR_SIZE,
      ToolUtils.formatDoublePrecision(sumInfo.estimatedInfo.estimatedGpuDur) ->
        ESTIMATED_GPU_DURATION.size,
      ToolUtils.formatDoublePrecision(sumInfo.estimatedInfo.estimatedGpuTimeSaved) ->
        ESTIMATED_GPU_TIMESAVED.size,
      sumInfo.estimatedInfo.unsupportedExecs -> unSupExecMaxSize,
      sumInfo.estimatedInfo.unsupportedExprs -> unSupExprMaxSize,
      sumInfo.estimatedFrequency.toString -> estimatedFrequencyMaxSize
    )
    if (hasClusterTags) {
      data += (sumInfo.estimatedInfo.allTagsMap.getOrElse(CLUSTER_ID, "") -> clusterIdMaxSize)
      data += (sumInfo.estimatedInfo.allTagsMap.getOrElse(JOB_ID, "") -> jobIdMaxSize)
      data += (sumInfo.estimatedInfo.allTagsMap.getOrElse(RUN_NAME, "") -> runNameMaxSize)
    }
    constructOutputRow(data, delimiter, prettyPrint)
  }

  def constructDetailedHeader(
      headersAndSizes: LinkedHashMap[String, Int],
      delimiter: String,
      prettyPrint: Boolean): String = {
    QualOutputWriter.constructOutputRowFromMap(headersAndSizes, delimiter, prettyPrint)
  }

  def getDetailedPerSqlHeaderStringsAndSizes(
      appMaxNameSize: Int,
      appMaxIdSize: Int,
      sqlDescLength: Int): LinkedHashMap[String, Int] = {
    val detailedHeadersAndFields = LinkedHashMap[String, Int](
      APP_NAME_STR -> appMaxNameSize,
      APP_ID_STR -> appMaxIdSize,
      ROOT_SQL_ID_STR -> ROOT_SQL_ID_STR.size,
      SQL_ID_STR -> SQL_ID_STR.size,
      SQL_DESC_STR -> sqlDescLength,
      SQL_DUR_STR -> SQL_DUR_STR_SIZE,
      GPU_OPPORTUNITY_STR -> GPU_OPPORTUNITY_STR_SIZE,
      ESTIMATED_GPU_DURATION -> ESTIMATED_GPU_DURATION.size,
      ESTIMATED_GPU_TIMESAVED -> ESTIMATED_GPU_TIMESAVED.size
    )
    detailedHeadersAndFields
  }

  private def formatSQLDescription(sqlDesc: String, maxSQLDescLength: Int,
      delimiter: String): String = {
    val escapedMetaStr =
      StringUtils.renderStr(sqlDesc, doEscapeMetaCharacters = true, maxLength = maxSQLDescLength)
    // should be a one for one replacement so length wouldn't be affected by this
    replaceDelimiter(escapedMetaStr, delimiter)
  }

  def constructPerSqlSummaryInfo(
      sumInfo: EstimatedPerSQLSummaryInfo,
      headersAndSizes: LinkedHashMap[String, Int],
      appIdMaxSize: Int,
      delimiter: String,
      prettyPrint: Boolean,
      maxSQLDescLength: Int,
      reformatCSV: Boolean = true): String = {
    val reformatCSVFunc : String => String =
      if (reformatCSV) str => StringUtils.reformatCSVString(str) else str => str
    val data = ListBuffer[(String, Int)](
      reformatCSVFunc(sumInfo.info.appName) -> headersAndSizes(APP_NAME_STR),
      reformatCSVFunc(sumInfo.info.appId) -> appIdMaxSize,
      reformatCSVFunc(sumInfo.rootExecutionID.getOrElse("").toString)-> ROOT_SQL_ID_STR.size,
      sumInfo.sqlID.toString -> SQL_ID_STR.size,
      reformatCSVFunc(formatSQLDescription(sumInfo.sqlDesc, maxSQLDescLength, delimiter)) ->
        headersAndSizes(SQL_DESC_STR),
      sumInfo.info.sqlDfDuration.toString -> SQL_DUR_STR_SIZE,
      sumInfo.info.gpuOpportunity.toString -> GPU_OPPORTUNITY_STR_SIZE,
      ToolUtils.formatDoublePrecision(sumInfo.info.estimatedGpuDur) -> ESTIMATED_GPU_DURATION.size,
      ToolUtils.formatDoublePrecision(sumInfo.info.estimatedGpuTimeSaved) ->
        ESTIMATED_GPU_TIMESAVED.size
    )
    constructOutputRow(data, delimiter, prettyPrint)
  }

  private def getDetailedExecsHeaderStrings: LinkedHashMap[String, Int] = {
    val detailedHeadersAndFields = LinkedHashMap[String, Int](
      APP_ID_STR -> APP_ID_STR.size,
      SQL_ID_STR -> SQL_ID_STR.size,
      EXEC_STR -> EXEC_STR.size,
      EXPR_STR -> EXPR_STR.size,
      EXEC_DURATION -> EXEC_DURATION.size,
      EXEC_NODEID -> EXEC_NODEID.size,
      EXEC_IS_SUPPORTED -> EXEC_IS_SUPPORTED.size,
      EXEC_STAGES -> EXEC_STAGES.size,
      EXEC_CHILDREN -> EXEC_CHILDREN.size,
      EXEC_CHILDREN_NODE_IDS -> EXEC_CHILDREN_NODE_IDS.size,
      EXEC_SHOULD_REMOVE -> EXEC_SHOULD_REMOVE.size,
      EXEC_SHOULD_IGNORE -> EXEC_SHOULD_IGNORE.size,
      EXEC_ACTION -> EXEC_ACTION.size)
    detailedHeadersAndFields
  }

  private def getClusterInfoHeaderStrings: mutable.LinkedHashMap[String, Int] = {
    val headersAndFields = Seq(
      APP_ID_STR, APP_NAME_STR, VENDOR, DRIVER_HOST, CLUSTER_ID_STR, CLUSTER_NAME,
      WORKER_NODE_TYPE, DRIVER_NODE_TYPE, NUM_WORKER_NODES, NUM_EXECS_PER_NODE, CORES_PER_EXEC,
      RECOMMENDED_WORKER_NODE_TYPE, RECOMMENDED_NUM_EXECS, RECOMMENDED_NUM_WORKER_NODES,
      RECOMMENDED_CORES_PER_EXEC, RECOMMENDED_GPU_DEVICE, RECOMMENDED_NUM_GPUS).map {
      key => (key, key.length)
    }
    mutable.LinkedHashMap(headersAndFields: _*)
  }

  def constructClusterInfo(
      sumInfo: QualificationSummaryInfo,
      headersAndSizes: LinkedHashMap[String, Int],
      delimiter: String = TEXT_DELIMITER,
      prettyPrint: Boolean,
      reformatCSV: Boolean = true): Seq[String] = {
    val reformatCSVFunc = getReformatCSVFunc(reformatCSV)
    val clusterInfo = sumInfo.clusterSummary.clusterInfo
    val recClusterInfo = sumInfo.clusterSummary.recommendedClusterInfo

    // Wrapper function around reformatCSVFunc() to handle optional fields and
    // reduce redundancy
    def refactorCSVFuncWithOption(field: Option[String], headerConst: String): (String, Int) =
      reformatCSVFunc(field.getOrElse("")) -> headersAndSizes(headerConst)

    val data = ListBuffer(
      refactorCSVFuncWithOption(Some(sumInfo.clusterSummary.appId), APP_ID_STR),
      refactorCSVFuncWithOption(Some(sumInfo.clusterSummary.appName), APP_NAME_STR),
      refactorCSVFuncWithOption(clusterInfo.map(_.vendor), VENDOR),
      refactorCSVFuncWithOption(clusterInfo.flatMap(_.driverHost), DRIVER_HOST),
      refactorCSVFuncWithOption(clusterInfo.flatMap(_.clusterId), CLUSTER_ID_STR),
      refactorCSVFuncWithOption(clusterInfo.flatMap(_.clusterName), CLUSTER_NAME),
      refactorCSVFuncWithOption(clusterInfo.flatMap(_.workerNodeType), WORKER_NODE_TYPE),
      refactorCSVFuncWithOption(clusterInfo.flatMap(_.driverNodeType), DRIVER_NODE_TYPE),
      refactorCSVFuncWithOption(clusterInfo.map(_.numWorkerNodes.toString), NUM_WORKER_NODES),
      refactorCSVFuncWithOption(clusterInfo.map(_.numExecsPerNode.toString), NUM_EXECS_PER_NODE),
      refactorCSVFuncWithOption(clusterInfo.map(_.coresPerExecutor.toString), CORES_PER_EXEC),
      refactorCSVFuncWithOption(recClusterInfo.flatMap(_.workerNodeType),
        RECOMMENDED_WORKER_NODE_TYPE),
      refactorCSVFuncWithOption(recClusterInfo.map(_.numExecutors.toString),
        RECOMMENDED_NUM_EXECS),
      refactorCSVFuncWithOption(recClusterInfo.map(_.numWorkerNodes.toString),
        RECOMMENDED_NUM_WORKER_NODES),
      refactorCSVFuncWithOption(recClusterInfo.map(_.coresPerExecutor.toString),
        RECOMMENDED_CORES_PER_EXEC),
      refactorCSVFuncWithOption(recClusterInfo.map(_.gpuDevice), RECOMMENDED_GPU_DEVICE),
      refactorCSVFuncWithOption(recClusterInfo.map(_.numGpus.toString), RECOMMENDED_NUM_GPUS)

    )
    constructOutputRow(data, delimiter, prettyPrint) :: Nil
  }

  def constructMlFuncsInfo(
      sumInfo: QualificationSummaryInfo,
      headersAndSizes: LinkedHashMap[String, Int],
      delimiter: String = TEXT_DELIMITER,
      prettyPrint: Boolean,
      reformatCSV: Boolean = true): Seq[String] = {
    val reformatCSVFunc = getReformatCSVFunc(reformatCSV)
    val appId = sumInfo.appId
    sumInfo.mlFunctions.get.map { info =>
      val data = ListBuffer[(String, Int)](
        reformatCSVFunc(appId) -> headersAndSizes(APP_ID_STR),
        info.stageId.toString -> headersAndSizes(STAGE_ID_STR),
        reformatCSVFunc(ToolUtils.renderTextField(info.mlOps, ";", delimiter)) ->
          headersAndSizes(ML_FUNCTIONS),
        info.duration.toString -> headersAndSizes(STAGE_DUR_STR))
      constructOutputRow(data, delimiter, prettyPrint)
    }
  }

  def constructMlFuncsTotalDurationInfo(
      sumInfo: QualificationSummaryInfo,
      headersAndSizes: LinkedHashMap[String, Int],
      delimiter: String = TEXT_DELIMITER,
      prettyPrint: Boolean,
      reformatCSV: Boolean = true): Seq[String] = {
    val reformatCSVFunc = getReformatCSVFunc(reformatCSV)
    val appId = sumInfo.appId
    sumInfo.mlFunctionsStageDurations.get.map { info =>
      val data = ListBuffer[(String, Int)](
        reformatCSVFunc(appId) -> headersAndSizes(APP_ID_STR),
        reformatCSVFunc(ToolUtils.renderTextField(info.stageIds, ";", delimiter)) ->
          headersAndSizes(ML_STAGE_IDS),
        reformatCSVFunc(info.mlFuncName) -> headersAndSizes(ML_FUNCTION_NAME),
        info.duration.toString -> headersAndSizes(ML_TOTAL_STAGE_DURATION))
      constructOutputRow(data, delimiter, prettyPrint)
    }
  }

  def flattenedExecs(execs: Seq[ExecInfo]): Seq[ExecInfo] = {
    // need to remove the WholeStageCodegen wrappers since they aren't actual
    // execs that we want to get timings of
    execs.flatMap { e =>
      if (e.exec.contains("WholeStageCodegen")) {
        e.children.getOrElse(Seq.empty)
      } else {
        e.children.getOrElse(Seq.empty) :+ e
      }
    }
  }

  def getDetailedMlFuncsHeaderStringsAndSizes(
      appInfos: Seq[QualificationSummaryInfo]): LinkedHashMap[String, Int] = {
    val detailedHeadersAndFields = LinkedHashMap[String, Int](
      APP_ID_STR -> QualOutputWriter.getAppIdSize(appInfos),
      STAGE_ID_STR -> STAGE_ID_STR.size,
      ML_FUNCTIONS -> getMaxSizeForHeader(
        appInfos.map(_.mlFunctions.get.map(
          mlFuns => mlFuns.mlOps.map(funcName => funcName.length).sum).sum), ML_FUNCTIONS),
      STAGE_DUR_STR -> STAGE_DUR_STR.size)
    detailedHeadersAndFields
  }

  def getDetailedMlFuncsTotalDurationHeaderStringsAndSizes(
      appInfos: Seq[QualificationSummaryInfo]): LinkedHashMap[String, Int] = {
    val detailedHeadersAndFields = LinkedHashMap[String, Int](
      APP_ID_STR -> QualOutputWriter.getAppIdSize(appInfos),
      ML_STAGE_IDS -> getMaxSizeForHeader(
        appInfos.map(_.mlFunctionsStageDurations.get.map(
          mlFuncs => mlFuncs.stageIds).size), ML_STAGE_IDS),
      ML_FUNCTION_NAME -> getMaxSizeForHeader(
        appInfos.map(_.mlFunctionsStageDurations.get.map(
          mlFuncs => mlFuncs.mlFuncName.length).sum), ML_FUNCTION_NAME),
      ML_TOTAL_STAGE_DURATION -> ML_TOTAL_STAGE_DURATION.size)
    detailedHeadersAndFields
  }

  private def constructExecInfoBuffer(
      info: ExecInfo,
      appId: String,
      delimiter: String,
      prettyPrint: Boolean,
      headersAndSizes: LinkedHashMap[String, Int],
      reformatCSV: Boolean = true): String = {
    val reformatCSVFunc = getReformatCSVFunc(reformatCSV)
    val data = ListBuffer[(String, Int)](
      reformatCSVFunc(appId) -> headersAndSizes(APP_ID_STR),
      info.sqlID.toString -> headersAndSizes(SQL_ID_STR),
      reformatCSVFunc(info.exec) -> headersAndSizes(EXEC_STR),
      reformatCSVFunc(info.expr) -> headersAndSizes(EXEC_STR),
      info.duration.getOrElse(0).toString -> headersAndSizes(EXEC_DURATION),
      info.nodeId.toString -> headersAndSizes(EXEC_NODEID),
      info.isSupported.toString -> headersAndSizes(EXEC_IS_SUPPORTED),
      reformatCSVFunc(info.stages.mkString(":")) -> headersAndSizes(EXEC_STAGES),
      reformatCSVFunc(info.children.getOrElse(Seq.empty).map(_.exec).mkString(":")) ->
        headersAndSizes(EXEC_CHILDREN),
      reformatCSVFunc(info.children.getOrElse(Seq.empty).map(_.nodeId).mkString(":")) ->
        headersAndSizes(EXEC_CHILDREN_NODE_IDS),
      info.shouldRemove.toString -> headersAndSizes(EXEC_SHOULD_REMOVE),
      info.shouldIgnore.toString -> headersAndSizes(EXEC_SHOULD_IGNORE),
      reformatCSVFunc(info.getOpAction.toString) -> headersAndSizes(EXEC_ACTION)
    )
    constructOutputRow(data, delimiter, prettyPrint)
  }

  private def getDetailedStagesHeaderStrings: LinkedHashMap[String, Int] = {
    val detailedHeadersAndFields = LinkedHashMap[String, Int](
      APP_ID_STR -> APP_ID_STR.size,
      STAGE_ID_STR -> STAGE_ID_STR.size,
      STAGE_DUR_STR -> STAGE_DUR_STR.size,
      UNSUPPORTED_TASK_DURATION_STR -> UNSUPPORTED_TASK_DURATION_STR.size,
      STAGE_ESTIMATED_STR -> STAGE_ESTIMATED_STR.size,
      NUM_TRANSITIONS -> NUM_TRANSITIONS.size
    )
    detailedHeadersAndFields
  }

  def constructStagesInfo(
      sumInfo: QualificationSummaryInfo,
      headersAndSizes: LinkedHashMap[String, Int],
      delimiter: String = TEXT_DELIMITER,
      prettyPrint: Boolean,
      reformatCSV: Boolean = true): Seq[String] = {
    val reformatCSVFunc: String => String =
      if (reformatCSV) str => StringUtils.reformatCSVString(str) else str => stringIfEmpty(str)
    sumInfo.stageInfo.map { info =>
      val data = ListBuffer[(String, Int)](
        reformatCSVFunc(sumInfo.appId) -> headersAndSizes(APP_ID_STR),
        info.stageId.toString -> headersAndSizes(STAGE_ID_STR),
        info.stageTaskTime.toString -> headersAndSizes(STAGE_DUR_STR),
        info.unsupportedTaskDur.toString -> headersAndSizes(UNSUPPORTED_TASK_DURATION_STR),
        info.estimated.toString -> headersAndSizes(STAGE_ESTIMATED_STR),
        info.numTransitions.toString -> headersAndSizes(NUM_TRANSITIONS))
      constructOutputRow(data, delimiter, prettyPrint)
    }
  }

  private def getUnsupportedExecsPerStage(
      sumInfo: QualificationSummaryInfo,
      stageID: Int): Set[ExecInfo] = {
    sumInfo.planInfo.map(_.execInfo).collect {
      case execInfos =>
        val allExecs = flattenedExecs(execInfos)
        allExecs.filter(exec => !exec.isSupported && exec.stages.contains(stageID))
    }.flatten.toSet
  }

  private def getUnsupportedExecsWithNoStage(
      sumInfo: QualificationSummaryInfo): Set[ExecInfo] = {
    sumInfo.planInfo.map(_.execInfo).collect {
      case execInfos =>
        val allExecs = flattenedExecs(execInfos)
        allExecs.filter(exec => exec.stages.isEmpty && !exec.isSupported)
    }.flatten.toSet
  }

  private def constructUnsupportedDetailedStagesDurationInfo(
      csvWriter: ToolTextFileWriter,
      sumInfo: QualificationSummaryInfo,
      headersAndSizes: LinkedHashMap[String, Int],
      delimiter: String,
      prettyPrint: Boolean,
      reformatCSV: Boolean = true): Unit = {
    val reformatCSVFunc = getReformatCSVFunc(reformatCSV)
    val appId = sumInfo.appId
    val appDuration = sumInfo.estimatedInfo.appDur
    val dummyStageID = -1
    val dummyStageDur = 0
    val execIdGenerator = new AtomicLong(0)

    def constructDetailedUnsupportedRow(unSupExecInfo: UnsupportedExecSummary,
        stageId: Int, stageAppDuration: Long): String = {
      val data = ListBuffer[(String, Int)](
        reformatCSVFunc(appId) -> headersAndSizes(APP_ID_STR),
        unSupExecInfo.sqlId.toString -> headersAndSizes(SQL_ID_STR),
        stageId.toString -> headersAndSizes(STAGE_ID_STR),
        reformatCSVFunc(unSupExecInfo.execId.toString) -> headersAndSizes(EXEC_ID),
        reformatCSVFunc(unSupExecInfo.finalOpType) -> headersAndSizes(UNSUPPORTED_TYPE),
        reformatCSVFunc(unSupExecInfo.unsupportedOperator) -> headersAndSizes(UNSUPPORTED_OPERATOR),
        reformatCSVFunc(unSupExecInfo.details) -> headersAndSizes(DETAILS),
        stageAppDuration.toString -> headersAndSizes(STAGE_WALLCLOCK_DUR_STR),
        appDuration.toString -> headersAndSizes(APP_DUR_STR),
        reformatCSVFunc(unSupExecInfo.opAction.toString) -> headersAndSizes(EXEC_ACTION)
      )
      constructOutputRow(data, delimiter, prettyPrint)
    }

    def getUnsupportedRows(execI: ExecInfo, stageId: Int, stageDur: Long): String = {
      val results = execI.getUnsupportedExecSummaryRecord(execIdGenerator.getAndIncrement())
      results.map { unsupportedExecSummary =>
        constructDetailedUnsupportedRow(unsupportedExecSummary, stageId, stageDur)
      }.mkString
    }

    csvWriter.write(sumInfo.origPlanStageInfo.flatMap { sInfo =>
      getUnsupportedExecsPerStage(sumInfo, sInfo.stageId).map { execInfo =>
          getUnsupportedRows(execInfo, sInfo.stageId, sInfo.stageWallclockDuration)
      }
    }.mkString)

    // write down the execs that are not attached to any stage
    csvWriter.write(getUnsupportedExecsWithNoStage(sumInfo).map { eInfo =>
      getUnsupportedRows(eInfo, dummyStageID, dummyStageDur)
    }.mkString)
  }

  def getAllExecsFromPlan(plans: Seq[PlanInfo]): Set[ExecInfo] = {
    val topExecInfo = plans.flatMap(_.execInfo)
    topExecInfo.flatMap { e =>
      e.children.getOrElse(Seq.empty) :+ e
    }.toSet
  }

  private def constructExecsInfo(
      sumInfo: QualificationSummaryInfo,
      headersAndSizes: LinkedHashMap[String, Int],
      delimiter: String,
      prettyPrint: Boolean): Set[String] = {
    // No need to visit the execInfo children because the result returned from
    // "getAllExecsFromPlan" is already flattened
    getAllExecsFromPlan(sumInfo.planInfo).collect { case info =>
      constructExecInfoBuffer(info, sumInfo.appId, delimiter, prettyPrint, headersAndSizes)
    }
  }

  def createFormattedQualSummaryInfo(
      appInfo: QualificationSummaryInfo,
      delimiter: String = TEXT_DELIMITER) : FormattedQualificationSummaryInfo = {
    FormattedQualificationSummaryInfo(
      appInfo.appName,
      appInfo.appId,
      ToolUtils.truncateDoubleToTwoDecimal(appInfo.estimatedInfo.estimatedGpuDur),
      ToolUtils.truncateDoubleToTwoDecimal(appInfo.estimatedInfo.estimatedGpuTimeSaved),
      appInfo.estimatedInfo.sqlDfDuration,
      appInfo.sqlDataframeTaskDuration,
      appInfo.estimatedInfo.appDur,
      appInfo.estimatedInfo.gpuOpportunity,
      ToolUtils.truncateDoubleToTwoDecimal(appInfo.executorCpuTimePercent),
      ToolUtils.renderTextField(appInfo.failedSQLIds, ",", delimiter),
      ToolUtils.renderTextField(appInfo.readFileFormatAndTypesNotSupported, ";", delimiter),
      ToolUtils.renderTextField(appInfo.readFileFormats, ":", delimiter),
      ToolUtils.renderTextField(appInfo.writeDataFormat, ";", delimiter).toUpperCase,
      ToolUtils.formatComplexTypes(appInfo.complexTypes, delimiter),
      ToolUtils.formatComplexTypes(appInfo.nestedComplexTypes, delimiter),
      ToolUtils.formatPotentialProblems(appInfo.potentialProblems, delimiter),
      appInfo.longestSqlDuration,
      appInfo.stageInfo.map(_.stageWallclockDuration).sum,
      appInfo.nonSqlTaskDurationAndOverhead,
      appInfo.unsupportedSQLTaskDuration,
      appInfo.supportedSQLTaskDuration,
      appInfo.endDurationEstimated,
      appInfo.unSupportedExecs,
      appInfo.unSupportedExprs,
      appInfo.allClusterTagsMap,
      appInfo.estimatedFrequency.getOrElse(DEFAULT_JOB_FREQUENCY),
      appInfo.totalCoreSec
    )
  }

  private def constructDetailedAppInfoCSVRow(
      appInfo: FormattedQualificationSummaryInfo,
      headersAndSizes: LinkedHashMap[String, Int],
      reportReadSchema: Boolean,
      reformatCSV: Boolean = true): ListBuffer[(String, Int)] = {
    val reformatCSVFunc = getReformatCSVFunc(reformatCSV)
    val data = ListBuffer[(String, Int)](
      reformatCSVFunc(appInfo.appName) -> headersAndSizes(APP_NAME_STR),
      reformatCSVFunc(appInfo.appId) -> headersAndSizes(APP_ID_STR),
      appInfo.estimatedGpuDur.toString -> ESTIMATED_GPU_DURATION.size,
      appInfo.estimatedGpuTimeSaved.toString -> ESTIMATED_GPU_TIMESAVED.size,
      appInfo.sqlDataframeDuration.toString -> headersAndSizes(SQL_DUR_STR),
      appInfo.sqlDataframeTaskDuration.toString -> headersAndSizes(TASK_DUR_STR),
      appInfo.appDuration.toString -> headersAndSizes(APP_DUR_STR),
      appInfo.gpuOpportunity.toString -> GPU_OPPORTUNITY_STR_SIZE,
      appInfo.executorCpuTimePercent.toString -> headersAndSizes(EXEC_CPU_PERCENT_STR),
      reformatCSVFunc(appInfo.failedSQLIds) -> headersAndSizes(SQL_IDS_FAILURES_STR),
      reformatCSVFunc(appInfo.readFileFormatAndTypesNotSupported) ->
        headersAndSizes(READ_FILE_FORMAT_TYPES_STR),
      reformatCSVFunc(appInfo.writeDataFormat) -> headersAndSizes(WRITE_DATA_FORMAT_STR),
      reformatCSVFunc(appInfo.complexTypes) -> headersAndSizes(COMPLEX_TYPES_STR),
      reformatCSVFunc(appInfo.nestedComplexTypes) -> headersAndSizes(NESTED_TYPES_STR),
      reformatCSVFunc(appInfo.potentialProblems) -> headersAndSizes(POT_PROBLEM_STR),
      appInfo.longestSqlDuration.toString -> headersAndSizes(LONGEST_SQL_DURATION_STR),
      appInfo.sqlStageDurationsSum.toString ->
        headersAndSizes(SQL_STAGE_DUR_SUM_STR),
      appInfo.nonSqlTaskDurationAndOverhead.toString -> headersAndSizes(NONSQL_DUR_STR),
      appInfo.unsupportedSQLTaskDuration.toString -> headersAndSizes(UNSUPPORTED_TASK_DURATION_STR),
      appInfo.supportedSQLTaskDuration.toString -> headersAndSizes(SUPPORTED_SQL_TASK_DURATION_STR),
      appInfo.endDurationEstimated.toString -> headersAndSizes(APP_DUR_ESTIMATED_STR),
      reformatCSVFunc(appInfo.unSupportedExecs) -> headersAndSizes(UNSUPPORTED_EXECS),
      reformatCSVFunc(appInfo.unSupportedExprs) -> headersAndSizes(UNSUPPORTED_EXPRS),
      appInfo.estimatedFrequency.toString -> headersAndSizes(ESTIMATED_FREQUENCY),
      appInfo.totalCoreSec.toString -> headersAndSizes(TOTAL_CORE_SEC)
    )

    if (appInfo.clusterTags.nonEmpty) {
      data += reformatCSVFunc(appInfo.clusterTags.mkString(";")) -> headersAndSizes(CLUSTER_TAGS)
    }
    if (reportReadSchema) {
      data += reformatCSVFunc(appInfo.readFileFormats) -> headersAndSizes(READ_SCHEMA_STR)
    }
    data
  }

  def constructAppDetailedInfo(
      summaryAppInfo: QualificationSummaryInfo,
      headersAndSizes: LinkedHashMap[String, Int],
      delimiter: String,
      prettyPrint: Boolean,
      reportReadSchema: Boolean): String = {
    val formattedAppInfo = createFormattedQualSummaryInfo(summaryAppInfo, delimiter)
    val data = constructDetailedAppInfoCSVRow(formattedAppInfo, headersAndSizes, reportReadSchema)
    constructOutputRow(data, delimiter, prettyPrint)
  }

  private def getDetailedStatusHeaderStringsAndSizes(
      statusInfos: Seq[AppStatusResult]): LinkedHashMap[String, Int] = {
    val descLengthList = statusInfos.map { statusInfo =>
      statusInfo.appId.length + statusInfo.message.length + 1
    }
    val detailedHeadersAndFields = LinkedHashMap[String, Int](
      STATUS_REPORT_PATH_STR ->
        getMaxSizeForHeader(statusInfos.map(_.path.length), STATUS_REPORT_PATH_STR),
      STATUS_REPORT_STATUS_STR ->
        getMaxSizeForHeader(statusInfos.map(_.status.length), STATUS_REPORT_STATUS_STR),
      STATUS_REPORT_APP_ID ->
        getMaxSizeForHeader(statusInfos.map(_.appId.length), STATUS_REPORT_APP_ID),
      STATUS_REPORT_DESC_STR ->
        getMaxSizeForHeader(descLengthList, STATUS_REPORT_DESC_STR)
    )
    detailedHeadersAndFields
  }

  private def constructStatusReportInfo(
      statusInfo: AppStatusResult,
      headersAndSizes: LinkedHashMap[String, Int],
      delimiter: String,
      prettyPrint: Boolean,
      reformatCSV: Boolean = true): Seq[String] = {
    val reformatCSVFunc = getReformatCSVFunc(reformatCSV)
    val data = ListBuffer[(String, Int)](
      reformatCSVFunc(statusInfo.path) -> headersAndSizes(STATUS_REPORT_PATH_STR),
      reformatCSVFunc(statusInfo.status) -> headersAndSizes(STATUS_REPORT_STATUS_STR),
      reformatCSVFunc(statusInfo.appId) -> headersAndSizes(STATUS_REPORT_APP_ID),
      reformatCSVFunc(statusInfo.message) -> headersAndSizes(STATUS_REPORT_DESC_STR))
    Seq(constructOutputRow(data, delimiter, prettyPrint))
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy