com.nvidia.spark.rapids.tool.profiling.ProfileOutputWriter.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of rapids-4-spark-tools_2.12 Show documentation
RAPIDS Accelerator for Apache Spark tools
The newest version!
/*
 * Copyright (c) 2021-2024, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.nvidia.spark.rapids.tool.profiling

import com.nvidia.spark.rapids.SparkRapidsBuildInfoEvent
import com.nvidia.spark.rapids.tool.ToolTextFileWriter
import org.apache.commons.lang3.StringUtils
import org.json4s.DefaultFormats
import org.json4s.jackson.Serialization


class ProfileOutputWriter(outputDir: String, filePrefix: String, numOutputRows: Int,
    outputCSV: Boolean = false) {

  implicit val formats: DefaultFormats.type = DefaultFormats

  private val textFileWriter = new ToolTextFileWriter(outputDir,
    s"$filePrefix.log", "Profile summary")

  def writeText(strToWrite: String): Unit = {
    textFileWriter.write(strToWrite)
  }

  private def writeTextTable(messageHeader: String, outRows: Seq[ProfileResult],
      emptyText: Option[String], tableDesc: Option[String]): Unit = {
    val headerText = tableDesc match {
      case Some(desc) => s"$messageHeader: $desc"
      case None => s"$messageHeader:"
    }
    textFileWriter.write(s"\n$headerText\n")

    if (outRows.nonEmpty) {
      val outStr = ProfileOutputWriter.makeFormattedString(numOutputRows, 0,
        outRows.head.outputHeaders, outRows.map(_.convertToSeq))
      textFileWriter.write(outStr)
    } else {
      val finalEmptyText = emptyText match {
        case Some(text) => text
        case None => messageHeader
      }
      textFileWriter.write(s"No $finalEmptyText Found!\n")
    }
  }

  def writeSparkRapidsBuildInfo(headerText: String,
      sparkRapidsBuildInfo: Seq[SparkRapidsBuildInfoEvent]): Unit = {
    val fileName = headerText.replace(" ", "_").toLowerCase
    val jsonWriter = new ToolTextFileWriter(outputDir, s"${fileName}.json", s"$headerText JSON:")
    try {
      jsonWriter.write(Serialization.writePretty(sparkRapidsBuildInfo) + "\n")
    } finally {
      jsonWriter.close()
    }
  }

  def write(headerText: String, outRows: Seq[ProfileResult],
      emptyTableText: Option[String] = None, tableDesc: Option[String] = None): Unit = {
    writeTextTable(headerText, outRows, emptyTableText, tableDesc)
    if (outputCSV) {
      ProfileOutputWriter.writeCSVTable(headerText, outRows, outputDir)
    }
  }

  def close(): Unit = {
    textFileWriter.close()
  }
}

object ProfileOutputWriter {
  val CSVDelimiter = ","

  private def stringIfempty(str: String): String = {
    if (str == null || str.isEmpty) "\"\"" else str
  }

  /**
   * Write a CSV file give the input header and data.
   */
  def writeCSVTable(header: String, outRows: Seq[ProfileResult], outputDir: String): Unit = {
    if (outRows.nonEmpty) {
      // need to have separate CSV file per table, use header text
      // with spaces as _ and lowercase as filename
      val suffix = header.replace(" ", "_").toLowerCase
      val csvWriter = new ToolTextFileWriter(outputDir, s"${suffix}.csv", s"$header CSV:")
      try {
        val headerString = outRows.head.outputHeaders.mkString(CSVDelimiter)
        csvWriter.write(headerString + "\n")
        val rows = outRows.map(_.convertToCSVSeq)
        rows.foreach { row =>
          val formattedRow = row.map(stringIfempty(_))
          val outStr = formattedRow.mkString(CSVDelimiter)
          csvWriter.write(outStr + "\n")
        }
      } finally {
        csvWriter.close()
      }
    }
  }

  /**
   * Regular expression matching full width characters.
   *
   * Looked at all the 0x0000-0xFFFF characters (unicode) and showed them under Xshell.
   * Found all the full width characters, then get the regular expression.
   */
  private val fullWidthRegex = ("""[""" +
    // scalastyle:off nonascii
    "\u1100-\u115F" +
    "\u2E80-\uA4CF" +
    "\uAC00-\uD7A3" +
    "\uF900-\uFAFF" +
    "\uFE10-\uFE19" +
    "\uFE30-\uFE6F" +
    "\uFF00-\uFF60" +
    "\uFFE0-\uFFE6" +
    // scalastyle:on nonascii
    """]""").r

  /**
   * Return the number of half widths in a given string. Note that a full width character
   * occupies two half widths.
   *
   * For a string consisting of 1 million characters, the execution of this method requires
   * about 50ms.
   */
  def stringHalfWidth(str: String): Int = {
    if (str == null) 0 else str.length + fullWidthRegex.findAllIn(str).size
  }

  // originally copied from Spark showString and modified
  def makeFormattedString(
      _numRows: Int,
      truncate: Int = 20,
      schema: Seq[String],
      rows: Seq[Seq[String]]): String = {
    val numRows = _numRows.max(0).min(2147483632 - 1)
    val hasMoreData = rows.length - 1 > numRows

    val sb = new StringBuilder
    val numCols = schema.length
    // We set a minimum column width at '3'
    val minimumColWidth = 3

    // Initialise the width of each column to a minimum value
    val colWidths = Array.fill(numCols)(minimumColWidth)

    if (rows.nonEmpty && schema.size != rows.head.size) {
      throw new IllegalArgumentException("schema must be same size as data!")
    }

    val escapedSchema = schema.map(
      org.apache.spark.sql.rapids.tool.util.StringUtils.renderStr(_, doEscapeMetaCharacters = true,
        maxLength = 0))

    val schemaAndData = escapedSchema +: rows.map { row =>
      row.map { cell =>
        cell match {
          case null => "null"
          case str: String =>
            // Escapes meta-characters not to break the `showString` format
            org.apache.spark.sql.rapids.tool.util.StringUtils.renderStr(
              str, doEscapeMetaCharacters = true, maxLength = truncate, showEllipses = true)
        }
      }: Seq[String]
    }

    // Compute the width of each column
    for (row <- schemaAndData) {
      for ((cell, i) <- row.zipWithIndex) {
        colWidths(i) = math.max(colWidths(i), stringHalfWidth(cell))
      }
    }

    val paddedRows = schemaAndData.map { row =>
      row.zipWithIndex.map { case (cell, i) =>
        if (truncate > 0) {
          StringUtils.leftPad(cell, colWidths(i) - stringHalfWidth(cell) + cell.length)
        } else {
          StringUtils.rightPad(cell, colWidths(i) - stringHalfWidth(cell) + cell.length)
        }
      }
    }

    // Create SeparateLine
    val sep: String = colWidths.map("-" * _).addString(sb, "+", "+", "+\n").toString()

    // column names
    paddedRows.head.addString(sb, "|", "|", "|\n")
    sb.append(sep)

    // data
    paddedRows.tail.foreach(_.addString(sb, "|", "|", "|\n"))
    sb.append(sep)


    // Print a footer
    if (hasMoreData) {
      // For Data that has more than "numRows" records
      val rowsString = if (numRows == 1) "row" else "rows"
      sb.append(s"only showing top $numRows $rowsString\n")
    }

    sb.toString()
  }
}