com.nvidia.spark.rapids.tool.planparser.DataWritingCommandExecParser.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of rapids-4-spark-tools_2.12 Show documentation
Show all versions of rapids-4-spark-tools_2.12 Show documentation
RAPIDS Accelerator for Apache Spark tools
The newest version!
/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.nvidia.spark.rapids.tool.planparser
import com.nvidia.spark.rapids.tool.qualification.PluginTypeChecker
import org.apache.spark.sql.execution.ui.SparkPlanGraphNode
case class DataWritingCommandExecParser(
node: SparkPlanGraphNode,
checker: PluginTypeChecker,
sqlID: Long) extends ExecParser {
// hardcode because InsertIntoHadoopFsRelationCommand uses this same exec
// and InsertIntoHadoopFsRelationCommand doesn't have an entry in the
// supported execs file
val fullExecName: String = DataWritingCommandExecParser.dataWriteCMD
override def parse: ExecInfo = {
// At this point we are sure that the wrapper is defined
val wStub = DataWritingCommandExecParser.getWriteCMDWrapper(node).get
val writeSupported = checker.isWriteFormatSupported(wStub.dataFormat)
val duration = None
val speedupFactor = checker.getSpeedupFactor(wStub.mappedExec)
val finalSpeedup = if (writeSupported) speedupFactor else 1
// TODO - add in parsing expressions - average speedup across?
// We do not want to parse the node description to avoid mistakenly marking the node as RDD/UDF
ExecInfo.createExecNoNode(sqlID, s"${node.name.trim} ${wStub.dataFormat.toLowerCase.trim}",
s"Format: ${wStub.dataFormat.toLowerCase.trim}",
finalSpeedup, duration, node.id, opType = OpTypes.WriteExec, writeSupported, None)
}
}
// A case class used to hold information of the parsed Sql-node.
// The purpose is to be able to reuse the code between different components.
case class DataWritingCmdWrapper(
execName: String,
mappedExec: String,
dataFormat: String)
object DataWritingCommandExecParser {
val dataWriteCMD = "DataWritingCommandExec"
private val defaultPhysicalCMD = dataWriteCMD
// made public for testing
val insertIntoHadoopCMD = "InsertIntoHadoopFsRelationCommand"
val appendDataExecV1 = "AppendDataExecV1"
val overwriteByExprExecV1 = "OverwriteByExpressionExecV1"
val atomicReplaceTableExec = "AtomicReplaceTableAsSelect"
val atomicCreateTableExec = "AtomicCreateTableAsSelect"
// Note: List of writeExecs that represent a physical command.
// hardcode because InsertIntoHadoopFsRelationCommand uses this same exec
// and InsertIntoHadoopFsRelationCommand doesn't have an entry in the
// supported execs file Set(defaultPhysicalCMD)
// A set of the logical commands that will be mapped to the physical write command
// which has an entry in the speedupSheet
// - InsertIntoHadoopFsRelationCommand is a logical command that does not have an entry in the
// supported execs file
// - SaveIntoDataSourceCommand is generated by DeltaLake Table writes
// - InsertIntoHiveTable is generated by Hive catalogue implementation
// - OverwriteByExprExecV1 is a spark write Op but it has special handling for DeltaLake
// - AppendDataExecV1 is a spark write Op but it has special handling for DeltaLake
// - AtomicReplaceTableAsSelectExec and AtomicCreateTableAsSelectExec are spark write Op but
// they have special handling for DeltaLake
private val logicalWriteCommands = Set(
dataWriteCMD,
insertIntoHadoopCMD,
HiveParseHelper.INSERT_INTO_HIVE_LABEL,
appendDataExecV1,
overwriteByExprExecV1,
atomicReplaceTableExec,
atomicCreateTableExec
)
// Note: Defines a list of the execs that include formatted data.
// This will be used to extract the format and then check whether the
// format is supported or not. Set(dataWriteCMD, insertIntoHadoopCMD)
// For now, we map the SaveIntoDataSourceCommand to defaultPhysicalCMD because we do not
// have speedup entry for the deltaLake write operation
private val logicalToPhysicalCmdMap = Map(
insertIntoHadoopCMD -> defaultPhysicalCMD,
HiveParseHelper.INSERT_INTO_HIVE_LABEL-> defaultPhysicalCMD,
appendDataExecV1 -> appendDataExecV1,
overwriteByExprExecV1 -> overwriteByExprExecV1,
atomicReplaceTableExec -> "AtomicReplaceTableAsSelectExec",
atomicCreateTableExec -> "AtomicCreateTableAsSelectExec"
)
// Map to hold the relation between writeExecCmd and the format.
// This used for expressions that do not show the format as part of the description.
private val specialWriteFormatMap = Map[String, String](
// if appendDataExecV1 is not deltaLakeProvider, then we want to mark it as unsupported
appendDataExecV1 -> "unknown",
// if overwriteByExprExecV1 is not deltaLakeProvider, then we want to mark it as unsupported
overwriteByExprExecV1 -> "unknown",
// if atomicReplaceTableExec is not deltaLakeProvider, then we want to mark it as unsupported
atomicReplaceTableExec -> "unknown",
// if atomicCreateTableExec is not deltaLakeProvider, then we want to mark it as unsupported
atomicCreateTableExec -> "unknown"
)
// Checks whether a node is a write CMD Exec
def isWritingCmdExec(nodeName: String): Boolean = {
logicalWriteCommands.exists(nodeName.contains(_)) || DeltaLakeHelper.accepts(nodeName)
}
def getPhysicalExecName(opName: String): String = {
logicalToPhysicalCmdMap.getOrElse(opName, defaultPhysicalCMD)
}
def getWriteCMDWrapper(node: SparkPlanGraphNode): Option[DataWritingCmdWrapper] = {
val processedNodeName = node.name.trim
logicalWriteCommands.find(processedNodeName.contains(_)) match {
case None => None
case Some(wCmd) =>
// get the dataformat from the map if it exists.
// Otherwise, fallback to the string parser
val dataFormat = specialWriteFormatMap.get(wCmd) match {
case Some(f) => f
case None =>
if (HiveParseHelper.isHiveTableInsertNode(node.name)) {
// Use Hive Utils to extract the format from insertIntoHiveTable based on the SerDe
// class.
HiveParseHelper.getWriteFormat(node)
} else {
// USe the default parser to extract the write-format
getWriteFormatString(node.desc)
}
}
val physicalCmd = getPhysicalExecName(wCmd)
Some(DataWritingCmdWrapper(wCmd, physicalCmd, dataFormat))
}
}
def parseNode(node: SparkPlanGraphNode,
checker: PluginTypeChecker,
sqlID: Long): ExecInfo = {
if (DeltaLakeHelper.acceptsWriteOp(node)) {
DeltaLakeHelper.parseNode(node, checker, sqlID)
} else {
DataWritingCommandExecParser(node, checker, sqlID).parse
}
}
// gets the data format by parsing the description of the node
def getWriteFormatString(node: String): String = {
// We need to parse the input string to get the write format. Write format is either third
// or fourth parameter in the input string. If the partition columns is provided, then the
// write format will be the fourth parameter.
// Example string in the eventlog:
// Execute InsertIntoHadoopFsRelationCommand
// gs://08f3844/, false, [op_cmpny_cd#25, clnt_rq_sent_dt#26], ORC, Map(path -> gs://08f3844)
val parsedString = node.split(",", 3).last.trim // remove first 2 parameters from the string
if (parsedString.startsWith("[")) {
// Optional parameter is present in the eventlog. Get the fourth parameter by skipping the
// optional parameter string.
parsedString.split("(?<=],)").map(_.trim).slice(1, 2)(0).split(",")(0)
} else {
parsedString.split(",")(0) // return third parameter from the input string
}
}
}