org.apache.spark.sql.execution.QueryExecution.scala Maven / Gradle / Ivy
package org.apache.spark.sql.execution
import java.nio.charset.StandardCharsets
import java.sql.Timestamp
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{AnalysisException, Row, SparkSession, SQLContext}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, ReturnAnswer}
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.catalyst.util.DateTimeUtils
import org.apache.spark.sql.execution.command.{DescribeTableCommand, ExecutedCommandExec}
import org.apache.spark.sql.execution.exchange.{EnsureRequirements, ReuseExchange}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.{BinaryType, DateType, DecimalType, TimestampType, _}
* The primary workflow for executing relational queries using Spark. Designed to allow easy
* access to the intermediate phases of query execution for developers.
* While this is not a public class, we should avoid changing the function names for the sake of
* changing them, because a lot of developers use the feature for debugging.
class QueryExecution(val sparkSession: SparkSession, val logical: LogicalPlan) {
// TODO: Move the planner an optimizer into here from SessionState.
protected def planner = sparkSession.sessionState.planner
def assertAnalyzed(): Unit = {
try sparkSession.sessionState.analyzer.checkAnalysis(analyzed) catch {
case e: AnalysisException =>
val ae = new AnalysisException(e.message, e.line, e.startPosition, Some(analyzed))
throw ae
def assertSupported(): Unit = {
if (sparkSession.sessionState.conf.getConf(SQLConf.UNSUPPORTED_OPERATION_CHECK_ENABLED)) {
lazy val analyzed: LogicalPlan = {
lazy val withCachedData: LogicalPlan = {
lazy val optimizedPlan: LogicalPlan = sparkSession.sessionState.optimizer.execute(withCachedData)
lazy val sparkPlan: SparkPlan = {
// executedPlan should not be used to initialize any SparkPlan. It should be
// only used for execution.
lazy val executedPlan: SparkPlan = prepareForExecution(sparkPlan)
/** Internal version of the RDD. Avoids copies and has no schema */
lazy val toRdd: RDD[InternalRow] = executedPlan.execute()
* Prepares a planned [[SparkPlan]] for execution by inserting shuffle operations and internal
* row format conversions as needed.
protected def prepareForExecution(plan: SparkPlan): SparkPlan = {
preparations.foldLeft(plan) { case (sp, rule) => rule.apply(sp) }
/** A sequence of rules that will be applied in order to the physical plan before execution. */
protected def preparations: Seq[Rule[SparkPlan]] = Seq(
protected def stringOrError[A](f: => A): String =
try f.toString catch { case e: Throwable => e.toString }
* Returns the result as a hive compatible sequence of strings. For native commands, the
* execution is simply passed back to Hive.
def hiveResultString(): Seq[String] = executedPlan match {
case ExecutedCommandExec(desc: DescribeTableCommand) =>
// If it is a describe command for a Hive table, we want to have the output format
// be similar with Hive.
desc.run(sparkSession).map {
case Row(name: String, dataType: String, comment) =>
Seq(name, dataType,
.map(s => String.format(s"%-20s", s))
case command: ExecutedCommandExec =>
case other =>
val result: Seq[Seq[Any]] = other.executeCollectPublic().map(_.toSeq).toSeq
// We need the types so we can output struct field names
val types = analyzed.output.map(_.dataType)
// Reformat to match hive tab delimited output.
/** Formats a datum (based on the given data type) and returns the string representation. */
private def toHiveString(a: (Any, DataType)): String = {
val primitiveTypes = Seq(StringType, IntegerType, LongType, DoubleType, FloatType,
BooleanType, ByteType, ShortType, DateType, TimestampType, BinaryType)
/** Implementation following Hive's TimestampWritable.toString */
def formatTimestamp(timestamp: Timestamp): String = {
val timestampString = timestamp.toString
if (timestampString.length() > 19) {
if (timestampString.length() == 21) {
if (timestampString.substring(19).compareTo(".0") == 0) {
return DateTimeUtils.threadLocalTimestampFormat.get().format(timestamp)
return DateTimeUtils.threadLocalTimestampFormat.get().format(timestamp) +
return DateTimeUtils.threadLocalTimestampFormat.get().format(timestamp)
def formatDecimal(d: java.math.BigDecimal): String = {
if (d.compareTo(java.math.BigDecimal.ZERO) == 0) {
} else {
/** Hive outputs fields of structs slightly differently than top level attributes. */
def toHiveStructString(a: (Any, DataType)): String = a match {
case (struct: Row, StructType(fields)) =>
struct.toSeq.zip(fields).map {
case (v, t) => s""""${t.name}":${toHiveStructString(v, t.dataType)}"""
}.mkString("{", ",", "}")
case (seq: Seq[_], ArrayType(typ, _)) =>
seq.map(v => (v, typ)).map(toHiveStructString).mkString("[", ",", "]")
case (map: Map[_, _], MapType(kType, vType, _)) =>
map.map {
case (key, value) =>
toHiveStructString((key, kType)) + ":" + toHiveStructString((value, vType))
}.toSeq.sorted.mkString("{", ",", "}")
case (null, _) => "null"
case (s: String, StringType) => "\"" + s + "\""
case (decimal, DecimalType()) => decimal.toString
case (other, tpe) if primitiveTypes contains tpe => other.toString
a match {
case (struct: Row, StructType(fields)) =>
struct.toSeq.zip(fields).map {
case (v, t) => s""""${t.name}":${toHiveStructString(v, t.dataType)}"""
}.mkString("{", ",", "}")
case (seq: Seq[_], ArrayType(typ, _)) =>
seq.map(v => (v, typ)).map(toHiveStructString).mkString("[", ",", "]")
case (map: Map[_, _], MapType(kType, vType, _)) =>
map.map {
case (key, value) =>
toHiveStructString((key, kType)) + ":" + toHiveStructString((value, vType))
}.toSeq.sorted.mkString("{", ",", "}")
case (null, _) => "NULL"
case (d: Int, DateType) => new java.util.Date(DateTimeUtils.daysToMillis(d)).toString
case (t: Timestamp, TimestampType) => formatTimestamp(t)
case (bin: Array[Byte], BinaryType) => new String(bin, StandardCharsets.UTF_8)
case (decimal: java.math.BigDecimal, DecimalType()) => formatDecimal(decimal)
case (other, tpe) if primitiveTypes.contains(tpe) => other.toString
def simpleString: String = {
s"""== Physical Plan ==
override def toString: String = {
def output =
analyzed.output.map(o => s"${o.name}: ${o.dataType.simpleString}").mkString(", ")
val analyzedPlan =
Seq(stringOrError(output), stringOrError(analyzed)).filter(_.nonEmpty).mkString("\n")
s"""== Parsed Logical Plan ==
|== Analyzed Logical Plan ==
|== Optimized Logical Plan ==
|== Physical Plan ==
/** A special namespace for commands that can be used to debug query execution. */
// scalastyle:off
object debug {
// scalastyle:on
* Prints to stdout all the generated code found in this plan (i.e. the output of each
* WholeStageCodegen subtree).
def codegen(): Unit = {
// scalastyle:off println
// scalastyle:on println
