org.apache.spark.sql.execution.CollectMetricsExec.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-sql_2.12 Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.spark.sql.execution

import org.apache.spark.TaskContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
import org.apache.spark.sql.catalyst.expressions.{Attribute, NamedExpression, SortOrder}
import org.apache.spark.sql.catalyst.plans.physical.Partitioning
import org.apache.spark.sql.catalyst.types.DataTypeUtils
import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec
import org.apache.spark.sql.types.StructType

/**
 * Collect arbitrary (named) metrics from a [[SparkPlan]].
 */
case class CollectMetricsExec(
    name: String,
    metricExpressions: Seq[NamedExpression],
    child: SparkPlan)
  extends UnaryExecNode {

  private lazy val accumulator: AggregatingAccumulator = {
    val acc = AggregatingAccumulator(metricExpressions, child.output)
    acc.register(sparkContext, Option("Collected metrics"))
    acc
  }

  val metricsSchema: StructType = {
    DataTypeUtils.fromAttributes(metricExpressions.map(_.toAttribute))
  }

  // This is not used very frequently (once a query); it is not useful to use code generation here.
  private lazy val toRowConverter: InternalRow => Row = {
    CatalystTypeConverters.createToScalaConverter(metricsSchema)
      .asInstanceOf[InternalRow => Row]
  }

  def collectedMetrics: Row = toRowConverter(accumulator.value)

  override def output: Seq[Attribute] = child.output

  override def outputPartitioning: Partitioning = child.outputPartitioning

  override def outputOrdering: Seq[SortOrder] = child.outputOrdering

  override protected def doExecute(): RDD[InternalRow] = {
    val collector = accumulator
    collector.reset()
    child.execute().mapPartitions { rows =>
      // Only publish the value of the accumulator when the task has completed. This is done by
      // updating a task local accumulator ('updater') which will be merged with the actual
      // accumulator as soon as the task completes. This avoids the following problems during the
      // heartbeat:
      // - Correctness issues due to partially completed/visible updates.
      // - Performance issues due to excessive serialization.
      val updater = collector.copyAndReset()
      TaskContext.get().addTaskCompletionListener[Unit] { _ =>
        if (collector.isZero) {
          collector.setState(updater)
        } else {
          collector.merge(updater)
        }
      }

      rows.map { r =>
        updater.add(r)
        r
      }
    }
  }

  override protected def withNewChildInternal(newChild: SparkPlan): CollectMetricsExec =
    copy(child = newChild)
}

object CollectMetricsExec extends AdaptiveSparkPlanHelper {
  /**
   * Recursively collect all collected metrics from a query tree.
   */
  def collect(plan: SparkPlan): Map[String, Row] = {
    val metrics = collectWithSubqueries(plan) {
      case collector: CollectMetricsExec =>
        Map(collector.name -> collector.collectedMetrics)
      case tableScan: InMemoryTableScanExec =>
        CollectMetricsExec.collect(tableScan.relation.cachedPlan)
    }
    metrics.reduceOption(_ ++ _).getOrElse(Map.empty)
  }
}