com.nvidia.spark.rapids.basicPhysicalOperators.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of rapids-4-spark_2.12 Show documentation
Show all versions of rapids-4-spark_2.12 Show documentation
Creates the distribution package of the RAPIDS plugin for Apache Spark
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
package com.nvidia.spark.rapids
import scala.annotation.tailrec
import scala.collection.mutable.ArrayBuffer
import ai.rapids.cudf
import ai.rapids.cudf._
import com.nvidia.spark.Retryable
import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
import com.nvidia.spark.rapids.GpuMetric._
import com.nvidia.spark.rapids.RapidsPluginImplicits._
import com.nvidia.spark.rapids.RmmRapidsRetryIterator.{splitSpillableInHalfByRows, withRestoreOnRetry, withRetry, withRetryNoSplit}
import com.nvidia.spark.rapids.jni.GpuSplitAndRetryOOM
import com.nvidia.spark.rapids.shims._
import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext}
import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, RangePartitioning, SinglePartition, UnknownPartitioning}
import org.apache.spark.sql.execution.{FilterExec, ProjectExec, SampleExec, SparkPlan}
import org.apache.spark.sql.rapids.{GpuPartitionwiseSampledRDD, GpuPoissonSampler}
import org.apache.spark.sql.rapids.execution.TrampolineUtil
import org.apache.spark.sql.types.{DataType, LongType}
import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector}
import org.apache.spark.util.random.BernoulliCellSampler
class GpuProjectExecMeta(
proj: ProjectExec,
conf: RapidsConf,
p: Option[RapidsMeta[_, _, _]],
r: DataFromReplacementRule) extends SparkPlanMeta[ProjectExec](proj, conf, p, r)
with Logging {
override def convertToGpu(): GpuExec = {
// Force list to avoid recursive Java serialization of lazy list Seq implementation
val gpuExprs =[NamedExpression]).toList
val gpuChild = childPlans.head.convertIfNeeded()
if (conf.isProjectAstEnabled) {
if (childExprs.forall(_.canThisBeAst)) {
return GpuProjectAstExec(gpuExprs, gpuChild)
// explain AST because this is optional and it is sometimes hard to debug
if (conf.shouldExplain) {
val explain =
if (explain.nonEmpty) {
logWarning(s"AST PROJECT\n$explain")
GpuProjectExec(gpuExprs, gpuChild)
object GpuProjectExec {
def projectAndClose[A <: Expression](cb: ColumnarBatch, boundExprs: Seq[A],
opTime: GpuMetric): ColumnarBatch = {
val nvtxRange = new NvtxWithMetrics("ProjectExec", NvtxColor.CYAN, opTime)
try {
project(cb, boundExprs)
} finally {
private def extractSingleBoundIndex(expr: Expression): Option[Int] = expr match {
case ga: GpuAlias => extractSingleBoundIndex(ga.child)
case br: GpuBoundReference => Some(br.ordinal)
case _ => None
def extractSingleBoundIndex(boundExprs: Seq[Expression]): Seq[Option[Int]] =
def isNoopProject(cb: ColumnarBatch, boundExprs: Seq[Expression]): Boolean = {
if (boundExprs.length == cb.numCols()) {
extractSingleBoundIndex(boundExprs).zip(0 until cb.numCols()).forall {
case (Some(foundIndex), expectedIndex) => foundIndex == expectedIndex
case _ => false
} else {
def project(cb: ColumnarBatch, boundExprs: Seq[Expression]): ColumnarBatch = {
if (isNoopProject(cb, boundExprs)) {
// This can help avoid contiguous splits in some cases when the input data is also contiguous
} else {
val newColumns = boundExprs.safeMap(_.columnarEval(cb)).toArray[ColumnVector]
new ColumnarBatch(newColumns, cb.numRows())
* Similar to project, but it will try and retry the operations if it can. It also will close
* the input SpillableColumnarBatch.
* @param sb the input batch
* @param boundExprs the expressions to run
* @return the resulting batch
def projectAndCloseWithRetrySingleBatch(sb: SpillableColumnarBatch,
boundExprs: Seq[Expression]): ColumnarBatch = {
withResource(sb) { _ =>
projectWithRetrySingleBatch(sb, boundExprs)
* Similar to project, but it will try and retry the operations if it can. The caller is
* responsible for closing the input batch.
* @param sb the input batch
* @param boundExprs the expressions to run
* @return the resulting batch
def projectWithRetrySingleBatch(sb: SpillableColumnarBatch,
boundExprs: Seq[Expression]): ColumnarBatch = {
// First off we want to find/run all of the expressions that are not retryable,
// These cannot be retried.
val (retryableExprs, notRetryableExprs) = boundExprs.partition(
val retryables = GpuExpressionsUtils.collectRetryables(retryableExprs)
val snd = if (notRetryableExprs.nonEmpty) {
withResource(sb.getColumnarBatch()) { cb =>
Some(SpillableColumnarBatch(project(cb, notRetryableExprs),
} else {
withResource(snd) { snd =>
RmmRapidsRetryIterator.withRetryNoSplit {
val deterministicResults = withResource(sb.getColumnarBatch()) { cb =>
withRestoreOnRetry(retryables) {
// For now we are just going to run all of these and deal with losing work...
project(cb, retryableExprs)
if (snd.isEmpty) {
// We are done and the order should be the same so we don't need to do anything...
} else {
// There was a mix of deterministic and non-deterministic...
withResource(deterministicResults) { _ =>
withResource(snd.get.getColumnarBatch()) { nd =>
var ndAt = 0
var detAt = 0
val outputColumns = ArrayBuffer[ColumnVector]()
boundExprs.foreach { expr =>
if (expr.deterministic) {
outputColumns += deterministicResults.column(detAt)
detAt += 1
} else {
outputColumns += nd.column(ndAt)
ndAt += 1
GpuColumnVector.incRefCounts(new ColumnarBatch(outputColumns.toArray, sb.numRows()))
object GpuProjectExecLike {
def unapply(plan: SparkPlan): Option[(Seq[Expression], SparkPlan)] = plan match {
case gpuProjectLike: GpuProjectExecLike =>
Some((gpuProjectLike.projectList, gpuProjectLike.child))
case _ => None
trait GpuProjectExecLike extends ShimUnaryExecNode with GpuExec {
def projectList: Seq[Expression]
override lazy val additionalMetrics: Map[String, GpuMetric] = Map(
override def outputOrdering: Seq[SortOrder] = child.outputOrdering
override def outputPartitioning: Partitioning = child.outputPartitioning
override def doExecute(): RDD[InternalRow] =
throw new IllegalStateException(s"Row-based execution should not occur for $this")
// The same as what feeds us
override def outputBatching: CoalesceGoal = GpuExec.outputBatching(child)
* An iterator that is intended to split the input to or output of a project on rows.
* In practice this is only used for splitting the input prior to a project in some
* very special cases. If the projected size of the output is so large that it would
* risk us not being able to split it later on if we ran into trouble.
* @param iter the input iterator of columnar batches.
* @param schema the schema of that input so we can make things spillable if needed
* @param opTime metric for how long this took
* @param numSplitsMetric metric for the number of splits that happened.
abstract class AbstractProjectSplitIterator(iter: Iterator[ColumnarBatch],
schema: Array[DataType],
opTime: GpuMetric,
numSplitsMetric: GpuMetric) extends Iterator[ColumnarBatch] {
private[this] val pending = new scala.collection.mutable.Queue[SpillableColumnarBatch]()
override def hasNext: Boolean = pending.nonEmpty || iter.hasNext
protected def calcNumSplits(cb: ColumnarBatch): Int
override def next(): ColumnarBatch = {
if (!hasNext) {
throw new NoSuchElementException()
} else if (pending.nonEmpty) {
opTime.ns {
withRetryNoSplit(pending.dequeue()) { sb =>
} else {
val cb =
opTime.ns {
val numSplits = closeOnExcept(cb) { cb =>
if (numSplits <= 1) {
} else {
// this should never happen but it is here just in case
require(cb.numCols() > 0,
"About to perform cuDF table operations with a rows-only batch.")
numSplitsMetric += numSplits - 1
val sb = SpillableColumnarBatch(cb, SpillPriorities.ACTIVE_ON_DECK_PRIORITY)
val tables = withRetryNoSplit(sb) { sb =>
withResource(sb.getColumnarBatch()) { cb =>
withResource(GpuColumnVector.from(cb)) { table =>
val rows = table.getRowCount.toInt
val rowsPerSplit = math.ceil(rows.toDouble / numSplits).toInt
val splitIndexes = rowsPerSplit until rows by rowsPerSplit
table.contiguousSplit(splitIndexes: _*)
withResource(tables) { tables =>
(1 until tables.length).foreach { ix =>
val tbl = tables(ix)
tables(ix) = null // everything but the head table will be nulled, queued
// as spillable batches in `pending`
SpillableColumnarBatch(tbl, schema, SpillPriorities.ACTIVE_BATCHING_PRIORITY))
GpuColumnVector.from(tables.head.getTable, schema)
object PreProjectSplitIterator {
def calcMinOutputSize(cb: ColumnarBatch, boundExprs: GpuTieredProject): Long = {
val numRows = cb.numRows() {
case (dataType, index) =>
if (GpuBatchUtils.isFixedWidth(dataType)) {
GpuBatchUtils.minGpuMemory(dataType, true, numRows)
} else {
boundExprs.getPassThroughIndex(index).map { inputIndex =>
}.getOrElse {
GpuBatchUtils.minGpuMemory(dataType, true, numRows)
* An iterator that can be used to split the input of a project before it happens to prevent
* situations where the output could not be split later on. In testing we tried to see what
* would happen if we split it to the target batch size, but there was a very significant
* performance degradation when that happened. For now this is only used in a few specific
* places and not everywhere. In the future this could be extended, but if we do that there
* are some places where we don't want a split, like a project before a window operation.
* @param iter the input iterator of columnar batches.
* @param schema the schema of that input so we can make things spillable if needed
* @param boundExprs the bound project so we can get a good idea of the output size.
* @param opTime metric for how long this took
* @param numSplits the number of splits that happened.
class PreProjectSplitIterator(
iter: Iterator[ColumnarBatch],
schema: Array[DataType],
boundExprs: GpuTieredProject,
opTime: GpuMetric,
numSplits: GpuMetric) extends AbstractProjectSplitIterator(iter, schema, opTime, numSplits) {
// We memoize this parameter here as the value doesn't change during the execution
// of a SQL query. This is the highest level we can cache at without getting it
// passed in from the Exec that instantiates this split iterator.
// NOTE: this is overwritten by tests to trigger various corner cases
private lazy val splitUntilSize: Double = GpuDeviceManager.getSplitUntilSize.toDouble
* calcNumSplit will return the number of splits that we need for the input, in the case
* that we can detect that a projection using `boundExprs` would expand the output above
* `GpuDeviceManager.getSplitUntilSize`.
* @note In the corner case that `cb` is rows-only (no columns), this function returns 0 and the
* caller must be prepared to handle that case.
override def calcNumSplits(cb: ColumnarBatch): Int = {
if (cb.numCols() == 0) {
0 // rows-only batches should not be split
} else {
val minOutputSize = PreProjectSplitIterator.calcMinOutputSize(cb, boundExprs)
// If the minimum size is too large we will split before doing the project, to help avoid
// extreme cases where the output size is so large that we cannot split it afterwards.
math.max(1, math.ceil(minOutputSize / splitUntilSize).toInt)
case class GpuProjectExec(
// NOTE for Scala 2.12.x and below we enforce usage of (eager) List to prevent running
// into a deep recursion during serde of lazy lists. See
// Whereas a similar issue is resolved
// using an Array, we opt in for List because it implements Seq while having non-recursive
// serde:
// immutable/List.scala#L516
projectList: List[NamedExpression],
child: SparkPlan) extends GpuProjectExecLike {
override def output: Seq[Attribute] =
override lazy val additionalMetrics: Map[String, GpuMetric] = Map(
override def internalDoExecuteColumnar() : RDD[ColumnarBatch] = {
val numOutputRows = gpuLongMetric(NUM_OUTPUT_ROWS)
val numOutputBatches = gpuLongMetric(NUM_OUTPUT_BATCHES)
val opTime = gpuLongMetric(OP_TIME)
val boundProjectList = GpuBindReferences.bindGpuReferencesTiered(projectList, child.output,
val rdd = child.executeColumnar() { cb =>
val ret = withResource(new NvtxWithMetrics("ProjectExec", NvtxColor.CYAN, opTime)) { _ =>
val sb = SpillableColumnarBatch(cb, SpillPriorities.ACTIVE_ON_DECK_PRIORITY)
//Note if this ever changes to include splitting the output we need to have an option to not
// do this for window to work properly.
numOutputBatches += 1
numOutputRows += ret.numRows()
/** Use cudf AST expressions to project columnar batches */
case class GpuProjectAstExec(
// NOTE for Scala 2.12.x and below we enforce usage of (eager) List to prevent running
// into a deep recursion during serde of lazy lists. See
// Whereas a similar issue is resolved
// using an Array, we opt in for List because it implements Seq while having non-recursive
// serde:
// immutable/List.scala#L516
projectList: List[Expression],
child: SparkPlan
) extends GpuProjectExecLike {
override def output: Seq[Attribute] = {
projectList.collect { case ne: NamedExpression => ne.toAttribute }
override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
def buildRetryableAstIterator(
input: Iterator[ColumnarBatch]): GpuColumnarBatchIterator = {
val numOutputRows = gpuLongMetric(NUM_OUTPUT_ROWS)
val numOutputBatches = gpuLongMetric(NUM_OUTPUT_BATCHES)
val opTime = gpuLongMetric(OP_TIME)
val boundProjectList = GpuBindReferences.bindGpuReferences(projectList, child.output)
val outputTypes =
new GpuColumnarBatchIterator(true) {
private[this] var maybeSplittedItr: Iterator[ColumnarBatch] = Iterator.empty
private[this] var compiledAstExprs =
withResource(new NvtxWithMetrics("Compile ASTs", NvtxColor.ORANGE, opTime)) { _ =>
boundProjectList.safeMap { expr =>
// Use intmax for the left table column count since there's only one input table here.
override def hasNext: Boolean = maybeSplittedItr.hasNext || {
if (input.hasNext) {
} else {
override def next(): ColumnarBatch = {
if (!maybeSplittedItr.hasNext) {
val spillable = SpillableColumnarBatch(, SpillPriorities.ACTIVE_ON_DECK_PRIORITY)
// AST currently doesn't support non-deterministic expressions so it's not needed
// to check whether compiled expressions are retryable.
maybeSplittedItr = withRetry(spillable, splitSpillableInHalfByRows) { spillable =>
withResource(new NvtxWithMetrics("Project AST", NvtxColor.CYAN, opTime)) { _ =>
withResource(spillable.getColumnarBatch()) { cb =>
val projectedTable = withResource(tableFromBatch(cb)) { table =>
compiledAstExprs.safeMap(_.computeColumn(table))) { projectedColumns =>
new Table(projectedColumns: _*)
withResource(projectedTable) { _ =>
GpuColumnVector.from(projectedTable, outputTypes)
val ret =
numOutputBatches += 1
numOutputRows += ret.numRows()
override def doClose(): Unit = {
compiledAstExprs = Nil
private def tableFromBatch(cb: ColumnarBatch): Table = {
if (cb.numCols != 0) {
} else {
// Count-only batch but cudf Table cannot be created with no columns.
// Create the cheapest table we can to evaluate the AST expression.
withResource(Scalar.fromBool(false)) { falseScalar =>
withResource(cudf.ColumnVector.fromScalar(falseScalar, cb.numRows())) { falseColumn =>
new Table(falseColumn)
* Do projections in a tiered fashion, where earlier tiers contain sub-expressions that are
* referenced in later tiers. Each tier adds columns to the original batch corresponding
* to the output of the sub-expressions. It also removes columns that are no longer needed,
* based on inputAttrTiers for the current tier and the next tier.
* Example of how this is processed:
* Original projection expressions:
* (((a + b) + c) * e), (((a + b) + d) * f), (a + e), (c + f)
* Input columns for tier 1: a, b, c, d, e, f (original projection inputs)
* Tier 1: (a + b) as ref1
* Input columns for tier 2: a, c, d, e, f, ref1
* Tier 2: (ref1 + c) as ref2, (ref1 + d) as ref3
* Input columns for tier 3: a, c, e, f, ref2, ref3
* Tier 3: (ref2 * e), (ref3 * f), (a + e), (c + f)
case class GpuTieredProject(exprTiers: Seq[Seq[GpuExpression]]) {
* Is everything retryable. This can help with reliability in the common case.
lazy val areAllRetryable = !exprTiers.exists { tier =>
tier.exists { expr =>
lazy val retryables: Seq[Retryable] = exprTiers.flatMap(GpuExpressionsUtils.collectRetryables)
lazy val outputTypes =
private[this] def getPassThroughIndex(tierIndex: Int,
expr: Expression,
exprIndex: Int): Option[Int] = expr match {
case GpuAlias(child, _) =>
getPassThroughIndex(tierIndex, child, exprIndex)
case GpuBoundReference(index, _, _) =>
if (tierIndex <= 0) {
// We are at the input tier so the bound attribute is good!!!
} else {
// Not at the input yet
val newTier = tierIndex - 1
val newExpr = exprTiers(newTier)(index)
getPassThroughIndex(newTier, newExpr, index)
case _ =>
* Given an output index check to see if this is just going to be a pass through to a
* specific input column index.
* @param index the output column index to check
* @return the index of the input column that it passes through to or else None
def getPassThroughIndex(index: Int): Option[Int] = {
val startTier = exprTiers.length - 1
getPassThroughIndex(startTier, exprTiers.last(index), index)
private [this] def projectWithRetrySingleBatchInternal(sb: SpillableColumnarBatch,
closeInputBatch: Boolean): ColumnarBatch = {
if (areAllRetryable) {
// If all of the expressions are retryable we can just run everything and retry it
// at the top level. If some things are not retryable we need to split them up and
// do the processing in a way that makes it so retries are more likely to succeed.
val sbToClose = if (closeInputBatch) {
} else {
withResource(sbToClose) { _ =>
RmmRapidsRetryIterator.withRetryNoSplit {
withResource(sb.getColumnarBatch()) { cb =>
withRestoreOnRetry(retryables) {
} else {
def recurse(boundExprs: Seq[Seq[GpuExpression]],
sb: SpillableColumnarBatch,
recurseCloseInputBatch: Boolean): SpillableColumnarBatch = boundExprs match {
case Nil => sb
case exprSet :: tail =>
val projectSb = withResource(new NvtxRange("project tier", NvtxColor.ORANGE)) { _ =>
val projectResult = if (recurseCloseInputBatch) {
GpuProjectExec.projectAndCloseWithRetrySingleBatch(sb, exprSet)
} else {
GpuProjectExec.projectWithRetrySingleBatch(sb, exprSet)
SpillableColumnarBatch(projectResult, SpillPriorities.ACTIVE_ON_DECK_PRIORITY)
// We always want to close the temp batches that we make...
recurse(tail, projectSb, recurseCloseInputBatch = true)
// Process tiers sequentially. The input batch is closed by recurse if requested.
withResource(recurse(exprTiers, sb, closeInputBatch)) { ret =>
* Do a project with retry and close the input batch when done.
def projectAndCloseWithRetrySingleBatch(sb: SpillableColumnarBatch): ColumnarBatch =
projectWithRetrySingleBatchInternal(sb, closeInputBatch = true)
* Do a project with retry, but don't close the input batch.
def projectWithRetrySingleBatch(sb: SpillableColumnarBatch): ColumnarBatch =
projectWithRetrySingleBatchInternal(sb, closeInputBatch = false)
def project(batch: ColumnarBatch): ColumnarBatch = {
def recurse(boundExprs: Seq[Seq[GpuExpression]],
cb: ColumnarBatch,
isFirst: Boolean): ColumnarBatch = {
boundExprs match {
case Nil => cb
case exprSet :: tail =>
val projectCb = try {
withResource(new NvtxRange("project tier", NvtxColor.ORANGE)) { _ =>
GpuProjectExec.project(cb, exprSet)
} finally {
// Close intermediate batches
if (!isFirst) {
recurse(tail, projectCb, false)
// Process tiers sequentially
recurse(exprTiers, batch, true)
* Run a filter on a batch. The batch will be consumed.
object GpuFilter {
def apply(
batch: ColumnarBatch,
boundCondition: Expression,
numOutputRows: GpuMetric,
numOutputBatches: GpuMetric,
filterTime: GpuMetric): ColumnarBatch = {
withResource(new NvtxWithMetrics("filter batch", NvtxColor.YELLOW, filterTime)) { _ =>
val filteredBatch = GpuFilter(batch, boundCondition)
numOutputBatches += 1
numOutputRows += filteredBatch.numRows()
def filterAndClose(batch: ColumnarBatch,
boundCondition: GpuTieredProject,
numOutputRows: GpuMetric,
numOutputBatches: GpuMetric,
filterTime: GpuMetric): Iterator[ColumnarBatch] = {
if (boundCondition.areAllRetryable) {
val sb = SpillableColumnarBatch(batch, SpillPriorities.ACTIVE_ON_DECK_PRIORITY)
filterAndCloseWithRetry(sb, boundCondition, numOutputRows, numOutputBatches, filterTime)
} else {
filterAndCloseNoRetry(batch, boundCondition, numOutputRows, numOutputBatches,
private def filterAndCloseNoRetry(batch: ColumnarBatch,
boundCondition: GpuTieredProject,
numOutputRows: GpuMetric,
numOutputBatches: GpuMetric,
filterTime: GpuMetric): Iterator[ColumnarBatch] = {
withResource(new NvtxWithMetrics("filter batch", NvtxColor.YELLOW, filterTime)) { _ =>
val filteredBatch = withResource(batch) { batch =>
GpuFilter(batch, boundCondition)
numOutputBatches += 1
numOutputRows += filteredBatch.numRows()
private def filterAndCloseWithRetry(input: SpillableColumnarBatch,
boundCondition: GpuTieredProject,
numOutputRows: GpuMetric,
numOutputBatches: GpuMetric,
opTime: GpuMetric): Iterator[ColumnarBatch] = {
val ret = withRetry(input, splitSpillableInHalfByRows) { sb =>
withResource(sb.getColumnarBatch()) { cb =>
withRestoreOnRetry(boundCondition.retryables) {
withResource(new NvtxWithMetrics("filter batch", NvtxColor.YELLOW, opTime)) { _ =>
GpuFilter(cb, boundCondition)
} { cb =>
numOutputRows += cb.numRows()
numOutputBatches += 1
private def allEntriesAreTrue(mask: GpuColumnVector): Boolean = {
if (mask.hasNull) {
} else {
withResource(mask.getBase.all()) { all =>
private def doFilter(checkedFilterMask: Option[cudf.ColumnVector],
cb: ColumnarBatch): ColumnarBatch = { { checkedFilterMask =>
withResource(checkedFilterMask) { checkedFilterMask =>
if (cb.numCols() <= 0) {
val rowCount = withResource(checkedFilterMask.sum(DType.INT32)) { sum =>
if (sum.isValid) {
} else {
new ColumnarBatch(Array(), rowCount)
} else {
val colTypes = GpuColumnVector.extractTypes(cb)
withResource(GpuColumnVector.from(cb)) { tbl =>
withResource(tbl.filter(checkedFilterMask)) { filteredData =>
GpuColumnVector.from(filteredData, colTypes)
}.getOrElse {
// Nothing to filter so it is a NOOP
private def computeCheckedFilterMask(boundCondition: Expression,
cb: ColumnarBatch): Option[cudf.ColumnVector] = {
withResource(boundCondition.columnarEval(cb)) { filterMask =>
// If filter is a noop then return a None for the mask
if (allEntriesAreTrue(filterMask)) {
} else {
private def computeCheckedFilterMask(boundCondition: GpuTieredProject,
cb: ColumnarBatch): Option[cudf.ColumnVector] = {
withResource(boundCondition.project(cb)) { filterBatch =>
val filterMask = filterBatch.column(0).asInstanceOf[GpuColumnVector]
// If filter is a noop then return a None for the mask
if (allEntriesAreTrue(filterMask)) {
} else {
private[rapids] def apply(batch: ColumnarBatch,
boundCondition: Expression) : ColumnarBatch = {
val checkedFilterMask = computeCheckedFilterMask(boundCondition, batch)
doFilter(checkedFilterMask, batch)
def apply(
batch: ColumnarBatch,
boundCondition: GpuTieredProject): ColumnarBatch = {
val checkedFilterMask = computeCheckedFilterMask(boundCondition, batch)
doFilter(checkedFilterMask, batch)
case class GpuFilterExecMeta(
filter: FilterExec,
override val conf: RapidsConf,
parentMetaOpt: Option[RapidsMeta[_, _, _]],
rule: DataFromReplacementRule
) extends SparkPlanMeta[FilterExec](filter, conf, parentMetaOpt, rule) {
override def convertToGpu(): GpuExec = {
case class GpuFilterExec(
condition: Expression,
child: SparkPlan)(
override val coalesceAfter: Boolean = true)
extends ShimUnaryExecNode with ShimPredicateHelper with GpuExec {
override def otherCopyArgs: Seq[AnyRef] =
override lazy val additionalMetrics: Map[String, GpuMetric] = Map(
// Split out all the IsNotNulls from condition.
private val (notNullPreds, _) = splitConjunctivePredicates(condition).partition {
case GpuIsNotNull(a) => isNullIntolerant(a) && a.references.subsetOf(child.outputSet)
case _ => false
// The columns that will filtered out by `IsNotNull` could be considered as not nullable.
private val notNullAttributes = notNullPreds.flatMap(_.references)
override def output: Seq[Attribute] = { { a =>
if (a.nullable && notNullAttributes.contains(a.exprId)) {
} else {
override def outputOrdering: Seq[SortOrder] = child.outputOrdering
override def outputPartitioning: Partitioning = child.outputPartitioning
override def doExecute(): RDD[InternalRow] =
throw new IllegalStateException(s"Row-based execution should not occur for $this")
override val outputRowsLevel: MetricsLevel = ESSENTIAL_LEVEL
override val outputBatchesLevel: MetricsLevel = MODERATE_LEVEL
override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
val numOutputRows = gpuLongMetric(NUM_OUTPUT_ROWS)
val numOutputBatches = gpuLongMetric(NUM_OUTPUT_BATCHES)
val opTime = gpuLongMetric(OP_TIME)
val rdd = child.executeColumnar()
val boundCondition = GpuBindReferences.bindGpuReferencesTiered(Seq(condition), child.output,
rdd.flatMap { batch =>
GpuFilter.filterAndClose(batch, boundCondition, numOutputRows,
numOutputBatches, opTime)
class GpuSampleExecMeta(
sample: SampleExec,
conf: RapidsConf,
p: Option[RapidsMeta[_, _, _]],
r: DataFromReplacementRule) extends SparkPlanMeta[SampleExec](sample, conf, p, r)
with Logging {
override def convertToGpu(): GpuExec = {
val gpuChild = childPlans.head.convertIfNeeded()
if (conf.isFastSampleEnabled) {
// Use GPU sample JNI, this is faster, but the output is not the same as CPU produces
GpuFastSampleExec(sample.lowerBound, sample.upperBound, sample.withReplacement,
sample.seed, gpuChild)
} else {
// The output is the same as CPU produces
// First generates row indexes by CPU sampler, then use GPU to gathers
GpuSampleExec(sample.lowerBound, sample.upperBound, sample.withReplacement,
sample.seed, gpuChild)
case class GpuSampleExec(
lowerBound: Double,
upperBound: Double,
withReplacement: Boolean,
seed: Long, child: SparkPlan) extends ShimUnaryExecNode with GpuExec {
override lazy val additionalMetrics: Map[String, GpuMetric] = Map(
override def output: Seq[Attribute] = {
// add one coalesce exec to avoid empty batch and small batch,
// because sample will shrink the batch
override val coalesceAfter: Boolean = true
override def outputOrdering: Seq[SortOrder] = child.outputOrdering
override def outputPartitioning: Partitioning = child.outputPartitioning
override def doExecute(): RDD[InternalRow] =
throw new IllegalStateException(s"Row-based execution should not occur for $this")
override val outputRowsLevel: MetricsLevel = ESSENTIAL_LEVEL
override val outputBatchesLevel: MetricsLevel = MODERATE_LEVEL
override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
val numOutputRows = gpuLongMetric(NUM_OUTPUT_ROWS)
val numOutputBatches = gpuLongMetric(NUM_OUTPUT_BATCHES)
val opTime = gpuLongMetric(OP_TIME)
val rdd = child.executeColumnar()
// CPU consistent, first generates sample row indexes by CPU, then gathers by GPU
if (withReplacement) {
new GpuPartitionwiseSampledRDD(
new GpuPoissonSampler(upperBound - lowerBound, useGapSamplingIfPossible = false,
numOutputRows, numOutputBatches, opTime),
preservesPartitioning = true,
} else {
(index, iterator) => {
// use CPU sampler generate row indexes
val sampler = new BernoulliCellSampler(lowerBound, upperBound)
sampler.setSeed(seed + index)[ColumnarBatch] { columnarBatch =>
// collect sampled row idx
// samples idx in batch one by one, so it's same as CPU execution
withResource(new NvtxWithMetrics("Sample Exec", NvtxColor.YELLOW, opTime)) { _ =>
withResource(columnarBatch) { cb =>
// generate sampled row indexes by CPU
val sampledRows = new ArrayBuffer[Int]
var rowIndex = 0
while (rowIndex < cb.numRows()) {
if (sampler.sample() > 0) {
sampledRows += rowIndex
rowIndex += 1
numOutputBatches += 1
numOutputRows += sampledRows.length
// gather by row indexes
GatherUtils.gather(cb, sampledRows)
, preservesPartitioning = true
case class GpuFastSampleExec(
lowerBound: Double,
upperBound: Double,
withReplacement: Boolean,
seed: Long,
child: SparkPlan) extends ShimUnaryExecNode with GpuExec {
override lazy val additionalMetrics: Map[String, GpuMetric] = Map(
override def output: Seq[Attribute] = {
// add one coalesce exec to avoid empty batch and small batch,
// because sample will shrink the batch
override val coalesceAfter: Boolean = true
// Note GPU sample does not preserve the ordering
override def outputOrdering: Seq[SortOrder] = Nil
override def outputPartitioning: Partitioning = child.outputPartitioning
override def doExecute(): RDD[InternalRow] =
throw new IllegalStateException(s"Row-based execution should not occur for $this")
override val outputRowsLevel: MetricsLevel = ESSENTIAL_LEVEL
override val outputBatchesLevel: MetricsLevel = MODERATE_LEVEL
override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
val numOutputRows = gpuLongMetric(NUM_OUTPUT_ROWS)
val numOutputBatches = gpuLongMetric(NUM_OUTPUT_BATCHES)
val opTime = gpuLongMetric(OP_TIME)
val rdd = child.executeColumnar()
// CPU inconsistent, uses GPU sample JNI
(index, iterator) => {[ColumnarBatch] { columnarBatch =>
withResource(new NvtxWithMetrics("Fast Sample Exec", NvtxColor.YELLOW, opTime)) { _ =>
withResource(columnarBatch) { cb =>
numOutputBatches += 1
val numSampleRows = (cb.numRows() * (upperBound - lowerBound)).toLong
val colTypes = GpuColumnVector.extractTypes(cb)
if (numSampleRows == 0L) {
} else if (cb.numCols() == 0) {
// for count agg, num of cols is 0
val c = GpuColumnVector.emptyBatchFromTypes(colTypes)
} else {
withResource(GpuColumnVector.from(cb)) { table =>
// GPU sample
withResource(table.sample(numSampleRows, withReplacement, seed + index)) {
sampled =>
val cb = GpuColumnVector.from(sampled, colTypes)
numOutputRows += cb.numRows()
, preservesPartitioning = true
private[rapids] class GpuRangeIterator(
partitionStart: BigInt,
partitionEnd: BigInt,
step: Long,
maxRowCountPerBatch: Long,
taskContext: TaskContext,
opTime: GpuMetric) extends Iterator[ColumnarBatch] with Logging {
// This iterator is designed for GpuRangeExec, so it has the requirement for the inputs.
assert((partitionEnd - partitionStart) % step == 0)
private def getSafeMargin(bi: BigInt): Long = {
if (bi.isValidLong) {
} else if (bi > 0) {
} else {
private val safePartitionStart = getSafeMargin(partitionStart) // inclusive
private val safePartitionEnd = getSafeMargin(partitionEnd) // exclusive, unless start == this
private[this] var currentPosition: Long = safePartitionStart
private[this] var done: Boolean = false
override def hasNext: Boolean = {
if (!done) {
if (step > 0) {
currentPosition < safePartitionEnd
} else {
currentPosition > safePartitionEnd
} else false
override def next(): ColumnarBatch = {
if (!hasNext) {
throw new NoSuchElementException()
withResource(new NvtxWithMetrics("GpuRange", NvtxColor.DARK_GREEN, opTime)) { _ =>
val start = currentPosition
val remainingRows = (safePartitionEnd - start) / step
// Start is inclusive so we need to produce at least one row
val rowsExpected = Math.max(1, Math.min(remainingRows, maxRowCountPerBatch))
val iter = withRetry(AutoCloseableLong(rowsExpected), reduceRowsNumberByHalf) { rows =>
withResource(Scalar.fromLong(start)) { startScalar =>
withResource(Scalar.fromLong(step)) { stepScalar =>
cudf.ColumnVector.sequence(startScalar, stepScalar, rows.value.toInt)) { vec =>
withResource(new Table(vec)) { tab =>
GpuColumnVector.from(tab, Array[DataType](LongType))
closeOnExcept( { batch =>
// This "iter" returned from the "withRetry" block above has only one batch,
// because the split function "reduceRowsNumberByHalf" returns a Seq with a single
// element inside.
// By doing this, we can pull out this single batch directly without maintaining
// this extra `iter` for the next loop.
val endInclusive = start + ((batch.numRows() - 1) * step)
currentPosition = endInclusive + step
if (currentPosition < endInclusive ^ step < 0) {
// we have Long.MaxValue + Long.MaxValue < Long.MaxValue
// and Long.MinValue + Long.MinValue > Long.MinValue, so iff the step causes a
// step back, we are pretty sure that we have an overflow.
done = true
if (batch.numRows() < rowsExpected) {
logDebug(s"Retried with ${batch.numRows()} rows when expected $rowsExpected rows")
* Reduce the input rows number by half, and it returns a Seq with only one element,
* that is the half value.
* This will be used with the split_retry block to generate a single batch a with smaller
* size when getting relevant OOMs.
private def reduceRowsNumberByHalf: AutoCloseableLong => Seq[AutoCloseableLong] =
(rowsNumber) => {
withResource(rowsNumber) { _ =>
if (rowsNumber.value < 10) {
throw new GpuSplitAndRetryOOM(s"GPU OutOfMemory: the number of rows generated is" +
s" too small to be split ${rowsNumber.value}!")
Seq(AutoCloseableLong(rowsNumber.value / 2))
/** A bridge class between Long and AutoCloseable for retry */
case class AutoCloseableLong(value: Long) extends AutoCloseable {
override def close(): Unit = { /* Nothing to be closed */ }
* Physical plan for range (generating a range of 64 bit numbers).
case class GpuRangeExec(
start: Long,
end: Long,
step: Long,
numSlices: Int,
output: Seq[Attribute],
targetSizeBytes: Long)
extends ShimLeafExecNode with GpuExec {
val numElements: BigInt = {
val safeStart = BigInt(start)
val safeEnd = BigInt(end)
if ((safeEnd - safeStart) % step == 0 || (safeEnd > safeStart) != (step > 0)) {
(safeEnd - safeStart) / step
} else {
// the remainder has the same sign with range, could add 1 more
(safeEnd - safeStart) / step + 1
val isEmptyRange: Boolean = start == end || (start < end ^ 0 < step)
override protected val outputRowsLevel: MetricsLevel = ESSENTIAL_LEVEL
override protected val outputBatchesLevel: MetricsLevel = MODERATE_LEVEL
override lazy val additionalMetrics: Map[String, GpuMetric] = Map(
override def outputOrdering: Seq[SortOrder] = {
val order = if (step > 0) {
} else {
} => SortOrder(a, order))
override def outputPartitioning: Partitioning = {
if (numElements > 0) {
if (numSlices == 1) {
} else {
RangePartitioning(outputOrdering, numSlices)
} else {
override def outputBatching: CoalesceGoal = TargetSize(targetSizeBytes)
protected override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
val numOutputRows = gpuLongMetric(NUM_OUTPUT_ROWS)
val numOutputBatches = gpuLongMetric(NUM_OUTPUT_BATCHES)
val opTime = gpuLongMetric(OP_TIME)
val maxRowCountPerBatch = Math.min(targetSizeBytes/8, Int.MaxValue)
if (isEmptyRange) {
} else {
.parallelize(0 until numSlices, numSlices)
.mapPartitionsWithIndex { (i, _) =>
val partitionStart = (i * numElements) / numSlices * step + start
val partitionEnd = (((i + 1) * numElements) / numSlices) * step + start
val taskContext = TaskContext.get()
val inputMetrics = taskContext.taskMetrics().inputMetrics
val rangeIter = new GpuRangeIterator(partitionStart, partitionEnd, step,
maxRowCountPerBatch, taskContext, opTime).map { batch =>
numOutputRows += batch.numRows()
TrampolineUtil.incInputRecordsRows(inputMetrics, batch.numRows())
numOutputBatches += 1
new InterruptibleIterator(taskContext, rangeIter)
override def simpleString(maxFields: Int): String = {
s"GpuRange ($start, $end, step=$step, splits=$numSlices)"
override protected def doExecute(): RDD[InternalRow] =
throw new IllegalStateException(s"Row-based execution should not occur for $this")
case class GpuUnionExec(children: Seq[SparkPlan]) extends ShimSparkPlan with GpuExec {
// updating nullability to make all the children consistent
override def output: Seq[Attribute] = { { attrs =>
val firstAttr = attrs.head
val nullable = attrs.exists(_.nullable)
val newDt =
if (firstAttr.dataType == newDt) {
} else {
AttributeReference(, newDt, nullable, firstAttr.metadata)(
firstAttr.exprId, firstAttr.qualifier)
// The smallest of our children
override def outputBatching: CoalesceGoal =
override def doExecute(): RDD[InternalRow] =
throw new IllegalStateException(s"Row-based execution should not occur for $this")
override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
val numOutputRows = gpuLongMetric(NUM_OUTPUT_ROWS)
val numOutputBatches = gpuLongMetric(NUM_OUTPUT_BATCHES)
sparkContext.union( { batch =>
numOutputBatches += 1
numOutputRows += batch.numRows
case class GpuCoalesceExec(numPartitions: Int, child: SparkPlan)
extends ShimUnaryExecNode with GpuExec {
// This operator does not record any metrics
override lazy val allMetrics: Map[String, GpuMetric] = Map.empty
override def output: Seq[Attribute] = child.output
override def outputPartitioning: Partitioning = {
if (numPartitions == 1) SinglePartition
else UnknownPartitioning(numPartitions)
// The same as what feeds us
override def outputBatching: CoalesceGoal = GpuExec.outputBatching(child)
protected override def doExecute(): RDD[InternalRow] = throw new UnsupportedOperationException(
s"${getClass.getCanonicalName} does not support row-based execution")
override protected def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
val rdd = child.executeColumnar()
if (numPartitions == 1 && rdd.getNumPartitions < 1) {
// Make sure we don't output an RDD with 0 partitions, when claiming that we have a
// `SinglePartition`.
new GpuCoalesceExec.EmptyRDDWithPartitions(sparkContext, numPartitions)
} else {
rdd.coalesce(numPartitions, shuffle = false)
override val coalesceAfter: Boolean = true
object GpuCoalesceExec {
/** A simple RDD with no data, but with the given number of partitions. */
class EmptyRDDWithPartitions(
@transient private val sc: SparkContext,
numPartitions: Int) extends RDD[ColumnarBatch](sc, Nil) {
override def getPartitions: Array[Partition] =
Array.tabulate(numPartitions)(i => EmptyPartition(i))
override def compute(split: Partition, context: TaskContext): Iterator[ColumnarBatch] = {
case class EmptyPartition(index: Int) extends Partition