com.nvidia.spark.rapids.GpuSortExec.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of rapids-4-spark_2.12 Show documentation
Show all versions of rapids-4-spark_2.12 Show documentation
Creates the distribution package of the RAPIDS plugin for Apache Spark
/*
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.nvidia.spark.rapids
import java.util.{Comparator, LinkedList, PriorityQueue}
import scala.collection.mutable.ArrayBuffer
import ai.rapids.cudf.{ColumnVector, ContiguousTable, NvtxColor, NvtxRange, Table}
import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
import com.nvidia.spark.rapids.GpuMetric._
import com.nvidia.spark.rapids.RapidsPluginImplicits.AutoCloseableProducingSeq
import com.nvidia.spark.rapids.RmmRapidsRetryIterator.{splitSpillableInHalfByRows, withRetry, withRetryNoSplit}
import com.nvidia.spark.rapids.ScalableTaskCompletion.onTaskCompletion
import com.nvidia.spark.rapids.shims.ShimUnaryExecNode
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder, UnsafeProjection, UnsafeRow}
import org.apache.spark.sql.catalyst.expressions.codegen.LazilyGeneratedOrdering
import org.apache.spark.sql.catalyst.plans.physical.{Distribution, OrderedDistribution, Partitioning, UnspecifiedDistribution}
import org.apache.spark.sql.execution.{SortExec, SparkPlan}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.rapids.execution.TrampolineUtil
import org.apache.spark.sql.vectorized.ColumnarBatch
sealed trait SortExecType extends Serializable
object OutOfCoreSort extends SortExecType
object FullSortSingleBatch extends SortExecType
object SortEachBatch extends SortExecType
class GpuSortMeta(
sort: SortExec,
conf: RapidsConf,
parent: Option[RapidsMeta[_, _, _]],
rule: DataFromReplacementRule)
extends SparkPlanMeta[SortExec](sort, conf, parent, rule) {
// Uses output attributes of child plan because SortExec will not change the attributes,
// and we need to propagate possible type conversions on the output attributes of
// GpuSortAggregateExec.
override protected val useOutputAttributesOfChild: Boolean = true
// For transparent plan like ShuffleExchange, the accessibility of runtime data transition is
// depended on the next non-transparent plan. So, we need to trace back.
override val availableRuntimeDataTransition: Boolean =
childPlans.head.availableRuntimeDataTransition
override def convertToGpu(): GpuExec = {
GpuSortExec(childExprs.map(_.convertToGpu()).asInstanceOf[Seq[SortOrder]],
sort.global,
childPlans.head.convertIfNeeded(),
if (conf.stableSort) FullSortSingleBatch else OutOfCoreSort
)(sort.sortOrder)
}
}
object GpuSortExec {
def targetSize(sqlConf: SQLConf): Long = {
val batchSize = RapidsConf.GPU_BATCH_SIZE_BYTES.get(sqlConf)
targetSize(batchSize)
}
def targetSize(batchSize: Long): Long = {
// To avoid divide by zero errors, underflow and overflow issues in tests
// that want the targetSize to be 0, we set it to something more reasonable
math.max(16 * 1024, batchSize)
}
}
case class GpuSortExec(
gpuSortOrder: Seq[SortOrder],
global: Boolean,
child: SparkPlan,
sortType: SortExecType)(cpuSortOrder: Seq[SortOrder])
extends ShimUnaryExecNode with GpuExec {
override def otherCopyArgs: Seq[AnyRef] = cpuSortOrder :: Nil
override def childrenCoalesceGoal: Seq[CoalesceGoal] = sortType match {
case FullSortSingleBatch => Seq(RequireSingleBatch)
case OutOfCoreSort | SortEachBatch => Seq(null)
case t => throw new IllegalArgumentException(s"Unexpected Sort Type $t")
}
override def output: Seq[Attribute] = child.output
override def outputOrdering: Seq[SortOrder] = cpuSortOrder
// sort performed is local within a given partition so will retain
// child operator's partitioning
override def outputPartitioning: Partitioning = child.outputPartitioning
override def requiredChildDistribution: Seq[Distribution] =
if (global) OrderedDistribution(cpuSortOrder) :: Nil else UnspecifiedDistribution :: Nil
override def outputBatching: CoalesceGoal = sortType match {
// We produce a single batch if we know that our input will be a single batch
case FullSortSingleBatch => RequireSingleBatch
case _ => null
}
override def doExecute(): RDD[InternalRow] =
throw new IllegalStateException(s"Row-based execution should not occur for $this")
override lazy val additionalMetrics: Map[String, GpuMetric] =
Map(
OP_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_OP_TIME),
SORT_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_SORT_TIME))
private lazy val targetSize = GpuSortExec.targetSize(conf)
override def internalDoExecuteColumnar(): RDD[ColumnarBatch] = {
val sorter = new GpuSorter(gpuSortOrder, output)
val sortTime = gpuLongMetric(SORT_TIME)
val opTime = gpuLongMetric(OP_TIME)
val outputBatch = gpuLongMetric(NUM_OUTPUT_BATCHES)
val outputRows = gpuLongMetric(NUM_OUTPUT_ROWS)
val outOfCore = sortType == OutOfCoreSort
val singleBatch = sortType == FullSortSingleBatch
child.executeColumnar().mapPartitions { cbIter =>
if (outOfCore) {
val iter = GpuOutOfCoreSortIterator(cbIter, sorter,
targetSize, opTime, sortTime, outputBatch, outputRows)
onTaskCompletion(iter.close())
iter
} else {
GpuSortEachBatchIterator(cbIter, sorter, singleBatch,
opTime, sortTime, outputBatch, outputRows)
}
}
}
}
case class GpuSortEachBatchIterator(
iter: Iterator[ColumnarBatch],
sorter: GpuSorter,
singleBatch: Boolean,
opTime: GpuMetric = NoopMetric,
sortTime: GpuMetric = NoopMetric,
outputBatches: GpuMetric = NoopMetric,
outputRows: GpuMetric = NoopMetric) extends Iterator[ColumnarBatch] {
override def hasNext: Boolean = iter.hasNext
override def next(): ColumnarBatch = {
if (!hasNext) {
throw new NoSuchElementException()
}
val scb = closeOnExcept(iter.next()) { cb =>
SpillableColumnarBatch(cb, SpillPriorities.ACTIVE_ON_DECK_PRIORITY)
}
val ret = sorter.fullySortBatchAndCloseWithRetry(scb, sortTime, opTime)
opTime.ns {
outputBatches += 1
outputRows += ret.numRows()
if (singleBatch) {
GpuColumnVector.tagAsFinalBatch(ret)
} else {
ret
}
}
}
}
/**
* Create an iterator that will sort each batch as it comes in. It will keep any projected
* columns in place after doing the sort on the assumption that you want to possibly combine
* them in some way afterwards.
*/
object GpuSpillableProjectedSortEachBatchIterator {
def apply(
iter: Iterator[ColumnarBatch],
sorter: GpuSorter,
opTime: GpuMetric = NoopMetric,
sortTime: GpuMetric = NoopMetric): Iterator[SpillableColumnarBatch] = {
val spillableIter = iter.flatMap { cb =>
// Filter out empty batches and make them spillable
if (cb.numRows() > 0) {
Some(SpillableColumnarBatch(cb, SpillPriorities.ACTIVE_ON_DECK_PRIORITY))
} else {
cb.close()
None
}
}
val sortedBatchIter = spillableIter.flatMap { scb =>
withRetry(scb, splitSpillableInHalfByRows) { attemptScb =>
opTime.ns {
val sortedTbl = withResource(attemptScb.getColumnarBatch()) { attemptCb =>
sorter.appendProjectedAndSort(attemptCb, sortTime)
}
withResource(sortedTbl) { _ =>
closeOnExcept(GpuColumnVector.from(sortedTbl, sorter.projectedBatchTypes)) { cb =>
SpillableColumnarBatch(cb, SpillPriorities.ACTIVE_ON_DECK_PRIORITY)
}
}
}
}
}
sortedBatchIter
}
}
/**
* Holds data for the out of core sort. It includes the batch of data and the first row in that
* batch so we can sort the batches.
*/
case class OutOfCoreBatch(buffer: SpillableColumnarBatch,
firstRow: UnsafeRow) extends AutoCloseable {
override def close(): Unit = buffer.close()
}
/**
* Data that the out of core sort algorithm has not finished sorting. This acts as a priority
* queue with each batch sorted by the first row in that batch.
*/
class Pending(cpuOrd: LazilyGeneratedOrdering) extends AutoCloseable {
private val pending = new PriorityQueue[OutOfCoreBatch](new Comparator[OutOfCoreBatch]() {
override def compare(a: OutOfCoreBatch, b: OutOfCoreBatch): Int =
cpuOrd.compare(a.firstRow, b.firstRow)
})
private var pendingSize = 0L
def add(batch: OutOfCoreBatch): Unit = {
pendingSize += batch.buffer.sizeInBytes
pending.add(batch)
}
def storedSize: Long = pendingSize
def size(): Int = pending.size()
def poll(): OutOfCoreBatch = {
val ret = pending.poll()
if (ret != null) {
pendingSize -= ret.buffer.sizeInBytes
}
ret
}
def peek(): OutOfCoreBatch = pending.peek()
def isEmpty: Boolean = pending.isEmpty
override def close(): Unit = pending.forEach(_.close())
}
/**
* Sorts incoming batches of data spilling if needed.
*
* The algorithm for this is a modified version of an external merge sort with multiple passes for
* large data.
* https://en.wikipedia.org/wiki/External_sorting#External_merge_sort
*
* The main difference is that we cannot stream the data when doing a merge sort. So, we instead
* divide the data into batches that are small enough that we can do a merge sort on N batches
* and still fit the output within the target batch size. When merging batches instead of
* individual rows we cannot assume that all of the resulting data is globally sorted. Hopefully,
* most of it is globally sorted but we have to use the first row from the next pending batch to
* determine the cutoff point between globally sorted data and data that still needs to be merged
* with other batches. The globally sorted portion is put into a sorted queue while the rest of
* the merged data is split and put back into a pending queue. The process repeats until we have
* enough data to output.
*/
case class GpuOutOfCoreSortIterator(
iter: Iterator[ColumnarBatch],
sorter: GpuSorter,
targetSize: Long,
opTime: GpuMetric,
sortTime: GpuMetric,
outputBatches: GpuMetric,
outputRows: GpuMetric) extends Iterator[ColumnarBatch]
with AutoCloseable {
/**
* This has already sorted the data, and it still has the projected columns in it that need to
* be removed before it is returned.
*/
val alreadySortedIter = GpuSpillableProjectedSortEachBatchIterator(iter, sorter, opTime, sortTime)
private val cpuOrd = new LazilyGeneratedOrdering(sorter.cpuOrdering)
// A priority queue of data that is not merged yet.
private val pending = new Pending(cpuOrd)
// data that has been determined to be globally sorted and is waiting to be output.
private val sorted = new LinkedList[SpillableColumnarBatch]()
// how much data, in bytes, that is stored in `sorted`
private var sortedSize = 0L
override def hasNext: Boolean = !sorted.isEmpty || !pending.isEmpty || alreadySortedIter.hasNext
// Use types for the UnsafeProjection otherwise we need to have CPU BoundAttributeReferences
// used for converting between columnar data and rows (to get the first row in each batch).
private lazy val unsafeProjection = UnsafeProjection.create(sorter.projectedBatchTypes)
// Used for converting between rows and columns when we have to put a cuttoff on the GPU
// to know how much of the data after a merge sort is fully sorted.
private lazy val converters = new GpuRowToColumnConverter(
TrampolineUtil.fromAttributes(sorter.projectedBatchSchema))
/**
* Convert the boundaries (first rows for each batch) into unsafe rows for use later on.
*/
private def convertBoundaries(tab: Table): Array[UnsafeRow] = {
import scala.collection.JavaConverters._
val cb = withResource(new NvtxRange("COPY BOUNDARIES", NvtxColor.PURPLE)) { _ =>
new ColumnarBatch(
GpuColumnVector.extractColumns(tab, sorter.projectedBatchTypes).map(_.copyToHost()),
tab.getRowCount.toInt)
}
withResource(cb) { cb =>
withResource(new NvtxRange("TO UNSAFE ROW", NvtxColor.RED)) { _ =>
cb.rowIterator().asScala.map(unsafeProjection).map(_.copy().asInstanceOf[UnsafeRow]).toArray
}
}
}
/**
* A rather complex function. It will take a sorted table (either the output of a regular sort or
* a merge sort), split it up, and place the split portions into the proper queues. If
* sortedOffset >= 0 then everything below that offset is considered to be fully sorted and is
* returned as an option of "SpillableColumnarBatch". Everything else is spilt into smaller
* batches as determined by this function and returned as a seq of "OutOfCoreBatch".
* Call `saveSplitResult` to place them into the cache correspondingly.
*/
private final def splitAfterSort(sortedSpill: SpillableColumnarBatch,
sortedOffset: Int = -1): (Option[SpillableColumnarBatch], Seq[OutOfCoreBatch]) = {
withResource(sortedSpill.getColumnarBatch()) { cb =>
withResource(GpuColumnVector.from(cb)) { table =>
splitTableAfterSort(table, sortedOffset)
}
}
}
private final def splitTableAfterSort(
sortedTbl: Table,
sortedOffset: Int): (Option[SpillableColumnarBatch], Seq[OutOfCoreBatch]) = {
// We need to figure out how to split up the data into reasonable batches. We could try and do
// something really complicated and figure out how much data get per batch, but in practice
// we really only expect to see one or two batches worth of data come in, so lets optimize
// for that case and set the targetBatchSize to always be 1/8th the targetSize.
val targetBatchSize = targetSize / 8
val rows = sortedTbl.getRowCount.toInt
val memSize = GpuColumnVector.getTotalDeviceMemoryUsed(sortedTbl)
val averageRowSize = memSize.toDouble/rows
// Protect ourselves from large rows when there are small targetSizes
val targetRowCount = Math.max((targetBatchSize/averageRowSize).toInt, 1024)
var sortedCb: Option[SpillableColumnarBatch] = None
val pendingObs: ArrayBuffer[OutOfCoreBatch] = ArrayBuffer.empty
if (sortedOffset == rows) {
// The entire thing is sorted
val batch = GpuColumnVector.from(sortedTbl, sorter.projectedBatchTypes)
val sp = SpillableColumnarBatch(batch, SpillPriorities.ACTIVE_ON_DECK_PRIORITY)
sortedCb = Some(sp)
} else {
val hasFullySortedData = sortedOffset > 0
val splitIndexes = if (hasFullySortedData) {
sortedOffset until rows by targetRowCount
} else {
targetRowCount until rows by targetRowCount
}
// Get back the first row so we can sort the batches
val lowerGatherIndexes = if (hasFullySortedData) {
// The first batch is sorted so don't gather a row for it
splitIndexes
} else {
Seq(0) ++ splitIndexes
}
val lowerBoundaries =
withResource(new NvtxRange("lower boundaries", NvtxColor.ORANGE)) { _ =>
withResource(ColumnVector.fromInts(lowerGatherIndexes: _*)) { gatherMap =>
withResource(sortedTbl.gather(gatherMap)) { boundariesTab =>
convertBoundaries(boundariesTab)
}
}
}
withResource(sortedTbl.contiguousSplit(splitIndexes: _*)) { splits =>
var currentSplit = 0
val stillPending = if (hasFullySortedData) {
val ct = splits(currentSplit)
splits(currentSplit) = null
val sp = SpillableColumnarBatch(ct,
sorter.projectedBatchTypes,
SpillPriorities.ACTIVE_ON_DECK_PRIORITY)
currentSplit += 1
sortedCb = Some(sp)
splits.slice(1, splits.length)
} else {
splits
}
closeOnExcept(sortedCb) { _ =>
assert(lowerBoundaries.length == stillPending.length)
closeOnExcept(pendingObs) { _ =>
stillPending.zip(lowerBoundaries).foreach {
case (ct: ContiguousTable, lower: UnsafeRow) =>
splits(currentSplit) = null
currentSplit += 1
if (ct.getRowCount > 0) {
val sp = SpillableColumnarBatch(ct, sorter.projectedBatchTypes,
SpillPriorities.ACTIVE_ON_DECK_PRIORITY)
pendingObs += OutOfCoreBatch(sp, lower)
} else {
ct.close()
}
}
}
}
}
}
(sortedCb, pendingObs.toSeq)
}
/** Save the splitting result returned from `splitAfterSort` into the cache */
private final def saveSplitResult(
result: (Option[SpillableColumnarBatch], Seq[OutOfCoreBatch])): Unit = {
val (sortedSp, pendingObs) = result
closeOnExcept(pendingObs) { _ =>
sortedSp.foreach { sp =>
sortedSize += sp.sizeInBytes
sorted.add(sp)
}
}
pendingObs.foreach(pending.add)
}
/**
* Take a single sorted batch from the `alreadySortedIter`, split it up and store them for
* merging.
*/
private final def splitOneSortedBatch(scb: SpillableColumnarBatch): Unit = {
withResource(new NvtxWithMetrics("split input batch", NvtxColor.CYAN, opTime)) { _ =>
val ret = withRetryNoSplit(scb) { attempt =>
onFirstPassSplit()
splitAfterSort(attempt)
}
saveSplitResult(ret)
}
}
/**
* First pass through the data. Conceptually we are going to read in all of the batches, that are
* already sorted and split them up into smaller chunks for later merge sorting. But we are
* only going to do that if we have more than one batch to sort.
*/
private final def firstPassReadBatches(scb: SpillableColumnarBatch): Unit = {
splitOneSortedBatch(scb)
while (alreadySortedIter.hasNext) {
splitOneSortedBatch(alreadySortedIter.next())
}
}
/**
* Merge sort enough data that we can output a batch and put it in the sorted queue.
* @return if we can determine that we can return a final batch without putting it in the sorted
* queue then optionally return it.
*/
private final def mergeSortEnoughToOutput(): Option[ColumnarBatch] = {
// Now get enough sorted data to return
while (!pending.isEmpty && sortedSize < targetSize) {
// Keep going until we have enough data to return
var bytesLeftToFetch = targetSize
val pendingSort = new RapidsStack[SpillableColumnarBatch]()
closeOnExcept(pendingSort.toSeq) { _ =>
while (!pending.isEmpty &&
(bytesLeftToFetch - pending.peek().buffer.sizeInBytes >= 0 || pendingSort.isEmpty)) {
val buffer = pending.poll().buffer
pendingSort.push(buffer)
bytesLeftToFetch -= buffer.sizeInBytes
}
}
val mergedSpillBatch = sorter.mergeSortAndCloseWithRetry(pendingSort, sortTime)
val (retBatch, sortedOffset) = closeOnExcept(mergedSpillBatch) { _ =>
// First we want figure out what is fully sorted from what is not
val sortSplitOffset = if (pending.isEmpty) {
// No need to split it
mergedSpillBatch.numRows()
} else {
// The data is only fully sorted if there is nothing pending that is smaller than it
// so get the next "smallest" row that is pending.
val cutoff = pending.peek().firstRow
val result = RmmRapidsRetryIterator.withRetryNoSplit[ColumnVector] {
withResource(converters.convertBatch(Array(cutoff),
TrampolineUtil.fromAttributes(sorter.projectedBatchSchema))) { cutoffCb =>
withResource(mergedSpillBatch.getColumnarBatch()) { mergedBatch =>
sorter.upperBound(mergedBatch, cutoffCb)
}
}
}
withResource(result) { _ =>
withResource(result.copyToHost()) { hostResult =>
assert(hostResult.getRowCount == 1)
hostResult.getInt(0)
}
}
}
if (sortSplitOffset == mergedSpillBatch.numRows() && sorted.isEmpty &&
(mergedSpillBatch.sizeInBytes >= targetSize || pending.isEmpty)) {
// This is a special case where we have everything we need to output already so why
// bother with another contig split just to put it into the queue
val projectedBatch = RmmRapidsRetryIterator.withRetryNoSplit[ColumnarBatch] {
withResource(mergedSpillBatch.getColumnarBatch()) { mergedBatch =>
withResource(GpuColumnVector.from(mergedBatch)) { mergedTbl =>
sorter.removeProjectedColumns(mergedTbl)
}
}
}
(Some(projectedBatch), sortSplitOffset)
} else {
(None, sortSplitOffset)
}
}
if (retBatch.nonEmpty) {
mergedSpillBatch.close()
return retBatch
} else {
val splitResult = withRetryNoSplit(mergedSpillBatch) { attempt =>
onMergeSortSplit()
splitAfterSort(attempt, sortedOffset)
}
saveSplitResult(splitResult)
}
}
None
}
/**
* Take data from the sorted queue and return a final batch that can be returned
* @return a batch that can be returned.
*/
private final def concatOutput(): ColumnarBatch = {
// combine all the sorted data into a single batch
val spillCbs = ArrayBuffer[SpillableColumnarBatch]()
var totalBytes = 0L
closeOnExcept(spillCbs) { _ =>
while (!sorted.isEmpty && (spillCbs.isEmpty ||
(totalBytes + sorted.peek().sizeInBytes) < targetSize)) {
val tmp = sorted.pop()
sortedSize -= tmp.sizeInBytes
totalBytes += tmp.sizeInBytes
spillCbs += tmp
}
}
if (spillCbs.length == 1) {
// We cannot concat a single table
withRetryNoSplit(spillCbs.head) { attemptSp =>
onConcatOutput()
withResource(attemptSp.getColumnarBatch()) { attemptCb =>
withResource(GpuColumnVector.from(attemptCb)) { attemptTbl =>
sorter.removeProjectedColumns(attemptTbl)
}
}
}
} else {
// withRetryNoSplit will take over the batches.
withRetryNoSplit(spillCbs.toSeq) { attempt =>
onConcatOutput()
val tables = attempt.safeMap { sp =>
withResource(sp.getColumnarBatch())(GpuColumnVector.from)
}
withResource(tables) { _ =>
withResource(Table.concatenate(tables: _*)) { combined =>
// ignore the output of removing the columns because it is just dropping columns
// so it will be smaller than this with not added memory
sorter.removeProjectedColumns(combined)
}
}
}
}
}
override def next(): ColumnarBatch = {
if (sorter.projectedBatchSchema.isEmpty) {
// special case, no columns just rows
withRetryNoSplit(alreadySortedIter.next()) { scb =>
// This should have no columns so no need to remove anything from the projected data
scb.getColumnarBatch()
}
} else {
if (pending.isEmpty && sorted.isEmpty) {
closeOnExcept(alreadySortedIter.next()) { scb =>
if (!alreadySortedIter.hasNext) {
sorted.add(scb)
} else {
firstPassReadBatches(scb)
}
}
}
withResource(new NvtxWithMetrics("Sort next output batch", NvtxColor.CYAN, opTime)) { _ =>
val ret = mergeSortEnoughToOutput().getOrElse(concatOutput())
outputBatches += 1
outputRows += ret.numRows()
// We already read in all of the data so calling hasNext is cheap
if (hasNext) {
ret
} else {
GpuColumnVector.tagAsFinalBatch(ret)
}
}
}
}
override def close(): Unit = {
sorted.forEach(_.close())
pending.close()
}
/** Callbacks designed for unit tests only. Don't do any heavy things inside. */
protected def onFirstPassSplit(): Unit = {}
protected def onMergeSortSplit(): Unit = {}
protected def onConcatOutput(): Unit = {}
}