org.apache.spark.sql.rapids.aggregate.aggregateFunctions.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of rapids-4-spark_2.13 Show documentation
Show all versions of rapids-4-spark_2.13 Show documentation
Creates the distribution package of the RAPIDS plugin for Apache Spark
/*
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.sql.rapids.aggregate
import ai.rapids.cudf
import ai.rapids.cudf.{Aggregation128Utils, BinaryOp, ColumnVector, DType, GroupByAggregation, GroupByScanAggregation, NaNEquality, NullEquality, NullPolicy, NvtxColor, NvtxRange, ReductionAggregation, ReplacePolicy, RollingAggregation, RollingAggregationOnColumn, Scalar, ScanAggregation}
import com.nvidia.spark.rapids._
import com.nvidia.spark.rapids.Arm.withResource
import com.nvidia.spark.rapids.RapidsPluginImplicits.ReallyAGpuExpression
import com.nvidia.spark.rapids.shims.{GpuDeterministicFirstLastCollectShim, ShimExpression, TypeUtilsShims}
import com.nvidia.spark.rapids.window._
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.TypeCheckSuccess
import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, ImplicitCastInputTypes, Literal, UnsafeProjection, UnsafeRow}
import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData, TypeUtils}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.rapids._
import org.apache.spark.sql.types._
import org.apache.spark.sql.vectorized.ColumnarBatch
class CudfCount(override val dataType: DataType) extends CudfAggregate {
override val reductionAggregate: cudf.ColumnVector => cudf.Scalar =
(col: cudf.ColumnVector) => cudf.Scalar.fromInt((col.getRowCount - col.getNullCount).toInt)
override lazy val groupByAggregate: GroupByAggregation =
GroupByAggregation.count(NullPolicy.EXCLUDE)
override val name: String = "CudfCount"
}
class CudfSum(override val dataType: DataType) extends CudfAggregate {
// Up to 3.1.1, analyzed plan widened the input column type before applying
// aggregation. Thus even though we did not explicitly pass the output column type
// we did not run into integer overflow issues:
//
// == Analyzed Logical Plan ==
// sum(shorts): bigint
// Aggregate [sum(cast(shorts#77 as bigint)) AS sum(shorts)#94L]
//
// In Spark's main branch (3.2.0-SNAPSHOT as of this comment), analyzed logical plan
// no longer applies the cast to the input column such that the output column type has to
// be passed explicitly into aggregation
//
// == Analyzed Logical Plan ==
// sum(shorts): bigint
// Aggregate [sum(shorts#33) AS sum(shorts)#50L]
//
@transient lazy val rapidsSumType: DType = GpuColumnVector.getNonNestedRapidsType(dataType)
override val reductionAggregate: cudf.ColumnVector => cudf.Scalar =
(col: cudf.ColumnVector) => col.sum(rapidsSumType)
override lazy val groupByAggregate: GroupByAggregation =
GroupByAggregation.sum()
override val name: String = "CudfSum"
}
class CudfMax(override val dataType: DataType) extends CudfAggregate {
override lazy val reductionAggregate: cudf.ColumnVector => cudf.Scalar =
(col: cudf.ColumnVector) => col.max
override lazy val groupByAggregate: GroupByAggregation =
GroupByAggregation.max()
override val name: String = "CudfMax"
}
/**
* Check if there is a `true` value in a boolean column.
* The CUDF any aggregation does not work for reductions or group by aggregations
* so we use Max as a workaround for this.
*/
object CudfAny {
def apply(): CudfAggregate = new CudfMax(BooleanType)
}
class CudfMin(override val dataType: DataType) extends CudfAggregate {
override lazy val reductionAggregate: cudf.ColumnVector => cudf.Scalar =
(col: cudf.ColumnVector) => col.min
override lazy val groupByAggregate: GroupByAggregation =
GroupByAggregation.min()
override val name: String = "CudfMin"
}
/**
* Check if all values in a boolean column are trues.
* The CUDF all aggregation does not work for reductions or group by aggregations
* so we use Min as a workaround for this.
*/
object CudfAll {
def apply(): CudfAggregate = new CudfMin(BooleanType)
}
class CudfCollectList(override val dataType: DataType) extends CudfAggregate {
override lazy val reductionAggregate: cudf.ColumnVector => cudf.Scalar =
(col: cudf.ColumnVector) => col.reduce(ReductionAggregation.collectList(), DType.LIST)
override lazy val groupByAggregate: GroupByAggregation =
GroupByAggregation.collectList()
override val name: String = "CudfCollectList"
}
class CudfMergeLists(override val dataType: DataType) extends CudfAggregate {
override lazy val reductionAggregate: cudf.ColumnVector => cudf.Scalar =
(col: cudf.ColumnVector) => col.reduce(ReductionAggregation.mergeLists(), DType.LIST)
override lazy val groupByAggregate: GroupByAggregation =
GroupByAggregation.mergeLists()
override val name: String = "CudfMergeLists"
}
/**
* Spark handles NaN's equality by different way for non-nested float/double and float/double
* in nested types. When we use non-nested versions of floats and doubles, NaN values are
* considered unequal, but when we collect sets of nested versions, NaNs are considered equal
* on the CPU. So we set NaNEquality dynamically in CudfCollectSet and CudfMergeSets.
* Note that dataType is ArrayType(child.dataType) here.
*/
class CudfCollectSet(override val dataType: DataType) extends CudfAggregate {
override lazy val reductionAggregate: cudf.ColumnVector => cudf.Scalar =
(col: cudf.ColumnVector) => {
val collectSet = dataType match {
case ArrayType(FloatType | DoubleType, _) =>
ReductionAggregation.collectSet(
NullPolicy.EXCLUDE, NullEquality.EQUAL, NaNEquality.UNEQUAL)
case _: DataType =>
ReductionAggregation.collectSet(
NullPolicy.EXCLUDE, NullEquality.EQUAL, NaNEquality.ALL_EQUAL)
}
col.reduce(collectSet, DType.LIST)
}
override lazy val groupByAggregate: GroupByAggregation = dataType match {
case ArrayType(FloatType | DoubleType, _) =>
GroupByAggregation.collectSet(
NullPolicy.EXCLUDE, NullEquality.EQUAL, NaNEquality.UNEQUAL)
case _: DataType =>
GroupByAggregation.collectSet(
NullPolicy.EXCLUDE, NullEquality.EQUAL, NaNEquality.ALL_EQUAL)
}
override val name: String = "CudfCollectSet"
}
class CudfMergeSets(override val dataType: DataType) extends CudfAggregate {
override lazy val reductionAggregate: cudf.ColumnVector => cudf.Scalar =
(col: cudf.ColumnVector) => {
val mergeSets = dataType match {
case ArrayType(FloatType | DoubleType, _) =>
ReductionAggregation.mergeSets(NullEquality.EQUAL, NaNEquality.UNEQUAL)
case _: DataType =>
ReductionAggregation.mergeSets(NullEquality.EQUAL, NaNEquality.ALL_EQUAL)
}
col.reduce(mergeSets, DType.LIST)
}
override lazy val groupByAggregate: GroupByAggregation = dataType match {
case ArrayType(FloatType | DoubleType, _) =>
GroupByAggregation.mergeSets(NullEquality.EQUAL, NaNEquality.UNEQUAL)
case _: DataType =>
GroupByAggregation.mergeSets(NullEquality.EQUAL, NaNEquality.ALL_EQUAL)
}
override val name: String = "CudfMergeSets"
}
class CudfNthLikeAggregate(opName: String, override val dataType: DataType, offset: Int,
includeNulls: NullPolicy) extends CudfAggregate {
override val name = includeNulls match {
case NullPolicy.INCLUDE => opName + "IncludeNulls"
case NullPolicy.EXCLUDE => opName + "ExcludeNulls"
}
override lazy val reductionAggregate: cudf.ColumnVector => cudf.Scalar =
(col: cudf.ColumnVector) => col.reduce(ReductionAggregation.nth(offset, includeNulls))
override lazy val groupByAggregate: GroupByAggregation = {
GroupByAggregation.nth(offset, includeNulls)
}
}
object CudfNthLikeAggregate {
def newFirstExcludeNulls(dataType: DataType): CudfAggregate =
new CudfNthLikeAggregate("CudfFirst", dataType, 0, NullPolicy.EXCLUDE)
def newFirstIncludeNulls(dataType: DataType): CudfAggregate =
new CudfNthLikeAggregate("CudfFirst", dataType, 0, NullPolicy.INCLUDE)
def newLastExcludeNulls(dataType: DataType): CudfAggregate =
new CudfNthLikeAggregate("CudfLast", dataType, -1, NullPolicy.EXCLUDE)
def newLastIncludeNulls(dataType: DataType): CudfAggregate =
new CudfNthLikeAggregate("CudfLast", dataType, -1, NullPolicy.INCLUDE)
}
/**
* This class is only used by the M2 class aggregates, do not confuse this with GpuAverage.
* In the future, this aggregate class should be removed and the mean values should be
* generated in the output of libcudf's M2 aggregate.
*/
class CudfMean extends CudfAggregate {
override lazy val reductionAggregate: cudf.ColumnVector => cudf.Scalar =
(col: cudf.ColumnVector) => {
val count = col.getRowCount - col.getNullCount
if (count == 0) {
Scalar.fromDouble(0.0)
} else {
withResource(col.sum(DType.FLOAT64)) { sum =>
Scalar.fromDouble(sum.getDouble / count.toDouble)
}
}
}
override lazy val groupByAggregate: GroupByAggregation = GroupByAggregation.mean()
override val name: String = "CudfMeanForM2"
override def dataType: DataType = DoubleType
}
class CudfM2 extends CudfAggregate {
override lazy val reductionAggregate: cudf.ColumnVector => cudf.Scalar =
(col: cudf.ColumnVector) => {
val count = col.getRowCount - col.getNullCount
if (count == 0) {
Scalar.fromDouble(0.0)
} else {
withResource(col.sum(DType.FLOAT64)) { sum =>
val mean = sum.getDouble / count.toDouble
withResource(col.reduce(ReductionAggregation.sumOfSquares(), DType.FLOAT64)) { sumSqr =>
Scalar.fromDouble(sumSqr.getDouble - mean * mean * count.toDouble)
}
}
}
}
override lazy val groupByAggregate: GroupByAggregation = GroupByAggregation.M2()
override val name: String = "CudfM2"
override def dataType: DataType = DoubleType
}
class CudfMergeM2 extends CudfAggregate {
override lazy val reductionAggregate: cudf.ColumnVector => cudf.Scalar =
(col: cudf.ColumnVector) => {
withResource(new NvtxRange("reduction-merge-m2", NvtxColor.ORANGE)) { _ =>
withResource(col.copyToHost()) { hcv =>
withResource(hcv.getChildColumnView(0)) { partialN =>
withResource(hcv.getChildColumnView(1)) { partialMean =>
withResource(hcv.getChildColumnView(2)) { partialM2 =>
var mergeN: Integer = 0
var mergeMean: Double = 0.0
var mergeM2: Double = 0.0
for (i <- 0 until partialN.getRowCount.toInt) {
val n = partialN.getInt(i)
if (n > 0) {
val mean = partialMean.getDouble(i)
val m2 = partialM2.getDouble(i)
val delta = mean - mergeMean
val newN = n + mergeN
mergeM2 += m2 + delta * delta * n.toDouble * mergeN.toDouble / newN.toDouble
mergeMean = (mergeMean * mergeN.toDouble + mean * n.toDouble) / newN.toDouble
mergeN = newN
}
}
withResource(ColumnVector.fromInts(mergeN)) { cvMergeN =>
withResource(ColumnVector.fromDoubles(mergeMean)) { cvMergeMean =>
withResource(ColumnVector.fromDoubles(mergeM2)) { cvMergeM2 =>
Scalar.structFromColumnViews(cvMergeN, cvMergeMean, cvMergeM2)
}
}
}
}
}
}
}
}
}
override lazy val groupByAggregate: GroupByAggregation = GroupByAggregation.mergeM2()
override val name: String = "CudfMergeM2"
override val dataType: DataType =
StructType(
StructField("n", IntegerType, nullable = false) ::
StructField("avg", DoubleType, nullable = true) ::
StructField("m2", DoubleType, nullable = true) :: Nil)
}
object GpuMin{
def apply(child: Expression): GpuMin = child.dataType match {
case FloatType | DoubleType => GpuFloatMin(child)
case _ => GpuBasicMin(child)
}
}
abstract class GpuMin(child: Expression) extends GpuAggregateFunction
with GpuBatchedRunningWindowWithFixer
with GpuUnboundToUnboundWindowWithFixer
with GpuAggregateWindowFunction
with GpuRunningWindowFunction
with Serializable {
override lazy val initialValues: Seq[GpuLiteral] = Seq(GpuLiteral(null, child.dataType))
override lazy val inputProjection: Seq[Expression] = Seq(child)
override lazy val updateAggregates: Seq[CudfAggregate] = Seq(new CudfMin(child.dataType))
override lazy val mergeAggregates: Seq[CudfAggregate] = Seq(new CudfMin(child.dataType))
private lazy val cudfMin = AttributeReference("min", child.dataType)()
override lazy val evaluateExpression: Expression = cudfMin
override lazy val aggBufferAttributes: Seq[AttributeReference] = cudfMin :: Nil
// Copied from Min
override def nullable: Boolean = true
override def dataType: DataType = child.dataType
override def children: Seq[Expression] = child :: Nil
override def checkInputDataTypes(): TypeCheckResult =
TypeUtils.checkForOrderingExpr(child.dataType, "function gpu min")
// GENERAL WINDOW FUNCTION
override lazy val windowInputProjection: Seq[Expression] = inputProjection
override def windowAggregation(
inputs: Seq[(ColumnVector, Int)]): RollingAggregationOnColumn =
RollingAggregation.min().onColumn(inputs.head._2)
// RUNNING WINDOW
override def newFixer(): BatchedRunningWindowFixer =
new BatchedRunningWindowBinaryFixer(BinaryOp.NULL_MIN, "min")
// UNBOUNDED TO UNBOUNDED WINDOW
override def newUnboundedToUnboundedFixer: BatchedUnboundedToUnboundedWindowFixer =
new BatchedUnboundedToUnboundedBinaryFixer(BinaryOp.NULL_MIN, dataType)
override def groupByScanInputProjection(isRunningBatched: Boolean): Seq[Expression] =
inputProjection
override def groupByScanAggregation(
isRunningBatched: Boolean): Seq[AggAndReplace[GroupByScanAggregation]] =
Seq(AggAndReplace(GroupByScanAggregation.min(), Some(ReplacePolicy.PRECEDING)))
override def isGroupByScanSupported: Boolean = child.dataType match {
case StringType | TimestampType | DateType => false
case _ => true
}
override def scanInputProjection(isRunningBatched: Boolean): Seq[Expression] = inputProjection
override def scanAggregation(isRunningBatched: Boolean): Seq[AggAndReplace[ScanAggregation]] =
Seq(AggAndReplace(ScanAggregation.min(), Some(ReplacePolicy.PRECEDING)))
override def isScanSupported: Boolean = child.dataType match {
case TimestampType | DateType => false
case _ => true
}
}
/** Min aggregation without `Nan` handling */
case class GpuBasicMin(child: Expression) extends GpuMin(child)
/** GpuMin for FloatType and DoubleType to handle `Nan`s.
*
* In Spark, `Nan` is the max float value, however in cuDF, the calculation
* involving `Nan` is undefined.
* We design a workaround method here to match the Spark's behaviour.
* The high level idea is:
* if the column contains only `Nan`s or `null`s
* then
if the column contains `Nan`
* then return `Nan`
* else return null
* else
* replace all `Nan`s with nulls;
* use cuDF kernel to find the min value
*/
case class GpuFloatMin(child: Expression) extends GpuMin(child)
with GpuReplaceWindowFunction {
override val dataType: DataType = child.dataType match {
case FloatType | DoubleType => child.dataType
case t => throw new IllegalStateException(s"child type $t is not FloatType or DoubleType")
}
protected val nan: Any = child.dataType match {
case FloatType => Float.NaN
case DoubleType => Double.NaN
case t => throw new IllegalStateException(s"child type $t is not FloatType or DoubleType")
}
protected lazy val updateAllNansOrNulls = CudfAll()
protected lazy val updateHasNan = CudfAny()
protected lazy val updateMinVal = new CudfMin(dataType)
protected lazy val mergeAllNansOrNulls = CudfAll()
protected lazy val mergeHasNan = CudfAny()
protected lazy val mergeMinVal = new CudfMin(dataType)
// Project 3 columns:
// 1. A boolean column indicating whether the values in `child` are `Nan`s or `null`s
// 2. A boolean column indicating whether the values in `child` are `Nan`s
// 3. Replace all `Nan`s in the `child` with `null`s
override lazy val inputProjection: Seq[Expression] = Seq(
GpuOr(GpuIsNan(child), GpuIsNull(child)),
GpuIsNan(child),
// We must eliminate all Nans before calling the cuDF min kernel.
// As this expression is only used when `allNansOrNulls` = false,
// and `Nan` is the max value in Spark, the elimination will
// not affect the final result.
GpuNansToNulls(child)
)
// 1. Check if all values in the `child` are `Nan`s or `null`s
// 2. Check if `child` contains `Nan`
// 3. Calculate the min value on `child` with all `Nan`s has been replaced.
override lazy val updateAggregates: Seq[CudfAggregate] =
Seq(updateAllNansOrNulls, updateHasNan, updateMinVal)
// If the column only contains `Nan`s or `null`s
// Then
// if the column contains `Nan`
// then return `Nan`
// else return `null`
// Else return the min value
override lazy val postUpdate: Seq[Expression] = Seq(
GpuIf(
updateAllNansOrNulls.attr,
GpuIf(
updateHasNan.attr, GpuLiteral(nan, dataType), GpuLiteral(null, dataType)
),
updateMinVal.attr
)
)
// Same logic as the `inputProjection` stage.
override lazy val preMerge: Seq[Expression] = Seq (
GpuOr(GpuIsNan(evaluateExpression), GpuIsNull(evaluateExpression)),
GpuIsNan(evaluateExpression),
GpuNansToNulls(evaluateExpression)
)
// Same logic as the `updateAggregates` stage.
override lazy val mergeAggregates: Seq[CudfAggregate] =
Seq(mergeAllNansOrNulls, mergeHasNan, mergeMinVal)
// Same logic as the `postUpdate` stage.
override lazy val postMerge: Seq[Expression] = Seq(
GpuIf(
mergeAllNansOrNulls.attr,
GpuIf(
mergeHasNan.attr, GpuLiteral(nan, dataType), GpuLiteral(null, dataType)
),
mergeMinVal.attr
)
)
// We should always override the windowing expression to handle `Nan`.
override def shouldReplaceWindow(spec: GpuWindowSpecDefinition): Boolean = true
override def windowReplacement(spec: GpuWindowSpecDefinition): Expression = {
// The `GpuBasicMin` here has the same functionality as `CudfAll`,
// as `true > false` in cuDF.
val allNansOrNull = GpuWindowExpression(
GpuBasicMin(GpuOr(GpuIsNan(child), GpuIsNull(child))), spec
)
val hasNan = GpuWindowExpression(GpuBasicMax(GpuIsNan(child)), spec)
// We use `GpuBasicMin` but not `GpuMin` to avoid self recursion.
val min = GpuWindowExpression(GpuBasicMin(GpuNansToNulls(child)), spec)
GpuIf(
allNansOrNull,
GpuIf(hasNan, GpuLiteral(nan, dataType), GpuLiteral(null, dataType)),
min
)
}
}
object GpuMax {
def apply(child: Expression): GpuMax = {
child.dataType match {
case FloatType | DoubleType => GpuFloatMax(child)
case _ => GpuBasicMax(child)
}
}
}
abstract class GpuMax(child: Expression) extends GpuAggregateFunction
with GpuBatchedRunningWindowWithFixer
with GpuUnboundToUnboundWindowWithFixer
with GpuAggregateWindowFunction
with GpuRunningWindowFunction
with Serializable {
override lazy val initialValues: Seq[GpuLiteral] = Seq(GpuLiteral(null, child.dataType))
override lazy val inputProjection: Seq[Expression] = Seq(child)
override lazy val updateAggregates: Seq[CudfAggregate] = Seq(new CudfMax(dataType))
override lazy val mergeAggregates: Seq[CudfAggregate] = Seq(new CudfMax(dataType))
private lazy val cudfMax = AttributeReference("max", child.dataType)()
override lazy val evaluateExpression: Expression = cudfMax
override lazy val aggBufferAttributes: Seq[AttributeReference] = cudfMax :: Nil
// Copied from Max
override def nullable: Boolean = true
override def dataType: DataType = child.dataType
override def children: Seq[Expression] = child :: Nil
override def checkInputDataTypes(): TypeCheckResult =
TypeUtils.checkForOrderingExpr(child.dataType, "function gpu max")
// GENERAL WINDOW FUNCTION
override lazy val windowInputProjection: Seq[Expression] = inputProjection
override def windowAggregation(
inputs: Seq[(ColumnVector, Int)]): RollingAggregationOnColumn =
RollingAggregation.max().onColumn(inputs.head._2)
// RUNNING WINDOW
override def newFixer(): BatchedRunningWindowFixer =
new BatchedRunningWindowBinaryFixer(BinaryOp.NULL_MAX, "max")
// UNBOUNDED TO UNBOUNDED WINDOW
override def newUnboundedToUnboundedFixer: BatchedUnboundedToUnboundedWindowFixer =
new BatchedUnboundedToUnboundedBinaryFixer(BinaryOp.NULL_MAX, dataType)
override def groupByScanInputProjection(isRunningBatched: Boolean): Seq[Expression] =
inputProjection
override def groupByScanAggregation(
isRunningBatched: Boolean): Seq[AggAndReplace[GroupByScanAggregation]] =
Seq(AggAndReplace(GroupByScanAggregation.max(), Some(ReplacePolicy.PRECEDING)))
override def isGroupByScanSupported: Boolean = child.dataType match {
case StringType | TimestampType | DateType => false
case _ => true
}
override def scanInputProjection(isRunningBatched: Boolean): Seq[Expression] = inputProjection
override def scanAggregation(isRunningBatched: Boolean): Seq[AggAndReplace[ScanAggregation]] =
Seq(AggAndReplace(ScanAggregation.max(), Some(ReplacePolicy.PRECEDING)))
override def isScanSupported: Boolean = child.dataType match {
case TimestampType | DateType => false
case _ => true
}
}
/** Max aggregation without `Nan` handling */
case class GpuBasicMax(child: Expression) extends GpuMax(child)
/** Max aggregation for FloatType and DoubleType to handle `Nan`s.
*
* In Spark, `Nan` is the max float value, however in cuDF, the calculation
* involving `Nan` is undefined.
* We design a workaround method here to match the Spark's behaviour.
* The high level idea is that, in the projection stage, we create another
* column `isNan`. If any value in this column is true, return `Nan`,
* Else, return what `GpuBasicMax` returns.
*/
case class GpuFloatMax(child: Expression) extends GpuMax(child)
with GpuReplaceWindowFunction{
override val dataType: DataType = child.dataType match {
case FloatType | DoubleType => child.dataType
case t => throw new IllegalStateException(s"child type $t is not FloatType or DoubleType")
}
protected val nan: Any = child.dataType match {
case FloatType => Float.NaN
case DoubleType => Double.NaN
case t => throw new IllegalStateException(s"child type $t is not FloatType or DoubleType")
}
protected lazy val updateIsNan = CudfAny()
protected lazy val updateMaxVal = new CudfMax(dataType)
protected lazy val mergeIsNan = CudfAny()
protected lazy val mergeMaxVal = new CudfMax(dataType)
// Project 2 columns. The first one is the target column, second one is a
// Boolean column indicating whether the values in the target column are` Nan`s.
override lazy val inputProjection: Seq[Expression] = Seq(child, GpuIsNan(child))
// Execute the `CudfMax` on the target column. At the same time,
// execute the `CudfAny` on the `isNan` column.
override lazy val updateAggregates: Seq[CudfAggregate] = Seq(updateMaxVal, updateIsNan)
// If there is `Nan` value in the target column, return `Nan`
// else return what the `CudfMax` returns
override lazy val postUpdate: Seq[Expression] =
Seq(
GpuIf(updateIsNan.attr, GpuLiteral(nan, dataType), updateMaxVal.attr)
)
// Same logic as the `inputProjection` stage.
override lazy val preMerge: Seq[Expression] =
Seq(evaluateExpression, GpuIsNan(evaluateExpression))
// Same logic as the `updateAggregates` stage.
override lazy val mergeAggregates: Seq[CudfAggregate] = Seq(mergeMaxVal, mergeIsNan)
// Same logic as the `postUpdate` stage.
override lazy val postMerge: Seq[Expression] =
Seq(
GpuIf(mergeIsNan.attr, GpuLiteral(nan, dataType), mergeMaxVal.attr)
)
// We should always override the windowing expression to handle `Nan`.
override def shouldReplaceWindow(spec: GpuWindowSpecDefinition): Boolean = true
override def windowReplacement(spec: GpuWindowSpecDefinition): Expression = {
// The `GpuBasicMax` here has the same functionality as `CudfAny`,
// as `true > false` in cuDF.
val isNan = GpuWindowExpression(GpuBasicMax(GpuIsNan(child)), spec)
// We use `GpuBasicMax` but not `GpuMax` to avoid self recursion.
val max = GpuWindowExpression(GpuBasicMax(child), spec)
GpuIf(isNan, GpuLiteral(nan, dataType), max)
}
}
/**
* Extracts a 32-bit chunk from a 128-bit value
* @param data expression producing 128-bit values
* @param chunkIdx index of chunk to extract (0-3)
* @param replaceNullsWithZero whether to replace nulls with zero
*/
case class GpuExtractChunk32(
data: Expression,
chunkIdx: Int,
replaceNullsWithZero: Boolean) extends GpuExpression with ShimExpression {
override def nullable: Boolean = true
override def dataType: DataType = if (chunkIdx < 3) GpuUnsignedIntegerType else IntegerType
override def sql: String = data.sql
override def columnarEval(batch: ColumnarBatch): GpuColumnVector = {
withResource(data.columnarEval(batch)) { dataCol =>
val dtype = if (chunkIdx < 3) DType.UINT32 else DType.INT32
val chunkCol = Aggregation128Utils.extractInt32Chunk(dataCol.getBase, dtype, chunkIdx)
val replacedCol = if (replaceNullsWithZero) {
withResource(chunkCol) { chunkCol =>
val zero = dtype match {
case DType.INT32 => Scalar.fromInt(0)
case DType.UINT32 => Scalar.fromUnsignedInt(0)
}
withResource(zero) { zero =>
chunkCol.replaceNulls(zero)
}
}
} else {
chunkCol
}
GpuColumnVector.from(replacedCol, dataType)
}
}
override def children: Seq[Expression] = Seq(data)
}
/**
* Reassembles a 128-bit value from four separate 64-bit sum results
* @param chunkAttrs attributes for the four 64-bit sum chunks ordered from least significant to
* most significant
* @param dataType output type of the reconstructed 128-bit value
* @param nullOnOverflow whether to produce null on overflows
*/
case class GpuAssembleSumChunks(
chunkAttrs: Seq[AttributeReference],
dataType: DecimalType,
nullOnOverflow: Boolean) extends GpuExpression with ShimExpression {
override def nullable: Boolean = true
override def columnarEval(batch: ColumnarBatch): GpuColumnVector = {
val cudfType = DecimalUtil.createCudfDecimal(dataType)
val assembledTable = withResource(GpuProjectExec.project(batch, chunkAttrs)) { dataCol =>
withResource(GpuColumnVector.from(dataCol)) { chunkTable =>
Aggregation128Utils.combineInt64SumChunks(chunkTable, cudfType)
}
}
withResource(assembledTable) { assembledTable =>
assert(assembledTable.getNumberOfColumns == 2)
val hasOverflowed = assembledTable.getColumn(0)
val decimalData = assembledTable.getColumn(1)
assert(hasOverflowed.getType == DType.BOOL8)
assert(decimalData.getType.getTypeId == DType.DTypeEnum.DECIMAL128)
withResource(Scalar.fromNull(cudfType)) { nullScalar =>
GpuColumnVector.from(hasOverflowed.ifElse(nullScalar, decimalData), dataType)
}
}
}
override def children: Seq[Expression] = chunkAttrs
}
/**
* All decimal processing in Spark has overflow detection as a part of it. Either it replaces
* the value with a null in non-ANSI mode, or it throws an exception in ANSI mode. Spark will also
* do the processing for larger values as `Decimal` values which are based on `BigDecimal` and have
* unbounded precision. So in most cases it is impossible to overflow/underflow so much that an
* incorrect value is returned. Spark will just use more and more memory to hold the value and
* then check for overflow at some point when the result needs to be turned back into a 128-bit
* value.
*
* We cannot do the same thing. Instead we take three strategies to detect overflow.
*
* 1. For decimal values with a precision of 8 or under we follow Spark and do the SUM
* on the unscaled value as a long, and then bit-cast the result back to a Decimal value.
* this means that we can SUM `174,467,442,481` maximum or minimum decimal values with a
* precision of 8 before overflow can no longer be detected. It is much higher for decimal
* values with a smaller precision.
* 2. For decimal values with a precision from 9 to 20 inclusive we sum them as 128-bit values.
* this is very similar to what we do in the first strategy. The main differences are that we
* use a 128-bit value when doing the sum, and we check for overflow after processing each batch.
* In the case of group-by and reduction that happens after the update stage and also after each
* merge stage. This gives us enough room that we can always detect overflow when summing a
* single batch. Even on a merge where we could be doing the aggregation on a batch that has
* all max output values in it.
* 3. For values from 21 to 28 inclusive we have enough room to not check for overflow on teh update
* aggregation, but for the merge aggregation we need to do some extra checks. This is done by
* taking the digits above 28 and sum them separately. We then check to see if they would have
* overflowed the original limits. This lets us detect overflow in cases where the original
* value would have wrapped around. The reason this works is because we have a hard limit on the
* maximum number of values in a single batch being processed. `Int.MaxValue`, or about 2.2
* billion values. So we use a precision on the higher values that is large enough to handle
* 2.2 billion values and still detect overflow. This equates to a precision of about 10 more
* than is needed to hold the higher digits. This effectively gives us unlimited overflow
* detection.
* 4. For anything larger than precision 28 we do the same overflow detection for strategy 3, but
* also do it on the update aggregation. This lets us fully detect overflows in any stage of
* an aggregation.
*
* Note that for Window operations either there is no merge stage or it only has a single value
* being merged into a batch instead of an entire batch being merged together. This lets us handle
* the overflow detection with what is built into GpuAdd.
*/
object GpuDecimalSumOverflow {
/**
* The increase in precision for the output of a SUM from the input. This is hard coded by
* Spark so we just have it here. This means that for most types without being limited to
* a precision of 38 you get 10-billion+ values before an overflow would even be possible.
*/
val sumPrecisionIncrease: Int = 10
/**
* Generally we want a guarantee that is at least 10x larger than the original overflow.
*/
val extraGuaranteePrecision: Int = 1
/**
* The precision above which we need extra overflow checks while doing an update. This is because
* anything above this precision could in theory overflow beyond detection within a single input
* batch.
*/
val updateCutoffPrecision: Int = 28
}
/**
* This is equivalent to what Spark does after a sum to check for overflow
* `
* If(isEmpty, Literal.create(null, resultType),
* CheckOverflowInSum(sum, d, !SQLConf.get.ansiEnabled))`
*
* But we are renaming it to avoid confusion with the overflow detection we do as a part of sum
* itself that takes the place of the overflow checking that happens with add.
*/
case class GpuCheckOverflowAfterSum(
data: Expression,
isEmpty: Expression,
dataType: DecimalType,
nullOnOverflow: Boolean) extends GpuExpression with ShimExpression {
override def nullable: Boolean = true
override def toString: String = s"CheckOverflowInSum($data, $isEmpty, $dataType, $nullOnOverflow)"
override def sql: String = data.sql
override def columnarEval(batch: ColumnarBatch): GpuColumnVector = {
withResource(data.columnarEval(batch)) { dataCol =>
val dataBase = dataCol.getBase
withResource(isEmpty.columnarEval(batch)) { isEmptyCol =>
val isEmptyBase = isEmptyCol.getBase
if (!nullOnOverflow) {
// ANSI mode
val problem = withResource(dataBase.isNull) { isNull =>
withResource(isEmptyBase.not()) { notEmpty =>
isNull.and(notEmpty)
}
}
withResource(problem) { problem =>
withResource(problem.any()) { anyProblem =>
if (anyProblem.isValid && anyProblem.getBoolean) {
throw new ArithmeticException("Overflow in sum of decimals.")
}
}
}
// No problems fall through...
}
withResource(GpuScalar.from(null, dataType)) { nullScale =>
GpuColumnVector.from(isEmptyBase.ifElse(nullScale, dataBase), dataType)
}
}
}
}
override def children: Seq[Expression] = Seq(data, isEmpty)
}
/**
* This extracts the highest digits from a Decimal value as a part of doing a SUM.
*/
case class GpuDecimalSumHighDigits(
input: Expression,
originalInputType: DecimalType) extends GpuExpression with ShimExpression {
override def nullable: Boolean = input.nullable
override def toString: String = s"GpuDecimalSumHighDigits($input)"
override def sql: String = input.sql
override val dataType: DecimalType = DecimalType(originalInputType.precision +
GpuDecimalSumOverflow.sumPrecisionIncrease + GpuDecimalSumOverflow.extraGuaranteePrecision -
GpuDecimalSumOverflow.updateCutoffPrecision, 0)
// Marking these as lazy because they are not serializable
private lazy val outputDType = GpuColumnVector.getNonNestedRapidsType(dataType)
private lazy val intermediateDType =
DType.create(DType.DTypeEnum.DECIMAL128, outputDType.getScale)
private lazy val divisionFactor: Decimal =
Decimal(math.pow(10, GpuDecimalSumOverflow.updateCutoffPrecision))
private val divisionType = DecimalType(38, 0)
override def columnarEval(batch: ColumnarBatch): GpuColumnVector = {
withResource(input.columnarEval(batch)) { inputCol =>
val inputBase = inputCol.getBase
// We don't have direct access to 128 bit ints so we use a decimal with a scale of 0
// as a stand in.
val bitCastInputType = DType.create(DType.DTypeEnum.DECIMAL128, 0)
val divided = withResource(inputBase.bitCastTo(bitCastInputType)) { bitCastInput =>
withResource(GpuScalar.from(divisionFactor, divisionType)) { divisor =>
bitCastInput.div(divisor, intermediateDType)
}
}
val ret = withResource(divided) { divided =>
if (divided.getType.equals(outputDType)) {
divided.incRefCount()
} else {
divided.castTo(outputDType)
}
}
GpuColumnVector.from(ret, dataType)
}
}
override def children: Seq[Expression] = Seq(input)
}
object GpuSum {
def apply(
child: Expression,
resultType: DataType,
failOnErrorOverride: Boolean = SQLConf.get.ansiEnabled,
forceWindowSumToNotBeReplaced: Boolean = false): GpuSum = {
resultType match {
case dt: DecimalType =>
if (dt.precision > Decimal.MAX_LONG_DIGITS) {
GpuDecimal128Sum(child, dt, failOnErrorOverride, forceWindowSumToNotBeReplaced)
} else {
GpuBasicDecimalSum(child, dt, failOnErrorOverride)
}
case _ => GpuBasicSum(child, resultType, failOnErrorOverride)
}
}
}
abstract class GpuSum(
child: Expression,
resultType: DataType,
failOnErrorOverride: Boolean)
extends GpuAggregateFunction
with ImplicitCastInputTypes
with GpuBatchedRunningWindowWithFixer
with GpuAggregateWindowFunction
with GpuRunningWindowFunction
with Serializable {
override lazy val initialValues: Seq[GpuLiteral] = Seq(GpuLiteral(null, resultType))
// we need to cast to `resultType` here, since Spark is not widening types
// as done before Spark 3.2.0. See CudfSum for more info.
override lazy val inputProjection: Seq[Expression] = Seq(GpuCast(child, resultType))
protected lazy val updateSum: CudfAggregate = new CudfSum(resultType)
override lazy val updateAggregates: Seq[CudfAggregate] = Seq(updateSum)
// output of GpuSum
protected lazy val sum: AttributeReference = AttributeReference("sum", resultType)()
override lazy val aggBufferAttributes: Seq[AttributeReference] = sum :: Nil
protected lazy val mergeSum: CudfAggregate = new CudfSum(resultType)
override lazy val mergeAggregates: Seq[CudfAggregate] = Seq(mergeSum)
override lazy val evaluateExpression: Expression = sum
// Copied from Sum
override def nullable: Boolean = true
override def dataType: DataType = resultType
override def children: Seq[Expression] = child :: Nil
override def inputTypes: Seq[AbstractDataType] = Seq(NumericType)
override def checkInputDataTypes(): TypeCheckResult =
TypeUtilsShims.checkForNumericExpr(child.dataType, "function gpu sum")
// GENERAL WINDOW FUNCTION
// Spark 3.2.0+ stopped casting the input data to the output type before the sum operation
// This fixes that.
override lazy val windowInputProjection: Seq[Expression] = {
if (child.dataType != resultType) {
Seq(GpuCast(child, resultType))
} else {
Seq(child)
}
}
override def windowAggregation(
inputs: Seq[(ColumnVector, Int)]): RollingAggregationOnColumn =
RollingAggregation.sum().onColumn(inputs.head._2)
override def windowOutput(result: ColumnVector): ColumnVector = result.incRefCount()
// RUNNING WINDOW
override def newFixer(): BatchedRunningWindowFixer =
new SumBinaryFixer(resultType, failOnErrorOverride)
override def groupByScanInputProjection(isRunningBatched: Boolean): Seq[Expression] =
windowInputProjection
override def groupByScanAggregation(
isRunningBatched: Boolean): Seq[AggAndReplace[GroupByScanAggregation]] =
Seq(AggAndReplace(GroupByScanAggregation.sum(), Some(ReplacePolicy.PRECEDING)))
override def scanInputProjection(isRunningBatched: Boolean): Seq[Expression] =
windowInputProjection
override def scanAggregation(isRunningBatched: Boolean): Seq[AggAndReplace[ScanAggregation]] =
Seq(AggAndReplace(ScanAggregation.sum(), Some(ReplacePolicy.PRECEDING)))
override def scanCombine(isRunningBatched: Boolean, cols: Seq[ColumnVector]): ColumnVector = {
cols.head.incRefCount()
}
}
/** Sum aggregation for non-decimal types */
case class GpuBasicSum(
child: Expression,
resultType: DataType,
failOnErrorOverride: Boolean)
extends GpuSum(child, resultType, failOnErrorOverride)
abstract class GpuDecimalSum(
child: Expression,
dt: DecimalType,
failOnErrorOverride: Boolean)
extends GpuSum(child, dt, failOnErrorOverride) {
private lazy val zeroDec = GpuLiteral(Decimal(0, dt.precision, dt.scale), dt)
override lazy val initialValues: Seq[GpuLiteral] = {
Seq(zeroDec, GpuLiteral(true, BooleanType))
}
// we need to cast to `resultType` here, since Spark is not widening types
// as done before Spark 3.2.0. See CudfSum for more info.
override lazy val inputProjection: Seq[Expression] = {
// Spark tracks null columns through a second column isEmpty for decimal. So null values
// are replaced with 0, and a separate boolean column for isNull is added
Seq(GpuIf(GpuIsNull(child), zeroDec, GpuCast(child, dt)), GpuIsNull(child))
}
protected lazy val updateIsEmpty: CudfAggregate = new CudfMin(BooleanType)
override lazy val updateAggregates: Seq[CudfAggregate] = {
Seq(updateSum, updateIsEmpty)
}
// Used for Decimal overflow detection
protected lazy val isEmpty: AttributeReference = AttributeReference("isEmpty", BooleanType)()
override lazy val aggBufferAttributes: Seq[AttributeReference] = {
Seq(sum, isEmpty)
}
override lazy val preMerge: Seq[Expression] = {
Seq(sum, isEmpty, GpuIsNull(sum))
}
protected lazy val mergeIsEmpty: CudfAggregate = new CudfMin(BooleanType)
protected lazy val mergeIsOverflow: CudfAggregate = new CudfMax(BooleanType)
// To be able to do decimal overflow detection, we need a CudfSum that does **not** ignore nulls.
// Cudf does not have such an aggregation, so for merge we have to work around that similar to
// what happens with isEmpty
override lazy val mergeAggregates: Seq[CudfAggregate] = {
Seq(mergeSum, mergeIsEmpty, mergeIsOverflow)
}
override lazy val postMerge: Seq[Expression] = {
Seq(
GpuIf(mergeIsOverflow.attr, GpuLiteral.create(null, dt), mergeSum.attr),
mergeIsEmpty.attr)
}
override lazy val evaluateExpression: Expression = {
GpuCheckOverflowAfterSum(sum, isEmpty, dt, !failOnErrorOverride)
}
override def windowOutput(result: ColumnVector): ColumnVector = {
// Check for overflow
GpuCast.checkNFixDecimalBounds(result, dt, failOnErrorOverride)
}
override def scanCombine(isRunningBatched: Boolean, cols: Seq[ColumnVector]): ColumnVector = {
// We do bounds checks if we are not going to use the running fixer and it is decimal
// The fixer will do the bounds checks for us on the actual final values.
if (!isRunningBatched) {
// Check for overflow
GpuCast.checkNFixDecimalBounds(cols.head, dt, failOnErrorOverride)
} else {
super.scanCombine(isRunningBatched, cols)
}
}
}
/** Sum aggregations for decimals up to and including DECIMAL64 */
case class GpuBasicDecimalSum(
child: Expression,
dt: DecimalType,
failOnErrorOverride: Boolean)
extends GpuDecimalSum(child, dt, failOnErrorOverride)
/**
* Sum aggregations for DECIMAL128.
*
* The sum aggregation is performed by splitting the original 128-bit values into 32-bit "chunks"
* and summing those. The chunking accomplishes two things. First, it helps avoid cudf resorting
* to a much slower aggregation since currently DECIMAL128 sums are only implemented for
* sort-based aggregations. Second, chunking allows detection of overflows.
*
* The chunked approach to sum aggregation works as follows. The 128-bit value is split into its
* four 32-bit chunks, with the most significant chunk being an INT32 and the remaining three
* chunks being UINT32. When these are sum aggregated, cudf will implicitly upscale the accumulated
* result to a 64-bit value. Since cudf only allows up to 2**31 rows to be aggregated at a time,
* the "extra" upper 32-bits of the upscaled 64-bit accumulation values will be enough to hold the
* worst-case "carry" bits from summing each 32-bit chunk.
*
* After the cudf aggregation has completed, the four 64-bit chunks are reassembled into a 128-bit
* value. The lowest 32-bits of the least significant 64-bit chunk are used directly as the lowest
* 32-bits of the final value, and the remaining 32-bits are added to the next most significant
* 64-bit chunk. The lowest 32-bits of that chunk then become the next 32-bits of the 128-bit value
* and the remaining 32-bits are added to the next 64-bit chunk, and so on. Finally after the
* 128-bit value is constructed, the remaining "carry" bits of the most significant chunk after
* reconstruction are checked against the sign bit of the 128-bit result to see if there was an
* overflow.
*/
case class GpuDecimal128Sum(
child: Expression,
dt: DecimalType,
failOnErrorOverride: Boolean,
forceWindowSumToNotBeReplaced: Boolean)
extends GpuDecimalSum(child, dt, failOnErrorOverride) with GpuReplaceWindowFunction {
private lazy val childIsDecimal: Boolean =
child.dataType.isInstanceOf[DecimalType]
private lazy val childDecimalType: DecimalType =
child.dataType.asInstanceOf[DecimalType]
private lazy val needsDec128UpdateOverflowChecks: Boolean =
childIsDecimal &&
childDecimalType.precision > GpuDecimalSumOverflow.updateCutoffPrecision
// For some operations we need to sum the higher digits in addition to the regular value so
// we can detect overflow. This is the type of the higher digits SUM value.
private lazy val higherDigitsCheckType: DecimalType = {
DecimalType(dt.precision - GpuDecimalSumOverflow.updateCutoffPrecision, 0)
}
override lazy val inputProjection: Seq[Expression] = {
val chunks = (0 until 4).map {
GpuExtractChunk32(GpuCast(child, dt), _, replaceNullsWithZero = true)
}
// Spark tracks null columns through a second column isEmpty for decimal. So null values
// are replaced with 0, and a separate boolean column for isNull is added
chunks :+ GpuIsNull(child)
}
private lazy val updateSumChunks = (0 until 4).map(_ => new CudfSum(LongType))
override lazy val updateAggregates: Seq[CudfAggregate] = updateSumChunks :+ updateIsEmpty
override lazy val postUpdate: Seq[Expression] = {
Seq(
GpuAssembleSumChunks(updateSumChunks.map(_.attr), dt, !failOnErrorOverride),
updateIsEmpty.attr)
}
override lazy val preMerge: Seq[Expression] = {
val chunks = (0 until 4).map {
GpuExtractChunk32(sum, _, replaceNullsWithZero = false)
}
// Spark tracks null columns through a second column isEmpty for decimal. So null values
// are replaced with 0, and a separate boolean column for isNull is added
chunks ++ Seq(isEmpty, GpuIsNull(sum))
}
private lazy val mergeSumChunks = (0 until 4).map(_ => new CudfSum(LongType))
// To be able to do decimal overflow detection, we need a CudfSum that does **not** ignore nulls.
// Cudf does not have such an aggregation, so for merge we have to work around that similar to
// what happens with isEmpty
override lazy val mergeAggregates: Seq[CudfAggregate] = {
mergeSumChunks ++ Seq(mergeIsEmpty, mergeIsOverflow)
}
override lazy val postMerge: Seq[Expression] = {
val assembleExpr = GpuAssembleSumChunks(mergeSumChunks.map(_.attr), dt, !failOnErrorOverride)
Seq(
GpuIf(mergeIsOverflow.attr, GpuLiteral.create(null, dt), assembleExpr),
mergeIsEmpty.attr)
}
// Replacement Window Function
override def shouldReplaceWindow(spec: GpuWindowSpecDefinition): Boolean = {
// We only will replace this if we think an update will fail. In the cases where we can
// handle a window function larger than a single batch, we already have merge overflow
// detection enabled.
!forceWindowSumToNotBeReplaced && needsDec128UpdateOverflowChecks
}
override def windowReplacement(spec: GpuWindowSpecDefinition): Expression = {
// We need extra overflow checks for some larger decimal type. To do these checks we
// extract the higher digits and SUM them separately to see if they would overflow.
// If they do we know that the regular SUM also overflowed. If not we know that we can rely on
// the existing overflow code to detect it.
val regularSum = GpuWindowExpression(
GpuSum(child, dt, failOnErrorOverride = failOnErrorOverride,
forceWindowSumToNotBeReplaced = true),
spec)
val highOrderDigitsSum = GpuWindowExpression(
GpuSum(
GpuDecimalSumHighDigits(GpuCast(child, dt), child.dataType.asInstanceOf[DecimalType]),
higherDigitsCheckType,
failOnErrorOverride = failOnErrorOverride),
spec)
GpuIf(GpuIsNull(highOrderDigitsSum), GpuLiteral(null, dt), regularSum)
}
}
/*
* GpuPivotFirst is an aggregate function used in the second phase of a two phase pivot to do the
* required rearrangement of values into pivoted form.
*
* For example on an input of
* type | A | B
* -----+--+--
* b | x | 1
* a | x | 2
* b | y | 3
*
* with type=groupbyKey, pivotColumn=A, valueColumn=B, and pivotColumnValues=[x,y]
*
* updateExpressions - In the partial_pivot stage, new columns are created based on
* pivotColumnValues one for each of the aggregation. Last aggregation on these columns grouped by
* `type` and convert into an array( as per Spark's expectation). Last aggregation(excluding nulls)
* works here as there would be atmost one entry in new columns when grouped by `type`.
* After CudfLastExcludeNulls, the intermediate result would be
*
* type | x | y
* -----+---+--
* b | 1 | 3
* a | 2 | null
*
*
* mergeExpressions - this is the final pivot aggregation after shuffle. We do another `Last`
* aggregation to merge the results. In this example all the data was combined in the
* partial_pivot hash aggregation. So it didn't do anything in this stage.
*
* The final result would be:
*
* type | x |
* -----+---+--
* b | 1 | 3
* a | 2 | null
*
* @param pivotColumn column that determines which output position to put valueColumn in.
* @param valueColumn the column that is being rearranged.
* @param pivotColumnValues the list of pivotColumn values in the order of desired output. Values
* not listed here will be ignored.
*/
case class GpuPivotFirst(
pivotColumn: Expression,
valueColumn: Expression,
pivotColumnValues: Seq[Any]) extends GpuAggregateFunction {
private val valueDataType = valueColumn.dataType
override lazy val initialValues: Seq[GpuLiteral] =
Seq.fill(pivotColumnValues.length)(GpuLiteral(null, valueDataType))
override lazy val inputProjection: Seq[Expression] = {
val expr = pivotColumnValues.map(pivotColumnValue => {
if (pivotColumnValue == null) {
GpuIf(GpuIsNull(pivotColumn), valueColumn, GpuLiteral(null, valueDataType))
} else {
// Need to use an equal to comparison that is != when both values are NaN to be consistent
// with Spark's inconsistency with regards to PivotFirst
GpuIf(GpuEqualToNoNans(pivotColumn, GpuLiteral(pivotColumnValue, pivotColumn.dataType)),
valueColumn, GpuLiteral(null, valueDataType))
}
})
expr
}
private lazy val pivotColAttr = pivotColumnValues.map(pivotColumnValue => {
// If `pivotColumnValue` is null, then create an AttributeReference for null column.
if (pivotColumnValue == null) {
AttributeReference(GpuLiteral(null, valueDataType).toString, valueDataType)()
} else {
AttributeReference(pivotColumnValue.toString, valueDataType)()
}
})
override lazy val updateAggregates: Seq[CudfAggregate] =
pivotColAttr.map(c => CudfNthLikeAggregate.newLastExcludeNulls(c.dataType))
override lazy val mergeAggregates: Seq[CudfAggregate] =
pivotColAttr.map(c => CudfNthLikeAggregate.newLastExcludeNulls(c.dataType))
override lazy val evaluateExpression: Expression =
GpuCreateArray(pivotColAttr, false)
override lazy val aggBufferAttributes: Seq[AttributeReference] = pivotColAttr
override val dataType: DataType = valueDataType
override val nullable: Boolean = false
override def children: Seq[Expression] = pivotColumn :: valueColumn :: Nil
}
case class GpuCount(children: Seq[Expression],
failOnError: Boolean = SQLConf.get.ansiEnabled)
extends GpuAggregateFunction
with GpuBatchedRunningWindowWithFixer
with GpuUnboundToUnboundWindowWithFixer
with GpuAggregateWindowFunction
with GpuRunningWindowFunction {
override lazy val initialValues: Seq[GpuLiteral] = Seq(GpuLiteral(0L, LongType))
// inputAttr
override lazy val inputProjection: Seq[Expression] = Seq(children.head)
private lazy val cudfCountUpdate = new CudfCount(IntegerType)
override lazy val updateAggregates: Seq[CudfAggregate] = Seq(cudfCountUpdate)
// Integer->Long before we are done with the update aggregate
override lazy val postUpdate: Seq[Expression] = Seq(GpuCast(cudfCountUpdate.attr, dataType))
override lazy val mergeAggregates: Seq[CudfAggregate] = Seq(new CudfSum(dataType))
// This is the spark API
private lazy val count = AttributeReference("count", dataType)()
override lazy val evaluateExpression: Expression = count
override lazy val aggBufferAttributes: Seq[AttributeReference] = count :: Nil
// Copied from Count
override def nullable: Boolean = false
override def dataType: DataType = LongType
// GENERAL WINDOW FUNCTION
// countDistinct is not supported for window functions in spark right now.
// we could support it by doing an `Aggregation.nunique(false)`
override lazy val windowInputProjection: Seq[Expression] = inputProjection
override def windowAggregation(
inputs: Seq[(ColumnVector, Int)]): RollingAggregationOnColumn =
RollingAggregation.count(NullPolicy.EXCLUDE).onColumn(inputs.head._2)
override def windowOutput(result: ColumnVector): ColumnVector = {
// The output needs to be a long
result.castTo(DType.INT64)
}
// RUNNING WINDOW
override def newFixer(): BatchedRunningWindowFixer =
new BatchedRunningWindowBinaryFixer(BinaryOp.ADD, "count")
// Scan and group by scan do not support COUNT with nulls excluded.
// one of them does not even support count at all, so we are going to SUM
// ones and zeros based off of the validity
override def groupByScanInputProjection(isRunningBatched: Boolean): Seq[Expression] = {
// There can be only one child according to requirements for count right now
require(children.length == 1)
val child = children.head
if (child.nullable) {
Seq(GpuIf(GpuIsNull(child), GpuLiteral(0, IntegerType), GpuLiteral(1, IntegerType)))
} else {
Seq(GpuLiteral(1, IntegerType))
}
}
override def groupByScanAggregation(
isRunningBatched: Boolean): Seq[AggAndReplace[GroupByScanAggregation]] =
Seq(AggAndReplace(GroupByScanAggregation.sum(), None))
override def scanInputProjection(isRunningBatched: Boolean): Seq[Expression] =
groupByScanInputProjection(isRunningBatched)
override def scanAggregation(isRunningBatched: Boolean): Seq[AggAndReplace[ScanAggregation]] =
Seq(AggAndReplace(ScanAggregation.sum(), None))
override def scanCombine(isRunningBatched: Boolean, cols: Seq[ColumnVector]): ColumnVector =
cols.head.castTo(DType.INT64)
override def newUnboundedToUnboundedFixer: BatchedUnboundedToUnboundedWindowFixer =
new CountUnboundedToUnboundedFixer(failOnError)
// minPeriods should be 0.
// Consider the following rows:
// v = [ 0, 1, 2, 3, 4, 5 ]
// A `COUNT` window aggregation over (2, -1) should yield 0, not null,
// for the first row.
override def getMinPeriods: Int = 0
}
object GpuAverage {
def apply(child: Expression): GpuAverage = {
child.dataType match {
case DecimalType.Fixed(p, s) =>
val sumDataType = DecimalType.bounded(p + 10, s)
if (sumDataType.precision > Decimal.MAX_LONG_DIGITS) {
GpuDecimal128Average(child, sumDataType)
} else {
GpuBasicDecimalAverage(child, sumDataType)
}
case _ =>
GpuBasicAverage(child, DoubleType)
}
}
}
abstract class GpuAverage(child: Expression, sumDataType: DataType) extends GpuAggregateFunction
with GpuReplaceWindowFunction with Serializable {
override lazy val inputProjection: Seq[Expression] = {
// Replace the nulls with 0s in the SUM column because Spark does not protect against
// nulls in the merge phase. It does this to be able to detect overflow errors in
// decimal aggregations. The null gets inserted back in with evaluateExpression where
// a divide by 0 gets replaced with a null.
val castedForSum = GpuCoalesce(Seq(
GpuCast(child, sumDataType),
GpuLiteral.default(sumDataType)))
val forCount = GpuCast(GpuIsNotNull(child), LongType)
Seq(castedForSum, forCount)
}
override def filteredInputProjection(filter: Expression): Seq[Expression] = {
inputProjection.map(e => GpuIf(filter, e, GpuLiteral.default(e.dataType)))
}
override lazy val initialValues: Seq[GpuLiteral] = Seq(
GpuLiteral.default(sumDataType),
GpuLiteral(0L, LongType))
protected lazy val updateSum = new CudfSum(sumDataType)
protected lazy val updateCount = new CudfSum(LongType)
// The count input projection will need to be collected as a sum (of counts) instead of
// counts (of counts) as the GpuIsNotNull o/p is casted to count=0 for null and 1 otherwise, and
// the total count can be correctly evaluated only by summing them. eg. avg(col(null, 27))
// should be 27, with count column projection as (0, 1) and total count for dividing the
// average = (0 + 1) and not 2 which is the rowcount of the projected column.
override lazy val updateAggregates: Seq[CudfAggregate] = Seq(updateSum, updateCount)
protected lazy val sum: AttributeReference = AttributeReference("sum", sumDataType)()
protected lazy val count: AttributeReference = AttributeReference("count", LongType)()
override lazy val aggBufferAttributes: Seq[AttributeReference] = sum :: count :: Nil
protected lazy val mergeSum = new CudfSum(sumDataType)
protected lazy val mergeCount = new CudfSum(LongType)
override lazy val mergeAggregates: Seq[CudfAggregate] = Seq(mergeSum, mergeCount)
// NOTE: this sets `failOnErrorOverride=false` in `GpuDivide` to force it not to throw
// divide-by-zero exceptions, even when ansi mode is enabled in Spark.
// This is to conform with Spark's behavior in the Average aggregate function.
override lazy val evaluateExpression: Expression =
GpuDivide(sum, GpuCast(count, DoubleType), failOnError = false)
// Window
// Replace average with SUM/COUNT. This lets us run average in running window mode without
// recreating everything that would have to go into doing the SUM and the COUNT here.
override def windowReplacement(spec: GpuWindowSpecDefinition): Expression = {
val count = GpuWindowExpression(GpuCount(Seq(child)), spec)
val sum = GpuWindowExpression(
GpuSum(GpuCast(child, dataType), dataType, failOnErrorOverride = false), spec)
GpuDivide(sum, GpuCast(count, dataType), failOnError = false)
}
// Copied from Average
override def prettyName: String = "avg"
override def children: Seq[Expression] = child :: Nil
override def checkInputDataTypes(): TypeCheckResult =
TypeUtilsShims.checkForNumericExpr(child.dataType, "function gpu average")
override def nullable: Boolean = true
override val dataType: DataType = DoubleType
}
case class GpuBasicAverage(child: Expression, dt: DataType) extends GpuAverage(child, dt)
abstract class GpuDecimalAverageBase(child: Expression, sumDataType: DecimalType)
extends GpuAverage(child, sumDataType) {
override lazy val postUpdate: Seq[Expression] =
Seq(GpuCheckOverflow(updateSum.attr, sumDataType, nullOnOverflow = true), updateCount.attr)
// To be able to do decimal overflow detection, we need a CudfSum that does **not** ignore nulls.
// Cudf does not have such an aggregation, so for merge we have to work around that with an extra
// isOverflow column. We only do this for Decimal because that is the only one that can have a
// null inserted as a part of overflow checks. Spark does this for all overflow columns.
override lazy val preMerge: Seq[Expression] = Seq(sum, count, GpuIsNull(sum))
protected lazy val mergeIsOverflow = new CudfMax(BooleanType)
override lazy val mergeAggregates: Seq[CudfAggregate] = Seq(mergeSum, mergeCount, mergeIsOverflow)
override lazy val postMerge: Seq[Expression] = Seq(
GpuCheckOverflow(
GpuIf(mergeIsOverflow.attr, GpuLiteral.create(null, sumDataType), mergeSum.attr),
sumDataType, nullOnOverflow = true),
mergeCount.attr)
// This is here to be bug for bug compatible with Spark. They round in the divide and then cast
// the result to the final value. This loses some data in many cases and we need to be able to
// match that. This bug appears to have been fixed in Spark 3.4.0.
lazy val intermediateSparkDivideType = DecimalDivideChecks.calcOrigSparkOutputType(sumDataType,
DecimalType.LongDecimal)
override val dataType: DecimalType = child.dataType match {
case DecimalType.Fixed(p, s) => DecimalType.bounded(p + 4, s + 4)
case t => throw new IllegalStateException(s"child type $t is not DecimalType")
}
}
case class GpuBasicDecimalAverage(child: Expression, dt: DecimalType)
extends GpuDecimalAverage(child, dt)
/**
* Average aggregations for DECIMAL128.
*
* To avoid the significantly slower sort-based aggregations in cudf for DECIMAL128 columns,
* the incoming DECIMAL128 values are split into four 32-bit chunks which are summed separately
* into 64-bit intermediate results and then recombined into a 128-bit result with overflow
* checking. See GpuDecimal128Sum for more details.
*/
case class GpuDecimal128Average(child: Expression, dt: DecimalType)
extends GpuDecimalAverage(child, dt) {
override lazy val inputProjection: Seq[Expression] = {
// Replace the nulls with 0s in the SUM column because Spark does not protect against
// nulls in the merge phase. It does this to be able to detect overflow errors in
// decimal aggregations. The null gets inserted back in with evaluateExpression where
// a divide by 0 gets replaced with a null.
val chunks = (0 until 4).map { chunkIdx =>
val extract = GpuExtractChunk32(GpuCast(child, dt), chunkIdx, replaceNullsWithZero = false)
GpuCoalesce(Seq(extract, GpuLiteral.default(extract.dataType)))
}
val forCount = GpuCast(GpuIsNotNull(child), LongType)
chunks :+ forCount
}
private lazy val updateSumChunks = (0 until 4).map(_ => new CudfSum(LongType))
override lazy val updateAggregates: Seq[CudfAggregate] = updateSumChunks :+ updateCount
override lazy val postUpdate: Seq[Expression] = {
val assembleExpr = GpuAssembleSumChunks(updateSumChunks.map(_.attr), dt, nullOnOverflow = true)
Seq(GpuCheckOverflow(assembleExpr, dt, nullOnOverflow = true), updateCount.attr)
}
// To be able to do decimal overflow detection, we need a CudfSum that does **not** ignore nulls.
// Cudf does not have such an aggregation, so for merge we have to work around that with an extra
// isOverflow column. We only do this for Decimal because that is the only one that can have a
// null inserted as a part of overflow checks. Spark does this for all overflow columns.
override lazy val preMerge: Seq[Expression] = {
val chunks = (0 until 4).map(GpuExtractChunk32(sum, _, replaceNullsWithZero = false))
chunks ++ Seq(count, GpuIsNull(sum))
}
private lazy val mergeSumChunks = (0 until 4).map(_ => new CudfSum(LongType))
override lazy val mergeAggregates: Seq[CudfAggregate] =
mergeSumChunks ++ Seq(mergeCount, mergeIsOverflow)
override lazy val postMerge: Seq[Expression] = {
val assembleExpr = GpuAssembleSumChunks(mergeSumChunks.map(_.attr), dt, nullOnOverflow = true)
Seq(
GpuCheckOverflow(GpuIf(mergeIsOverflow.attr,
GpuLiteral.create(null, dt),
assembleExpr), dt, nullOnOverflow = true),
mergeCount.attr)
}
}
/*
* First/Last are "good enough" for the hash aggregate, and should only be used when we
* want to collapse to a grouped key. The hash aggregate doesn't make guarantees on the
* ordering of how batches are processed, so this is as good as picking any old function
* (we picked max)
*
* These functions have an extra field they expect to be around in the aggregation buffer.
* So this adds a "max" of that, and currently sends it to the GPU. The CPU version uses it
* to check if the value was set (if we don't ignore nulls, valueSet is true, that's what we do
* here).
*/
case class GpuFirst(child: Expression, ignoreNulls: Boolean)
extends GpuAggregateFunction
with GpuBatchedRunningWindowWithFixer
with GpuAggregateWindowFunction
with GpuDeterministicFirstLastCollectShim
with ImplicitCastInputTypes
with Serializable {
private lazy val cudfFirst = AttributeReference("first", child.dataType)()
private lazy val valueSet = AttributeReference("valueSet", BooleanType)()
override lazy val inputProjection: Seq[Expression] =
Seq(child, GpuLiteral(ignoreNulls, BooleanType))
private lazy val commonExpressions: Seq[CudfAggregate] = if (ignoreNulls) {
Seq(CudfNthLikeAggregate.newFirstExcludeNulls(cudfFirst.dataType),
CudfNthLikeAggregate.newFirstExcludeNulls(valueSet.dataType))
} else {
Seq(CudfNthLikeAggregate.newFirstIncludeNulls(cudfFirst.dataType),
CudfNthLikeAggregate.newFirstIncludeNulls(valueSet.dataType))
}
// Expected input data type.
override lazy val initialValues: Seq[GpuLiteral] = Seq(
GpuLiteral(null, child.dataType),
GpuLiteral(false, BooleanType))
override lazy val updateAggregates: Seq[CudfAggregate] = commonExpressions
override lazy val mergeAggregates: Seq[CudfAggregate] = commonExpressions
override lazy val evaluateExpression: Expression = cudfFirst
override lazy val aggBufferAttributes: Seq[AttributeReference] = cudfFirst :: valueSet :: Nil
// Copied from First
override def inputTypes: Seq[AbstractDataType] = Seq(AnyDataType, BooleanType)
override def children: Seq[Expression] = child :: Nil
override def nullable: Boolean = true
override def dataType: DataType = child.dataType
override def toString: String = s"gpufirst($child)${if (ignoreNulls) " ignore nulls"}"
override def checkInputDataTypes(): TypeCheckResult = {
val defaultCheck = super.checkInputDataTypes()
if (defaultCheck.isFailure) {
defaultCheck
} else {
TypeCheckSuccess
}
}
// GENERAL WINDOW FUNCTION
override lazy val windowInputProjection: Seq[Expression] = inputProjection
override def windowAggregation(
inputs: Seq[(ColumnVector, Int)]): RollingAggregationOnColumn =
RollingAggregation.nth(0, if (ignoreNulls) NullPolicy.EXCLUDE else NullPolicy.INCLUDE)
.onColumn(inputs.head._2)
override def newFixer(): BatchedRunningWindowFixer =
new FirstRunningWindowFixer(ignoreNulls)
}
case class GpuLast(child: Expression, ignoreNulls: Boolean)
extends GpuAggregateFunction
with GpuBatchedRunningWindowWithFixer
with GpuAggregateWindowFunction
with GpuDeterministicFirstLastCollectShim
with ImplicitCastInputTypes
with Serializable {
private lazy val cudfLast = AttributeReference("last", child.dataType)()
private lazy val valueSet = AttributeReference("valueSet", BooleanType)()
override lazy val inputProjection: Seq[Expression] =
Seq(child, GpuLiteral(!ignoreNulls, BooleanType))
private lazy val commonExpressions: Seq[CudfAggregate] = if (ignoreNulls) {
Seq(CudfNthLikeAggregate.newLastExcludeNulls(cudfLast.dataType),
CudfNthLikeAggregate.newLastExcludeNulls(valueSet.dataType))
} else {
Seq(CudfNthLikeAggregate.newLastIncludeNulls(cudfLast.dataType),
CudfNthLikeAggregate.newLastIncludeNulls(valueSet.dataType))
}
override lazy val initialValues: Seq[GpuLiteral] = Seq(
GpuLiteral(null, child.dataType),
GpuLiteral(false, BooleanType))
override lazy val updateAggregates: Seq[CudfAggregate] = commonExpressions
override lazy val mergeAggregates: Seq[CudfAggregate] = commonExpressions
override lazy val evaluateExpression: Expression = cudfLast
override lazy val aggBufferAttributes: Seq[AttributeReference] = cudfLast :: valueSet :: Nil
// Copied from Last
override def inputTypes: Seq[AbstractDataType] = Seq(AnyDataType, BooleanType)
override def children: Seq[Expression] = child :: Nil
override def nullable: Boolean = true
override def dataType: DataType = child.dataType
override def toString: String = s"gpulast($child)${if (ignoreNulls) " ignore nulls"}"
override def checkInputDataTypes(): TypeCheckResult = {
val defaultCheck = super.checkInputDataTypes()
if (defaultCheck.isFailure) {
defaultCheck
} else {
TypeCheckSuccess
}
}
// GENERAL WINDOW FUNCTION
override lazy val windowInputProjection: Seq[Expression] = inputProjection
override def windowAggregation(
inputs: Seq[(ColumnVector, Int)]): RollingAggregationOnColumn =
RollingAggregation.nth(-1, if (ignoreNulls) NullPolicy.EXCLUDE else NullPolicy.INCLUDE)
.onColumn(inputs.head._2)
override def newFixer(): BatchedRunningWindowFixer = new LastRunningWindowFixer(ignoreNulls)
}
case class GpuNthValue(child: Expression, offset: Expression, ignoreNulls: Boolean)
extends GpuAggregateWindowFunction
with GpuBatchedRunningWindowWithFixer // Only if the N == 1.
with ImplicitCastInputTypes
with Serializable {
// offset is foldable, get value as Spark does
private lazy val offsetVal = offset.eval().asInstanceOf[Int]
// Copied from First
override def inputTypes: Seq[AbstractDataType] = Seq(AnyDataType, BooleanType)
override def children: Seq[Expression] = child :: Nil
override def nullable: Boolean = true
override def dataType: DataType = child.dataType
override def toString: String = s"gpu_nth_value($child, $offset)" +
s"${if (ignoreNulls) " ignore nulls"}"
override def checkInputDataTypes(): TypeCheckResult = {
val defaultCheck = super.checkInputDataTypes()
if (defaultCheck.isFailure) {
defaultCheck
} else {
TypeCheckSuccess
}
}
// GENERAL WINDOW FUNCTION
override lazy val windowInputProjection: Seq[Expression] =
Seq(child)
override def windowAggregation(
inputs: Seq[(ColumnVector, Int)]): RollingAggregationOnColumn =
RollingAggregation.nth(offsetVal - 1,
if (ignoreNulls) NullPolicy.EXCLUDE else NullPolicy.INCLUDE)
.onColumn(inputs.head._2)
private[this] def getN: Option[Int] = GpuOverrides.extractLit(offset) match {
// Only Integer literals are supported for N.
case Some(Literal(value: Int, IntegerType)) => Some(value)
case _ => None
}
override def canFixUp: Boolean = {
getN match {
case Some(1) => true // First is supported.
case Some(-1) => true // Last is also supported.
case _ => false // No other index is currently supported for fixup.
}
}
override def newFixer(): BatchedRunningWindowFixer = {
assert(canFixUp, "NthValue fixup cannot be done when offset != 1.")
getN match {
case Some(1) => new FirstRunningWindowFixer(ignoreNulls)
case _ => new LastRunningWindowFixer(ignoreNulls)
}
}
}
trait GpuCollectBase
extends GpuAggregateFunction
with GpuDeterministicFirstLastCollectShim
with GpuAggregateWindowFunction {
def child: Expression
override def nullable: Boolean = false
override def dataType: DataType = ArrayType(child.dataType, containsNull = false)
override def children: Seq[Expression] = child :: Nil
// WINDOW FUNCTION
override val windowInputProjection: Seq[Expression] = Seq(child)
override val initialValues: Seq[Expression] = {
Seq(GpuLiteral.create(new GenericArrayData(Array.empty[Any]), dataType))
}
override val inputProjection: Seq[Expression] = Seq(child)
protected final lazy val outputBuf: AttributeReference =
AttributeReference("inputBuf", dataType)()
}
/**
* Collects and returns a list of non-unique elements.
*
* The two 'offset' parameters are not used by GPU version, but are here for the compatibility
* with the CPU version and automated checks.
*/
case class GpuCollectList(
child: Expression,
mutableAggBufferOffset: Int = 0,
inputAggBufferOffset: Int = 0)
extends GpuCollectBase {
override lazy val updateAggregates: Seq[CudfAggregate] = Seq(new CudfCollectList(dataType))
override lazy val mergeAggregates: Seq[CudfAggregate] = Seq(new CudfMergeLists(dataType))
override lazy val evaluateExpression: Expression = outputBuf
override def aggBufferAttributes: Seq[AttributeReference] = outputBuf :: Nil
override def prettyName: String = "collect_list"
override def windowAggregation(
inputs: Seq[(ColumnVector, Int)]): RollingAggregationOnColumn =
RollingAggregation.collectList().onColumn(inputs.head._2)
// minPeriods should be 0.
// Consider the following rows: v = [ 0, 1, 2, 3, 4, 5 ]
// A `COLLECT_LIST` window aggregation over (2, -1) should yield an empty array [],
// not null, for the first row.
override def getMinPeriods: Int = 0
}
/**
* Collects and returns a set of unique elements.
*
* The two 'offset' parameters are not used by GPU version, but are here for the compatibility
* with the CPU version and automated checks.
*/
case class GpuCollectSet(
child: Expression,
mutableAggBufferOffset: Int = 0,
inputAggBufferOffset: Int = 0)
extends GpuCollectBase with GpuUnboundedToUnboundedWindowAgg {
override lazy val updateAggregates: Seq[CudfAggregate] = Seq(new CudfCollectSet(dataType))
override lazy val mergeAggregates: Seq[CudfAggregate] = Seq(new CudfMergeSets(dataType))
override lazy val evaluateExpression: Expression = outputBuf
override def aggBufferAttributes: Seq[AttributeReference] = outputBuf :: Nil
override def prettyName: String = "collect_set"
// Spark handles NaN's equality by different way for non-nested float/double and float/double
// in nested types. When we use non-nested versions of floats and doubles, NaN values are
// considered unequal, but when we collect sets of nested versions, NaNs are considered equal
// on the CPU. So we set NaNEquality dynamically here.
override def windowAggregation(
inputs: Seq[(ColumnVector, Int)]): RollingAggregationOnColumn = child.dataType match {
case FloatType | DoubleType =>
RollingAggregation.collectSet(NullPolicy.EXCLUDE, NullEquality.EQUAL,
NaNEquality.UNEQUAL).onColumn(inputs.head._2)
case _ =>
RollingAggregation.collectSet(NullPolicy.EXCLUDE, NullEquality.EQUAL,
NaNEquality.ALL_EQUAL).onColumn(inputs.head._2)
}
// minPeriods should be 0.
// Consider the following rows: v = [ 0, 1, 2, 3, 4, 5 ]
// A `COLLECT_SET` window aggregation over (2, -1) should yield an empty array [],
// not null, for the first row.
override def getMinPeriods: Int = 0
}
class CpuToGpuCollectBufferConverter(
elementType: DataType) extends CpuToGpuAggregateBufferConverter {
def createExpression(child: Expression): CpuToGpuBufferTransition = {
CpuToGpuCollectBufferTransition(child, elementType)
}
}
case class CpuToGpuCollectBufferTransition(
override val child: Expression,
private val elementType: DataType) extends CpuToGpuBufferTransition {
private lazy val row = new UnsafeRow(1)
override def dataType: DataType = ArrayType(elementType, containsNull = false)
override protected def nullSafeEval(input: Any): ArrayData = {
// Converts binary buffer into UnSafeArrayData, according to the deserialize method of Collect.
// The input binary buffer is the binary view of a UnsafeRow, which only contains single field
// with ArrayType of elementType. Since array of elements exactly matches the GPU format, we
// don't need to do any conversion in memory level. Instead, we simply bind the binary data to
// a reused UnsafeRow. Then, fetch the only field as ArrayData.
val bytes = input.asInstanceOf[Array[Byte]]
row.pointTo(bytes, bytes.length)
row.getArray(0).copy()
}
}
class GpuToCpuCollectBufferConverter extends GpuToCpuAggregateBufferConverter {
def createExpression(child: Expression): GpuToCpuBufferTransition = {
GpuToCpuCollectBufferTransition(child)
}
}
case class GpuToCpuCollectBufferTransition(
override val child: Expression) extends GpuToCpuBufferTransition {
private lazy val projection = UnsafeProjection.create(Array(child.dataType))
override protected def nullSafeEval(input: Any): Array[Byte] = {
// Converts UnSafeArrayData into binary buffer, according to the serialize method of Collect.
// The binary buffer is the binary view of a UnsafeRow, which only contains single field
// with ArrayType of elementType. As Collect.serialize, we create an UnsafeProjection to
// transform ArrayData to binary view of the single field UnsafeRow. Unlike Collect.serialize,
// we don't have to build ArrayData from on-heap array, since the input is already formatted
// in ArrayData(UnsafeArrayData).
val arrayData = input.asInstanceOf[ArrayData]
projection.apply(InternalRow.apply(arrayData)).getBytes
}
}
/**
* Base class for overriding standard deviation and variance aggregations.
* This is also a GPU-based implementation of 'CentralMomentAgg' aggregation class in Spark with
* the fixed 'momentOrder' variable set to '2'.
*/
abstract class GpuM2(child: Expression, nullOnDivideByZero: Boolean)
extends GpuAggregateFunction with ImplicitCastInputTypes with Serializable {
override def children: Seq[Expression] = Seq(child)
override def dataType: DataType = DoubleType
override def nullable: Boolean = true
override def inputTypes: Seq[AbstractDataType] = Seq(NumericType)
protected def divideByZeroEvalResult: Expression =
GpuLiteral(if (nullOnDivideByZero) null else Double.NaN, DoubleType)
override lazy val initialValues: Seq[GpuLiteral] =
Seq(GpuLiteral(0.0), GpuLiteral(0.0), GpuLiteral(0.0))
override lazy val inputProjection: Seq[Expression] = Seq(child, child, child)
// cudf aggregates
lazy val cudfCountN: CudfAggregate = new CudfCount(IntegerType)
lazy val cudfMean: CudfMean = new CudfMean
lazy val cudfM2: CudfM2 = new CudfM2
// For local update, we need to compute all 3 aggregates: n, avg, m2.
override lazy val updateAggregates: Seq[CudfAggregate] = Seq(cudfCountN, cudfMean, cudfM2)
// We copy the `bufferN` attribute and stomp on the type as Integer here, because we only
// have its values are of Integer type. However,we want to output `DoubleType` to match
// with Spark so we need to cast it to `DoubleType`.
//
// In the future, when we make CudfM2 aggregate outputs all the buffers at once,
// we need to make sure that bufferN is a LongType.
//
// Note that avg and m2 output from libcudf's M2 aggregate are nullable while Spark's
// corresponding buffers require them to be non-nullable.
// As such, we need to convert those nulls into Double(0.0) in the postUpdate step.
// This will not affect the outcome of the merge step.
override lazy val postUpdate: Seq[Expression] = {
val bufferAvgNoNulls = GpuCoalesce(Seq(cudfMean.attr, GpuLiteral(0.0, DoubleType)))
val bufferM2NoNulls = GpuCoalesce(Seq(cudfM2.attr, GpuLiteral(0.0, DoubleType)))
GpuCast(cudfCountN.attr, DoubleType) :: bufferAvgNoNulls :: bufferM2NoNulls :: Nil
}
protected lazy val bufferN: AttributeReference =
AttributeReference("n", DoubleType, nullable = false)()
protected lazy val bufferAvg: AttributeReference =
AttributeReference("avg", DoubleType, nullable = false)()
protected lazy val bufferM2: AttributeReference =
AttributeReference("m2", DoubleType, nullable = false)()
override lazy val aggBufferAttributes: Seq[AttributeReference] =
bufferN :: bufferAvg :: bufferM2 :: Nil
// Before merging we have 3 columns and we need to combine them into a structs column.
// This is because we are going to do the merging using libcudf's native MERGE_M2 aggregate,
// which only accepts one column in the input.
//
// We cast `n` to be an Integer, as that's what MERGE_M2 expects. Note that Spark keeps
// `n` as Double thus we also need to cast `n` back to Double after merging.
// In the future, we need to rewrite CudfMergeM2 such that it accepts `n` in Double type and
// also output `n` in Double type.
override lazy val preMerge: Seq[Expression] = {
val childrenWithNames =
GpuLiteral("n", StringType) :: GpuCast(bufferN, IntegerType) ::
GpuLiteral("avg", StringType) :: bufferAvg ::
GpuLiteral("m2", StringType) :: bufferM2 :: Nil
GpuCreateNamedStruct(childrenWithNames) :: Nil
}
private lazy val mergeM2 = new CudfMergeM2
override lazy val mergeAggregates: Seq[CudfAggregate] = Seq(mergeM2)
// The postMerge step needs to extract 3 columns (n, avg, m2) from the structs column
// output from the merge step. Note that the first one is casted to Double to match with Spark.
//
// In the future, when rewriting CudfMergeM2, we will need to output it in Double type.
override lazy val postMerge: Seq[Expression] = Seq(
GpuCast(GpuGetStructField(mergeM2.attr, 0), DoubleType),
GpuCoalesce(Seq(GpuCast(GpuGetStructField(mergeM2.attr, 1), DoubleType),
GpuLiteral(0.0, DoubleType))),
GpuCoalesce(Seq(GpuCast(GpuGetStructField(mergeM2.attr, 2), DoubleType),
GpuLiteral(0.0, DoubleType))))
}
case class GpuStddevPop(child: Expression, nullOnDivideByZero: Boolean)
extends GpuM2(child, nullOnDivideByZero) {
override lazy val evaluateExpression: Expression = {
// stddev_pop = sqrt(m2 / n).
val stddevPop = GpuSqrt(GpuDivide(bufferM2, bufferN, failOnError = false))
// Set nulls for the rows where n == 0.
GpuIf(GpuEqualTo(bufferN, GpuLiteral(0.0)), GpuLiteral(null, DoubleType), stddevPop)
}
override def prettyName: String = "stddev_pop"
}
case class WindowStddevSamp(
child: Expression,
nullOnDivideByZero: Boolean)
extends GpuAggregateWindowFunction {
override def dataType: DataType = DoubleType
override def children: Seq[Expression] = Seq(child)
override def nullable: Boolean = true
/**
* Using child references, define the shape of the vectors sent to the window operations
*/
override val windowInputProjection: Seq[Expression] = Seq(child)
override def windowAggregation(inputs: Seq[(ColumnVector, Int)]): RollingAggregationOnColumn = {
RollingAggregation.standardDeviation().onColumn(inputs.head._2)
}
}
case class GpuStddevSamp(child: Expression, nullOnDivideByZero: Boolean)
extends GpuM2(child, nullOnDivideByZero) with GpuReplaceWindowFunction {
override lazy val evaluateExpression: Expression = {
// stddev_samp = sqrt(m2 / (n - 1.0)).
val stddevSamp =
GpuSqrt(GpuDivide(bufferM2, GpuSubtract(bufferN, GpuLiteral(1.0), failOnError = false),
failOnError = false))
// Set nulls for the rows where n == 0, and set nulls (or NaN) for the rows where n == 1.
GpuIf(GpuEqualTo(bufferN, GpuLiteral(1.0)), divideByZeroEvalResult,
GpuIf(GpuEqualTo(bufferN, GpuLiteral(0.0)), GpuLiteral(null, DoubleType), stddevSamp))
}
override def prettyName: String = "stddev_samp"
override def windowReplacement(spec: GpuWindowSpecDefinition): Expression = {
// calculate n
val count = GpuCast(GpuWindowExpression(GpuCount(Seq(child)), spec), DoubleType)
val stddev = GpuWindowExpression(WindowStddevSamp(child, nullOnDivideByZero), spec)
// if (n == 0.0)
GpuIf(GpuEqualTo(count, GpuLiteral(0.0)),
// return null
GpuLiteral(null, DoubleType),
// else if (n == 1.0)
GpuIf(GpuEqualTo(count, GpuLiteral(1.0)),
// return divideByZeroEval
divideByZeroEvalResult,
// else return stddev
stddev))
}
}
case class GpuVariancePop(child: Expression, nullOnDivideByZero: Boolean)
extends GpuM2(child, nullOnDivideByZero) {
override lazy val evaluateExpression: Expression = {
// var_pop = m2 / n.
val varPop = GpuDivide(bufferM2, bufferN, failOnError = false)
// Set nulls for the rows where n == 0.
GpuIf(GpuEqualTo(bufferN, GpuLiteral(0.0)), GpuLiteral(null, DoubleType), varPop)
}
override def prettyName: String = "var_pop"
}
case class GpuVarianceSamp(child: Expression, nullOnDivideByZero: Boolean)
extends GpuM2(child, nullOnDivideByZero) {
override lazy val evaluateExpression: Expression = {
// var_samp = m2 / (n - 1.0).
val varSamp = GpuDivide(bufferM2, GpuSubtract(bufferN, GpuLiteral(1.0), failOnError = false),
failOnError = false)
// Set nulls for the rows where n == 0, and set nulls (or NaN) for the rows where n == 1.
GpuIf(GpuEqualTo(bufferN, GpuLiteral(1.0)), divideByZeroEvalResult,
GpuIf(GpuEqualTo(bufferN, GpuLiteral(0.0)), GpuLiteral(null, DoubleType), varSamp))
}
override def prettyName: String = "var_samp"
}