com.nvidia.spark.rapids.window.BasicWindowCalc.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of rapids-4-spark_2.13 Show documentation
Creates the distribution package of the RAPIDS plugin for Apache Spark
There is a newer version: 24.10.1
/*
 * Copyright (c) 2024, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.nvidia.spark.rapids.window

import java.util.concurrent.TimeUnit

import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer

import ai.rapids.cudf
import ai.rapids.cudf.{AggregationOverWindow, DType, GroupByOptions, GroupByScanAggregation, NullPolicy, ReplacePolicy, ReplacePolicyWithColumn, Scalar, ScanAggregation, ScanType, Table, WindowOptions}
import com.nvidia.spark.rapids._
import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
import com.nvidia.spark.rapids.shims.GpuWindowUtil

import org.apache.spark.sql.catalyst.expressions.{Expression, FrameType, RangeFrame, RowFrame, SortOrder}
import org.apache.spark.sql.types.{ByteType, CalendarIntervalType, DataType, Decimal, DecimalType, DoubleType, FloatType, IntegerType, LongType, ShortType}
import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector}
import org.apache.spark.unsafe.types.CalendarInterval


/**
 * For Scan and GroupBy Scan aggregations nulls are not always treated the same way as they are
 * in window operations. Often we have to run a post processing step and replace them. This
 * groups those two together so we can have a complete picture of how to perform these types of
 * aggregations.
 */
case class AggAndReplace[T](agg: T, nullReplacePolicy: Option[ReplacePolicy])

/**
 * The class represents a window function and the locations of its deduped inputs after an initial
 * projection.
 */
case class BoundGpuWindowFunction(
    windowFunc: GpuWindowFunction,
    boundInputLocations: Array[Int]) {

  /**
   * Get the operations to perform a scan aggregation.
   * @param isRunningBatched is this for a batched running window operation?
   * @return the sequence of aggregation operators to do. There will be one `AggAndReplace`
   *         for each value in `boundInputLocations` so that they can be zipped together.
   */
  def scan(isRunningBatched: Boolean): Seq[AggAndReplace[ScanAggregation]] = {
    val aggFunc = windowFunc.asInstanceOf[GpuRunningWindowFunction]
    aggFunc.scanAggregation(isRunningBatched)
  }

  /**
   * Get the operations to perform a group by scan aggregation.
   * @param isRunningBatched is this for a batched running window operation?
   * @return the sequence of aggregation operators to do. There will be one `AggAndReplace`
   *         for each value in `boundInputLocations` so that they can be zipped together.
   */
  def groupByScan(isRunningBatched: Boolean): Seq[AggAndReplace[GroupByScanAggregation]] = {
    val aggFunc = windowFunc.asInstanceOf[GpuRunningWindowFunction]
    aggFunc.groupByScanAggregation(isRunningBatched)
  }

  /**
   * After a scan or group by scan if there are multiple columns they need to be combined together
   * into a single final output column. This does that job.
   * @param isRunningBatched is this for a batched running window operation?
   * @param cols the columns to be combined. This should not close them.
   * @return a single result column.
   */
  def scanCombine(isRunningBatched: Boolean,
      cols: Seq[cudf.ColumnVector]): cudf.ColumnVector = {
    val aggFunc = windowFunc.asInstanceOf[GpuRunningWindowFunction]
    aggFunc.scanCombine(isRunningBatched, cols)
  }

  def aggOverWindow(cb: ColumnarBatch,
      windowOpts: WindowOptions): AggregationOverWindow = {
    val aggFunc = windowFunc.asInstanceOf[GpuAggregateWindowFunction]
    val inputs = boundInputLocations.map { pos =>
      (cb.column(pos).asInstanceOf[GpuColumnVector].getBase, pos)
    }
    aggFunc.windowAggregation(inputs).overWindow(windowOpts)
  }

  def windowOutput(cv: cudf.ColumnVector): cudf.ColumnVector = {
    val aggFunc = windowFunc.asInstanceOf[GpuAggregateWindowFunction]
    aggFunc.windowOutput(cv)
  }

  val dataType: DataType = windowFunc.dataType
}

/**
 * Abstraction for possible range-boundary specifications.
 *
 * This provides type disjunction for Long, BigInt and Double,
 * the three types that might represent a range boundary.
 */
abstract class RangeBoundaryValue {
  def long: Long = RangeBoundaryValue.long(this)
  def bigInt: BigInt = RangeBoundaryValue.bigInt(this)
  def double: Double = RangeBoundaryValue.double(this)
}

case class LongRangeBoundaryValue(value: Long) extends RangeBoundaryValue
case class BigIntRangeBoundaryValue(value: BigInt) extends RangeBoundaryValue
case class DoubleRangeBoundaryValue(value: Double) extends RangeBoundaryValue

object RangeBoundaryValue {

  def long(boundary: RangeBoundaryValue): Long = boundary match {
    case LongRangeBoundaryValue(l) => l
    case other => throw new NoSuchElementException(s"Cannot get `long` from $other")
  }

  def bigInt(boundary: RangeBoundaryValue): BigInt = boundary match {
    case BigIntRangeBoundaryValue(b) => b
    case other => throw new NoSuchElementException(s"Cannot get `bigInt` from $other")
  }

  def double(boundary: RangeBoundaryValue): Double = boundary match {
    case DoubleRangeBoundaryValue(d) => d
    case other => throw new NoSuchElementException(s"Cannot get `double` from $other")
  }

  def long(value: Long): LongRangeBoundaryValue = LongRangeBoundaryValue(value)

  def bigInt(value: BigInt): BigIntRangeBoundaryValue = BigIntRangeBoundaryValue(value)

  def double(value: Double): DoubleRangeBoundaryValue = DoubleRangeBoundaryValue(value)
}

case class ParsedBoundary(isUnbounded: Boolean, value: RangeBoundaryValue)

object GroupedAggregations {
  /**
   * Get the window options for an aggregation
   * @param orderSpec the order by spec
   * @param orderPositions the positions of the order by columns
   * @param frame the frame to translate
   * @return the options to use when doing the aggregation.
   */
  private def getWindowOptions(
      orderSpec: Seq[SortOrder],
      orderPositions: Seq[Int],
      frame: GpuSpecifiedWindowFrame,
      minPeriods: Int): WindowOptions = {
    frame.frameType match {
      case RowFrame =>
        withResource(getRowBasedLower(frame)) { lower =>
          withResource(getRowBasedUpper(frame)) { upper =>
            val builder = WindowOptions.builder().minPeriods(minPeriods)
            if (isUnbounded(frame.lower)) builder.unboundedPreceding() else builder.preceding(lower)
            if (isUnbounded(frame.upper)) builder.unboundedFollowing() else builder.following(upper)
            builder.build
          }
        }
      case RangeFrame =>
        // This gets to be a little more complicated

        // We only support a single column to order by right now, so just verify that.
        require(orderSpec.length == 1)
        require(orderPositions.length == orderSpec.length)
        val orderExpr = orderSpec.head

        // We only support basic types for now too
        val orderType = GpuColumnVector.getNonNestedRapidsType(orderExpr.dataType)

        val orderByIndex = orderPositions.head
        val lower = getRangeBoundaryValue(frame.lower, orderType)
        val upper = getRangeBoundaryValue(frame.upper, orderType)

        withResource(asScalarRangeBoundary(orderType, lower)) { preceding =>
          withResource(asScalarRangeBoundary(orderType, upper)) { following =>
            val windowOptionBuilder = WindowOptions.builder()
                .minPeriods(1) // Does not currently support custom minPeriods.
                .orderByColumnIndex(orderByIndex)

            if (preceding.isEmpty) {
              windowOptionBuilder.unboundedPreceding()
            } else {
              if (orderType == DType.STRING) { // Bounded STRING bounds can only mean "CURRENT ROW".
                windowOptionBuilder.currentRowPreceding()
              } else {
                windowOptionBuilder.preceding(preceding.get)
              }
            }

            if (following.isEmpty) {
              windowOptionBuilder.unboundedFollowing()
            } else {
              if (orderType == DType.STRING) { // Bounded STRING bounds can only mean "CURRENT ROW".
                windowOptionBuilder.currentRowFollowing()
              } else {
                windowOptionBuilder.following(following.get)
              }
            }

            if (orderExpr.isAscending) {
              windowOptionBuilder.orderByAscending()
            } else {
              windowOptionBuilder.orderByDescending()
            }

            windowOptionBuilder.build()
          }
        }
    }
  }

  private def isUnbounded(boundary: Expression): Boolean = boundary match {
    case special: GpuSpecialFrameBoundary => special.isUnbounded
    case _ => false
  }

  private def getRowBasedLower(windowFrameSpec : GpuSpecifiedWindowFrame): Scalar = {
    val lower = getRowBoundaryValue(windowFrameSpec.lower)

    // Translate the lower bound value to CUDF semantics:
    // In spark 0 is the current row and lower bound is negative relative to that
    // In CUDF the preceding window starts at the current row with 1 and up from there the
    // further from the current row.
    val ret = if (lower >= Int.MaxValue) {
      Int.MinValue
    } else if (lower <= Int.MinValue) {
      Int.MaxValue
    } else {
      -(lower-1)
    }
    Scalar.fromInt(ret)
  }

  private def getRowBasedUpper(windowFrameSpec : GpuSpecifiedWindowFrame): Scalar =
    Scalar.fromInt(getRowBoundaryValue(windowFrameSpec.upper))

  private def getRowBoundaryValue(boundary : Expression) : Int = boundary match {
    case literal: GpuLiteral if literal.dataType.equals(IntegerType) =>
      literal.value.asInstanceOf[Int]
    case special: GpuSpecialFrameBoundary =>
      special.value
    case anythingElse =>
      throw new UnsupportedOperationException(s"Unsupported window frame expression $anythingElse")
  }

  /**
   * Create a Scalar from boundary value according to order by column type.
   *
   * Timestamp types will be converted into interval types.
   *
   * @param orderByType the type of order by column
   * @param bound boundary value
   * @return a Scalar holding boundary value or None if the boundary is unbounded.
   */
  private def asScalarRangeBoundary(orderByType: DType, bound: ParsedBoundary): Option[Scalar] = {
    if (bound.isUnbounded) {
      None
    } else {
      val s = orderByType match {
        case DType.INT8 => Scalar.fromByte(bound.value.long.toByte)
        case DType.INT16 => Scalar.fromShort(bound.value.long.toShort)
        case DType.INT32 => Scalar.fromInt(bound.value.long.toInt)
        case DType.INT64 => Scalar.fromLong(bound.value.long)
        case DType.FLOAT32 => Scalar.fromFloat(bound.value.double.toFloat)
        case DType.FLOAT64 => Scalar.fromDouble(bound.value.double)
        // Interval is not working for DateType
        case DType.TIMESTAMP_DAYS => Scalar.durationFromLong(DType.DURATION_DAYS, bound.value.long)
        case DType.TIMESTAMP_MICROSECONDS =>
          Scalar.durationFromLong(DType.DURATION_MICROSECONDS, bound.value.long)
        case x if x.getTypeId == DType.DTypeEnum.DECIMAL32 =>
          Scalar.fromDecimal(x.getScale, bound.value.long.toInt)
        case x if x.getTypeId == DType.DTypeEnum.DECIMAL64 =>
          Scalar.fromDecimal(x.getScale, bound.value.long)
        case x if x.getTypeId == DType.DTypeEnum.DECIMAL128 =>
          Scalar.fromDecimal(x.getScale, bound.value.bigInt.underlying())
        case x if x.getTypeId == DType.DTypeEnum.STRING =>
          // Not UNBOUNDED. The only other supported boundary for String is CURRENT ROW, i.e. 0.
          Scalar.fromString("")
        case _ => throw new RuntimeException(s"Not supported order by type, Found $orderByType")
      }
      Some(s)
    }
  }

  private def getRangeBoundaryValue(boundary: Expression, orderByType: DType): ParsedBoundary =
    boundary match {
      case special: GpuSpecialFrameBoundary =>
        ParsedBoundary(
          isUnbounded = special.isUnbounded,
          value = orderByType.getTypeId match {
            case DType.DTypeEnum.DECIMAL128 => RangeBoundaryValue.bigInt(special.value)
            case DType.DTypeEnum.FLOAT32 | DType.DTypeEnum.FLOAT64 =>
              RangeBoundaryValue.double(special.value)
            case _ => RangeBoundaryValue.long(special.value)
          }
        )
      case GpuLiteral(ci: CalendarInterval, CalendarIntervalType) =>
        // Get the total microseconds for TIMESTAMP_MICROSECONDS
        var x = TimeUnit.DAYS.toMicros(ci.days) + ci.microseconds
        if (x == Long.MinValue) x = Long.MaxValue
        ParsedBoundary(isUnbounded = false, RangeBoundaryValue.long(Math.abs(x)))
      case GpuLiteral(value, ByteType) =>
        var x = value.asInstanceOf[Byte]
        if (x == Byte.MinValue) x = Byte.MaxValue
        ParsedBoundary(isUnbounded = false, RangeBoundaryValue.long(Math.abs(x)))
      case GpuLiteral(value, ShortType) =>
        var x = value.asInstanceOf[Short]
        if (x == Short.MinValue) x = Short.MaxValue
        ParsedBoundary(isUnbounded = false, RangeBoundaryValue.long(Math.abs(x)))
      case GpuLiteral(value, IntegerType) =>
        var x = value.asInstanceOf[Int]
        if (x == Int.MinValue) x = Int.MaxValue
        ParsedBoundary(isUnbounded = false, RangeBoundaryValue.long(Math.abs(x)))
      case GpuLiteral(value, LongType) =>
        var x = value.asInstanceOf[Long]
        if (x == Long.MinValue) x = Long.MaxValue
        ParsedBoundary(isUnbounded = false, RangeBoundaryValue.long(Math.abs(x)))
      case GpuLiteral(value, FloatType) =>
        var x = value.asInstanceOf[Float]
        if (x == Float.MinValue) x = Float.MaxValue
        ParsedBoundary(isUnbounded = false, RangeBoundaryValue.double(Math.abs(x)))
      case GpuLiteral(value, DoubleType) =>
        var x = value.asInstanceOf[Double]
        if (x == Double.MinValue) x = Double.MaxValue
        ParsedBoundary(isUnbounded = false, RangeBoundaryValue.double(Math.abs(x)))
      case GpuLiteral(value: Decimal, DecimalType()) =>
        orderByType.getTypeId match {
          case DType.DTypeEnum.DECIMAL32 | DType.DTypeEnum.DECIMAL64 =>
            ParsedBoundary(isUnbounded = false,
              RangeBoundaryValue.long(Math.abs(value.toUnscaledLong)))
          case DType.DTypeEnum.DECIMAL128 =>
            ParsedBoundary(isUnbounded = false,
              RangeBoundaryValue.bigInt(value.toJavaBigDecimal.unscaledValue().abs))
          case anythingElse =>
            throw new UnsupportedOperationException(s"Unexpected Decimal type: $anythingElse")
        }
      case anything => GpuWindowUtil.getRangeBoundaryValue(anything)
    }
}

/**
 * Window aggregations that are grouped together. It holds the aggregation and the offsets of
 * its input columns, along with the output columns it should write the result to.
 */
class GroupedAggregations {
  import GroupedAggregations._

  // The window frame to a map of the window function to the output locations for the result
  private val data = mutable.HashMap[GpuSpecifiedWindowFrame,
      mutable.HashMap[BoundGpuWindowFunction, ArrayBuffer[Int]]]()

  // This is similar to data but specific to running windows. We don't divide it up by the
  // window frame because the frame is the same for all of them unbounded rows preceding to
  // the current row.
  private val runningWindowOptimizedData =
    mutable.HashMap[BoundGpuWindowFunction, ArrayBuffer[Int]]()

  /**
   * Add an aggregation.
   * @param win the window this aggregation is over.
   * @param inputLocs the locations of the input columns for this aggregation.
   * @param outputIndex the output index this will write to in the final output.
   */
  def addAggregation(win: GpuWindowExpression, inputLocs: Array[Int], outputIndex: Int): Unit = {
    val forSpec = if (win.isOptimizedRunningWindow) {
      runningWindowOptimizedData
    } else {
      data.getOrElseUpdate(win.normalizedFrameSpec, mutable.HashMap.empty)
    }

    forSpec.getOrElseUpdate(BoundGpuWindowFunction(win.wrappedWindowFunc, inputLocs),
      ArrayBuffer.empty) += outputIndex
  }

  private def doAggInternal(
      frameType: FrameType,
      boundOrderSpec: Seq[SortOrder],
      orderByPositions: Array[Int],
      partByPositions: Array[Int],
      inputCb: ColumnarBatch,
      outputColumns: Array[cudf.ColumnVector],
      aggIt: (Table.GroupByOperation, Seq[AggregationOverWindow]) => Table): Unit = {
    data.foreach {
      case (frameSpec, functions) =>
        if (frameSpec.frameType == frameType) {
          // For now I am going to assume that we don't need to combine calls across frame specs
          // because it would just not help that much
          val result = {
            val allWindowOpts = functions.map { f =>
              getWindowOptions(boundOrderSpec, orderByPositions, frameSpec,
                f._1.windowFunc.getMinPeriods)
            }
            withResource(allWindowOpts.toSeq) { allWindowOpts =>
              val allAggs = allWindowOpts.zip(functions).map { case (windowOpt, f) =>
                f._1.aggOverWindow(inputCb, windowOpt)
              }
              withResource(GpuColumnVector.from(inputCb)) { initProjTab =>
                aggIt(initProjTab.groupBy(partByPositions: _*), allAggs)
              }
            }
          }
          withResource(result) { result =>
            functions.zipWithIndex.foreach {
              case ((func, outputIndexes), resultIndex) =>
                val aggColumn = result.getColumn(resultIndex)

                outputIndexes.foreach { outIndex =>
                  require(outputColumns(outIndex) == null,
                    "Attempted to overwrite a window output column!!")
                  outputColumns(outIndex) = func.windowOutput(aggColumn)
                }
            }
          }
        }
    }
  }

  private def doRowAggs(boundOrderSpec: Seq[SortOrder],
      orderByPositions: Array[Int],
      partByPositions: Array[Int],
      inputCb: ColumnarBatch,
      outputColumns: Array[cudf.ColumnVector]): Unit = {
    doAggInternal(
      RowFrame, boundOrderSpec, orderByPositions, partByPositions, inputCb, outputColumns,
      (groupBy, aggs) => groupBy.aggregateWindows(aggs: _*))
  }

  private def doRangeAggs(boundOrderSpec: Seq[SortOrder],
      orderByPositions: Array[Int],
      partByPositions: Array[Int],
      inputCb: ColumnarBatch,
      outputColumns: Array[cudf.ColumnVector]): Unit = {
    doAggInternal(
      RangeFrame, boundOrderSpec, orderByPositions, partByPositions, inputCb, outputColumns,
      (groupBy, aggs) => groupBy.aggregateWindowsOverRanges(aggs: _*))
  }

  private final def doRunningWindowScan(
      isRunningBatched: Boolean,
      inputCb: ColumnarBatch,
      outputColumns: Array[cudf.ColumnVector]): Unit = {
    runningWindowOptimizedData.foreach {
      case (func, outputIndexes) =>
        val aggAndReplaces = func.scan(isRunningBatched)
        // For now we need at least one column. For row number in the future we might be able
        // to change that, but I think this is fine.
        require(func.boundInputLocations.length == aggAndReplaces.length,
          s"Input locations for ${func.windowFunc} do not match aggregations " +
              s"${func.boundInputLocations.toSeq} vs $aggAndReplaces")
        val combined = withResource(
          new ArrayBuffer[cudf.ColumnVector](aggAndReplaces.length)) { replacedCols =>
          func.boundInputLocations.indices.foreach { aggIndex =>
            val inputColIndex = func.boundInputLocations(aggIndex)
            val inputCol = inputCb.column(inputColIndex).asInstanceOf[GpuColumnVector].getBase
            val anr = aggAndReplaces(aggIndex)
            val agg = anr.agg
            val replacePolicy = anr.nullReplacePolicy
            replacedCols +=
                withResource(inputCol.scan(agg, ScanType.INCLUSIVE, NullPolicy.EXCLUDE)) {
                  scanned =>
                    // For scans when nulls are excluded then each input row that has a null in it
                    // the output row also has a null in it. Typically this is not what we want,
                    // because for windows that only happens if the first values are nulls. So we
                    // will then call replace nulls as needed to fix that up. Typically the
                    // replacement policy is preceding.
                    replacePolicy.map(scanned.replaceNulls).getOrElse(scanned.incRefCount())
                }
          }
          func.scanCombine(isRunningBatched, replacedCols.toSeq)
        }

        withResource(combined) { combined =>
          outputIndexes.foreach { outIndex =>
            require(outputColumns(outIndex) == null,
              "Attempted to overwrite a window output column!!")
            outputColumns(outIndex) = combined.incRefCount()
          }
        }
    }
  }

  // Part by is always ascending with nulls first, which is the default for group by options too
  private[this] val sortedGroupingOpts = GroupByOptions.builder()
      .withKeysSorted(true)
      .build()

  /**
   * Do just the grouped scan portion of a grouped scan aggregation.
   * @param isRunningBatched is this optimized for a running batch?
   * @param partByPositions what are the positions of the part by columns.
   * @param inputCb the input data to process
   * @return a Table that is the result of the aggregations. The partition
   *         by columns will be first, followed by one column for each aggregation in the order of
   *         `runningWindowOptimizedData`.
   */
  private final def justGroupedScan(
      isRunningBatched: Boolean,
      partByPositions: Array[Int],
      inputCb: ColumnarBatch): Table = {
    val allAggsWithInputs = runningWindowOptimizedData.map { case (func, _) =>
      func.groupByScan(isRunningBatched).zip(func.boundInputLocations)
    }.toArray

    val allAggs = allAggsWithInputs.flatMap { aggsWithInputs =>
      aggsWithInputs.map { case (aggAndReplace, index) =>
        aggAndReplace.agg.onColumn(index)
      }
    }

    val unoptimizedResult = withResource(GpuColumnVector.from(inputCb)) { initProjTab =>
      initProjTab.groupBy(sortedGroupingOpts, partByPositions: _*).scan(allAggs: _*)
    }
    // Our scan is sorted, but to comply with the API requirements of a non-sorted scan
    // the group/partition by columns are copied out. This is more memory then we want,
    // so we will replace them in the result with the same columns from the input batch
    withResource(unoptimizedResult) { unoptimizedResult =>
      withResource(new Array[cudf.ColumnVector](unoptimizedResult.getNumberOfColumns)) { cols =>
        // First copy over the part by columns
        partByPositions.zipWithIndex.foreach { case (inPos, outPos) =>
          cols(outPos) = inputCb.column(inPos).asInstanceOf[GpuColumnVector].getBase.incRefCount()
        }

        // Now copy over the scan results
        (partByPositions.length until unoptimizedResult.getNumberOfColumns).foreach { pos =>
          cols(pos) = unoptimizedResult.getColumn(pos).incRefCount()
        }
        new Table(cols: _*)
      }
    }
  }

  private final def groupedReplace(
      isRunningBatched: Boolean,
      partByPositions: Array[Int],
      tabFromScan: Table): Table = {
    // This gets a little complicated, because scan does not typically treat nulls the
    // way window treats nulls. So in some cases we need to do another group by and replace
    // the nulls to make them match what we want. But this is not all of the time, so we
    // keep track of which aggregations need to have a replace called on them, and where
    // we need to copy the results back out to. This is a little hard, but to try and help keep
    // track of it all the output of scan has the group by columns first followed by the scan
    // result columns in the order of `runningWindowOptimizedData`, and the output of
    // replace has the group by columns first followed by the replaced columns. So scans that
    // don't need a replace don't show up in the output of the replace call.
    val allReplace = ArrayBuffer[ReplacePolicyWithColumn]()
    val copyFromScan = ArrayBuffer[Int]()
    // We will not drop the partition by columns
    copyFromScan.appendAll(partByPositions.indices)

    // Columns to copy from the output of replace in the format of (fromReplaceIndex, toOutputIndex)
    val copyFromReplace = ArrayBuffer[(Int, Int)]()
    // Index of a column after it went through replace
    var afterReplaceIndex = partByPositions.length
    // Index of a column before it went through replace (this should be the same as the scan input
    // and the final output)
    var beforeReplaceIndex = partByPositions.length
    runningWindowOptimizedData.foreach { case (func, _) =>
      func.groupByScan(isRunningBatched).foreach { aggAndReplace =>
        val replace = aggAndReplace.nullReplacePolicy
        if (replace.isDefined) {
          allReplace.append(replace.get.onColumn(beforeReplaceIndex))
          copyFromReplace.append((afterReplaceIndex, beforeReplaceIndex))
          afterReplaceIndex += 1
        } else {
          copyFromScan.append(beforeReplaceIndex)
        }
        beforeReplaceIndex += 1
      }
    }

    withResource(new Array[cudf.ColumnVector](tabFromScan.getNumberOfColumns)) { columns =>
      copyFromScan.foreach { index =>
        columns(index) = tabFromScan.getColumn(index).incRefCount()
      }
      if (allReplace.nonEmpty) {
        // Don't bother to do the replace if none of them want anything replaced
        withResource(tabFromScan
            .groupBy(sortedGroupingOpts, partByPositions.indices: _*)
            .replaceNulls(allReplace.toSeq: _*)) { replaced =>
          copyFromReplace.foreach { case (from, to) =>
            columns(to) = replaced.getColumn(from).incRefCount()
          }
        }
      }
      new Table(columns: _*)
    }
  }

  /**
   * Take the aggregation results and run `scanCombine` on them if needed before copying them to
   * the output location.
   */
  private final def combineAndOutput(isRunningBatched: Boolean,
      partByPositions: Array[Int],
      scannedAndReplaced: Table,
      outputColumns: Array[cudf.ColumnVector]): Unit = {
    var readIndex = partByPositions.length
    runningWindowOptimizedData.foreach { case (func, outputLocations) =>
      val numScans = func.boundInputLocations.length
      val columns =
        (readIndex until (readIndex + numScans)).map(scannedAndReplaced.getColumn).toArray
      withResource(func.scanCombine(isRunningBatched, columns)) { col =>
        outputLocations.foreach { outIndex =>
          require(outputColumns(outIndex) == null,
            "Attempted to overwrite a window output column!!")
          outputColumns(outIndex) = col.incRefCount()
        }
      }
      readIndex += numScans
    }
  }

  /**
   * Do any running window grouped scan aggregations.
   */
  private final def doRunningWindowGroupedScan(
      isRunningBatched: Boolean,
      partByPositions: Array[Int],
      inputCb: ColumnarBatch,
      outputColumns: Array[cudf.ColumnVector]): Unit = {
    val replaced =
      withResource(justGroupedScan(isRunningBatched, partByPositions, inputCb)) { scanned =>
        groupedReplace(isRunningBatched, partByPositions, scanned)
      }
    withResource(replaced) { replaced =>
      combineAndOutput(isRunningBatched, partByPositions, replaced, outputColumns)
    }
  }

  /**
   * Do any running window optimized aggregations.
   */
  private def doRunningWindowOptimizedAggs(
      isRunningBatched: Boolean,
      partByPositions: Array[Int],
      inputCb: ColumnarBatch,
      outputColumns: Array[cudf.ColumnVector]): Unit = {
    if (runningWindowOptimizedData.nonEmpty) {
      if (partByPositions.isEmpty) {
        // This is implemented in terms of a scan on a column
        doRunningWindowScan(isRunningBatched, inputCb, outputColumns)
      } else {
        doRunningWindowGroupedScan(isRunningBatched, partByPositions, inputCb, outputColumns)
      }
    }
  }

  /**
   * Do all of the aggregations and put them in the output columns. There may be extra processing
   * after this before you get to a final result.
   */
  def doAggs(isRunningBatched: Boolean,
      boundOrderSpec: Seq[SortOrder],
      orderByPositions: Array[Int],
      partByPositions: Array[Int],
      inputCb: ColumnarBatch,
      outputColumns: Array[cudf.ColumnVector]): Unit = {
    doRunningWindowOptimizedAggs(isRunningBatched, partByPositions, inputCb, outputColumns)
    doRowAggs(boundOrderSpec, orderByPositions, partByPositions, inputCb, outputColumns)
    doRangeAggs(boundOrderSpec, orderByPositions, partByPositions, inputCb, outputColumns)
  }

  /**
   * Turn the final result of the aggregations into a ColumnarBatch.
   */
  def convertToColumnarBatch(dataTypes: Array[DataType],
      aggOutputColumns: Array[cudf.ColumnVector]): ColumnarBatch = {
    assert(dataTypes.length == aggOutputColumns.length)
    val numRows = aggOutputColumns.head.getRowCount.toInt
    closeOnExcept(new Array[ColumnVector](aggOutputColumns.length)) { finalOutputColumns =>
      dataTypes.indices.foreach { index =>
        val dt = dataTypes(index)
        val col = aggOutputColumns(index)
        finalOutputColumns(index) = GpuColumnVector.from(col, dt).incRefCount()
      }
      new ColumnarBatch(finalOutputColumns, numRows)
    }
  }
}

/**
 * Calculates the results of window operations. It assumes that any batching of the data
 * or fixups after the fact to get the right answer is done outside of this.
 */
trait BasicWindowCalc {
  val boundWindowOps: Seq[GpuExpression]
  val boundPartitionSpec: Seq[GpuExpression]
  val boundOrderSpec: Seq[SortOrder]

  /**
   * Is this going to do a batched running window optimization or not.
   */
  def isRunningBatched: Boolean

  // In order to dedupe aggregations we take a slightly different approach from
  // group by aggregations. Instead of using named expressions to line up different
  // parts of the aggregation (pre-processing, aggregation, post-processing) we
  // keep track of the offsets directly. This is quite a bit more complex, but lets us
  // see that 5 aggregations want a column of just 1 and we dedupe it so it is only
  // materialized once.
  // `initialProjections` are a list of projections that provide the inputs to the `aggregations`
  // The order of these matter and `aggregations` is keeping track of them
  // `passThrough` are columns that go directly from the input to the output. The first value
  // is the index in the original input batch. The second value is the index in the final output
  // batch
  // `orderByPositions`  and `partByPositions` are the positions in `initialProjections` for
  // the order by columns and the part by columns respectively.
  private val (initialProjections,
  passThrough,
  aggregations,
  orderByPositions,
  partByPositions) = {
    val initialProjections = ArrayBuffer[Expression]()
    val dedupedInitialProjections = mutable.HashMap[Expression, Int]()

    def getOrAddInitialProjectionIndex(expr: Expression): Int =
      dedupedInitialProjections.getOrElseUpdate(expr, {
        val at = initialProjections.length
        initialProjections += expr
        at
      })

    val passThrough = ArrayBuffer[(Int, Int)]()
    val aggregations = new GroupedAggregations()

    boundWindowOps.zipWithIndex.foreach {
      case (GpuAlias(GpuBoundReference(inputIndex, _, _), _), outputIndex) =>
        passThrough.append((inputIndex, outputIndex))
      case (GpuBoundReference(inputIndex, _, _), outputIndex) =>
        passThrough.append((inputIndex, outputIndex))
      case (GpuAlias(win: GpuWindowExpression, _), outputIndex) =>
        val inputLocations = win.initialProjections(isRunningBatched)
            .map(getOrAddInitialProjectionIndex).toArray
        aggregations.addAggregation(win, inputLocations, outputIndex)
      case _ =>
        throw new IllegalArgumentException("Unexpected operation found in window expression")
    }

    val partByPositions =  boundPartitionSpec.map(getOrAddInitialProjectionIndex).toArray
    val orderByPositions = boundOrderSpec.map { so =>
      getOrAddInitialProjectionIndex(so.child)
    }.toArray

    (initialProjections, passThrough, aggregations, orderByPositions, partByPositions)
  }

  /**
   * Compute the basic aggregations. In some cases the resulting columns may not be the expected
   * types.  This could be caused by cudf type differences and can be fixed by calling
   * `castResultsIfNeeded` or it could be different because the window operations know about a
   * post processing step that needs to happen prior to `castResultsIfNeeded`.
   * @param cb the batch to do window aggregations on.
   * @return the cudf columns that are the results of doing the aggregations.
   */
  def computeBasicWindow(cb: ColumnarBatch): Array[cudf.ColumnVector] = {
    closeOnExcept(new Array[cudf.ColumnVector](boundWindowOps.length)) { outputColumns =>

      withResource(GpuProjectExec.project(cb, initialProjections.toSeq)) { proj =>
        aggregations.doAggs(
          isRunningBatched,
          boundOrderSpec,
          orderByPositions,
          partByPositions,
          proj,
          outputColumns)
      }

      // if the window aggregates were successful, lets splice the passThrough
      // columns
      passThrough.foreach {
        case (inputIndex, outputIndex) =>
          outputColumns(outputIndex) =
            cb.column(inputIndex).asInstanceOf[GpuColumnVector].getBase.incRefCount()
      }

      outputColumns
    }
  }

  def convertToBatch(dataTypes: Array[DataType],
      cols: Array[cudf.ColumnVector]): ColumnarBatch =
    aggregations.convertToColumnarBatch(dataTypes, cols)
}