com.nvidia.spark.rapids.AutoCloseColumnBatchIterator.scala Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of rapids-4-spark_2.12 Show documentation

Creates the distribution package of the RAPIDS plugin for Apache Spark

There is a newer version: 24.10.1

/*
 * Copyright (c) 2019-2023, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.nvidia.spark.rapids

import com.nvidia.spark.rapids.ScalableTaskCompletion.onTaskCompletion

import org.apache.spark.TaskContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.vectorized.ColumnarBatch

/**
 * For columnar code on the CPU it is the responsibility of the SparkPlan exec that creates a
 * `ColumnarBatch` to close it.  In the case of code running on the GPU that would waste too
 * much memory, so it is the responsibility of the code receiving the batch to close it, when it
 * is not longer needed.
 *
 * This class provides a simple way for CPU batch code to be sure that a batch gets closed. If your
 * code is executing on the GPU do not use this class.
 */
class AutoCloseColumnBatchIterator[U](itr: Iterator[U], nextBatch: Iterator[U] => ColumnarBatch)
    extends Iterator[ColumnarBatch] {
  var cb: ColumnarBatch = null

  private def closeCurrentBatch(): Unit = {
    if (cb != null) {
      cb.close
      cb = null
    }
  }

  // Don't install the callback if in a unit test
  Option(TaskContext.get()).foreach { tc =>
    onTaskCompletion(tc) {
      closeCurrentBatch()
    }
  }

  override def hasNext: Boolean = {
    closeCurrentBatch()
    itr.hasNext
  }

  override def next(): ColumnarBatch = {
    closeCurrentBatch()
    cb = nextBatch(itr)
    cb
  }
}

object AutoCloseColumnBatchIterator {
  def map[U](rdd: RDD[U], f: (U) => ColumnarBatch) : RDD[ColumnarBatch] = {
    rdd.mapPartitions((itr) => new AutoCloseColumnBatchIterator(itr,
      (batchIter: Iterator[U]) => f(batchIter.next())))
  }
}