com.nvidia.spark.rapids.RapidsBuffer.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of rapids-4-spark_2.13 Show documentation
Creates the distribution package of the RAPIDS plugin for Apache Spark
There is a newer version: 24.10.1
/*
 * Copyright (c) 2020-2023, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.nvidia.spark.rapids

import java.io.File
import java.nio.channels.WritableByteChannel

import scala.collection.mutable.ArrayBuffer

import ai.rapids.cudf.{Cuda, DeviceMemoryBuffer, HostMemoryBuffer, MemoryBuffer, Table}
import com.nvidia.spark.rapids.Arm.withResource
import com.nvidia.spark.rapids.RapidsPluginImplicits._
import com.nvidia.spark.rapids.StorageTier.StorageTier
import com.nvidia.spark.rapids.format.TableMeta

import org.apache.spark.internal.Logging
import org.apache.spark.sql.rapids.RapidsDiskBlockManager
import org.apache.spark.sql.types.DataType
import org.apache.spark.sql.vectorized.ColumnarBatch

/**
 * An identifier for a RAPIDS buffer that can be automatically spilled between buffer stores.
 * NOTE: Derived classes MUST implement proper hashCode and equals methods, as these objects are
 *       used as keys in hash maps. Scala case classes are recommended.
 */
trait RapidsBufferId {
  val tableId: Int

  /**
   * Indicates whether the buffer may share a spill file with other buffers.
   * If false then the spill file will be automatically removed when the buffer is freed.
   * If true then the spill file will not be automatically removed, and another subsystem needs
   * to be responsible for cleaning up the spill files for those types of buffers.
   */
  val canShareDiskPaths: Boolean = false

  /**
   * Generate a path to a local file that can be used to spill the corresponding buffer to disk.
   * The path must be unique across all buffers unless canShareDiskPaths is true.
   */
  def getDiskPath(diskBlockManager: RapidsDiskBlockManager): File
}

/** Enumeration of the storage tiers */
object StorageTier extends Enumeration {
  type StorageTier = Value
  val DEVICE: StorageTier = Value(0, "device memory")
  val HOST: StorageTier = Value(1, "host memory")
  val DISK: StorageTier = Value(2, "local disk")
}

/**
 * ChunkedPacker is an Iterator that uses a cudf::chunked_pack to copy a cuDF `Table`
 * to a target buffer in chunks.
 *
 * Each chunk is sized at most `bounceBuffer.getLength`, and the caller should cudaMemcpy
 * bytes from `bounceBuffer` to a target buffer after each call to `next()`.
 *
 * @note `ChunkedPacker` must be closed by the caller as it has GPU and host resources
 *       associated with it.
 *
 * @param id The RapidsBufferId for this pack operation to be included in the metadata
 * @param table cuDF Table to chunk_pack
 * @param bounceBuffer GPU memory to be used for packing. The buffer should be at least 1MB
 *                     in length.
 */
class ChunkedPacker(
    id: RapidsBufferId,
    table: Table,
    bounceBuffer: DeviceMemoryBuffer)
    extends Iterator[MemoryBuffer]
        with Logging
        with AutoCloseable {

  private var closed: Boolean = false

  // When creating cudf::chunked_pack use a pool if available, otherwise default to the
  // per-device memory resource
  private val chunkedPack = {
    val pool = GpuDeviceManager.chunkedPackMemoryResource
    val cudfChunkedPack = try {
      pool.flatMap { chunkedPool =>
        Some(table.makeChunkedPack(bounceBuffer.getLength, chunkedPool))
      }
    } catch {
      case _: OutOfMemoryError =>
        if (!ChunkedPacker.warnedAboutPoolFallback) {
          ChunkedPacker.warnedAboutPoolFallback = true
          logWarning(
            s"OOM while creating chunked_pack using pool sized ${pool.map(_.getMaxSize)}B. " +
                "Falling back to the per-device memory resource.")
        }
        None
    }

    // if the pool is not configured, or we got an OOM, try again with the per-device pool
    cudfChunkedPack.getOrElse {
      table.makeChunkedPack(bounceBuffer.getLength)
    }
  }

  private val tableMeta = withResource(chunkedPack.buildMetadata()) { packedMeta =>
    MetaUtils.buildTableMeta(
      id.tableId,
      chunkedPack.getTotalContiguousSize,
      packedMeta.getMetadataDirectBuffer,
      table.getRowCount)
  }

  // take out a lease on the bounce buffer
  bounceBuffer.incRefCount()

  def getTotalContiguousSize: Long = chunkedPack.getTotalContiguousSize

  def getMeta: TableMeta = {
    tableMeta
  }

  override def hasNext: Boolean = synchronized {
    if (closed) {
      throw new IllegalStateException(s"ChunkedPacker for $id is closed")
    }
    chunkedPack.hasNext
  }

  def next(): MemoryBuffer = synchronized {
    if (closed) {
      throw new IllegalStateException(s"ChunkedPacker for $id is closed")
    }
    val bytesWritten = chunkedPack.next(bounceBuffer)
    // we increment the refcount because the caller has no idea where
    // this memory came from, so it should close it.
    bounceBuffer.slice(0, bytesWritten)
  }

  override def close(): Unit = synchronized {
    if (!closed) {
      closed = true
      val toClose = new ArrayBuffer[AutoCloseable]()
      toClose.append(chunkedPack, bounceBuffer)
      toClose.safeClose()
    }
  }
}

object ChunkedPacker {
  private var warnedAboutPoolFallback: Boolean = false
}

/**
 * This iterator encapsulates a buffer's internal `MemoryBuffer` access
 * for spill reasons. Internally, there are two known implementations:
 * - either this is a "single shot" copy, where the entirety of the `RapidsBuffer` is
 *   already represented as a single contiguous blob of memory, then the expectation
 *   is that this iterator is exhausted with a single call to `next`
 * - or, we have a `RapidsBuffer` that isn't contiguous. This iteration will then
 *   drive a `ChunkedPacker` to pack the `RapidsBuffer`'s table as needed. The
 *   iterator will likely need several calls to `next` to be exhausted.
 *
 * @param buffer `RapidsBuffer` to copy out of its tier.
 */
class RapidsBufferCopyIterator(buffer: RapidsBuffer)
    extends Iterator[MemoryBuffer] with AutoCloseable with Logging {

  private val chunkedPacker: Option[ChunkedPacker] = if (buffer.supportsChunkedPacker) {
    Some(buffer.makeChunkedPacker)
  } else {
    None
  }
  def isChunked: Boolean = chunkedPacker.isDefined

  // this is used for the single shot case to flag when `next` is call
  // to satisfy the Iterator interface
  private var singleShotCopyHasNext: Boolean = false
  private var singleShotBuffer: MemoryBuffer = _

  if (!isChunked) {
    singleShotCopyHasNext = true
    singleShotBuffer = buffer.getMemoryBuffer
  }

  override def hasNext: Boolean =
    chunkedPacker.map(_.hasNext).getOrElse(singleShotCopyHasNext)

  override def next(): MemoryBuffer = {
    require(hasNext,
      "next called on exhausted iterator")
    chunkedPacker.map(_.next()).getOrElse {
      singleShotCopyHasNext = false
      singleShotBuffer.slice(0, singleShotBuffer.getLength)
    }
  }

  def getTotalCopySize: Long = {
    chunkedPacker
        .map(_.getTotalContiguousSize)
        .getOrElse(singleShotBuffer.getLength)
  }

  override def close(): Unit = {
    val toClose = new ArrayBuffer[AutoCloseable]()
    toClose.appendAll(chunkedPacker)
    toClose.appendAll(Option(singleShotBuffer))

    toClose.safeClose()
  }
}

/** Interface provided by all types of RAPIDS buffers */
trait RapidsBuffer extends AutoCloseable {
  /** The buffer identifier for this buffer. */
  val id: RapidsBufferId

  /**
   * The size of this buffer in bytes in its _current_ store. As the buffer goes through
   * contiguous split (either added as a contiguous table already, or spilled to host),
   * its size changes because contiguous_split adds its own alignment padding.
   *
   * @note Do not use this size to allocate a target buffer to copy, always use `getPackedSize.`
   */
  val memoryUsedBytes: Long

  /**
   * The size of this buffer if it has already gone through contiguous_split.
   *
   * @note Use this function when allocating a target buffer for spill or shuffle purposes.
   */
  def getPackedSizeBytes: Long = memoryUsedBytes

  /**
   * At spill time, obtain an iterator used to copy this buffer to a different tier.
   */
  def getCopyIterator: RapidsBufferCopyIterator =
    new RapidsBufferCopyIterator(this)

  /** Descriptor for how the memory buffer is formatted */
  def meta: TableMeta

  /** The storage tier for this buffer */
  val storageTier: StorageTier

  /**
   * Get the columnar batch within this buffer. The caller must have
   * successfully acquired the buffer beforehand.
   * @param sparkTypes the spark data types the batch should have
   * @see [[addReference]]
   * @note It is the responsibility of the caller to close the batch.
   * @note If the buffer is compressed data then the resulting batch will be built using
   *       `GpuCompressedColumnVector`, and it is the responsibility of the caller to deal
   *       with decompressing the data if necessary.
   */
  def getColumnarBatch(sparkTypes: Array[DataType]): ColumnarBatch

  /**
   * Get the host-backed columnar batch from this buffer. The caller must have
   * successfully acquired the buffer beforehand.
   *
   * If this `RapidsBuffer` was added originally to the device tier, or if this is
   * a just a buffer (not a batch), this function will throw.
   *
   * @param sparkTypes the spark data types the batch should have
   * @see [[addReference]]
   * @note It is the responsibility of the caller to close the batch.
   */
  def getHostColumnarBatch(sparkTypes: Array[DataType]): ColumnarBatch = {
    throw new IllegalStateException(s"$this does not support host columnar batches.")
  }

  /**
   * Get the underlying memory buffer. This may be either a HostMemoryBuffer or a DeviceMemoryBuffer
   * depending on where the buffer currently resides.
   * The caller must have successfully acquired the buffer beforehand.
   * @see [[addReference]]
   * @note It is the responsibility of the caller to close the buffer.
   */
  def getMemoryBuffer: MemoryBuffer

  val supportsChunkedPacker: Boolean = false

  /**
   * Makes a new chunked packer. It is the responsibility of the caller to close this.
   */
  def makeChunkedPacker: ChunkedPacker = {
    throw new NotImplementedError("not implemented for this store")
  }

  /**
   * Copy the content of this buffer into the specified memory buffer, starting from the given
   * offset.
   *
   * @param srcOffset offset to start copying from.
   * @param dst the memory buffer to copy into.
   * @param dstOffset offset to copy into.
   * @param length number of bytes to copy.
   * @param stream CUDA stream to use
   */
  def copyToMemoryBuffer(
      srcOffset: Long, dst: MemoryBuffer, dstOffset: Long, length: Long, stream: Cuda.Stream): Unit

  /**
   * Get the device memory buffer from the underlying storage. If the buffer currently resides
   * outside of device memory, a new DeviceMemoryBuffer is created with the data copied over.
   * The caller must have successfully acquired the buffer beforehand.
   * @see [[addReference]]
   * @note It is the responsibility of the caller to close the buffer.
   */
  def getDeviceMemoryBuffer: DeviceMemoryBuffer

  /**
   * Get the host memory buffer from the underlying storage. If the buffer currently resides
   * outside of host memory, a new HostMemoryBuffer is created with the data copied over.
   * The caller must have successfully acquired the buffer beforehand.
   * @see [[addReference]]
   * @note It is the responsibility of the caller to close the buffer.
   */
  def getHostMemoryBuffer: HostMemoryBuffer

  /**
   * Try to add a reference to this buffer to acquire it.
   * @note The close method must be called for every successfully obtained reference.
   * @return true if the reference was added or false if this buffer is no longer valid
   */
  def addReference(): Boolean

  /**
   * Schedule the release of the buffer's underlying resources.
   * Subsequent attempts to acquire the buffer will fail. As soon as the
   * buffer has no outstanding references, the resources will be released.
   * 
   * This is separate from the close method which does not normally release
   * resources. close will only release resources if called as the last
   * outstanding reference and the buffer was previously marked as freed.
   */
  def free(): Unit

  /**
   * Get the spill priority value for this buffer. Lower values are higher
   * priority for spilling, meaning buffers with lower values will be
   * preferred for spilling over buffers with a higher value.
   */
  def getSpillPriority: Long

  /**
   * Set the spill priority for this buffer. Lower values are higher priority
   * for spilling, meaning buffers with lower values will be preferred for
   * spilling over buffers with a higher value.
   * @note should only be called from the buffer catalog
   * @param priority new priority value for this buffer
   */
  def setSpillPriority(priority: Long): Unit

  /**
   * Function invoked by the `RapidsBufferStore.addBuffer` method that prompts
   * the specific `RapidsBuffer` to check its reference counting to make itself
   * spillable or not. Only `RapidsTable` and `RapidsHostMemoryBuffer` implement
   * this method.
   */
  def updateSpillability(): Unit = {}

  /**
   * Obtains a read lock on this instance of `RapidsBuffer` and calls the function
   * in `body` while holding the lock.
   * @param body function that takes a `MemoryBuffer` and produces `K`
   * @tparam K any return type specified by `body`
   * @return the result of body(memoryBuffer)
   */
  def withMemoryBufferReadLock[K](body: MemoryBuffer => K): K

  /**
   * Obtains a write lock on this instance of `RapidsBuffer` and calls the function
   * in `body` while holding the lock.
   * @param body function that takes a `MemoryBuffer` and produces `K`
   * @tparam K any return type specified by `body`
   * @return the result of body(memoryBuffer)
   */
  def withMemoryBufferWriteLock[K](body: MemoryBuffer => K): K
}

/**
 * A buffer with no corresponding device data (zero rows or columns).
 * These buffers are not tracked in buffer stores since they have no
 * device memory. They are only tracked in the catalog and provide
 * a representative `ColumnarBatch` but cannot provide a
 * `MemoryBuffer`.
 * @param id buffer ID to associate with the buffer
 * @param meta schema metadata
 */
sealed class DegenerateRapidsBuffer(
    override val id: RapidsBufferId,
    override val meta: TableMeta) extends RapidsBuffer {

  override val memoryUsedBytes: Long = 0L

  override val storageTier: StorageTier = StorageTier.DEVICE

  override def getColumnarBatch(sparkTypes: Array[DataType]): ColumnarBatch = {
    val rowCount = meta.rowCount
    val packedMeta = meta.packedMetaAsByteBuffer()
    if (packedMeta != null) {
      withResource(DeviceMemoryBuffer.allocate(0)) { deviceBuffer =>
        withResource(Table.fromPackedTable(meta.packedMetaAsByteBuffer(), deviceBuffer)) { table =>
          GpuColumnVectorFromBuffer.from(table, deviceBuffer, meta, sparkTypes)
        }
      }
    } else {
      // no packed metadata, must be a table with zero columns
      new ColumnarBatch(Array.empty, rowCount.toInt)
    }
  }

  override def free(): Unit = {}

  override def getMemoryBuffer: MemoryBuffer =
    throw new UnsupportedOperationException("degenerate buffer has no memory buffer")

  override def copyToMemoryBuffer(srcOffset: Long, dst: MemoryBuffer, dstOffset: Long, length: Long,
      stream: Cuda.Stream): Unit =
    throw new UnsupportedOperationException("degenerate buffer cannot copy to memory buffer")

  override def getDeviceMemoryBuffer: DeviceMemoryBuffer =
    throw new UnsupportedOperationException("degenerate buffer has no device memory buffer")

  override def getHostMemoryBuffer: HostMemoryBuffer =
    throw new UnsupportedOperationException("degenerate buffer has no host memory buffer")

  override def addReference(): Boolean = true

  override def getSpillPriority: Long = Long.MaxValue

  override def setSpillPriority(priority: Long): Unit = {}

  override def withMemoryBufferReadLock[K](body: MemoryBuffer => K): K = {
    throw new UnsupportedOperationException("degenerate buffer has no memory buffer")
  }

  override def withMemoryBufferWriteLock[K](body: MemoryBuffer => K): K = {
    throw new UnsupportedOperationException("degenerate buffer has no memory buffer")
  }

  override def close(): Unit = {}
}

trait RapidsHostBatchBuffer extends AutoCloseable {
  /**
   * Get the host-backed columnar batch from this buffer. The caller must have
   * successfully acquired the buffer beforehand.
   *
   * If this `RapidsBuffer` was added originally to the device tier, or if this is
   * a just a buffer (not a batch), this function will throw.
   *
   * @param sparkTypes the spark data types the batch should have
   * @see [[addReference]]
   * @note It is the responsibility of the caller to close the batch.
   */
  def getHostColumnarBatch(sparkTypes: Array[DataType]): ColumnarBatch

  val memoryUsedBytes: Long
}

trait RapidsBufferChannelWritable {
  /**
   * At spill time, write this buffer to an nio WritableByteChannel.
   * @param writableChannel that this buffer can just write itself to, either byte-for-byte
   *                        or via serialization if needed.
   * @param stream the Cuda.Stream for the spilling thread. If the `RapidsBuffer` that
   *               implements this method is on the device, synchronization may be needed
   *               for staged copies.
   * @return the amount of bytes written to the channel
   */
  def writeToChannel(writableChannel: WritableByteChannel, stream: Cuda.Stream): Long
}