com.nvidia.spark.rapids.TableCompressionCodec.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of rapids-4-spark_2.13 Show documentation
Creates the distribution package of the RAPIDS plugin for Apache Spark
There is a newer version: 24.10.1
/*
 * Copyright (c) 2020-2024, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.nvidia.spark.rapids

import scala.collection.mutable.ArrayBuffer

import ai.rapids.cudf.{BaseDeviceMemoryBuffer, ContiguousTable, Cuda, DeviceMemoryBuffer, NvtxColor, NvtxRange}
import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
import com.nvidia.spark.rapids.RapidsPluginImplicits._
import com.nvidia.spark.rapids.format.{BufferMeta, CodecType, TableMeta}

import org.apache.spark.internal.Logging

/**
 * Compressed table descriptor
 * @param compressedSize size of the compressed data in bytes
 * @param meta metadata describing the table layout when uncompressed
 * @param buffer buffer containing the compressed data
 */
case class CompressedTable(
    compressedSize: Long,
    meta: TableMeta,
    buffer: DeviceMemoryBuffer) extends AutoCloseable {
  override def close(): Unit = buffer.close()
}

/** An interface to a compression codec that can compress a contiguous Table on the GPU */
trait TableCompressionCodec {
  /** The name of the codec, used for logging. */
  val name: String

  /** The ID used for this codec.  See the definitions in `CodecType`. */
  val codecId: Byte

  /**
   * Create a batched compressor instance
   * @param maxBatchMemorySize The upper limit in bytes of temporary and output memory usage at
   *                           which a batch should be compressed. A single table that requires
   *                           temporary and output memory above this limit is allowed but will
   *                           be compressed individually.
   * @param stream CUDA stream to use for compression
   * @return batched compressor instance
   */
  def createBatchCompressor(maxBatchMemorySize: Long, stream: Cuda.Stream): BatchedTableCompressor

  /**
   * Create a batched decompressor instance
   * @param maxBatchMemorySize The upper limit in bytes of temporary and output memory usage at
   *                           which a batch should be decompressed. A single buffer that requires
   *                           temporary and output memory above this limit is allowed but will
   *                           be decompressed individually.
   * @param stream CUDA stream to use for decompression
   * @return batched decompressor instance
   */
  def createBatchDecompressor(
      maxBatchMemorySize: Long,
      stream: Cuda.Stream): BatchedBufferDecompressor
}

/**
 * A small case class used to carry codec-specific settings.
 */
case class TableCompressionCodecConfig(lz4ChunkSize: Long, zstdChunkSize: Long)

object TableCompressionCodec extends Logging {
  private val codecNameToId = Map(
    "copy" -> CodecType.COPY,
    "zstd" -> CodecType.NVCOMP_ZSTD,
    "lz4" -> CodecType.NVCOMP_LZ4)

  /** Make a codec configuration object which can be serialized (can be used in tasks) */
  def makeCodecConfig(rapidsConf: RapidsConf): TableCompressionCodecConfig =
    TableCompressionCodecConfig(
      rapidsConf.shuffleCompressionLz4ChunkSize,
      rapidsConf.shuffleCompressionZstdChunkSize)

  /** Get a compression codec by short name or fully qualified class name */
  def getCodec(name: String, codecConfigs: TableCompressionCodecConfig): TableCompressionCodec = {
    val codecId = codecNameToId.getOrElse(name,
      throw new IllegalArgumentException(s"Unknown table codec: $name"))
    getCodec(codecId, codecConfigs)
  }

  /** Get a compression codec by ID, using a cache. */
  def getCodec(codecId: Byte, codecConfig: TableCompressionCodecConfig): TableCompressionCodec = {
    val ret = codecId match {
      case CodecType.NVCOMP_ZSTD => new NvcompZSTDCompressionCodec(codecConfig)
      case CodecType.NVCOMP_LZ4 => new NvcompLZ4CompressionCodec(codecConfig)
      case CodecType.COPY => new CopyCompressionCodec
      case _ => throw new IllegalArgumentException(s"Unknown codec ID: $codecId")
    }
    logDebug(s"Using codec: ${ret.name}")
    ret
  }
}

/**
 * Base class for batched compressors
 * @param maxBatchMemorySize The upper limit in bytes of estimated output memory usage at
 *                           which a batch should be compressed. A single table that requires
 *                           estimated output memory above this limit is allowed but will
 *                           be compressed individually.
 * @param stream CUDA stream to use
 */
abstract class BatchedTableCompressor(maxBatchMemorySize: Long, stream: Cuda.Stream)
    extends AutoCloseable with Logging {
  // The tables that need to be compressed in the next batch
  private[this] val tables = new ArrayBuffer[ContiguousTable]

  // The compressed outputs of all tables across all batches
  private[this] val results = new ArrayBuffer[CompressedTable]

  // temporary and output memory being used as part of the current batch
  private[this] var batchMemUsed: Long = 0

  /**
   * Add a contiguous table to be batch-compressed. Ownership of the table is transferred to the
   * batch compressor which is responsible for closing the table.
   * @param contigTable the contiguous table to be compressed
   */
  def addTableToCompress(contigTable: ContiguousTable): Unit = {
    closeOnExcept(contigTable) { contigTable =>
      // use original input size as a conservative estimate of compressed output size
      val memNeededToCompressThisBuffer = contigTable.getBuffer.getLength
      if (batchMemUsed + memNeededToCompressThisBuffer > maxBatchMemorySize) {
        compressBatch()
      }
      tables += contigTable
      batchMemUsed += memNeededToCompressThisBuffer
    }
  }

  /**
   * Add an array of contiguous tables to be compressed. The tables will be closed by the
   * batch compressor.
   * @param contigTable contiguous tables to compress
   */
  def addTables(contigTable: Array[ContiguousTable]): Unit = {
    var i = 0
    try {
      contigTable.foreach { ct =>
        addTableToCompress(ct)
        i += 1
      }
    } catch {
      case t: Throwable =>
        contigTable.drop(i).foreach(_.safeClose())
        throw t
    }
  }

  /**
   * This must be called after all tables to be compressed have been added to retrieve the
   * compression results.
   * @note the table IDs in the TableMeta of all tables will be set to zero
   * @return compressed tables
   */
  def finish(): Array[CompressedTable] = {
    // compress the last batch
    compressBatch()

    val compressedTables = results.toArray
    results.clear()

    // Ensure we synchronize on the CUDA stream, because `CompressedTable` instances
    // could be copied to host during a spill before we are done.
    // TODO: A better way to do this would be via CUDA events, synchronizing on the event
    //  instead of the whole stream
    stream.sync()
    compressedTables
  }

  /** Must be closed to release the resources owned by the batch compressor */
  override def close(): Unit = {
    tables.safeClose()
    tables.clear()
    results.safeClose()
    results.clear()
  }

  private def compressBatch(): Unit = if (tables.nonEmpty) {
    withResource(new NvtxRange("batch compress", NvtxColor.ORANGE)) { _ =>
      val startTime = System.nanoTime()
      val compressedTables = compress(tables.toArray, stream)
      results ++= compressedTables
      require(compressedTables.length == tables.length)

      if (log.isDebugEnabled) {
        val duration = (System.nanoTime() - startTime).toFloat
        val inputSize = tables.map(_.getBuffer.getLength).sum
        val outputSize = compressedTables.map(_.compressedSize).sum
        logDebug(s"Compressed ${tables.length} tables from $inputSize to $outputSize " +
            s"in ${duration / 1000000} msec rate=${inputSize / duration} GB/s " +
            s"ratio=${outputSize.toFloat/inputSize}")
      }

      // free the inputs to this batch
      tables.safeClose()
      tables.clear()
      batchMemUsed = 0
    }
  }

  /**
   * Reallocates and copies data for oversized compressed data buffers due to inaccurate estimates
   * of the compressed output size. If the buffer is already the appropriate size then no copy
   * is performed.
   * @note This method takes ownership of the tables and is responsible for closing them.
   * @param tables compressed tables to resize
   * @return right-sized compressed tables
   */
  protected def resizeOversizedOutputs(tables: Array[CompressedTable]): Array[CompressedTable] = {
    withResource(new NvtxRange("copy compressed buffers", NvtxColor.PURPLE)) { _ =>
      withResource(tables) { _ =>
        tables.safeMap { ct =>
          val newBuffer = if (ct.buffer.getLength > ct.compressedSize) {
            closeOnExcept(DeviceMemoryBuffer.allocate(ct.compressedSize)) { buffer =>
              buffer.copyFromDeviceBufferAsync(
                0, ct.buffer, 0, ct.compressedSize, stream)
              buffer
            }
          } else {
            ct.buffer.incRefCount()
            ct.buffer
          }
          CompressedTable(ct.compressedSize, ct.meta, newBuffer)
        }
      }
    }
  }

  /**
   * Batch-compress contiguous tables
   * @param tables contiguous tables to compress
   * @param stream CUDA stream to use
   * @return compressed tables. Table IDs in the `TableMeta` should be set to 0.
   */
  protected def compress(
      tables: Array[ContiguousTable],
      stream: Cuda.Stream): Array[CompressedTable]
}

/**
 * Base class for batched decompressors
 * @param maxBatchMemorySize The upper limit in bytes of output memory usage at which a batch
 *                           should be decompressed. A single table that requires output memory
 *                           above this limit is allowed but will be decompressed individually.
 * @param stream CUDA stream to use
 */
abstract class BatchedBufferDecompressor(maxBatchMemorySize: Long, stream: Cuda.Stream)
    extends AutoCloseable with Logging {
  // The buffers of compressed data that will be decompressed in the next batch
  private[this] val inputBuffers = new ArrayBuffer[BaseDeviceMemoryBuffer]

  // The output buffers that will contain the decompressed data in the next batch
  private[this] val bufferMetas = new ArrayBuffer[BufferMeta]

  // The decompressed data results for all input buffers across all batches
  private[this] val results = new ArrayBuffer[DeviceMemoryBuffer]

  // temporary and output memory being used as part of the current batch
  private[this] var batchMemUsed: Long = 0

  /** The codec ID corresponding to this decompressor */
  val codecId: Byte

  def addBufferToDecompress(buffer: BaseDeviceMemoryBuffer, meta: BufferMeta): Unit = {
    closeOnExcept(buffer) { buffer =>
      // Only supports a single codec per buffer for now.
      require(meta.codecBufferDescrsLength == 1)
      val descr = meta.codecBufferDescrs(0)
      require(descr.codec == codecId)

      // Only support codec that consumes entire input buffer for now.
      require(descr.compressedOffset == 0)
      require(descr.compressedSize == buffer.getLength)

      val outputNeeded = descr.uncompressedSize
      if (batchMemUsed + outputNeeded > maxBatchMemorySize) {
        decompressBatch()
      }

      batchMemUsed += outputNeeded
      bufferMetas += meta
      inputBuffers += buffer
    }
  }

  /**
   * This must be called after all buffers to be decompressed have been added to retrieve the
   * decompression results. Note that the decompression may still be occurring asynchronously
   * using the CUDA stream specified when the decompressor was instantiated.
   * @return decompressed tables
   */
  def finishAsync(): Array[DeviceMemoryBuffer] = {
    // decompress the last batch
    decompressBatch()
    val resultsArray = results.toArray
    results.clear()
    resultsArray
  }

  override def close(): Unit = {
    inputBuffers.safeClose()
    inputBuffers.clear()
    bufferMetas.clear()
    results.safeClose()
    results.clear()
  }

  protected def decompressBatch(): Unit = {
    if (inputBuffers.nonEmpty) {
      withResource(new NvtxRange("batch decompress", NvtxColor.ORANGE)) { _ =>
        val startTime = System.nanoTime()
        val uncompressedBuffers = decompressAsync(inputBuffers.toArray, bufferMetas.toArray, stream)
        results ++= uncompressedBuffers
        require(uncompressedBuffers.length == inputBuffers.length)
        if (log.isDebugEnabled) {
          val duration = (System.nanoTime - startTime).toFloat
          val inputSize = inputBuffers.map(_.getLength).sum
          val outputSize = uncompressedBuffers.map(_.getLength).sum
          logDebug(s"Decompressed ${inputBuffers.length} buffers from $inputSize " +
              s"to $outputSize in ${duration / 1000000} msec rate=${outputSize / duration} GB/s")
        }

        // free all the inputs to this batch
        inputBuffers.safeClose()
        inputBuffers.clear()
        bufferMetas.clear()
        batchMemUsed = 0
      }
    }
  }

  /**
   * Decompress a batch of compressed buffers
   * @param inputBuffers buffers that contain the compressed input
   * @param bufferMetas corresponding metadata for each compressed input buffer
   * @param stream CUDA stream to use
   * @return buffers that contain the uncompressed output
   */
  protected def decompressAsync(
      inputBuffers: Array[BaseDeviceMemoryBuffer],
      bufferMetas: Array[BufferMeta],
      stream: Cuda.Stream): Array[DeviceMemoryBuffer]
}