com.nvidia.spark.rapids.DumpUtils.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of rapids-4-spark_2.12 Show documentation
Show all versions of rapids-4-spark_2.12 Show documentation
Creates the distribution package of the RAPIDS plugin for Apache Spark
/*
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.nvidia.spark.rapids
import java.io.{File, FileOutputStream, OutputStream}
import java.util.Random
import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer
import ai.rapids.cudf._
import ai.rapids.cudf.ColumnWriterOptions._
import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration
import org.apache.spark.internal.Logging
import org.apache.spark.sql.vectorized.ColumnarBatch
object DumpUtils extends Logging {
/**
* Debug utility to dump a host memory buffer to a file.
* @param conf Hadoop configuration
* @param data buffer containing data to dump to a file
* @param offset starting offset in the buffer of the data
* @param len size of the data in bytes
* @param prefix prefix for a path to write the data
* @param suffix suffix for a path to write the data
* @return Hadoop path for where the data was written or null on error
*/
def dumpBuffer(
conf: Configuration,
data: HostMemoryBuffer,
offset: Long,
len: Long,
prefix: String,
suffix: String): String = {
try {
val (out, path) = FileUtils.createTempFile(conf, prefix, suffix)
withResource(out) { _ =>
withResource(data.slice(offset, len)) { hmb =>
withResource(new HostMemoryInputStream(hmb, hmb.getLength)) { in =>
IOUtils.copy(in, out)
}
}
}
path.toString
} catch {
case e: Exception =>
log.error(s"Error attempting to dump data", e)
s""
}
}
/**
* Debug utility to dump columnar batch to parquet file.
* It's running on GPU. Parquet column names are generated from columnar batch type info.
*
* @param columnarBatch the columnar batch to be dumped, should be GPU columnar batch
* @param filePrefix parquet file prefix, e.g. /tmp/my-debug-prefix-
* @return parquet file path if dump is successful, e.g. /tmp/my-debug-prefix-123.parquet
*/
def dumpToParquetFile(columnarBatch: ColumnarBatch, filePrefix: String): Option[String] = {
if (columnarBatch.numCols() == 0) {
logWarning("dump to parquet failed, has no column, file prefix is " + filePrefix)
None
} else {
Some(dumpToParquetFileImpl(columnarBatch, filePrefix))
}
}
/**
* Dump columnar batch to output stream in parquet format.
*
* @param columnarBatch The columnar batch to be dumped, should be GPU columnar batch. It
* should be closed by caller.
* @param outputStream Will be closed after writing.
*/
def dumpToParquet(columnarBatch: ColumnarBatch, outputStream: OutputStream): Unit = {
closeOnExcept(outputStream) { _ =>
withResource(GpuColumnVector.from(columnarBatch)) { table =>
withResource(new ParquetDumper(outputStream, table)) { dumper =>
dumper.writeTable(table)
}
}
}
}
/**
* Debug utility to dump table to parquet file.
* It's running on GPU. Parquet column names are generated from table column type info.
*
* @param table the table to be dumped, should be GPU columnar batch
* @param filePrefix parquet file prefix, e.g. /tmp/my-debug-prefix-
* @return parquet file path if dump is successful, e.g. /tmp/my-debug-prefix-123.parquet
*/
def dumpToParquetFile(table: Table, filePrefix: String): Option[String] = {
if (table.getNumberOfColumns == 0) {
logWarning("dump to parquet failed, has no column, file prefix is " + filePrefix)
None
} else {
Some(dumpToParquetFileImp(table, filePrefix))
}
}
private def dumpToParquetFileImp(table: Table, filePrefix: String): String = {
val path = genPath(filePrefix)
withResource(new ParquetDumper(path, table)) { dumper =>
dumper.writeTable(table)
path
}
}
private def dumpToParquetFileImpl(columnarBatch: ColumnarBatch, filePrefix: String): String = {
// transform to table then dump
withResource(GpuColumnVector.from(columnarBatch)) { table =>
dumpToParquetFileImp(table, filePrefix)
}
}
private def genPath(filePrefix: String): String = {
var path = ""
val random = new Random
var succeeded = false
while (!succeeded) {
path = filePrefix + random.nextInt(Int.MaxValue) + ".parquet"
if (!new File(path).exists()) {
succeeded = true
}
}
path
}
}
// parquet dumper
class ParquetDumper(private val outputStream: OutputStream, table: Table) extends HostBufferConsumer
with AutoCloseable {
private[this] val tempBuffer = new Array[Byte](128 * 1024)
def this(path: String, table: Table) = {
this(new FileOutputStream(path), table)
}
private lazy val tableWriter: TableWriter = {
// avoid anything conversion, just dump as it is
val builder = ParquetDumper.parquetWriterOptionsFromTable(ParquetWriterOptions.builder(), table)
.withCompressionType(ParquetDumper.COMPRESS_TYPE)
Table.writeParquetChunked(builder.build(), this)
}
override def handleBuffer(buffer: HostMemoryBuffer, len: Long): Unit =
ColumnarOutputWriter.writeBufferedData(mutable.Queue((buffer, len)), tempBuffer,
outputStream)
def writeTable(table: Table): Unit = {
tableWriter.write(table)
}
/**
* Closes the [[ParquetDumper]]. Invoked on the executor side after all columnar batches
* are persisted, before the task output is committed.
*/
def close(): Unit = {
tableWriter.close()
outputStream.close()
}
}
private class ColumnIndex() {
var i = 0
def inc(): Int = {
i = i + 1
i
}
}
object ParquetDumper {
val COMPRESS_TYPE = CompressionType.SNAPPY
def parquetWriterOptionsFromTable[T <: NestedBuilder[_, _], V <: ColumnWriterOptions](
builder: ColumnWriterOptions.NestedBuilder[T, V],
table: Table): T = {
val cIndex = new ColumnIndex
withResource(new ArrayBuffer[ColumnView]) { toClose =>
for (i <- 0 until table.getNumberOfColumns) {
parquetWriterOptionsFromColumnView(builder, table.getColumn(i), cIndex, toClose)
}
builder.asInstanceOf[T]
}
}
private def parquetWriterOptionsFromColumnView[T <: NestedBuilder[_, _],
V <: ColumnWriterOptions](
builder: ColumnWriterOptions.NestedBuilder[T, V],
cv: ColumnView,
cIndex: ColumnIndex,
toClose: ArrayBuffer[ColumnView]): T = {
val dType = cv.getType
if (dType.isDecimalType) {
val precision = dType.getTypeId match {
case DType.DTypeEnum.DECIMAL32 => DType.DECIMAL32_MAX_PRECISION
case DType.DTypeEnum.DECIMAL64 => DType.DECIMAL64_MAX_PRECISION
case DType.DTypeEnum.DECIMAL128 => DType.DECIMAL128_MAX_PRECISION
case _ =>
throw new UnsupportedOperationException("unsupported type " + dType.getTypeId)
}
builder.withDecimalColumn(getTypeName(dType) + cIndex.inc(), precision, true)
} else if (dType == DType.STRUCT) {
val subBuilder = structBuilder("c_struct" + cIndex.inc(), true)
for (i <- 0 until cv.getNumChildren) {
val subCv = cv.getChildColumnView(i)
toClose += subCv
parquetWriterOptionsFromColumnView(subBuilder, subCv, cIndex, toClose)
}
builder.withStructColumn(subBuilder.build())
} else if (dType == DType.LIST) {
val subCv = cv.getChildColumnView(0)
toClose += subCv
builder.withListColumn(
parquetWriterOptionsFromColumnView(
listBuilder("c_list" + cIndex.inc(), true),
subCv,
cIndex,
toClose).build())
} else {
builder.withColumns(true, getTypeName(dType) + cIndex.inc())
}
builder.asInstanceOf[T]
}
private def getTypeName(t: DType): String = {
"c_" + t.toString.replace(' ', '_')
}
}