org.apache.spark.sql.rapids.GpuAvroScan.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of rapids-4-spark_2.12 Show documentation
Show all versions of rapids-4-spark_2.12 Show documentation
Creates the distribution package of the RAPIDS plugin for Apache Spark
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.sql.rapids
import java.io.{FileNotFoundException, IOException, OutputStream}
import java.net.URI
import java.util.concurrent.{Callable, TimeUnit}
import scala.annotation.tailrec
import scala.collection.JavaConverters.mapAsScalaMapConverter
import scala.collection.mutable.{ArrayBuffer, LinkedHashMap}
import scala.language.implicitConversions
import ai.rapids.cudf.{AvroOptions => CudfAvroOptions, HostMemoryBuffer, NvtxColor, NvtxRange, Table}
import com.nvidia.spark.rapids._
import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
import com.nvidia.spark.rapids.GpuMetric.{BUFFER_TIME, FILTER_TIME, GPU_DECODE_TIME, NUM_OUTPUT_BATCHES, READ_FS_TIME, WRITE_BUFFER_TIME}
import com.nvidia.spark.rapids.RapidsPluginImplicits._
import com.nvidia.spark.rapids.shims.ShimFilePartitionReaderFactory
import org.apache.avro.Schema
import org.apache.avro.file.DataFileConstants.SYNC_SIZE
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, Path}
import org.apache.spark.TaskContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.internal.Logging
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.avro.{AvroOptions, SchemaConverters}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, PartitionReaderFactory}
import org.apache.spark.sql.execution.QueryExecutionException
import org.apache.spark.sql.execution.datasources.{PartitionedFile, PartitioningAwareFileIndex}
import org.apache.spark.sql.execution.datasources.v2.FileScan
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.rapids.execution.TrampolineUtil
import org.apache.spark.sql.rapids.shims.AvroUtils
import org.apache.spark.sql.sources.Filter
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.util.CaseInsensitiveStringMap
import org.apache.spark.sql.v2.avro.AvroScan
import org.apache.spark.sql.vectorized.ColumnarBatch
import org.apache.spark.util.SerializableConfiguration
object GpuAvroScan {
def tagSupport(scanMeta: ScanMeta[AvroScan]) : Unit = {
val scan = scanMeta.wrapped
tagSupport(
scan.sparkSession,
scan.readDataSchema,
scan.options.asScala.toMap,
scanMeta)
}
def tagSupport(
sparkSession: SparkSession,
readSchema: StructType,
options: Map[String, String],
meta: RapidsMeta[_, _, _]): Unit = {
if (!meta.conf.isAvroEnabled) {
meta.willNotWorkOnGpu("Avro input and output has been disabled. To enable set " +
s"${RapidsConf.ENABLE_AVRO} to true")
}
if (!meta.conf.isAvroReadEnabled) {
meta.willNotWorkOnGpu("Avro input has been disabled. To enable set " +
s"${RapidsConf.ENABLE_AVRO_READ} to true")
}
val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(options)
val parsedOptions = new AvroOptions(options, hadoopConf)
AvroUtils.tagSupport(parsedOptions, meta)
FileFormatChecks.tag(meta, readSchema, AvroFormatType, ReadFileOp)
}
}
case class GpuAvroScan(
sparkSession: SparkSession,
fileIndex: PartitioningAwareFileIndex,
dataSchema: StructType,
readDataSchema: StructType,
readPartitionSchema: StructType,
options: CaseInsensitiveStringMap,
pushedFilters: Array[Filter],
rapidsConf: RapidsConf,
partitionFilters: Seq[Expression] = Seq.empty,
dataFilters: Seq[Expression] = Seq.empty,
queryUsesInputFile: Boolean = false) extends FileScan with GpuScan {
override def isSplitable(path: Path): Boolean = true
override def createReaderFactory(): PartitionReaderFactory = {
val caseSensitiveMap = options.asCaseSensitiveMap.asScala.toMap
// Hadoop Configurations are case sensitive.
val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(caseSensitiveMap)
val broadcastedConf = sparkSession.sparkContext.broadcast(
new SerializableConfiguration(hadoopConf))
val parsedOptions = new AvroOptions(caseSensitiveMap, hadoopConf)
// The partition values are already truncated in `FileScan.partitions`.
// We should use `readPartitionSchema` as the partition schema here.
if (rapidsConf.isAvroPerFileReadEnabled) {
GpuAvroPartitionReaderFactory(sparkSession.sessionState.conf, rapidsConf, broadcastedConf,
dataSchema, readDataSchema, readPartitionSchema, parsedOptions, metrics,
options.asScala.toMap)
} else {
GpuAvroMultiFilePartitionReaderFactory(sparkSession.sessionState.conf,
rapidsConf, broadcastedConf, dataSchema, readDataSchema, readPartitionSchema,
parsedOptions, metrics, pushedFilters, queryUsesInputFile)
}
}
// overrides nothing in 330
def withFilters(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): FileScan =
this.copy(partitionFilters = partitionFilters, dataFilters = dataFilters)
override def equals(obj: Any): Boolean = obj match {
case a: GpuAvroScan =>
super.equals(a) && dataSchema == a.dataSchema && options == a.options &&
equivalentFilters(pushedFilters, a.pushedFilters) && rapidsConf == a.rapidsConf &&
queryUsesInputFile == a.queryUsesInputFile
case _ => false
}
override def hashCode(): Int = getClass.hashCode()
override def description(): String = {
super.description() + ", PushedFilters: " + seqToString(pushedFilters)
}
override def withInputFile(): GpuScan = copy(queryUsesInputFile = true)
}
/** Avro partition reader factory to build columnar reader */
case class GpuAvroPartitionReaderFactory(
@transient sqlConf: SQLConf,
@transient rapidsConf: RapidsConf,
broadcastedConf: Broadcast[SerializableConfiguration],
dataSchema: StructType,
readDataSchema: StructType,
partitionSchema: StructType,
avroOptions: AvroOptions,
metrics: Map[String, GpuMetric],
@transient params: Map[String, String])
extends ShimFilePartitionReaderFactory(params) with Logging {
private val debugDumpPrefix = rapidsConf.avroDebugDumpPrefix
private val debugDumpAlways = rapidsConf.avroDebugDumpAlways
private val maxReadBatchSizeRows = rapidsConf.maxReadBatchSizeRows
private val maxReadBatchSizeBytes = rapidsConf.maxReadBatchSizeBytes
private val maxGpuColumnSizeBytes = rapidsConf.maxGpuColumnSizeBytes
override def supportColumnarReads(partition: InputPartition): Boolean = true
override def buildReader(partitionedFile: PartitionedFile): PartitionReader[InternalRow] = {
throw new IllegalStateException("ROW BASED PARSING IS NOT SUPPORTED ON THE GPU...")
}
override def buildColumnarReader(partFile: PartitionedFile): PartitionReader[ColumnarBatch] = {
val conf = broadcastedConf.value.value
val startTime = System.nanoTime()
val blockMeta = AvroFileFilterHandler(conf, avroOptions).filterBlocks(partFile)
metrics.get(FILTER_TIME).foreach {
_ += (System.nanoTime() - startTime)
}
val reader = new PartitionReaderWithBytesRead(new GpuAvroPartitionReader(conf, partFile,
blockMeta, readDataSchema, debugDumpPrefix, debugDumpAlways, maxReadBatchSizeRows,
maxReadBatchSizeBytes, metrics))
ColumnarPartitionReaderWithPartitionValues.newReader(partFile, reader, partitionSchema,
maxGpuColumnSizeBytes)
}
}
/**
* The multi-file partition reader factory for cloud or coalescing reading of avro file format.
*/
case class GpuAvroMultiFilePartitionReaderFactory(
@transient sqlConf: SQLConf,
@transient rapidsConf: RapidsConf,
broadcastedConf: Broadcast[SerializableConfiguration],
dataSchema: StructType,
readDataSchema: StructType,
partitionSchema: StructType,
options: AvroOptions,
metrics: Map[String, GpuMetric],
filters: Array[Filter],
queryUsesInputFile: Boolean)
extends MultiFilePartitionReaderFactoryBase(sqlConf, broadcastedConf, rapidsConf) {
private val debugDumpPrefix = rapidsConf.avroDebugDumpPrefix
private val debugDumpAlways = rapidsConf.avroDebugDumpAlways
private val ignoreMissingFiles = sqlConf.ignoreMissingFiles
private val ignoreCorruptFiles = sqlConf.ignoreCorruptFiles
private val numThreads = rapidsConf.multiThreadReadNumThreads
private val maxNumFileProcessed = rapidsConf.maxNumAvroFilesParallel
// we can't use the coalescing files reader when InputFileName, InputFileBlockStart,
// or InputFileBlockLength because we are combining all the files into a single buffer
// and we don't know which file is associated with each row.
override val canUseCoalesceFilesReader: Boolean =
rapidsConf.isAvroCoalesceFileReadEnabled && !(queryUsesInputFile || ignoreCorruptFiles)
override val canUseMultiThreadReader: Boolean = rapidsConf.isAvroMultiThreadReadEnabled
/**
* File format short name used for logging and other things to uniquely identity
* which file format is being used.
*/
override final def getFileFormatShortName: String = "AVRO"
/**
* Build the PartitionReader for cloud reading
*/
@scala.annotation.nowarn(
"msg=value ignoreExtension in class AvroOptions is deprecated*"
)
override def buildBaseColumnarReaderForCloud(
partFiles: Array[PartitionedFile],
conf: Configuration): PartitionReader[ColumnarBatch] = {
val files = if (options.ignoreExtension) {
partFiles
} else {
partFiles.filter(_.filePath.toString().endsWith(".avro"))
}
new GpuMultiFileCloudAvroPartitionReader(conf, files, numThreads, maxNumFileProcessed,
filters, metrics, ignoreCorruptFiles, ignoreMissingFiles, debugDumpPrefix, debugDumpAlways,
readDataSchema, partitionSchema, maxReadBatchSizeRows, maxReadBatchSizeBytes,
maxGpuColumnSizeBytes)
}
/**
* Build the PartitionReader for coalescing reading
*
* @param files files to be read
* @param conf the configuration
* @return a PartitionReader of coalescing reading
*/
override def buildBaseColumnarReaderForCoalescing(
files: Array[PartitionedFile],
conf: Configuration): PartitionReader[ColumnarBatch] = {
val clippedBlocks = ArrayBuffer[AvroSingleDataBlockInfo]()
val mapPathHeader = LinkedHashMap[Path, Header]()
val filterHandler = AvroFileFilterHandler(conf, options)
val startTime = System.nanoTime()
files.foreach { file =>
val singleFileInfo = try {
filterHandler.filterBlocks(file)
} catch {
case e: FileNotFoundException if ignoreMissingFiles =>
logWarning(s"Skipped missing file: ${file.filePath}", e)
AvroBlockMeta(null, 0L, Seq.empty)
// Throw FileNotFoundException even if `ignoreCorruptFiles` is true
case e: FileNotFoundException if !ignoreMissingFiles => throw e
case e @(_: RuntimeException | _: IOException) if ignoreCorruptFiles =>
logWarning(
s"Skipped the rest of the content in the corrupted file: ${file.filePath}", e)
AvroBlockMeta(null, 0L, Seq.empty)
}
val fPath = new Path(new URI(file.filePath.toString()))
clippedBlocks ++= singleFileInfo.blocks.map(block =>
AvroSingleDataBlockInfo(
fPath,
AvroDataBlock(block),
file.partitionValues,
AvroSchemaWrapper(SchemaConverters.toAvroType(readDataSchema)),
readDataSchema,
AvroExtraInfo()))
if (singleFileInfo.blocks.nonEmpty) {
// No need to check the header since it can not be null when blocks is not empty here.
mapPathHeader.put(fPath, singleFileInfo.header)
}
}
val filterTime = System.nanoTime() - startTime
metrics.get(FILTER_TIME).foreach {
_ += filterTime
}
metrics.get("scanTime").foreach {
_ += TimeUnit.NANOSECONDS.toMillis(filterTime)
}
new GpuMultiFileAvroPartitionReader(conf, files, clippedBlocks.toSeq, readDataSchema,
partitionSchema, maxReadBatchSizeRows, maxReadBatchSizeBytes, maxGpuColumnSizeBytes,
numThreads, debugDumpPrefix, debugDumpAlways, metrics, mapPathHeader.toMap)
}
}
/** A trait collecting common methods across the 3 kinds of avro readers */
trait GpuAvroReaderBase extends Logging { self: FilePartitionReaderBase =>
def debugDumpPrefix: Option[String]
def debugDumpAlways: Boolean
def readDataSchema: StructType
def conf: Configuration
// Same default buffer size with Parquet readers.
val cacheBufferSize = conf.getInt("avro.read.allocation.size", 8 * 1024 * 1024)
/**
* Read the host data to GPU for decoding, and return it as a cuDF Table.
* The input host buffer should contain valid data, otherwise the behavior is
* undefined.
* 'splits' is used only for debugging.
*/
protected final def sendToGpuUnchecked(
hostBuf: HostMemoryBuffer,
bufSize: Long,
splits: Array[PartitionedFile]): Table = {
val readOpts = CudfAvroOptions.builder()
.includeColumn(readDataSchema.fieldNames.toSeq: _*)
.build()
// about to start using the GPU
GpuSemaphore.acquireIfNecessary(TaskContext.get())
val table = try {
RmmRapidsRetryIterator.withRetryNoSplit[Table] {
withResource(new NvtxWithMetrics("Avro decode",
NvtxColor.DARK_GREEN, metrics(GPU_DECODE_TIME))) { _ =>
Table.readAvro(readOpts, hostBuf, 0, bufSize)
}
}
} catch {
case e: Exception =>
val dumpMsg = debugDumpPrefix.map { prefix =>
val p = DumpUtils.dumpBuffer(conf, hostBuf, 0, bufSize, prefix, ".avro")
s", data dumped to $p"
}.getOrElse("")
throw new IOException(
s"Error when processing file splits [${splits.mkString("; ")}]$dumpMsg", e)
}
closeOnExcept(table) { _ =>
debugDumpPrefix.foreach { prefix =>
if (debugDumpAlways) {
val p = DumpUtils.dumpBuffer(conf, hostBuf, 0, bufSize, prefix, ".avro")
logWarning(s"Wrote data for ${splits.mkString("; ")} to $p")
}
}
}
table
}
/**
* Send a host buffer to GPU for decoding, and return it as a ColumnarBatch.
* The input hostBuf will be closed after returning, please do not use it anymore.
* 'splits' is used only for debugging.
*/
protected final def sendToGpu(
hostBuf: HostMemoryBuffer,
bufSize: Long,
splits: Array[PartitionedFile]): Option[ColumnarBatch] = {
withResource(hostBuf) { _ =>
if (bufSize == 0) {
None
} else {
withResource(sendToGpuUnchecked(hostBuf, bufSize, splits)) { t =>
val batchSizeBytes = GpuColumnVector.getTotalDeviceMemoryUsed(t)
logDebug(s"GPU batch size: $batchSizeBytes bytes")
metrics(NUM_OUTPUT_BATCHES) += 1
// convert to batch
Some(GpuColumnVector.from(t, GpuColumnVector.extractTypes(readDataSchema)))
}
} // end of else
}
}
/**
* Get the block chunk according to the max batch size and max rows.
*
* @param blockIter blocks to be evaluated
* @param maxReadBatchSizeRows soft limit on the maximum number of rows the reader
* reads per batch
* @param maxReadBatchSizeBytes soft limit on the maximum number of bytes the reader
* reads per batch
* @return
*/
protected final def populateCurrentBlockChunk(
blockIter: BufferedIterator[BlockInfo],
maxReadBatchSizeRows: Int,
maxReadBatchSizeBytes: Long): Seq[BlockInfo] = {
val currentChunk = new ArrayBuffer[BlockInfo]
var numRows, numBytes, numAvroBytes: Long = 0
@tailrec
def readNextBatch(): Unit = {
if (blockIter.hasNext) {
val peekedRowGroup = blockIter.head
if (peekedRowGroup.count > Integer.MAX_VALUE) {
throw new UnsupportedOperationException("Too many rows in split")
}
if (numRows == 0 || numRows + peekedRowGroup.count <= maxReadBatchSizeRows) {
val estBytes = GpuBatchUtils.estimateGpuMemory(readDataSchema, peekedRowGroup.count)
if (numBytes == 0 || numBytes + estBytes <= maxReadBatchSizeBytes) {
currentChunk += blockIter.next()
numRows += currentChunk.last.count
numAvroBytes += currentChunk.last.dataSize
numBytes += estBytes
readNextBatch()
}
}
}
}
readNextBatch()
logDebug(s"Loaded $numRows rows from Avro. bytes read: $numAvroBytes. " +
s"Estimated GPU bytes: $numBytes")
currentChunk.toSeq
}
/** Read a split into a host buffer, preparing for sending to GPU */
protected final def readPartFile(
partFilePath: Path,
blocks: Seq[BlockInfo],
headerSize: Long,
conf: Configuration): (HostMemoryBuffer, Long) = {
withResource(new NvtxRange("Avro buffer file split", NvtxColor.YELLOW)) { _ =>
if (blocks.isEmpty) {
// No need to check the header here since it can not be null when blocks is not empty.
return (null, 0L)
}
val estOutSize = estimateOutputSize(blocks, headerSize)
withResource(partFilePath.getFileSystem(conf).open(partFilePath)) { in =>
closeOnExcept(HostMemoryBuffer.allocate(estOutSize)) { hmb =>
withResource(new HostMemoryOutputStream(hmb)) { out =>
val headerAndBlocks = BlockInfo(0, headerSize, 0, 0) +: blocks
copyBlocksData(headerAndBlocks, in, out)
// check we didn't go over memory
if (out.getPos > estOutSize) {
throw new QueryExecutionException(s"Calculated buffer size $estOutSize is" +
s" too small, actual written: ${out.getPos}")
}
(hmb, out.getPos)
}
}
}
}
}
/** Estimate the total size from the given blocks and header */
protected final def estimateOutputSize(blocks: Seq[BlockInfo], headerSize: Long): Long = {
// Start from the Header
var totalSize: Long = headerSize
// Add all blocks
totalSize += blocks.map(_.blockSize).sum
totalSize
}
/** Copy the data specified by the blocks from `in` to `out` */
protected final def copyBlocksData(
blocks: Seq[BlockInfo],
in: FSDataInputStream,
out: OutputStream,
sync: Option[Array[Byte]] = None): Seq[BlockInfo] = {
val copyRanges = sync.map { s =>
assert(s.size >= SYNC_SIZE)
// Copy every block without the tailing sync marker if a sync is given. This
// is for coalescing reader who requires to append this given sync marker
// to each block. Then we can not merge sequential blocks.
blocks.map(b => CopyRange(b.blockStart, b.blockSize - SYNC_SIZE))
}.getOrElse(computeCopyRanges(blocks))
val copySyncFunc: OutputStream => Unit = if (sync.isEmpty) {
out => // do nothing
} else {
out => out.write(sync.get, 0, SYNC_SIZE)
}
// copy cache, default to 8MB
val copyCache = new Array[Byte](cacheBufferSize)
var readTime, writeTime = 0L
copyRanges.foreach { range =>
if (in.getPos != range.offset) {
in.seek(range.offset)
}
var bytesLeft = range.length
while (bytesLeft > 0) {
// downcast is safe because copyBuffer.length is an int
val readLength = Math.min(bytesLeft, copyCache.length).toInt
val start = System.nanoTime()
in.readFully(copyCache, 0, readLength)
val mid = System.nanoTime()
out.write(copyCache, 0, readLength)
val end = System.nanoTime()
readTime += (mid - start)
writeTime += (end - mid)
bytesLeft -= readLength
}
// append the sync marker to the block data for the coalescing reader.
copySyncFunc(out)
}
metrics.get(READ_FS_TIME).foreach(_.add(readTime))
metrics.get(WRITE_BUFFER_TIME).foreach(_.add(writeTime))
blocks
}
/**
* Calculate the copy ranges from blocks.
* And it will try to combine the sequential blocks.
*/
private def computeCopyRanges(blocks: Seq[BlockInfo]): Seq[CopyRange] = {
var currentCopyStart, currentCopyEnd = 0L
val copyRanges = new ArrayBuffer[CopyRange]()
blocks.foreach { block =>
if (currentCopyEnd != block.blockStart) {
if (currentCopyEnd != 0) {
copyRanges.append(CopyRange(currentCopyStart, currentCopyEnd - currentCopyStart))
}
currentCopyStart = block.blockStart
currentCopyEnd = currentCopyStart
}
currentCopyEnd += block.blockSize
}
if (currentCopyEnd != currentCopyStart) {
copyRanges.append(CopyRange(currentCopyStart, currentCopyEnd - currentCopyStart))
}
copyRanges.toSeq
}
}
/** A PartitionReader that reads an AVRO file split on the GPU. */
class GpuAvroPartitionReader(
override val conf: Configuration,
partFile: PartitionedFile,
blockMeta: AvroBlockMeta,
override val readDataSchema: StructType,
override val debugDumpPrefix: Option[String],
override val debugDumpAlways: Boolean,
maxReadBatchSizeRows: Integer,
maxReadBatchSizeBytes: Long,
execMetrics: Map[String, GpuMetric])
extends FilePartitionReaderBase(conf, execMetrics) with GpuAvroReaderBase {
private val partFilePath = new Path(new URI(partFile.filePath.toString()))
private val blockIterator: BufferedIterator[BlockInfo] = blockMeta.blocks.iterator.buffered
override def next(): Boolean = {
if (batchIter.hasNext) {
return true
}
batchIter = EmptyGpuColumnarBatchIterator
if (!isDone) {
if (!blockIterator.hasNext) {
isDone = true
} else {
batchIter = readBatch() match {
case Some(batch) => new SingleGpuColumnarBatchIterator(batch)
case _ => EmptyGpuColumnarBatchIterator
}
}
}
// NOTE: At this point, the task may not have yet acquired the semaphore if `batch` is `None`.
// We are not acquiring the semaphore here since this next() is getting called from
// the `PartitionReaderIterator` which implements a standard iterator pattern, and
// advertises `hasNext` as false when we return false here. No downstream tasks should
// try to call next after `hasNext` returns false, and any task that produces some kind of
// data when `hasNext` is false is responsible to get the semaphore themselves.
batchIter.hasNext
}
private def readBatch(): Option[ColumnarBatch] = {
withResource(new NvtxRange("Avro readBatch", NvtxColor.GREEN)) { _ =>
val currentChunkedBlocks = populateCurrentBlockChunk(blockIterator,
maxReadBatchSizeRows, maxReadBatchSizeBytes)
if (readDataSchema.isEmpty) {
// not reading any data, so return a degenerate ColumnarBatch with the row count
val numRows = currentChunkedBlocks.map(_.count).sum.toInt
if (numRows == 0) {
None
} else {
Some(new ColumnarBatch(Array.empty, numRows.toInt))
}
} else {
if (currentChunkedBlocks.isEmpty) {
None
} else {
val (dataBuffer, dataSize) = metrics(BUFFER_TIME).ns {
readPartFile(partFilePath, currentChunkedBlocks,
blockMeta.headerSize, conf)
}
sendToGpu(dataBuffer, dataSize, Array(partFile))
}
}
}
}
}
/**
* A PartitionReader that can read multiple AVRO files in parallel.
* This is most efficient running in a cloud environment where the I/O of reading is slow.
*
* When reading a file, it
* - seeks to the start position of the first block located in this partition.
* - next, parses the meta and sync, rewrites the meta and sync, and copies the data to a
* batch buffer per block, until reaching the last one of the current partition.
* - sends batches to GPU at last.
*
* @param conf the Hadoop configuration
* @param files the partitioned files to read
* @param numThreads the size of the threadpool
* @param maxNumFileProcessed threshold to control the maximum file number to be
* submitted to threadpool
* @param filters filters passed into the filterHandler
* @param execMetrics the metrics
* @param ignoreCorruptFiles Whether to ignore corrupt files
* @param ignoreMissingFiles Whether to ignore missing files
* @param debugDumpPrefix a path prefix to use for dumping the fabricated AVRO data or null
* @param debugDumpAlways whether to debug dump always or only on errors
* @param readDataSchema the Spark schema describing what will be read
* @param partitionSchema Schema of partitions.
* @param maxReadBatchSizeRows soft limit on the maximum number of rows to be read per batch
* @param maxReadBatchSizeBytes soft limit on the maximum number of bytes to be read per batch
* @param maxGpuColumnSizeBytes maximum number of bytes for a GPU column
*/
class GpuMultiFileCloudAvroPartitionReader(
override val conf: Configuration,
files: Array[PartitionedFile],
numThreads: Int,
maxNumFileProcessed: Int,
filters: Array[Filter],
execMetrics: Map[String, GpuMetric],
ignoreCorruptFiles: Boolean,
ignoreMissingFiles: Boolean,
override val debugDumpPrefix: Option[String],
override val debugDumpAlways: Boolean,
override val readDataSchema: StructType,
partitionSchema: StructType,
maxReadBatchSizeRows: Integer,
maxReadBatchSizeBytes: Long,
maxGpuColumnSizeBytes: Long)
extends MultiFileCloudPartitionReaderBase(conf, files, numThreads, maxNumFileProcessed, filters,
execMetrics, maxReadBatchSizeRows, maxReadBatchSizeBytes,
ignoreCorruptFiles) with MultiFileReaderFunctions with GpuAvroReaderBase {
override def readBatches(fileBufsAndMeta: HostMemoryBuffersWithMetaDataBase):
Iterator[ColumnarBatch] = fileBufsAndMeta match {
case buffer: AvroHostBuffersWithMeta =>
val bufsAndSizes = buffer.memBuffersAndSizes
val bufAndSizeInfo = bufsAndSizes.head
val partitionValues = buffer.partitionedFile.partitionValues
val batchIter = if (bufAndSizeInfo.hmb == null) {
// Not reading any data, but add in partition data if needed
// Someone is going to process this data, even if it is just a row count
GpuSemaphore.acquireIfNecessary(TaskContext.get())
val emptyBatch = new ColumnarBatch(Array.empty, bufAndSizeInfo.numRows.toInt)
BatchWithPartitionDataUtils.addSinglePartitionValueToBatch(emptyBatch,
partitionValues, partitionSchema, maxGpuColumnSizeBytes)
} else {
val maybeBatch = sendToGpu(bufAndSizeInfo.hmb, bufAndSizeInfo.bytes, files)
// we have to add partition values here for this batch, we already verified that
// it's not different for all the blocks in this batch
maybeBatch match {
case Some(batch) => BatchWithPartitionDataUtils.addSinglePartitionValueToBatch(batch,
partitionValues, partitionSchema, maxGpuColumnSizeBytes)
case None => EmptyGpuColumnarBatchIterator
}
}
// Update the current buffers
closeOnExcept(batchIter) { _ =>
if (bufsAndSizes.length > 1) {
val updatedBuffers = bufsAndSizes.drop(1)
currentFileHostBuffers = Some(buffer.copy(memBuffersAndSizes = updatedBuffers))
} else {
currentFileHostBuffers = None
}
}
batchIter
case t =>
throw new RuntimeException(s"Unknown avro buffer type: ${t.getClass.getSimpleName}")
}
override final def getFileFormatShortName: String = "AVRO"
override def getBatchRunner(
tc: TaskContext,
file: PartitionedFile,
origFile: Option[PartitionedFile],
config: Configuration,
filters: Array[Filter]): Callable[HostMemoryBuffersWithMetaDataBase] =
new ReadBatchRunner(tc, file, config, filters)
/** Two utils classes */
private case class AvroHostBuffersWithMeta(
override val partitionedFile: PartitionedFile,
override val memBuffersAndSizes: Array[SingleHMBAndMeta],
override val bytesRead: Long) extends HostMemoryBuffersWithMetaDataBase
private class ReadBatchRunner(
taskContext: TaskContext,
partFile: PartitionedFile,
config: Configuration,
filters: Array[Filter]) extends Callable[HostMemoryBuffersWithMetaDataBase] with Logging {
override def call(): HostMemoryBuffersWithMetaDataBase = {
TrampolineUtil.setTaskContext(taskContext)
try {
doRead()
} catch {
case e: FileNotFoundException if ignoreMissingFiles =>
logWarning(s"Skipped missing file: ${partFile.filePath}", e)
AvroHostBuffersWithMeta(partFile, Array(SingleHMBAndMeta.empty()), 0)
// Throw FileNotFoundException even if `ignoreCorruptFiles` is true
case e: FileNotFoundException if !ignoreMissingFiles => throw e
case e @(_: RuntimeException | _: IOException) if ignoreCorruptFiles =>
logWarning(
s"Skipped the rest of the content in the corrupted file: ${partFile.filePath}", e)
AvroHostBuffersWithMeta(partFile, Array(SingleHMBAndMeta.empty()), 0)
} finally {
TrampolineUtil.unsetTaskContext()
}
}
private def createBufferAndMeta(
arrayBufSize: Array[SingleHMBAndMeta],
startingBytesRead: Long): HostMemoryBuffersWithMetaDataBase = {
val bytesRead = fileSystemBytesRead() - startingBytesRead
AvroHostBuffersWithMeta(partFile, arrayBufSize, bytesRead)
}
private val stopPosition = partFile.start + partFile.length
/**
* Read the split to one or more batches.
* Here is overview of the process:
* - some preparation
* - while (has next block in this split) {
* - 1) peek the current head block and estimate the batch buffer size
* - 2) read blocks as many as possible to fill the batch buffer
* - 3) One batch is done, append it to the list
* - }
* - post processing
*/
private def doRead(): HostMemoryBuffersWithMetaDataBase = {
val bufferStartTime = System.nanoTime()
val startingBytesRead = fileSystemBytesRead()
val result =
withResource(AvroFileReader.openDataReader(partFile.filePath.toString(), config)) {
reader =>
// Go to the start of the first block after the start position
reader.sync(partFile.start)
if (!reader.hasNextBlock || isDone) {
// no data or got close before finishing, return null buffer and zero size
createBufferAndMeta(
Array(SingleHMBAndMeta.empty()),
startingBytesRead)
} else {
val hostBuffers = new ArrayBuffer[SingleHMBAndMeta]
try {
val headerSize = reader.headerSize
var isBlockSizeEstimated = false
var estBlocksSize = 0L
var totalRowsNum = 0
var curBlock: MutableBlockInfo = null
while (reader.hasNextBlock && !reader.pastSync(stopPosition)) {
// Get the block metadata first
curBlock = reader.peekBlock(curBlock)
if (!isBlockSizeEstimated) {
// Initialize the estimated block total size.
// The AVRO file has no special section for block metadata, and collecting the
// block meta through the file is quite expensive for files in cloud. So we do
// not know the target buffer size ahead. Then we have to do an estimation.
// "the estimated total block size = partFile.length + additional space"
// Letting "additional space = one block length * 1.2" is because we may
// move the start and stop positions when reading this split to keep the
// integrity of edge blocks.
// One worst case is the stop position is one byte after a block start,
// then we need to read the whole block into the current batch. And this block
// may be larger than the first block. So we preserve an additional space
// whose size is 'one block length * 1.2' to try to avoid reading it to a new
// batch.
estBlocksSize = partFile.length + (curBlock.blockSize * 1.2F).toLong
isBlockSizeEstimated = true
}
var estSizeToRead = if (estBlocksSize > maxReadBatchSizeBytes) {
maxReadBatchSizeBytes
} else if (estBlocksSize < curBlock.blockSize) {
// This may happen only for the last block.
logInfo("Less buffer is estimated, read the last block into a new batch.")
curBlock.blockSize
} else {
estBlocksSize
}
val optHmb = if (readDataSchema.nonEmpty) {
Some(HostMemoryBuffer.allocate(headerSize + estSizeToRead))
} else None
// Allocate the buffer for the header and blocks for a batch
closeOnExcept(optHmb) { _ =>
val optOut = optHmb.map { hmb =>
val out = new HostMemoryOutputStream(hmb)
// Write the header to the output stream
AvroFileWriter(out).writeHeader(reader.header)
out
}
// Read the block data to the output stream
var batchRowsNum: Int = 0
var batchSize: Long = 0
var hasNextBlock = true
do {
if (optOut.nonEmpty) {
reader.readNextRawBlock(optOut.get)
} else {
// skip the current block
reader.skipCurrentBlock()
}
batchRowsNum += curBlock.count.toInt
estSizeToRead -= curBlock.blockSize
batchSize += curBlock.blockSize
// Continue reading the next block into the current batch when
// - the next block exists, and
// - the remaining buffer is enough to hold the next block, and
// - the batch rows number does not go beyond the upper limit.
hasNextBlock = reader.hasNextBlock && !reader.pastSync(stopPosition)
if (hasNextBlock) {
curBlock = reader.peekBlock(curBlock)
}
} while (hasNextBlock && curBlock.blockSize <= estSizeToRead &&
batchRowsNum <= maxReadBatchSizeRows)
// One batch is done
optOut.foreach(out => hostBuffers +=
(SingleHMBAndMeta(optHmb.get, out.getPos, batchRowsNum, Seq.empty)))
totalRowsNum += batchRowsNum
estBlocksSize -= batchSize
}
} // end of while
val bufAndSize: Array[SingleHMBAndMeta] = if (readDataSchema.isEmpty) {
hostBuffers.foreach(_.hmb.safeClose(new Exception))
Array(SingleHMBAndMeta.empty(totalRowsNum))
} else if (isDone) {
// got close before finishing, return null buffer and zero size
hostBuffers.foreach(_.hmb.safeClose(new Exception))
Array(SingleHMBAndMeta.empty())
} else {
hostBuffers.toArray
}
createBufferAndMeta(bufAndSize, startingBytesRead)
} catch {
case e: Throwable =>
hostBuffers.foreach(_.hmb.safeClose(e))
throw e
}
}
} // end of withResource(reader)
val bufferTime = System.nanoTime() - bufferStartTime
// multi-file avro scanner does not filter and then buffer, it just buffers
result.setMetrics(0, bufferTime)
result
} // end of doRead
} // end of Class ReadBatchRunner
}
/**
* A PartitionReader that can read multiple AVRO files up to the certain size. It will
* coalesce small files together and copy the block data in a separate thread pool to speed
* up processing the small files before sending down to the GPU.
*/
class GpuMultiFileAvroPartitionReader(
override val conf: Configuration,
splits: Array[PartitionedFile],
clippedBlocks: Seq[AvroSingleDataBlockInfo],
override val readDataSchema: StructType,
partitionSchema: StructType,
maxReadBatchSizeRows: Integer,
maxReadBatchSizeBytes: Long,
maxGpuColumnSizeBytes: Long,
numThreads: Int,
override val debugDumpPrefix: Option[String],
override val debugDumpAlways: Boolean,
execMetrics: Map[String, GpuMetric],
mapPathHeader: Map[Path, Header])
extends MultiFileCoalescingPartitionReaderBase(conf, clippedBlocks,
partitionSchema, maxReadBatchSizeRows, maxReadBatchSizeBytes, maxGpuColumnSizeBytes, numThreads,
execMetrics) with GpuAvroReaderBase {
override def checkIfNeedToSplitDataBlock(
currentBlockInfo: SingleDataBlockInfo,
nextBlockInfo: SingleDataBlockInfo): Boolean = {
val nextHeader = mapPathHeader.get(nextBlockInfo.filePath).get
val curHeader = mapPathHeader.get(currentBlockInfo.filePath).get
// Split into another block when a key exists in both of the two headers' metadata,
// and maps to different values.
if (Header.hasConflictInMetadata(nextHeader, curHeader)) {
logInfo(s"Avro metadata in the next file ${nextBlockInfo.filePath}" +
s" conflicts with the current one in file ${currentBlockInfo.filePath}," +
s" splitting it into a new batch!")
return true
}
false
}
override protected def createBatchContext(
chunkedBlocks: LinkedHashMap[Path, ArrayBuffer[DataBlockBase]],
clippedSchema: SchemaBase): BatchContext = {
// Get headers according to the input paths.
val headers = chunkedBlocks.keys.map(mapPathHeader.get(_).get)
// Merge the meta, it is safe because the compatibility has been verifed
// in 'checkIfNeedToSplitDataBlock'
val mergedHeader = Header.mergeMetadata(headers.toSeq)
assert(mergedHeader.nonEmpty, "No header exists")
AvroBatchContext(chunkedBlocks, clippedSchema, mergedHeader.get)
}
override def calculateEstimatedBlocksOutputSize(batchContext: BatchContext): Long = {
val allBlocks = batchContext.origChunkedBlocks.values.flatten.toSeq
val headerSize = Header.headerSizeInBytes(batchContext.mergedHeader)
estimateOutputSize(allBlocks, headerSize)
}
override def writeFileHeader(buffer: HostMemoryBuffer, bContext: BatchContext): Long = {
withResource(new HostMemoryOutputStream(buffer)) { out =>
AvroFileWriter(out).writeHeader(bContext.mergedHeader)
out.getPos
}
}
override def calculateFinalBlocksOutputSize(footerOffset: Long,
blocks: collection.Seq[DataBlockBase], batchContext: BatchContext): Long = {
// In 'calculateEstimatedBlocksOutputSize', we have got the true size for
// Header + All Blocks.
footerOffset
}
override def writeFileFooter(buffer: HostMemoryBuffer, bufferSize: Long, footerOffset: Long,
blocks: Seq[DataBlockBase], bContext: BatchContext): (HostMemoryBuffer, Long) = {
// AVRO files have no footer, do nothing
(buffer, footerOffset)
}
override def readBufferToTablesAndClose(dataBuffer: HostMemoryBuffer, dataSize: Long,
clippedSchema: SchemaBase, readSchema: StructType,
extraInfo: ExtraInfo): GpuDataProducer[Table] = {
val tableReader = withResource(dataBuffer) { _ =>
new SingleGpuDataProducer(sendToGpuUnchecked(dataBuffer, dataSize, splits))
}
GpuDataProducer.wrap(tableReader) { table =>
if (readDataSchema.length < table.getNumberOfColumns) {
throw new QueryExecutionException(s"Expected ${readDataSchema.length} columns " +
s"but read ${table.getNumberOfColumns}")
}
metrics(NUM_OUTPUT_BATCHES) += 1
table
}
}
override final def getFileFormatShortName: String = "AVRO"
override def getBatchRunner(
tc: TaskContext,
file: Path,
outhmb: HostMemoryBuffer,
blocks: ArrayBuffer[DataBlockBase],
offset: Long,
batchContext: BatchContext): Callable[(Seq[DataBlockBase], Long)] =
new AvroCopyBlocksRunner(tc, file, outhmb, blocks, offset, batchContext)
// The runner to copy blocks to offset of HostMemoryBuffer
class AvroCopyBlocksRunner(
taskContext: TaskContext,
file: Path,
outhmb: HostMemoryBuffer,
blocks: ArrayBuffer[DataBlockBase],
offset: Long,
batchContext: BatchContext) extends Callable[(Seq[DataBlockBase], Long)] {
private val headerSync = Some(batchContext.mergedHeader.sync)
override def call(): (Seq[DataBlockBase], Long) = {
TrampolineUtil.setTaskContext(taskContext)
try {
val startBytesRead = fileSystemBytesRead()
val res = withResource(outhmb) { _ =>
withResource(file.getFileSystem(conf).open(file)) { in =>
withResource(new HostMemoryOutputStream(outhmb)) { out =>
copyBlocksData(blocks.toSeq, in, out, headerSync)
}
}
}
val bytesRead = fileSystemBytesRead() - startBytesRead
(res, bytesRead)
} finally {
TrampolineUtil.unsetTaskContext()
}
}
}
// Some implicits for conversions between the base class to the sub-class
implicit def toAvroSchema(schema: SchemaBase): Schema =
schema.asInstanceOf[AvroSchemaWrapper].schema
implicit def toBlockInfo(block: DataBlockBase): BlockInfo =
block.asInstanceOf[AvroDataBlock].blockInfo
implicit def toBlockInfos(blocks: Seq[DataBlockBase]): Seq[BlockInfo] =
blocks.map(toBlockInfo(_))
implicit def toBlockBases(blocks: Seq[BlockInfo]): Seq[DataBlockBase] =
blocks.map(AvroDataBlock(_))
implicit def toAvroExtraInfo(in: ExtraInfo): AvroExtraInfo =
in.asInstanceOf[AvroExtraInfo]
implicit def toAvroBatchContext(in: BatchContext): AvroBatchContext =
in.asInstanceOf[AvroBatchContext]
}
/** A tool to filter Avro blocks */
case class AvroFileFilterHandler(
hadoopConf: Configuration,
@transient options: AvroOptions) extends Logging {
@scala.annotation.nowarn(
"msg=value ignoreExtension in class AvroOptions is deprecated*"
)
val ignoreExtension = options.ignoreExtension
def filterBlocks(partFile: PartitionedFile): AvroBlockMeta = {
if (ignoreExtension || partFile.filePath.toString().endsWith(".avro")) {
withResource(AvroFileReader.openMetaReader(partFile.filePath.toString(), hadoopConf)) {
reader =>
// Get blocks only belong to this split
reader.sync(partFile.start)
val partBlocks = reader.getPartialBlocks(partFile.start + partFile.length)
AvroBlockMeta(reader.header, reader.headerSize, partBlocks)
}
} else {
AvroBlockMeta(null, 0L, Seq.empty)
}
}
}
/**
* Avro block meta info
*
* @param header the header of avro file
* @param blocks the total block info of avro file
*/
case class AvroBlockMeta(header: Header, headerSize: Long, blocks: Seq[BlockInfo])
/**
* CopyRange to indicate from where to copy.
*
* @param offset from where to copy
* @param length how many bytes to copy
*/
private case class CopyRange(offset: Long, length: Long)
/** Extra information */
case class AvroExtraInfo() extends ExtraInfo
/** avro schema wrapper */
case class AvroSchemaWrapper(schema: Schema) extends SchemaBase {
override def isEmpty: Boolean = schema.getFields.isEmpty
}
/** avro BlockInfo wrapper */
case class AvroDataBlock(blockInfo: BlockInfo) extends DataBlockBase {
override def getRowCount: Long = blockInfo.count
override def getReadDataSize: Long = blockInfo.dataSize
override def getBlockSize: Long = blockInfo.blockSize
}
case class AvroSingleDataBlockInfo(
filePath: Path,
dataBlock: AvroDataBlock,
partitionValues: InternalRow,
schema: AvroSchemaWrapper,
readSchema: StructType,
extraInfo: AvroExtraInfo) extends SingleDataBlockInfo
case class AvroBatchContext(
override val origChunkedBlocks: LinkedHashMap[Path, ArrayBuffer[DataBlockBase]],
override val schema: SchemaBase,
mergedHeader: Header) extends BatchContext(origChunkedBlocks, schema)