org.apache.comet.parquet.CometParquetPartitionReaderFactory.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of comet-spark-spark3.4_2.12
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.comet.parquet

import scala.collection.JavaConverters
import scala.collection.mutable

import org.apache.parquet.filter2.predicate.{FilterApi, FilterPredicate}
import org.apache.parquet.hadoop.ParquetInputFormat
import org.apache.parquet.hadoop.metadata.ParquetMetadata
import org.apache.spark.TaskContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.util.RebaseDateTime.RebaseSpec
import org.apache.spark.sql.connector.read.InputPartition
import org.apache.spark.sql.connector.read.PartitionReader
import org.apache.spark.sql.execution.datasources.{FilePartition, PartitionedFile}
import org.apache.spark.sql.execution.datasources.parquet.ParquetOptions
import org.apache.spark.sql.execution.datasources.v2.FilePartitionReaderFactory
import org.apache.spark.sql.execution.metric.SQLMetric
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.sources.Filter
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.vectorized.ColumnarBatch
import org.apache.spark.util.SerializableConfiguration

import org.apache.comet.{CometConf, CometRuntimeException}
import org.apache.comet.shims.ShimSQLConf

case class CometParquetPartitionReaderFactory(
    @transient sqlConf: SQLConf,
    broadcastedConf: Broadcast[SerializableConfiguration],
    readDataSchema: StructType,
    partitionSchema: StructType,
    filters: Array[Filter],
    options: ParquetOptions,
    metrics: Map[String, SQLMetric])
    extends FilePartitionReaderFactory
    with ShimSQLConf
    with Logging {

  private val isCaseSensitive = sqlConf.caseSensitiveAnalysis
  private val useFieldId = CometParquetUtils.readFieldId(sqlConf)
  private val ignoreMissingIds = CometParquetUtils.ignoreMissingIds(sqlConf)
  private val pushDownDate = sqlConf.parquetFilterPushDownDate
  private val pushDownTimestamp = sqlConf.parquetFilterPushDownTimestamp
  private val pushDownDecimal = sqlConf.parquetFilterPushDownDecimal
  private val pushDownStringPredicate = getPushDownStringPredicate(sqlConf)
  private val pushDownInFilterThreshold = sqlConf.parquetFilterPushDownInFilterThreshold
  private val datetimeRebaseModeInRead = options.datetimeRebaseModeInRead
  private val parquetFilterPushDown = sqlConf.parquetFilterPushDown

  // Comet specific configurations
  private val batchSize = CometConf.COMET_BATCH_SIZE.get(sqlConf)

  // This is only called at executor on a Broadcast variable, so we don't want it to be
  // materialized at driver.
  @transient private lazy val preFetchEnabled = {
    val conf = broadcastedConf.value.value

    conf.getBoolean(
      CometConf.COMET_SCAN_PREFETCH_ENABLED.key,
      CometConf.COMET_SCAN_PREFETCH_ENABLED.defaultValue.get)
  }

  private var cometReaders: Iterator[BatchReader] = _
  private val cometReaderExceptionMap = new mutable.HashMap[PartitionedFile, Throwable]()

  // TODO: we may want to revisit this as we're going to only support flat types at the beginning
  override def supportColumnarReads(partition: InputPartition): Boolean = true

  override def createColumnarReader(partition: InputPartition): PartitionReader[ColumnarBatch] = {
    if (preFetchEnabled) {
      val filePartition = partition.asInstanceOf[FilePartition]
      val conf = broadcastedConf.value.value

      val threadNum = conf.getInt(
        CometConf.COMET_SCAN_PREFETCH_THREAD_NUM.key,
        CometConf.COMET_SCAN_PREFETCH_THREAD_NUM.defaultValue.get)
      val prefetchThreadPool = CometPrefetchThreadPool.getOrCreateThreadPool(threadNum)

      this.cometReaders = filePartition.files
        .map { file =>
          // `init()` call is deferred to when the prefetch task begins.
          // Otherwise we will hold too many resources for readers which are not ready
          // to prefetch.
          val cometReader = buildCometReader(file)
          if (cometReader != null) {
            cometReader.submitPrefetchTask(prefetchThreadPool)
          }

          cometReader
        }
        .toSeq
        .toIterator
    }

    super.createColumnarReader(partition)
  }

  override def buildReader(partitionedFile: PartitionedFile): PartitionReader[InternalRow] =
    throw new UnsupportedOperationException("Comet doesn't support 'buildReader'")

  private def buildCometReader(file: PartitionedFile): BatchReader = {
    val conf = broadcastedConf.value.value

    try {
      val (datetimeRebaseSpec, footer, filters) = getFilter(file)
      filters.foreach(pushed => ParquetInputFormat.setFilterPredicate(conf, pushed))
      val cometReader = new BatchReader(
        conf,
        file,
        footer,
        batchSize,
        readDataSchema,
        isCaseSensitive,
        useFieldId,
        ignoreMissingIds,
        datetimeRebaseSpec.mode == CORRECTED,
        partitionSchema,
        file.partitionValues,
        JavaConverters.mapAsJavaMap(metrics))
      val taskContext = Option(TaskContext.get)
      taskContext.foreach(_.addTaskCompletionListener[Unit](_ => cometReader.close()))
      return cometReader
    } catch {
      case e: Throwable if preFetchEnabled =>
        // Keep original exception
        cometReaderExceptionMap.put(file, e)
    }
    null
  }

  override def buildColumnarReader(file: PartitionedFile): PartitionReader[ColumnarBatch] = {
    val cometReader = if (!preFetchEnabled) {
      // Prefetch is not enabled, create comet reader and initiate it.
      val cometReader = buildCometReader(file)
      cometReader.init()

      cometReader
    } else {
      // If prefetch is enabled, we already tried to access the file when in `buildCometReader`.
      // It is possibly we got an exception like `FileNotFoundException` and we need to throw it
      // now to let Spark handle it.
      val reader = cometReaders.next()
      val exception = cometReaderExceptionMap.get(file)
      exception.foreach(e => throw e)

      if (reader == null) {
        throw new CometRuntimeException(s"Cannot find comet file reader for $file")
      }
      reader
    }
    CometPartitionReader(cometReader)
  }

  def getFilter(file: PartitionedFile): (RebaseSpec, ParquetMetadata, Option[FilterPredicate]) = {
    val sharedConf = broadcastedConf.value.value
    val footer = FooterReader.readFooter(sharedConf, file)
    val footerFileMetaData = footer.getFileMetaData
    val datetimeRebaseSpec = CometParquetFileFormat.getDatetimeRebaseSpec(
      file,
      readDataSchema,
      sharedConf,
      footerFileMetaData,
      datetimeRebaseModeInRead)

    val pushed = if (parquetFilterPushDown) {
      val parquetSchema = footerFileMetaData.getSchema
      val parquetFilters = new ParquetFilters(
        parquetSchema,
        pushDownDate,
        pushDownTimestamp,
        pushDownDecimal,
        pushDownStringPredicate,
        pushDownInFilterThreshold,
        isCaseSensitive,
        datetimeRebaseSpec)
      filters
        // Collects all converted Parquet filter predicates. Notice that not all predicates can be
        // converted (`ParquetFilters.createFilter` returns an `Option`). That's why a `flatMap`
        // is used here.
        .flatMap(parquetFilters.createFilter)
        .reduceOption(FilterApi.and)
    } else {
      None
    }
    (datetimeRebaseSpec, footer, pushed)
  }

  override def createReader(inputPartition: InputPartition): PartitionReader[InternalRow] =
    throw new UnsupportedOperationException("Only 'createColumnarReader' is supported.")

  /**
   * A simple adapter on Comet's [[BatchReader]].
   */
  protected case class CometPartitionReader(reader: BatchReader)
      extends PartitionReader[ColumnarBatch] {

    override def next(): Boolean = {
      reader.nextBatch()
    }

    override def get(): ColumnarBatch = {
      reader.currentBatch()
    }

    override def close(): Unit = {
      reader.close()
    }
  }
}