org.apache.comet.CometExecIterator.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of comet-spark-spark3.3_2.13
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.comet

import org.apache.spark._
import org.apache.spark.sql.comet.CometMetricNode
import org.apache.spark.sql.vectorized._

import org.apache.comet.CometConf.{COMET_BATCH_SIZE, COMET_BLOCKING_THREADS, COMET_DEBUG_ENABLED, COMET_EXEC_MEMORY_FRACTION, COMET_EXPLAIN_NATIVE_ENABLED, COMET_WORKER_THREADS}
import org.apache.comet.vector.NativeUtil

/**
 * An iterator class used to execute Comet native query. It takes an input iterator which comes
 * from Comet Scan and is expected to produce batches of Arrow Arrays. During consuming this
 * iterator, it will consume input iterator and pass Arrow Arrays to Comet native engine by
 * addresses. Even after the end of input iterator, this iterator still possibly continues
 * executing native query as there might be blocking operators such as Sort, Aggregate. The API
 * `hasNext` can be used to check if it is the end of this iterator (i.e. the native query is
 * done).
 *
 * @param inputs
 *   The input iterators producing sequence of batches of Arrow Arrays.
 * @param protobufQueryPlan
 *   The serialized bytes of Spark execution plan.
 */
class CometExecIterator(
    val id: Long,
    inputs: Seq[Iterator[ColumnarBatch]],
    numOutputCols: Int,
    protobufQueryPlan: Array[Byte],
    nativeMetrics: CometMetricNode)
    extends Iterator[ColumnarBatch] {

  private val nativeLib = new Native()
  private val nativeUtil = new NativeUtil()
  private val cometBatchIterators = inputs.map { iterator =>
    new CometBatchIterator(iterator, nativeUtil)
  }.toArray
  private val plan = {
    val configs = createNativeConf
    nativeLib.createPlan(
      id,
      configs,
      cometBatchIterators,
      protobufQueryPlan,
      nativeMetrics,
      new CometTaskMemoryManager(id))
  }

  private var nextBatch: Option[ColumnarBatch] = None
  private var currentBatch: ColumnarBatch = null
  private var closed: Boolean = false

  /**
   * Creates a new configuration map to be passed to the native side.
   */
  private def createNativeConf: java.util.HashMap[String, String] = {
    val result = new java.util.HashMap[String, String]()
    val conf = SparkEnv.get.conf

    val maxMemory = CometSparkSessionExtensions.getCometMemoryOverhead(conf)
    // Only enable unified memory manager when off-heap mode is enabled. Otherwise,
    // we'll use the built-in memory pool from DF, and initializes with `memory_limit`
    // and `memory_fraction` below.
    result.put(
      "use_unified_memory_manager",
      String.valueOf(conf.get("spark.memory.offHeap.enabled", "false")))
    result.put("memory_limit", String.valueOf(maxMemory))
    result.put("memory_fraction", String.valueOf(COMET_EXEC_MEMORY_FRACTION.get()))
    result.put("batch_size", String.valueOf(COMET_BATCH_SIZE.get()))
    result.put("debug_native", String.valueOf(COMET_DEBUG_ENABLED.get()))
    result.put("explain_native", String.valueOf(COMET_EXPLAIN_NATIVE_ENABLED.get()))
    result.put("worker_threads", String.valueOf(COMET_WORKER_THREADS.get()))
    result.put("blocking_threads", String.valueOf(COMET_BLOCKING_THREADS.get()))

    // Strip mandatory prefix spark. which is not required for DataFusion session params
    conf.getAll.foreach {
      case (k, v) if k.startsWith("spark.datafusion") =>
        result.put(k.replaceFirst("spark\\.", ""), v)
      case _ =>
    }

    result
  }

  def getNextBatch(): Option[ColumnarBatch] = {
    nativeUtil.getNextBatch(
      numOutputCols,
      (arrayAddrs, schemaAddrs) => {
        nativeLib.executePlan(plan, arrayAddrs, schemaAddrs)
      })
  }

  override def hasNext: Boolean = {
    if (closed) return false

    if (nextBatch.isDefined) {
      return true
    }

    nextBatch = getNextBatch()

    if (nextBatch.isEmpty) {
      close()
      false
    } else {
      true
    }
  }

  override def next(): ColumnarBatch = {
    if (currentBatch != null) {
      // Eagerly release Arrow Arrays in the previous batch
      currentBatch.close()
      currentBatch = null
    }

    if (nextBatch.isEmpty && !hasNext) {
      throw new NoSuchElementException("No more element")
    }

    currentBatch = nextBatch.get
    nextBatch = None
    currentBatch
  }

  def close(): Unit = synchronized {
    if (!closed) {
      if (currentBatch != null) {
        currentBatch.close()
        currentBatch = null
      }
      nativeUtil.close()
      nativeLib.releasePlan(plan)

      // The allocator thoughts the exported ArrowArray and ArrowSchema structs are not released,
      // so it will report:
      // Caused by: java.lang.IllegalStateException: Memory was leaked by query.
      // Memory leaked: (516) Allocator(ROOT) 0/516/808/9223372036854775807 (res/actual/peak/limit)
      // Suspect this seems a false positive leak, because there is no reported memory leak at JVM
      // when profiling. `allocator` reports a leak because it calculates the accumulated number
      // of memory allocated for ArrowArray and ArrowSchema. But these exported ones will be
      // released in native side later.
      // More to clarify it. For ArrowArray and ArrowSchema, Arrow will put a release field into the
      // memory region which is a callback function pointer (C function) that could be called to
      // release these structs in native code too. Once we wrap their memory addresses at native
      // side using FFI ArrowArray and ArrowSchema, and drop them later, the callback function will
      // be called to release the memory.
      // But at JVM, the allocator doesn't know about this fact so it still keeps the accumulated
      // number.
      // Tried to manually do `release` and `close` that can make the allocator happy, but it will
      // cause JVM runtime failure.

      // allocator.close()
      closed = true
    }
  }
}