All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.comet.vector.NativeUtil.scala Maven / Gradle / Ivy

There is a newer version: 0.4.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.comet.vector

import scala.collection.mutable

import org.apache.arrow.c.{ArrowArray, ArrowImporter, ArrowSchema, CDataDictionaryProvider, Data}
import org.apache.arrow.vector.VectorSchemaRoot
import org.apache.arrow.vector.dictionary.DictionaryProvider
import org.apache.spark.SparkException
import org.apache.spark.sql.comet.util.Utils
import org.apache.spark.sql.vectorized.ColumnarBatch

import org.apache.comet.CometArrowAllocator

/**
 * Provides functionality for importing Arrow vectors from native code and wrapping them as
 * CometVectors.
 *
 * Also provides functionality for exporting Comet columnar batches to native code.
 *
 * Each instance of NativeUtil creates an instance of CDataDictionaryProvider (a
 * DictionaryProvider that is used in C Data Interface for imports).
 *
 * NativeUtil must be closed after use to release resources in the dictionary provider.
 */
class NativeUtil {
  import Utils._

  /** Use the global allocator */
  private val allocator = CometArrowAllocator

  /** ArrowImporter does not hold any state and does not need to be closed */
  private val importer = new ArrowImporter(allocator)

  /**
   * Dictionary provider to use for the lifetime of this instance of NativeUtil. The dictionary
   * provider is closed when NativeUtil is closed.
   */
  private val dictionaryProvider: CDataDictionaryProvider = new CDataDictionaryProvider

  /**
   * Allocates Arrow structs for the given number of columns.
   *
   * @param numCols
   *   the number of columns
   * @return
   *   a pair of Arrow arrays and Arrow schemas
   */
  def allocateArrowStructs(numCols: Int): (Array[ArrowArray], Array[ArrowSchema]) = {
    val arrays = new Array[ArrowArray](numCols)
    val schemas = new Array[ArrowSchema](numCols)

    (0 until numCols).foreach { index =>
      val arrowSchema = ArrowSchema.allocateNew(allocator)
      val arrowArray = ArrowArray.allocateNew(allocator)
      arrays(index) = arrowArray
      schemas(index) = arrowSchema
    }

    (arrays, schemas)
  }

  /**
   * Exports a Comet `ColumnarBatch` into a list of memory addresses that can be consumed by the
   * native execution.
   *
   * @param batch
   *   the input Comet columnar batch
   * @return
   *   an exported batches object containing an array containing number of rows + pairs of memory
   *   addresses in the format of (address of Arrow array, address of Arrow schema)
   */
  def exportBatch(
      arrayAddrs: Array[Long],
      schemaAddrs: Array[Long],
      batch: ColumnarBatch): Int = {
    (0 until batch.numCols()).foreach { index =>
      batch.column(index) match {
        case a: CometVector =>
          val valueVector = a.getValueVector

          val provider = if (valueVector.getField.getDictionary != null) {
            a.getDictionaryProvider
          } else {
            null
          }

          // The array and schema structures are allocated by native side.
          // Don't need to deallocate them here.
          val arrowSchema = ArrowSchema.wrap(schemaAddrs(index))
          val arrowArray = ArrowArray.wrap(arrayAddrs(index))
          Data.exportVector(
            allocator,
            getFieldVector(valueVector, "export"),
            provider,
            arrowArray,
            arrowSchema)
        case c =>
          throw new SparkException(
            "Comet execution only takes Arrow Arrays, but got " +
              s"${c.getClass}")
      }
    }

    batch.numRows()
  }

  /**
   * Gets the next batch from native execution.
   *
   * @param numOutputCols
   *   The number of output columns
   * @param func
   *   The function to call to get the next batch
   * @return
   *   The number of row of the next batch, or None if there are no more batches
   */
  def getNextBatch(
      numOutputCols: Int,
      func: (Array[Long], Array[Long]) => Long): Option[ColumnarBatch] = {
    val (arrays, schemas) = allocateArrowStructs(numOutputCols)

    val arrayAddrs = arrays.map(_.memoryAddress())
    val schemaAddrs = schemas.map(_.memoryAddress())

    val result = func(arrayAddrs, schemaAddrs)

    result match {
      case -1 =>
        // EOF
        None
      case numRows =>
        val cometVectors = importVector(arrays, schemas)
        Some(new ColumnarBatch(cometVectors.toArray, numRows.toInt))
      case flag =>
        throw new IllegalStateException(s"Invalid native flag: $flag")
    }
  }

  /**
   * Imports a list of Arrow addresses from native execution, and return a list of Comet vectors.
   *
   * @param arrays
   *   a list of Arrow array
   * @param schemas
   *   a list of Arrow schema
   * @return
   *   a list of Comet vectors
   */
  def importVector(arrays: Array[ArrowArray], schemas: Array[ArrowSchema]): Seq[CometVector] = {
    val arrayVectors = mutable.ArrayBuffer.empty[CometVector]

    (0 until arrays.length).foreach { i =>
      val arrowSchema = schemas(i)
      val arrowArray = arrays(i)

      // Native execution should always have 'useDecimal128' set to true since it doesn't support
      // other cases.
      arrayVectors += CometVector.getVector(
        importer.importVector(arrowArray, arrowSchema, dictionaryProvider),
        true,
        dictionaryProvider)
    }
    arrayVectors.toSeq
  }

  /**
   * Takes zero-copy slices of the input batch with given start index and maximum number of rows.
   *
   * @param batch
   *   Input batch
   * @param startIndex
   *   Start index of the slice
   * @param maxNumRows
   *   Maximum number of rows in the slice
   * @return
   *   A new batch with the sliced vectors
   */
  def takeRows(batch: ColumnarBatch, startIndex: Int, maxNumRows: Int): ColumnarBatch = {
    val arrayVectors = mutable.ArrayBuffer.empty[CometVector]

    for (i <- 0 until batch.numCols()) {
      val column = batch.column(i).asInstanceOf[CometVector]
      arrayVectors += column.slice(startIndex, maxNumRows)
    }

    new ColumnarBatch(arrayVectors.toArray, maxNumRows)
  }

  def close(): Unit = {
    // closing the dictionary provider also closes the dictionary arrays
    dictionaryProvider.close()
  }
}

object NativeUtil {
  def rootAsBatch(arrowRoot: VectorSchemaRoot): ColumnarBatch = {
    rootAsBatch(arrowRoot, null)
  }

  def rootAsBatch(arrowRoot: VectorSchemaRoot, provider: DictionaryProvider): ColumnarBatch = {
    val vectors = (0 until arrowRoot.getFieldVectors.size()).map { i =>
      val vector = arrowRoot.getFieldVectors.get(i)
      // Native shuffle always uses decimal128.
      CometVector.getVector(vector, true, provider)
    }
    new ColumnarBatch(vectors.toArray, arrowRoot.getRowCount)
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy