All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.spark.sql.HoodieUnsafeUtils.scala Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.sql

import org.apache.hudi.{HoodieUnsafeRDD, SparkAdapterSupport}

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnknownPartitioning}
import org.apache.spark.sql.execution.LogicalRDD
import org.apache.spark.sql.types.StructType
import org.apache.spark.util.MutablePair

/**
 * Suite of utilities helping in handling instances of [[HoodieUnsafeRDD]]
 */
object HoodieUnsafeUtils {

  /**
   * Fetches expected output [[Partitioning]] of the provided [[DataFrame]]
   *
   * NOTE: Invoking [[QueryExecution#executedPlan]] wouldn't actually execute the query (ie start pumping the data)
   *       but instead will just execute Spark resolution, optimization and actual execution planning stages
   *       returning instance of [[SparkPlan]] ready for execution
   */
  def getNumPartitions(df: DataFrame): Int = {
    // NOTE: In general we'd rely on [[outputPartitioning]] of the executable [[SparkPlan]] to determine
    //       number of partitions plan is going to be executed with.
    //       However in case of [[LogicalRDD]] plan's output-partitioning will be stubbed as [[UnknownPartitioning]]
    //       and therefore we will be falling back to determine number of partitions by looking at the RDD itself
    df.queryExecution.logical match {
      case LogicalRDD(_, rdd, outputPartitioning, _, _) =>
        outputPartitioning match {
          case _: UnknownPartitioning => rdd.getNumPartitions
          case _ => outputPartitioning.numPartitions
        }

      case _ => df.queryExecution.executedPlan.outputPartitioning.numPartitions
    }
  }

  /**
   * Creates [[DataFrame]] from provided [[plan]]
   *
   * @param spark spark's session
   * @param plan given plan to wrap into [[DataFrame]]
   */
  def createDataFrameFrom(spark: SparkSession, plan: LogicalPlan): DataFrame =
    Dataset.ofRows(spark, plan)

  /**
   * Creates [[DataFrame]] from the in-memory [[Seq]] of [[Row]]s with provided [[schema]]
   *
   * NOTE: [[DataFrame]] is based on [[LocalRelation]], entailing that most computations with it
   * will be executed by Spark locally
   *
   * @param spark  spark's session
   * @param rows   collection of rows to base [[DataFrame]] on
   * @param schema target [[DataFrame]]'s schema
   */
  def createDataFrameFromRows(spark: SparkSession, rows: Seq[Row], schema: StructType): DataFrame =
    Dataset.ofRows(spark, LocalRelation.fromExternalRows(
      SparkAdapterSupport.sparkAdapter.getSchemaUtils.toAttributes(schema), rows))

  /**
   * Creates [[DataFrame]] from the in-memory [[Seq]] of [[InternalRow]]s with provided [[schema]]
   *
   * NOTE: [[DataFrame]] is based on [[LocalRelation]], entailing that most computations with it
   *       will be executed by Spark locally
   *
   * @param spark spark's session
   * @param rows collection of rows to base [[DataFrame]] on
   * @param schema target [[DataFrame]]'s schema
   */
  def createDataFrameFromInternalRows(spark: SparkSession, rows: Seq[InternalRow], schema: StructType): DataFrame =
    Dataset.ofRows(spark, LocalRelation(SparkAdapterSupport.sparkAdapter.getSchemaUtils.toAttributes(schema), rows))


  /**
   * Creates [[DataFrame]] from the [[RDD]] of [[Row]]s with provided [[schema]]
   *
   * @param spark spark's session
   * @param rdd RDD w/ [[Row]]s to base [[DataFrame]] on
   * @param schema target [[DataFrame]]'s schema
   */
  def createDataFrameFromRDD(spark: SparkSession, rdd: RDD[InternalRow], schema: StructType): DataFrame =
    spark.internalCreateDataFrame(rdd, schema)

  /**
   * Canonical implementation of the [[RDD#collect]] for [[HoodieUnsafeRDD]], returning a properly
   * copied [[Array]] of [[InternalRow]]s
   */
  def collect(rdd: HoodieUnsafeRDD): Array[InternalRow] = {
    rdd.mapPartitionsInternal { iter =>
      // NOTE: We're leveraging [[MutablePair]] here to avoid unnecessary allocations, since
      //       a) iteration is performed lazily and b) iteration is single-threaded (w/in partition)
      val pair = new MutablePair[InternalRow, Null]()
      iter.map(row => pair.update(row.copy(), null))
    }
      .map(p => p._1)
      .collect()
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy