org.apache.spark.sql.hudi.SparkAdapter.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hudi-spark-bundle_2.11 Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.sql.hudi

import org.apache.hudi.client.utils.SparkRowSerDe
import org.apache.hudi.common.table.HoodieTableMetaClient
import org.apache.hudi.storage.StoragePath

import org.apache.avro.Schema
import org.apache.hadoop.conf.Configuration
import org.apache.spark.sql._
import org.apache.spark.sql.avro.{HoodieAvroDeserializer, HoodieAvroSchemaConverters, HoodieAvroSerializer}
import org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases
import org.apache.spark.sql.catalyst.catalog.CatalogTable
import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, InterpretedPredicate}
import org.apache.spark.sql.catalyst.parser.ParserInterface
import org.apache.spark.sql.catalyst.planning.PhysicalOperation
import org.apache.spark.sql.catalyst.plans.logical.{Command, LogicalPlan}
import org.apache.spark.sql.catalyst.util.DateFormatter
import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier}
import org.apache.spark.sql.execution.QueryExecution
import org.apache.spark.sql.execution.datasources._
import org.apache.spark.sql.execution.datasources.parquet.{ParquetFileFormat, SparkParquetReader}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.parser.HoodieExtendedParserInterface
import org.apache.spark.sql.sources.{BaseRelation, Filter}
import org.apache.spark.sql.types.{DataType, Metadata, StructType}
import org.apache.spark.sql.vectorized.{ColumnVector, ColumnarBatch}
import org.apache.spark.storage.StorageLevel

import java.util.{Locale, TimeZone}

/**
 * Interface adapting discrepancies and incompatibilities between different Spark versions
 */
trait SparkAdapter extends Serializable {

  /**
   * Checks whether provided instance of [[InternalRow]] is actually an instance of [[ColumnarBatchRow]]
   */
  def isColumnarBatchRow(r: InternalRow): Boolean

  /**
   * Creates Catalyst [[Metadata]] for Hudi's meta-fields (designating these w/
   * [[METADATA_COL_ATTR_KEY]] if available (available in Spark >= 3.2)
   */
  def createCatalystMetadataForMetaField: Metadata

  /**
   * Inject table-valued functions to SparkSessionExtensions
   */
  def injectTableFunctions(extensions : SparkSessionExtensions): Unit = {}

  /**
   * Returns an instance of [[HoodieCatalogUtils]] providing for common utils operating on Spark's
   * [[TableCatalog]]s
   */
  def getCatalogUtils: HoodieCatalogUtils

  /**
   * Returns an instance of [[HoodieCatalystExpressionUtils]] providing for common utils operating
   * on Catalyst [[Expression]]s
   */
  def getCatalystExpressionUtils: HoodieCatalystExpressionUtils

  /**
   * Returns an instance of [[HoodieCatalystPlansUtils]] providing for common utils operating
   * on Catalyst [[LogicalPlan]]s
   */
  def getCatalystPlanUtils: HoodieCatalystPlansUtils

  /**
   * Returns an instance of [[HoodieSchemaUtils]] providing schema utils.
   */
  def getSchemaUtils: HoodieSchemaUtils

  /**
   * Creates instance of [[HoodieAvroSerializer]] providing for ability to serialize
   * Spark's [[InternalRow]] into Avro payloads
   */
  def createAvroSerializer(rootCatalystType: DataType, rootAvroType: Schema, nullable: Boolean): HoodieAvroSerializer

  /**
   * Creates instance of [[HoodieAvroDeserializer]] providing for ability to deserialize
   * Avro payloads into Spark's [[InternalRow]]
   */
  def createAvroDeserializer(rootAvroType: Schema, rootCatalystType: DataType): HoodieAvroDeserializer

  /**
   * Creates instance of [[HoodieAvroSchemaConverters]] allowing to convert b/w Avro and Catalyst schemas
   */
  def getAvroSchemaConverters: HoodieAvroSchemaConverters

  /**
   * Create the SparkRowSerDe.
   */
  def createSparkRowSerDe(schema: StructType): SparkRowSerDe

  /**
   * Create the hoodie's extended spark sql parser.
   */
  def createExtendedSparkParser(spark: SparkSession, delegate: ParserInterface): HoodieExtendedParserInterface

  /**
   * Create the SparkParsePartitionUtil.
   */
  def getSparkParsePartitionUtil: SparkParsePartitionUtil

  /**
   * Gets the [[HoodieSparkPartitionedFileUtils]].
   */
  def getSparkPartitionedFileUtils: HoodieSparkPartitionedFileUtils

  /**
   * Get the [[DateFormatter]].
   */
  def getDateFormatter(tz: TimeZone): DateFormatter

  /**
   * Combine [[PartitionedFile]] to [[FilePartition]] according to `maxSplitBytes`.
   */
  def getFilePartitions(sparkSession: SparkSession, partitionedFiles: Seq[PartitionedFile],
      maxSplitBytes: Long): Seq[FilePartition]

  /**
   * Checks whether [[LogicalPlan]] refers to Hudi table, and if it's the case extracts
   * corresponding [[CatalogTable]]
   */
  def resolveHoodieTable(plan: LogicalPlan): Option[CatalogTable] = {
    EliminateSubqueryAliases(plan) match {
      // First, we need to weed out unresolved plans
      case plan if !plan.resolved => None
      // NOTE: When resolving Hudi table we allow [[Filter]]s and [[Project]]s be applied
      //       on top of it
      case PhysicalOperation(_, _, LogicalRelation(_, _, Some(table), _)) if isHoodieTable(table) => Some(table)
      case _ => None
    }
  }

  def isHoodieTable(map: java.util.Map[String, String]): Boolean = {
    isHoodieTable(map.getOrDefault("provider", ""))
  }

  def isHoodieTable(table: CatalogTable): Boolean = {
    isHoodieTable(table.provider.map(_.toLowerCase(Locale.ROOT)).orNull)
  }

  def isHoodieTable(tableId: TableIdentifier, spark: SparkSession): Boolean = {
    val table = spark.sessionState.catalog.getTableMetadata(tableId)
    isHoodieTable(table)
  }

  def isHoodieTable(provider: String): Boolean = {
    "hudi".equalsIgnoreCase(provider)
  }

  /**
   * Create instance of [[ParquetFileFormat]]
   */
  def createLegacyHoodieParquetFileFormat(appendPartitionValues: Boolean): Option[ParquetFileFormat]

  def makeColumnarBatch(vectors: Array[ColumnVector], numRows: Int): ColumnarBatch

  /**
   * Create instance of [[InterpretedPredicate]]
   *
   * TODO move to HoodieCatalystExpressionUtils
   */
  def createInterpretedPredicate(e: Expression): InterpretedPredicate

  /**
   * Create Hoodie relation based on globPaths, otherwise use tablePath if it's empty
   */
  def createRelation(sqlContext: SQLContext,
                     metaClient: HoodieTableMetaClient,
                     schema: Schema,
                     globPaths: Array[StoragePath],
                     parameters: java.util.Map[String, String]): BaseRelation

  /**
   * Create instance of [[HoodieFileScanRDD]]
   * SPARK-37273 FileScanRDD constructor changed in SPARK 3.3
   */
  def createHoodieFileScanRDD(sparkSession: SparkSession,
                              readFunction: PartitionedFile => Iterator[InternalRow],
                              filePartitions: Seq[FilePartition],
                              readDataSchema: StructType,
                              metadataColumns: Seq[AttributeReference] = Seq.empty): FileScanRDD

  /**
   * Extract condition in [[DeleteFromTable]]
   * SPARK-38626 condition is no longer Option in Spark 3.3
   */
  def extractDeleteCondition(deleteFromTable: Command): Expression

  /**
   * Converts instance of [[StorageLevel]] to a corresponding string
   */
  def convertStorageLevelToString(level: StorageLevel): String

  /**
   * Tries to translate a Catalyst Expression into data source Filter
   */
  def translateFilter(predicate: Expression, supportNestedPredicatePushdown: Boolean = false): Option[Filter]

  /**
   * Get parquet file reader
   *
   * @param vectorized true if vectorized reading is not prohibited due to schema, reading mode, etc
   * @param sqlConf    the [[SQLConf]] used for the read
   * @param options    passed as a param to the file format
   * @param hadoopConf some configs will be set for the hadoopConf
   * @return parquet file reader
   */
  def createParquetFileReader(vectorized: Boolean,
                              sqlConf: SQLConf,
                              options: Map[String, String],
                              hadoopConf: Configuration): SparkParquetReader

  /**
   * use new qe execute
   */
  def sqlExecutionWithNewExecutionId[T](sparkSession: SparkSession,
                                        queryExecution: QueryExecution,
                                        name: Option[String] = None)(body: => T): T
}