org.apache.hudi.BaseFileOnlyRelation.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hudi-spark-bundle_2.11 Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi

import org.apache.hudi.DataSourceReadOptions.ENABLE_HOODIE_FILE_INDEX
import org.apache.hudi.HoodieBaseRelation.projectReader
import org.apache.hudi.common.table.HoodieTableMetaClient
import org.apache.hudi.hadoop.HoodieROTablePathFilter
import org.apache.hudi.storage.{StoragePath, StoragePathInfo}

import org.apache.hadoop.conf.Configuration
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.execution.datasources._
import org.apache.spark.sql.sources.{BaseRelation, Filter}
import org.apache.spark.sql.types.StructType

/**
 * [[BaseRelation]] implementation only reading Base files of Hudi tables, essentially supporting following querying
 * modes:
 * 
 *  For COW tables: Snapshot
 *  For MOR tables: Read-optimized
 * 
 *
 * NOTE: The reason this Relation is used in-liue of Spark's default [[HadoopFsRelation]] is primarily due to the
 * fact that it injects real partition's path as the value of the partition field, which Hudi ultimately persists
 * as part of the record payload. In some cases, however, partition path might not necessarily be equal to the
 * verbatim value of the partition path field (when custom [[KeyGenerator]] is used) therefore leading to incorrect
 * partition field values being written
 */
case class BaseFileOnlyRelation(override val sqlContext: SQLContext,
                                override val metaClient: HoodieTableMetaClient,
                                override val optParams: Map[String, String],
                                private val userSchema: Option[StructType],
                                private val globPaths: Seq[StoragePath],
                                private val prunedDataSchema: Option[StructType] = None)
  extends HoodieBaseRelation(sqlContext, metaClient, optParams, userSchema, prunedDataSchema)
    with SparkAdapterSupport {

  case class HoodieBaseFileSplit(filePartition: FilePartition) extends HoodieFileSplit

  override type FileSplit = HoodieBaseFileSplit
  override type Relation = BaseFileOnlyRelation

  // TODO(HUDI-3204) this is to override behavior (exclusively) for COW tables to always extract
  //                 partition values from partition path
  //                 For more details please check HUDI-4161
  // NOTE: This override has to mirror semantic of whenever this Relation is converted into [[HadoopFsRelation]],
  //       which is currently done for all cases, except when Schema Evolution is enabled
  override protected val shouldExtractPartitionValuesFromPartitionPath: Boolean =
  internalSchemaOpt.isEmpty

  override lazy val mandatoryFields: Seq[String] = Seq.empty

  // Before Spark 3.4.0: PartitioningAwareFileIndex.BASE_PATH_PARAM
  // Since Spark 3.4.0: FileIndexOptions.BASE_PATH_PARAM
  val BASE_PATH_PARAM = "basePath"

  override def updatePrunedDataSchema(prunedSchema: StructType): Relation =
    this.copy(prunedDataSchema = Some(prunedSchema))

  protected override def composeRDD(fileSplits: Seq[HoodieBaseFileSplit],
                                    tableSchema: HoodieTableSchema,
                                    requiredSchema: HoodieTableSchema,
                                    requestedColumns: Array[String],
                                    filters: Array[Filter]): RDD[InternalRow] = {
    val (partitionSchema, dataSchema, requiredDataSchema) = tryPrunePartitionColumns(tableSchema, requiredSchema)
    val baseFileReader = createBaseFileReader(
      spark = sparkSession,
      partitionSchema = partitionSchema,
      dataSchema = dataSchema,
      requiredDataSchema = requiredDataSchema,
      filters = filters,
      options = optParams,
      // NOTE: We have to fork the Hadoop Config here as Spark will be modifying it
      //       to configure Parquet reader appropriately
      hadoopConf = embedInternalSchema(new Configuration(conf), requiredSchema.internalSchema)
    )

    // NOTE: In some case schema of the reader's output (reader's schema) might not match the schema expected by the caller.
    //       This could occur for ex, when requested schema contains partition columns which might not be persisted w/in the
    //       data file, but instead would be parsed from the partition path. In that case output of the file-reader will have
    //       different ordering of the fields than the original required schema (for more details please check out
    //       [[ParquetFileFormat]] impl). In that case we have to project the rows from the file-reader's schema
    //       back into the one expected by the caller
    val projectedReader = projectReader(baseFileReader, requiredSchema.structTypeSchema)

    // SPARK-37273 FileScanRDD constructor changed in SPARK 3.3
    sparkAdapter.createHoodieFileScanRDD(sparkSession, projectedReader.apply, fileSplits.map(_.filePartition), requiredSchema.structTypeSchema)
      .asInstanceOf[HoodieUnsafeRDD]
  }

  protected def collectFileSplits(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): Seq[HoodieBaseFileSplit] = {
    val fileSlices = listLatestFileSlices(globPaths, partitionFilters, dataFilters)
    val fileSplits = fileSlices.flatMap { fileSlice =>
      // TODO fix, currently assuming parquet as underlying format
      val pathInfo: StoragePathInfo = fileSlice.getBaseFile.get.getPathInfo
      HoodieDataSourceHelper.splitFiles(
        sparkSession = sparkSession,
        file = pathInfo,
        partitionValues = getPartitionColumnsAsInternalRow(pathInfo)
      )
    }
      // NOTE: It's important to order the splits in the reverse order of their
      //       size so that we can subsequently bucket them in an efficient manner
      .sortBy(_.length)(implicitly[Ordering[Long]].reverse)

    val maxSplitBytes = sparkSession.sessionState.conf.filesMaxPartitionBytes

    sparkAdapter.getFilePartitions(sparkSession, fileSplits, maxSplitBytes)
      .map(HoodieBaseFileSplit.apply)
  }

  /**
   * NOTE: We have to fallback to [[HadoopFsRelation]] to make sure that all of the Spark optimizations could be
   *       equally applied to Hudi tables, since some of those are predicated on the usage of [[HadoopFsRelation]],
   *       and won't be applicable in case of us using our own custom relations (one of such optimizations is [[SchemaPruning]]
   *       rule; you can find more details in HUDI-3896)
   */
  def toHadoopFsRelation: HadoopFsRelation = {
    val enableFileIndex = HoodieSparkConfUtils.getConfigValue(optParams, sparkSession.sessionState.conf,
      ENABLE_HOODIE_FILE_INDEX.key, ENABLE_HOODIE_FILE_INDEX.defaultValue.toString).toBoolean
    if (enableFileIndex && globPaths.isEmpty) {
      // NOTE: There are currently 2 ways partition values could be fetched:
      //          - Source columns (producing the values used for physical partitioning) will be read
      //          from the data file
      //          - Values parsed from the actual partition path would be appended to the final dataset
      //
      //        In the former case, we don't need to provide the partition-schema to the relation,
      //        therefore we simply stub it w/ empty schema and use full table-schema as the one being
      //        read from the data file.
      //
      //        In the latter, we have to specify proper partition schema as well as "data"-schema, essentially
      //        being a table-schema with all partition columns stripped out
      val (partitionSchema, dataSchema) = if (shouldExtractPartitionValuesFromPartitionPath) {
        (fileIndex.partitionSchema, fileIndex.dataSchema)
      } else {
        (StructType(Nil), tableStructSchema)
      }

      HadoopFsRelation(
        location = fileIndex,
        partitionSchema = partitionSchema,
        dataSchema = dataSchema,
        bucketSpec = None,
        fileFormat = fileFormat,
        optParams)(sparkSession)
    } else {
      val readPathsStr = optParams.get(DataSourceReadOptions.READ_PATHS.key)
      val extraReadPaths = readPathsStr.map(p => p.split(",").toSeq).getOrElse(Seq())
      // NOTE: Spark is able to infer partitioning values from partition path only when Hive-style partitioning
      //       scheme is used. Therefore, we fallback to reading the table as non-partitioned (specifying
      //       partitionColumns = Seq.empty) whenever Hive-style partitioning is not involved
      val partitionColumns: Seq[String] = if (tableConfig.getHiveStylePartitioningEnable.toBoolean) {
        this.partitionColumns
      } else {
        Seq.empty
      }

      DataSource.apply(
        sparkSession = sparkSession,
        paths = extraReadPaths,
        // Here we should specify the schema to the latest commit schema since
        // the table schema evolution.
        userSpecifiedSchema = userSchema.orElse(Some(tableStructSchema)),
        className = fileFormatClassName,
        options = optParams ++ Map(
          // Since we're reading the table as just collection of files we have to make sure
          // we only read the latest version of every Hudi's file-group, which might be compacted, clustered, etc.
          // while keeping previous versions of the files around as well.
          //
          // We rely on [[HoodieROTablePathFilter]], to do proper filtering to assure that
          "mapreduce.input.pathFilter.class" -> classOf[HoodieROTablePathFilter].getName,

          // We have to override [[EXTRACT_PARTITION_VALUES_FROM_PARTITION_PATH]] setting, since
          // the relation might have this setting overridden
          DataSourceReadOptions.EXTRACT_PARTITION_VALUES_FROM_PARTITION_PATH.key -> shouldExtractPartitionValuesFromPartitionPath.toString,

          // NOTE: We have to specify table's base-path explicitly, since we're requesting Spark to read it as a
          //       list of globbed paths which complicates partitioning discovery for Spark.
          //       Please check [[PartitioningAwareFileIndex#basePaths]] comment for more details.
          BASE_PATH_PARAM -> metaClient.getBasePath.toString
        ),
        partitionColumns = partitionColumns
      )
        .resolveRelation()
        .asInstanceOf[HadoopFsRelation]
    }
  }
}