All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.HoodieBootstrapRelation.scala Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi

import org.apache.hudi.HoodieBaseRelation.{BaseFileReader, convertToAvroSchema, projectReader}
import org.apache.hudi.HoodieBootstrapRelation.{createPartitionedFile, validate}
import org.apache.hudi.common.model.FileSlice
import org.apache.hudi.common.table.HoodieTableMetaClient
import org.apache.hudi.common.util.ValidationUtils.checkState
import org.apache.hudi.storage.StoragePath

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, PartitionedFile}
import org.apache.spark.sql.hudi.HoodieSqlCommonUtils.isMetaField
import org.apache.spark.sql.sources.Filter
import org.apache.spark.sql.types.StructType

trait BaseHoodieBootstrapSplit extends HoodieFileSplit {
  val dataFile: PartitionedFile
  val skeletonFile: Option[PartitionedFile]
}

case class HoodieBootstrapSplit(dataFile: PartitionedFile,
                                skeletonFile: Option[PartitionedFile]) extends BaseHoodieBootstrapSplit

case class HoodieBootstrapRelation(override val sqlContext: SQLContext,
                                   private val userSchema: Option[StructType],
                                   private val globPaths: Seq[StoragePath],
                                   override val metaClient: HoodieTableMetaClient,
                                   override val optParams: Map[String, String],
                                   private val prunedDataSchema: Option[StructType] = None)
  extends BaseHoodieBootstrapRelation(sqlContext, userSchema, globPaths, metaClient, optParams, prunedDataSchema) {

  override type Relation = HoodieBootstrapRelation

  override protected def createFileSplit(fileSlice: FileSlice, dataFile: PartitionedFile, skeletonFile: Option[PartitionedFile]): FileSplit = {
    HoodieBootstrapSplit(dataFile, skeletonFile)
  }

  override def updatePrunedDataSchema(prunedSchema: StructType): Relation =
    this.copy(prunedDataSchema = Some(prunedSchema))

  def toHadoopFsRelation: HadoopFsRelation = {
    HadoopFsRelation(
      location = fileIndex,
      partitionSchema = fileIndex.partitionSchema,
      dataSchema = fileIndex.dataSchema,
      bucketSpec = None,
      fileFormat = fileFormat,
      optParams)(sparkSession)
  }
}
/**
  * This is Spark relation that can be used for querying metadata/fully bootstrapped query hoodie tables, as well as
 * non-bootstrapped tables. It implements PrunedFilteredScan interface in order to support column pruning and filter
 * push-down. For metadata bootstrapped files, if we query columns from both metadata and actual data then it will
 * perform a merge of both to return the result.
 *
 * Caveat: Filter push-down does not work when querying both metadata and actual data columns over metadata
 * bootstrapped files, because then the metadata file and data file can return different number of rows causing errors
 * merging.
 *
 * @param sqlContext Spark SQL Context
 * @param userSchema User specified schema in the datasource query
 * @param globPaths  The global paths to query. If it not none, read from the globPaths,
 *                   else read data from tablePath using HoodieFileIndex.
 * @param metaClient Hoodie table meta client
 * @param optParams  DataSource options passed by the user
 */
abstract class BaseHoodieBootstrapRelation(override val sqlContext: SQLContext,
                                           private val userSchema: Option[StructType],
                                           private val globPaths: Seq[StoragePath],
                                           override val metaClient: HoodieTableMetaClient,
                                           override val optParams: Map[String, String],
                                           private val prunedDataSchema: Option[StructType] = None)
  extends HoodieBaseRelation(sqlContext, metaClient, optParams, userSchema, prunedDataSchema) {

  override type FileSplit = BaseHoodieBootstrapSplit

  private lazy val skeletonSchema = HoodieSparkUtils.getMetaSchema

  override lazy val mandatoryFields: Seq[String] = Seq.empty

  protected def getFileSlices(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): Seq[FileSlice] = {
    listLatestFileSlices(globPaths, partitionFilters, dataFilters)
  }

  protected def createFileSplit(fileSlice: FileSlice, dataFile: PartitionedFile, skeletonFile: Option[PartitionedFile]): FileSplit

  protected override def collectFileSplits(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): Seq[FileSplit] = {
    val fileSlices = getFileSlices(partitionFilters, dataFilters)
    val isPartitioned = metaClient.getTableConfig.isTablePartitioned
    fileSlices.map { fileSlice =>
      val baseFile = fileSlice.getBaseFile.get()
      if (baseFile.getBootstrapBaseFile.isPresent) {
        val partitionValues = getPartitionColumnsAsInternalRowInternal(baseFile.getPathInfo,
          metaClient.getBasePath, extractPartitionValuesFromPartitionPath = isPartitioned)
        val dataFile = createPartitionedFile(
          partitionValues, baseFile.getBootstrapBaseFile.get.getPathInfo.getPath,
          0, baseFile.getBootstrapBaseFile.get().getFileLen)
        val skeletonFile = Option(createPartitionedFile(
          InternalRow.empty, baseFile.getStoragePath, 0, baseFile.getFileLen))

        createFileSplit(fileSlice, dataFile, skeletonFile)
      } else {
        val dataFile = createPartitionedFile(
          getPartitionColumnsAsInternalRow(baseFile.getPathInfo), baseFile.getStoragePath, 0, baseFile.getFileLen)
        createFileSplit(fileSlice, dataFile, Option.empty)
      }
    }
  }

  /**
   * get all the file readers required for composeRDD
   */
  protected def getFileReaders(tableSchema: HoodieTableSchema,
                               requiredSchema: HoodieTableSchema,
                               requestedColumns: Array[String],
                               filters: Array[Filter]): (BaseFileReader, BaseFileReader, BaseFileReader) = {
    val requiredSkeletonFileSchema =
      StructType(skeletonSchema.filter(f => requestedColumns.exists(col => resolver(f.name, col))))

    val (bootstrapDataFileReader, bootstrapSkeletonFileReader) =
      createBootstrapFileReaders(tableSchema, requiredSchema, requiredSkeletonFileSchema, filters)

    val regularFileReader = createRegularFileReader(tableSchema, requiredSchema, filters)
    (bootstrapDataFileReader, bootstrapSkeletonFileReader, regularFileReader)
  }

  protected override def composeRDD(fileSplits: Seq[FileSplit],
                                    tableSchema: HoodieTableSchema,
                                    requiredSchema: HoodieTableSchema,
                                    requestedColumns: Array[String],
                                    filters: Array[Filter]): RDD[InternalRow] = {

    val (bootstrapDataFileReader, bootstrapSkeletonFileReader, regularFileReader) = getFileReaders(tableSchema,
      requiredSchema, requestedColumns, filters)
    new HoodieBootstrapRDD(sqlContext.sparkSession, bootstrapDataFileReader, bootstrapSkeletonFileReader, regularFileReader,
      requiredSchema, fileSplits)
  }

  /**
   * Creates skeleton and base file reader
   */
  private def createBootstrapFileReaders(tableSchema: HoodieTableSchema,
                                         requiredSchema: HoodieTableSchema,
                                         requiredSkeletonFileSchema: StructType,
                                         filters: Array[Filter]): (BaseFileReader, BaseFileReader) = {
    // NOTE: "Data" schema in here refers to the whole table's schema that doesn't include only partition
    //       columns, as opposed to data file schema not including any meta-fields columns in case of
    //       Bootstrap relation
    val (partitionSchema, dataSchema, requiredDataSchema) =
      tryPrunePartitionColumnsInternal(tableSchema, requiredSchema, extractPartitionValuesFromPartitionPath = true)

    val bootstrapDataFileSchema = StructType(dataSchema.structTypeSchema.filterNot(sf => isMetaField(sf.name)))
    val requiredBootstrapDataFileSchema = StructType(requiredDataSchema.structTypeSchema.filterNot(sf => isMetaField(sf.name)))

    validate(requiredDataSchema, requiredBootstrapDataFileSchema, requiredSkeletonFileSchema)

    val bootstrapDataFileReader = createBaseFileReader(
      spark = sqlContext.sparkSession,
      dataSchema = new HoodieTableSchema(bootstrapDataFileSchema, convertToAvroSchema(bootstrapDataFileSchema, tableName).toString),
      partitionSchema = partitionSchema,
      requiredDataSchema = new HoodieTableSchema(requiredBootstrapDataFileSchema, convertToAvroSchema(requiredBootstrapDataFileSchema, tableName).toString),
      // NOTE: For bootstrapped files we can't apply any filtering in case we'd need to merge it with
      //       a skeleton-file as we rely on matching ordering of the records across bootstrap- and skeleton-files
      filters = if (requiredSkeletonFileSchema.isEmpty) filters else Seq(),
      options = optParams,
      hadoopConf = sqlContext.sparkSession.sessionState.newHadoopConf(),
      // NOTE: Bootstrap relation have to always extract partition values from the partition-path as this is a
      //       default Spark behavior: Spark by default strips partition-columns from the data schema and does
      //       NOT persist them in the data files, instead parsing them from partition-paths (on the fly) whenever
      //       table is queried
      shouldAppendPartitionValuesOverride = Some(true)
    )

    val boostrapSkeletonFileReader = createBaseFileReader(
      spark = sqlContext.sparkSession,
      dataSchema = new HoodieTableSchema(skeletonSchema, convertToAvroSchema(skeletonSchema, tableName).toString),
      // NOTE: Here we specify partition-schema as empty since we don't need Spark to inject partition-values
      //       parsed from the partition-path
      partitionSchema = StructType(Seq.empty),
      requiredDataSchema = new HoodieTableSchema(requiredSkeletonFileSchema, convertToAvroSchema(requiredSkeletonFileSchema, tableName).toString),
      // NOTE: For bootstrapped files we can't apply any filtering in case we'd need to merge it with
      //       a skeleton-file as we rely on matching ordering of the records across bootstrap- and skeleton-files
      filters = if (requiredBootstrapDataFileSchema.isEmpty) filters else Seq(),
      options = optParams,
      hadoopConf = sqlContext.sparkSession.sessionState.newHadoopConf(),
      // NOTE: We override Spark to avoid injecting partition values into the records read from
      //       skeleton-file
      shouldAppendPartitionValuesOverride = Some(false)
    )

    (bootstrapDataFileReader, boostrapSkeletonFileReader)
  }

  /**
   * create reader for hudi base files
   */
  private def createRegularFileReader(tableSchema: HoodieTableSchema,
                                     requiredSchema: HoodieTableSchema,
                                     filters: Array[Filter]): BaseFileReader = {
    // NOTE: "Data" schema in here refers to the whole table's schema that doesn't include only partition
    //       columns, as opposed to data file schema not including any meta-fields columns in case of
    //       Bootstrap relation
    val (partitionSchema, dataSchema, requiredDataSchema) =
      tryPrunePartitionColumns(tableSchema, requiredSchema)

    // NOTE: Bootstrapped table allows Hudi created file-slices to be co-located w/ the "bootstrapped"
    //       ones (ie persisted by Spark). Therefore to be able to read the data from Bootstrapped
    //       table we also need to create regular file-reader to read file-slices created by Hudi
    val regularFileReader = createBaseFileReader(
      spark = sqlContext.sparkSession,
      dataSchema = dataSchema,
      partitionSchema = partitionSchema,
      requiredDataSchema = requiredDataSchema,
      filters = filters,
      options = optParams,
      hadoopConf = sqlContext.sparkSession.sessionState.newHadoopConf()
    )

    // NOTE: In some case schema of the reader's output (reader's schema) might not match the schema expected by the caller.
    //       This could occur for ex, when requested schema contains partition columns which might not be persisted w/in the
    //       data file, but instead would be parsed from the partition path. In that case output of the file-reader will have
    //       different ordering of the fields than the original required schema (for more details please check out
    //       [[ParquetFileFormat]] impl). In that case we have to project the rows from the file-reader's schema
    //       back into the one expected by the caller
    projectReader(regularFileReader, requiredSchema.structTypeSchema)
  }
}

object HoodieBootstrapRelation extends SparkAdapterSupport {
  def validate(requiredDataSchema: HoodieTableSchema, requiredDataFileSchema: StructType, requiredSkeletonFileSchema: StructType): Unit = {
    val requiredDataColumns: Seq[String] = requiredDataSchema.structTypeSchema.fieldNames.toSeq
    val combinedColumns = (requiredSkeletonFileSchema.fieldNames ++ requiredDataFileSchema.fieldNames).toSeq

    // NOTE: Here we validate that all required data columns are covered by the combination of the columns
    //       from both skeleton file and the corresponding data file
    checkState(combinedColumns.sorted == requiredDataColumns.sorted)
  }

  def createPartitionedFile(partitionValues: InternalRow,
                            filePath: StoragePath,
                            start: Long,
                            length: Long): PartitionedFile = {
    sparkAdapter.getSparkPartitionedFileUtils.createPartitionedFile(
      partitionValues, filePath, start, length)
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy