All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.MergeOnReadSnapshotRelation.scala Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi

import org.apache.hudi.HoodieConversionUtils.toScalaOption
import org.apache.hudi.MergeOnReadSnapshotRelation.{createPartitionedFile, isProjectionCompatible}
import org.apache.hudi.common.model.{FileSlice, HoodieLogFile, OverwriteWithLatestAvroPayload}
import org.apache.hudi.common.table.HoodieTableMetaClient
import org.apache.hudi.storage.StoragePath

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.execution.datasources.PartitionedFile
import org.apache.spark.sql.sources.Filter
import org.apache.spark.sql.types.StructType

import scala.collection.JavaConverters._

case class HoodieMergeOnReadFileSplit(dataFile: Option[PartitionedFile],
                                      logFiles: List[HoodieLogFile]) extends HoodieFileSplit

case class MergeOnReadSnapshotRelation(override val sqlContext: SQLContext,
                                       override val optParams: Map[String, String],
                                       override val metaClient: HoodieTableMetaClient,
                                       private val globPaths: Seq[StoragePath],
                                       private val userSchema: Option[StructType],
                                       private val prunedDataSchema: Option[StructType] = None)
  extends BaseMergeOnReadSnapshotRelation(sqlContext, optParams, metaClient, globPaths, userSchema, prunedDataSchema) {

  override type Relation = MergeOnReadSnapshotRelation

  override def updatePrunedDataSchema(prunedSchema: StructType): Relation =
    this.copy(prunedDataSchema = Some(prunedSchema))

  override protected def shouldIncludeLogFiles(): Boolean = {
    true
  }

}

/**
 * Base implementation of the Merge-on-Read snapshot relation
 *
 * NOTE: Reason this is extracted as a standalone base class is such that both MOR
 *       Snapshot and Incremental relations could inherit from it while both being Scala
 *       case classes
 */
abstract class BaseMergeOnReadSnapshotRelation(sqlContext: SQLContext,
                                               optParams: Map[String, String],
                                               metaClient: HoodieTableMetaClient,
                                               globPaths: Seq[StoragePath],
                                               userSchema: Option[StructType],
                                               prunedDataSchema: Option[StructType])
  extends HoodieBaseRelation(sqlContext, metaClient, optParams, userSchema, prunedDataSchema) {

  override type FileSplit = HoodieMergeOnReadFileSplit

  /**
   * NOTE: These are the fields that are required to properly fulfil Merge-on-Read (MOR)
   *       semantic:
   *
   *       
    *
  1. Primary key is required to make sure we're able to correlate records from the base * file with the updated records from the delta-log file
  2. *
  3. Pre-combine key is required to properly perform the combining (or merging) of the * existing and updated records
  4. *
* * However, in cases when merging is NOT performed (for ex, if file-group only contains base * files but no delta-log files, or if the query-type is equal to [["skip_merge"]]) neither * of primary-key or pre-combine-key are required to be fetched from storage (unless requested * by the query), therefore saving on throughput */ protected lazy val mandatoryFieldsForMerging: Seq[String] = Seq(recordKeyField) ++ preCombineFieldOpt.map(Seq(_)).getOrElse(Seq()) override lazy val mandatoryFields: Seq[String] = mandatoryFieldsForMerging protected val mergeType: String = optParams.getOrElse(DataSourceReadOptions.REALTIME_MERGE.key, DataSourceReadOptions.REALTIME_MERGE.defaultValue) /** * Determines whether relation's schema could be pruned by Spark's Optimizer */ override def canPruneRelationSchema: Boolean = super.canPruneRelationSchema && isProjectionCompatible(tableState) protected override def composeRDD(fileSplits: Seq[HoodieMergeOnReadFileSplit], tableSchema: HoodieTableSchema, requiredSchema: HoodieTableSchema, requestedColumns: Array[String], filters: Array[Filter]): RDD[InternalRow] = { val requiredFilters = Seq.empty val optionalFilters = filters val readers = createBaseFileReaders(tableSchema, requiredSchema, requestedColumns, requiredFilters, optionalFilters) new HoodieMergeOnReadRDDV2( sqlContext.sparkContext, config = jobConf, fileReaders = readers, tableSchema = tableSchema, requiredSchema = requiredSchema, tableState = tableState, mergeType = mergeType, fileSplits = fileSplits) } protected override def collectFileSplits(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): List[HoodieMergeOnReadFileSplit] = { val convertedPartitionFilters = HoodieFileIndex.convertFilterForTimestampKeyGenerator(metaClient, partitionFilters) if (globPaths.isEmpty) { val fileSlices = fileIndex.filterFileSlices(dataFilters, convertedPartitionFilters).flatMap(s => s._2) buildSplits(fileSlices) } else { val fileSlices = listLatestFileSlices(globPaths, partitionFilters, dataFilters) buildSplits(fileSlices) } } protected def buildSplits(fileSlices: Seq[FileSlice]): List[HoodieMergeOnReadFileSplit] = { fileSlices.map { fileSlice => val baseFile = toScalaOption(fileSlice.getBaseFile) val logFiles = fileSlice.getLogFiles.sorted(HoodieLogFile.getLogFileComparator).iterator().asScala.toList val partitionedBaseFile = baseFile.map { file => createPartitionedFile( getPartitionColumnsAsInternalRow(file.getPathInfo), file.getPathInfo.getPath, 0, file.getFileLen) } HoodieMergeOnReadFileSplit(partitionedBaseFile, logFiles) }.toList } } object MergeOnReadSnapshotRelation extends SparkAdapterSupport { /** * List of [[HoodieRecordPayload]] classes capable of merging projected records: * in some cases, when for example, user is only interested in a handful of columns rather * than the full row we will be able to optimize data throughput by only fetching the required * columns. However, to properly fulfil MOR semantic particular [[HoodieRecordPayload]] in * question should be able to merge records based on just such projected representation (including * columns required for merging, such as primary-key, pre-combine key, etc) */ private val projectionCompatiblePayloadClasses: Set[String] = Seq( classOf[OverwriteWithLatestAvroPayload] ).map(_.getName).toSet def isProjectionCompatible(tableState: HoodieTableState): Boolean = projectionCompatiblePayloadClasses.contains(tableState.recordPayloadClassName) def createPartitionedFile(partitionValues: InternalRow, filePath: StoragePath, start: Long, length: Long): PartitionedFile = { sparkAdapter.getSparkPartitionedFileUtils.createPartitionedFile( partitionValues, filePath, start, length) } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy