All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.HoodieMergeOnReadRDD.scala Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi

import org.apache.hudi.HoodieBaseRelation.{BaseFileReader, projectReader}
import org.apache.hudi.HoodieMergeOnReadRDD.CONFIG_INSTANTIATION_LOCK
import org.apache.hudi.MergeOnReadSnapshotRelation.isProjectionCompatible
import org.apache.hudi.common.model.HoodieRecord
import org.apache.hudi.common.util.StringUtils
import org.apache.hudi.exception.HoodieException
import org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils.getMaxCompactionMemoryInBytes

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.mapred.JobConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.{Partition, SerializableWritable, SparkContext, TaskContext}

import java.io.Closeable
import java.util.function.Predicate

case class HoodieMergeOnReadPartition(index: Int, split: HoodieMergeOnReadFileSplit) extends Partition

/**
 * Class holding base-file readers for 3 different use-cases:
 *
 * 
    *
  1. Full-schema reader: is used when whole row has to be read to perform merging correctly. * This could occur, when no optimizations could be applied and we have to fallback to read the whole row from * the base file and the corresponding delta-log file to merge them correctly
  2. * *
  3. Required-schema reader: is used when it's fine to only read row's projected columns. * This could occur, when row could be merged with corresponding delta-log record while leveraging only * projected columns
  4. * *
  5. Required-schema reader (skip-merging): is used when when no merging will be performed (skip-merged). * This could occur, when file-group has no delta-log files
  6. *
*/ private[hudi] case class HoodieMergeOnReadBaseFileReaders(fullSchemaReader: BaseFileReader, requiredSchemaReader: BaseFileReader, requiredSchemaReaderSkipMerging: BaseFileReader) /** * RDD enabling Hudi's Merge-on-Read (MOR) semantic * * @param sc spark's context * @param config hadoop configuration * @param fileReaders suite of base file readers * @param tableSchema table's full schema * @param requiredSchema expected (potentially) projected schema * @param tableState table's state * @param mergeType type of merge performed * @param fileSplits target file-splits this RDD will be iterating over * @param includeStartTime whether to include the commit with the commitTime * @param startTimestamp start timestamp to filter records * @param endTimestamp end timestamp to filter records */ class HoodieMergeOnReadRDD(@transient sc: SparkContext, @transient config: Configuration, fileReaders: HoodieMergeOnReadBaseFileReaders, tableSchema: HoodieTableSchema, requiredSchema: HoodieTableSchema, tableState: HoodieTableState, mergeType: String, @transient fileSplits: Seq[HoodieMergeOnReadFileSplit], includeStartTime: Boolean = false, startTimestamp: String = null, endTimestamp: String = null) extends RDD[InternalRow](sc, Nil) with HoodieUnsafeRDD { protected val maxCompactionMemoryInBytes: Long = getMaxCompactionMemoryInBytes(new JobConf(config)) private val hadoopConfBroadcast = sc.broadcast(new SerializableWritable(config)) override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = { val partition = split.asInstanceOf[HoodieMergeOnReadPartition] val iter = partition.split match { case dataFileOnlySplit if dataFileOnlySplit.logFiles.isEmpty => val projectedReader = projectReader(fileReaders.requiredSchemaReaderSkipMerging, requiredSchema.structTypeSchema) projectedReader(dataFileOnlySplit.dataFile.get) case logFileOnlySplit if logFileOnlySplit.dataFile.isEmpty => new LogFileIterator(logFileOnlySplit, tableSchema, requiredSchema, tableState, getHadoopConf) case split => mergeType match { case DataSourceReadOptions.REALTIME_SKIP_MERGE_OPT_VAL => val reader = fileReaders.requiredSchemaReaderSkipMerging new SkipMergeIterator(split, reader, tableSchema, requiredSchema, tableState, getHadoopConf) case DataSourceReadOptions.REALTIME_PAYLOAD_COMBINE_OPT_VAL => val reader = pickBaseFileReader() new RecordMergingFileIterator(split, reader, tableSchema, requiredSchema, tableState, getHadoopConf) case _ => throw new UnsupportedOperationException(s"Not supported merge type ($mergeType)") } case _ => throw new HoodieException(s"Unable to select an Iterator to read the Hoodie MOR File Split for " + s"file path: ${partition.split.dataFile.get.filePath}" + s"log paths: ${partition.split.logFiles.toString}" + s"hoodie table path: ${tableState.tablePath}" + s"spark partition Index: ${partition.index}" + s"merge type: ${mergeType}") } if (iter.isInstanceOf[Closeable]) { // register a callback to close logScanner which will be executed on task completion. // when tasks finished, this method will be called, and release resources. Option(TaskContext.get()).foreach(_.addTaskCompletionListener[Unit](_ => iter.asInstanceOf[Closeable].close())) } val commitTimeMetadataFieldIdx = requiredSchema.structTypeSchema.fieldNames.indexOf(HoodieRecord.COMMIT_TIME_METADATA_FIELD) val needsFiltering = commitTimeMetadataFieldIdx >= 0 && !StringUtils.isNullOrEmpty(startTimestamp) && !StringUtils.isNullOrEmpty(endTimestamp) if (needsFiltering) { val filterT: Predicate[InternalRow] = getCommitTimeFilter(includeStartTime, commitTimeMetadataFieldIdx) iter.filter(filterT.test) } else { iter } } private def getCommitTimeFilter(includeStartTime: Boolean, commitTimeMetadataFieldIdx: Int): Predicate[InternalRow] = { if (includeStartTime) { new Predicate[InternalRow] { override def test(row: InternalRow): Boolean = { val commitTime = row.getString(commitTimeMetadataFieldIdx) commitTime >= startTimestamp && commitTime <= endTimestamp } } } else { new Predicate[InternalRow] { override def test(row: InternalRow): Boolean = { val commitTime = row.getString(commitTimeMetadataFieldIdx) commitTime > startTimestamp && commitTime <= endTimestamp } } } } private def pickBaseFileReader(): BaseFileReader = { // NOTE: This is an optimization making sure that even for MOR tables we fetch absolute minimum // of the stored data possible, while still properly executing corresponding relation's semantic // and meet the query's requirements. // // Here we assume that iff queried table does use one of the standard (and whitelisted) // Record Payload classes then we can avoid reading and parsing the records w/ _full_ schema, // and instead only rely on projected one, nevertheless being able to perform merging correctly if (isProjectionCompatible(tableState)) { fileReaders.requiredSchemaReader } else { fileReaders.fullSchemaReader } } override protected def getPartitions: Array[Partition] = fileSplits.zipWithIndex.map(file => HoodieMergeOnReadPartition(file._2, file._1)).toArray private def getHadoopConf: Configuration = { val conf = hadoopConfBroadcast.value.value // TODO clean up, this lock is unnecessary CONFIG_INSTANTIATION_LOCK.synchronized { new Configuration(conf) } } } object HoodieMergeOnReadRDD { val CONFIG_INSTANTIATION_LOCK = new Object() }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy