org.apache.hudi.HoodieBootstrapRDD.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hudi-spark-bundle_2.11 Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi

import org.apache.hudi.HoodieBaseRelation.BaseFileReader
import org.apache.hudi.common.util.ValidationUtils.checkState

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.HoodieCatalystExpressionUtils.generateUnsafeProjection
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.JoinedRow
import org.apache.spark.sql.execution.datasources.PartitionedFile
import org.apache.spark.sql.types.StructType
import org.apache.spark.{Partition, TaskContext}

class HoodieBootstrapRDD(@transient spark: SparkSession,
                         bootstrapDataFileReader: BaseFileReader,
                         bootstrapSkeletonFileReader: BaseFileReader,
                         regularFileReader: BaseFileReader,
                         requiredSchema: HoodieTableSchema,
                         @transient splits: Seq[BaseHoodieBootstrapSplit])
  extends RDD[InternalRow](spark.sparkContext, Nil) {


  protected def getSkeletonIteratorSchema(dataFile: PartitionedFile, skeletonFile: PartitionedFile): (Iterator[InternalRow], StructType) = {
    if (bootstrapDataFileReader.schema.isEmpty) {
      // No data column to fetch, hence fetch only from skeleton file
      (bootstrapSkeletonFileReader.read(skeletonFile), bootstrapSkeletonFileReader.schema)
    } else if (bootstrapSkeletonFileReader.schema.isEmpty) {
      // No metadata column to fetch, hence fetch only from data file
      (bootstrapDataFileReader.read(dataFile), bootstrapDataFileReader.schema)
    } else {
      // Fetch from both data and skeleton file, and merge
      val dataFileIterator = bootstrapDataFileReader.read(dataFile)
      val skeletonFileIterator = bootstrapSkeletonFileReader.read(skeletonFile)
      val mergedSchema = StructType(bootstrapSkeletonFileReader.schema.fields ++ bootstrapDataFileReader.schema.fields)

      (merge(skeletonFileIterator, dataFileIterator), mergedSchema)
    }
  }

  /**
   *  Here we have to project the [[InternalRow]]s fetched into the expected target schema.
   *  These could diverge for ex, when requested schema contains partition columns which might not be
   *  persisted w/in the data file, but instead would be parsed from the partition path. In that case
   *  output of the file-reader will have different ordering of the fields than the original required
   *  schema (for more details please check out [[ParquetFileFormat]] implementation).
   */
  protected def unsafeProjectIterator(iterator: Iterator[InternalRow], schema: StructType): Iterator[InternalRow] = {
    val unsafeProjection = generateUnsafeProjection(schema, requiredSchema.structTypeSchema)
    iterator.map(unsafeProjection)
  }

  protected def maybeLog(bootstrapPartition: HoodieBootstrapPartition): Unit = {
    if (log.isDebugEnabled) {
      var msg = "Got Split => Index: " + bootstrapPartition.index + ", Data File: " +
        bootstrapPartition.split.dataFile.filePath
      if (bootstrapPartition.split.skeletonFile.isDefined) {
        msg += ", Skeleton File: " + bootstrapPartition.split.skeletonFile.get.filePath
      }
      logDebug(msg)
    }
  }

  protected def getIterator(bootstrapPartition: HoodieBootstrapPartition): Iterator[InternalRow] = {
    bootstrapPartition.split.skeletonFile match {
      case Some(skeletonFile) =>
        // It is a bootstrap split. Check both skeleton and data files.
        val (iterator, schema) = getSkeletonIteratorSchema(bootstrapPartition.split.dataFile, skeletonFile)
        unsafeProjectIterator(iterator, schema)
      case _ =>
        // NOTE: Regular file-reader is already projected into the required schema
        regularFileReader.read(bootstrapPartition.split.dataFile)
    }
  }

  override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = {
    val bootstrapPartition = split.asInstanceOf[HoodieBootstrapPartition]
    maybeLog(bootstrapPartition)
    getIterator(bootstrapPartition)
  }

  def merge(skeletonFileIterator: Iterator[InternalRow], dataFileIterator: Iterator[InternalRow]): Iterator[InternalRow] = {
    new Iterator[InternalRow] {
      private val combinedRow = new JoinedRow()

      override def hasNext: Boolean = {
        checkState(dataFileIterator.hasNext == skeletonFileIterator.hasNext,
          "Bootstrap data-file iterator and skeleton-file iterator have to be in-sync!")
        dataFileIterator.hasNext && skeletonFileIterator.hasNext
      }

      override def next(): InternalRow = {
        combinedRow(skeletonFileIterator.next(), dataFileIterator.next())
      }
    }
  }

  override protected def getPartitions: Array[Partition] = {
    splits.zipWithIndex.map(file => {
      if (file._1.skeletonFile.isDefined) {
        logDebug("Forming partition with => Index: " + file._2 + ", Files: " + file._1.dataFile.filePath
          + "," + file._1.skeletonFile.get.filePath)
        HoodieBootstrapPartition(file._2, file._1)
      } else {
        logDebug("Forming partition with => Index: " + file._2 + ", File: " + file._1.dataFile.filePath)
        HoodieBootstrapPartition(file._2, file._1)
      }
    }).toArray
  }
}

case class HoodieBootstrapPartition(index: Int, split: BaseHoodieBootstrapSplit) extends Partition