org.apache.hudi.HoodieBootstrapMORRelation.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hudi-spark-bundle_2.11 Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi

import org.apache.hudi.common.model.{FileSlice, HoodieLogFile}
import org.apache.hudi.common.table.HoodieTableMetaClient
import org.apache.hudi.storage.StoragePath
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.execution.datasources.PartitionedFile
import org.apache.spark.sql.sources.Filter
import org.apache.spark.sql.types.StructType

import scala.collection.JavaConverters._

case class HoodieBootstrapMORSplit(dataFile: PartitionedFile, skeletonFile: Option[PartitionedFile],
                                   logFiles: List[HoodieLogFile]) extends BaseHoodieBootstrapSplit

/**
 * This is Spark relation that can be used for querying metadata/fully bootstrapped query hoodie tables, as well as
 * non-bootstrapped tables. It implements PrunedFilteredScan interface in order to support column pruning and filter
 * push-down. For metadata bootstrapped files, if we query columns from both metadata and actual data then it will
 * perform a merge of both to return the result.
 *
 * Caveat: Filter push-down does not work when querying both metadata and actual data columns over metadata
 * bootstrapped files, because then the metadata file and data file can return different number of rows causing errors
 * merging.
 *
 * @param sqlContext Spark SQL Context
 * @param userSchema User specified schema in the datasource query
 * @param globPaths  The global paths to query. If it not none, read from the globPaths,
 *                   else read data from tablePath using HoodieFileIndex.
 * @param metaClient Hoodie table meta client
 * @param optParams  DataSource options passed by the user
 */
case class HoodieBootstrapMORRelation(override val sqlContext: SQLContext,
                                      private val userSchema: Option[StructType],
                                      private val globPaths: Seq[StoragePath],
                                      override val metaClient: HoodieTableMetaClient,
                                      override val optParams: Map[String, String],
                                      private val prunedDataSchema: Option[StructType] = None)
  extends BaseHoodieBootstrapRelation(sqlContext, userSchema, globPaths, metaClient,
    optParams, prunedDataSchema) {

  override type Relation = HoodieBootstrapMORRelation

  protected lazy val mandatoryFieldsForMerging: Seq[String] =
    Seq(recordKeyField) ++ preCombineFieldOpt.map(Seq(_)).getOrElse(Seq())

  override lazy val mandatoryFields: Seq[String] = mandatoryFieldsForMerging

  protected override def getFileSlices(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): Seq[FileSlice] = {
    if (globPaths.isEmpty) {
      fileIndex.listFileSlices(HoodieFileIndex.
        convertFilterForTimestampKeyGenerator(metaClient, partitionFilters)).values.flatten.toSeq
    } else {
      listLatestFileSlices(globPaths, partitionFilters, dataFilters)
    }
  }

  protected override def createFileSplit(fileSlice: FileSlice, dataFile: PartitionedFile, skeletonFile: Option[PartitionedFile]): FileSplit = {
    HoodieBootstrapMORSplit(dataFile, skeletonFile, fileSlice.getLogFiles.sorted(HoodieLogFile.getLogFileComparator).iterator().asScala.toList)
  }

  protected override def composeRDD(fileSplits: Seq[FileSplit],
                                    tableSchema: HoodieTableSchema,
                                    requiredSchema: HoodieTableSchema,
                                    requestedColumns: Array[String],
                                    filters: Array[Filter]): RDD[InternalRow] = {

    val (bootstrapDataFileReader, bootstrapSkeletonFileReader, regularFileReader) = getFileReaders(tableSchema,
      requiredSchema, requestedColumns, filters)
    new HoodieBootstrapMORRDD(
      sqlContext.sparkSession,
      config = jobConf,
      bootstrapDataFileReader = bootstrapDataFileReader,
      bootstrapSkeletonFileReader = bootstrapSkeletonFileReader,
      regularFileReader = regularFileReader,
      tableSchema = tableSchema,
      requiredSchema = requiredSchema,
      tableState = tableState, fileSplits)
  }


  override def updatePrunedDataSchema(prunedSchema: StructType): Relation =
    this.copy(prunedDataSchema = Some(prunedSchema))
}