org.apache.hudi.cdc.CDCRelation.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hudi-spark-bundle_2.11 Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.cdc

import org.apache.hudi.common.table.cdc.HoodieCDCExtractor
import org.apache.hudi.common.table.cdc.HoodieCDCOperation._
import org.apache.hudi.common.table.cdc.HoodieCDCSupplementalLoggingMode._
import org.apache.hudi.common.table.cdc.HoodieCDCUtils._
import org.apache.hudi.common.table.log.InstantRange
import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver}
import org.apache.hudi.exception.HoodieException
import org.apache.hudi.internal.schema.InternalSchema
import org.apache.hudi.{AvroConversionUtils, DataSourceReadOptions, HoodieDataSourceHelper, HoodieTableSchema}

import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.sources.{BaseRelation, Filter, PrunedFilteredScan}
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SQLContext, SparkSession}
import org.apache.spark.unsafe.types.UTF8String

import scala.collection.JavaConverters._
import scala.util.{Failure, Success, Try}

/**
 * Hoodie CDC Relation extends Spark's [[BaseRelation]], provide the schema of cdc
 * and the [[buildScan]] to return the change-data in a specified range.
 */
class CDCRelation(
    override val sqlContext: SQLContext,
    metaClient: HoodieTableMetaClient,
    startInstant: String,
    endInstant: String,
    options: Map[String, String]
) extends BaseRelation with PrunedFilteredScan with Logging {

  imbueConfigs(sqlContext)

  val spark: SparkSession = sqlContext.sparkSession

  val (tableAvroSchema, _) = {
    val schemaUtil = new TableSchemaResolver(metaClient)
    val avroSchema = Try(schemaUtil.getTableAvroSchema) match {
      case Success(schema) => schema
      case Failure(e) =>
        throw new IllegalArgumentException("Failed to fetch schema from the table", e)
    }
    // try to find internalSchema
    val internalSchemaFromMeta = try {
      schemaUtil.getTableInternalSchemaFromCommitMetadata.orElse(InternalSchema.getEmptyInternalSchema)
    } catch {
      case _: Exception => InternalSchema.getEmptyInternalSchema
    }
    (avroSchema, internalSchemaFromMeta)
  }

  val tableStructSchema: StructType = AvroConversionUtils.convertAvroSchemaToStructType(tableAvroSchema)

  val cdcExtractor: HoodieCDCExtractor =
    new HoodieCDCExtractor(
      metaClient,
      InstantRange.builder()
        .startInstant(startInstant)
        .endInstant(endInstant)
        .nullableBoundary(true)
        .rangeType(InstantRange.RangeType.OPEN_CLOSED).build())

  override final def needConversion: Boolean = false

  override def schema: StructType = CDCRelation.FULL_CDC_SPARK_SCHEMA

  override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = {
    val internalRows = buildScan0(requiredColumns, filters)
    internalRows.asInstanceOf[RDD[Row]]
  }

  def buildScan0(requiredColumns: Array[String], filters: Array[Filter]): RDD[InternalRow] = {
    val nameToField = schema.fields.map(f => f.name -> f).toMap
    val requiredSchema = StructType(requiredColumns.map(nameToField))
    val originTableSchema = HoodieTableSchema(tableStructSchema, tableAvroSchema.toString)
    val parquetReader = HoodieDataSourceHelper.buildHoodieParquetReader(
      sparkSession = spark,
      dataSchema = tableStructSchema,
      partitionSchema = StructType(Nil),
      requiredSchema = tableStructSchema,
      filters = Nil,
      options = options,
      hadoopConf = spark.sessionState.newHadoopConf()
    )

    val changes = cdcExtractor.extractCDCFileSplits().values().asScala.map { splits =>
      HoodieCDCFileGroupSplit(
        splits.asScala.sorted.toArray
      )
    }
    val cdcRdd = new HoodieCDCRDD(
      spark,
      metaClient,
      parquetReader,
      originTableSchema,
      schema,
      requiredSchema,
      changes.toArray
    )
    cdcRdd.asInstanceOf[RDD[InternalRow]]
  }

  def imbueConfigs(sqlContext: SQLContext): Unit = {
    // Disable vectorized reading for CDC relation
    sqlContext.sparkSession.sessionState.conf.setConfString("spark.sql.parquet.enableVectorizedReader", "false")
  }
}

object CDCRelation {

  val CDC_OPERATION_DELETE: UTF8String = UTF8String.fromString(DELETE.getValue)
  val CDC_OPERATION_INSERT: UTF8String = UTF8String.fromString(INSERT.getValue)
  val CDC_OPERATION_UPDATE: UTF8String = UTF8String.fromString(UPDATE.getValue)

  /**
   * CDC Schema For Spark.
   * Also it's schema when `hoodie.table.cdc.supplemental.logging.mode` is [[DATA_BEFORE_AFTER]].
   * Here we use the debezium format.
   */
  val FULL_CDC_SPARK_SCHEMA: StructType = {
    StructType(
      Seq(
        StructField(CDC_OPERATION_TYPE, StringType),
        StructField(CDC_COMMIT_TIMESTAMP, StringType),
        StructField(CDC_BEFORE_IMAGE, StringType),
        StructField(CDC_AFTER_IMAGE, StringType)
      )
    )
  }

  /**
   * CDC Schema For Spark when `hoodie.table.cdc.supplemental.logging.mode` is [[OP_KEY_ONLY]].
   */
  val MIN_CDC_SPARK_SCHEMA: StructType = {
    StructType(
      Seq(
        StructField(CDC_OPERATION_TYPE, StringType),
        StructField(CDC_RECORD_KEY, StringType)
      )
    )
  }

  /**
   * CDC Schema For Spark when `hoodie.table.cdc.supplemental.logging.mode` is [[DATA_BEFORE]].
   */
  val CDC_WITH_BEFORE_SPARK_SCHEMA: StructType = {
    StructType(
      Seq(
        StructField(CDC_OPERATION_TYPE, StringType),
        StructField(CDC_RECORD_KEY, StringType),
        StructField(CDC_BEFORE_IMAGE, StringType)
      )
    )
  }

  def isCDCEnabled(metaClient: HoodieTableMetaClient): Boolean = {
    metaClient.getTableConfig.isCDCEnabled
  }

  /**
   * The only approach to create the CDC relation.
   */
  def getCDCRelation(
      sqlContext: SQLContext,
      metaClient: HoodieTableMetaClient,
      options: Map[String, String]): CDCRelation = {

    if (!isCDCEnabled(metaClient)) {
      throw new IllegalArgumentException(s"It isn't a CDC hudi table on ${metaClient.getBasePath}")
    }

    val startingInstant = options.getOrElse(DataSourceReadOptions.BEGIN_INSTANTTIME.key(),
      throw new HoodieException("CDC Query should provide the valid start version or timestamp")
    )
    val endingInstant = options.getOrElse(DataSourceReadOptions.END_INSTANTTIME.key(),
      getTimestampOfLatestInstant(metaClient)
    )
    if (startingInstant > endingInstant) {
      throw new HoodieException(s"This is not a valid range between $startingInstant and $endingInstant")
    }

    new CDCRelation(sqlContext, metaClient, startingInstant, endingInstant, options)
  }

  def getTimestampOfLatestInstant(metaClient: HoodieTableMetaClient): String = {
    val latestInstant = metaClient.getActiveTimeline.lastInstant()
    if (latestInstant.isPresent) {
      latestInstant.get().getTimestamp
    } else {
      throw new HoodieException("No valid instant in Active Timeline.")
    }
  }
}