All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.HoodieFileIndex.scala Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi

import org.apache.hudi.BaseHoodieTableFileIndex.PartitionPath
import org.apache.hudi.DataSourceWriteOptions.{PARTITIONPATH_FIELD, PRECOMBINE_FIELD, RECORDKEY_FIELD}
import org.apache.hudi.HoodieFileIndex.{DataSkippingFailureMode, collectReferencedColumns, convertFilterForTimestampKeyGenerator, getConfigProperties}
import org.apache.hudi.HoodieSparkConfUtils.getConfigValue
import org.apache.hudi.common.config.TimestampKeyGeneratorConfig.{TIMESTAMP_INPUT_DATE_FORMAT, TIMESTAMP_OUTPUT_DATE_FORMAT}
import org.apache.hudi.common.config.{HoodieMetadataConfig, TypedProperties}
import org.apache.hudi.common.model.{FileSlice, HoodieBaseFile, HoodieLogFile}
import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient}
import org.apache.hudi.common.util.StringUtils
import org.apache.hudi.exception.HoodieException
import org.apache.hudi.keygen.{TimestampBasedAvroKeyGenerator, TimestampBasedKeyGenerator}
import org.apache.hudi.storage.{StoragePath, StoragePathInfo}
import org.apache.hudi.util.JFunction

import org.apache.hadoop.fs.{FileStatus, Path}
import org.apache.spark.internal.Logging
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Expression, Literal}
import org.apache.spark.sql.execution.datasources.{FileIndex, FileStatusCache, NoopCache, PartitionDirectory}
import org.apache.spark.sql.hudi.HoodieSqlCommonUtils
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String

import java.text.SimpleDateFormat
import java.util.stream.Collectors
import javax.annotation.concurrent.NotThreadSafe

import scala.collection.JavaConverters._
import scala.util.control.NonFatal
import scala.util.{Failure, Success, Try}

/**
 * A file index which support partition prune for hoodie snapshot and read-optimized query.
 *
 * Main steps to get the file list for query:
 * 1、Load all files and partition values from the table path.
 * 2、Do the partition prune by the partition filter condition.
 *
 * There are 3 cases for this:
 * 1、If the partition columns size is equal to the actually partition path level, we
 * read it as partitioned table.(e.g partition column is "dt", the partition path is "2021-03-10")
 *
 * 2、If the partition columns size is not equal to the partition path level, but the partition
 * column size is "1" (e.g. partition column is "dt", but the partition path is "2021/03/10"
 * who's directory level is 3).We can still read it as a partitioned table. We will mapping the
 * partition path (e.g. 2021/03/10) to the only partition column (e.g. "dt").
 *
 * 3、Else the partition columns size is not equal to the partition directory level and the
 * size is great than "1" (e.g. partition column is "dt,hh", the partition path is "2021/03/10/12")
 * , we read it as a Non-Partitioned table because we cannot know how to mapping the partition
 * path with the partition columns in this case.
 *
 * When flag `shouldUseStringTypeForTimestampPartitionKeyType` is set to true, the schema type for timestamp partition columns
 * in custom key generator is set as STRING instead of the base schema type for that field. This makes sure that with
 * output partition format as DD/MM/YYYY, there are no incompatible schema errors while reading the table.
 * Further the partition values are also updated to string. For example if output partition format is YYYY,
 * it can be represented as integer if the base schema type is integer. But since we are upgrading the schema
 * to STRING for all output formats, we need to update the partition values to STRING format as well to avoid
 * errors.
 *
 * TODO rename to HoodieSparkSqlFileIndex
 */
@NotThreadSafe
case class HoodieFileIndex(spark: SparkSession,
                           metaClient: HoodieTableMetaClient,
                           schemaSpec: Option[StructType],
                           options: Map[String, String],
                           @transient fileStatusCache: FileStatusCache = NoopCache,
                           includeLogFiles: Boolean = false,
                           shouldEmbedFileSlices: Boolean = false)
  extends SparkHoodieTableFileIndex(
    spark = spark,
    metaClient = metaClient,
    schemaSpec = schemaSpec,
    configProperties = getConfigProperties(spark, options, metaClient.getTableConfig),
    queryPaths = HoodieFileIndex.getQueryPaths(options),
    specifiedQueryInstant = options.get(DataSourceReadOptions.TIME_TRAVEL_AS_OF_INSTANT.key).map(HoodieSqlCommonUtils.formatQueryInstant),
    fileStatusCache = fileStatusCache,
    startCompletionTime = options.get(DataSourceReadOptions.START_COMMIT.key),
    endCompletionTime = options.get(DataSourceReadOptions.END_COMMIT.key)) with FileIndex {

  @transient protected var hasPushedDownPartitionPredicates: Boolean = false

  /**
   * NOTE: [[indicesSupport]] is a transient state, since it's only relevant while logical plan
   *       is handled by the Spark's driver
   *       The order of elements is important as in this order indices will be applied
   *       during `lookupCandidateFilesInMetadataTable`
   */
  @transient private lazy val indicesSupport: List[SparkBaseIndexSupport] = List(
    new RecordLevelIndexSupport(spark, metadataConfig, metaClient),
    new BucketIndexSupport(spark, metadataConfig, metaClient),
    new SecondaryIndexSupport(spark, metadataConfig, metaClient),
    new ExpressionIndexSupport(spark, schema, metadataConfig, metaClient),
    new BloomFiltersIndexSupport(spark, metadataConfig, metaClient),
    new ColumnStatsIndexSupport(spark, schema, metadataConfig, metaClient)
  )

  private val enableHoodieExtension = spark.sessionState.conf.getConfString("spark.sql.extensions", "")
    .split(",")
    .map(_.trim)
    .contains("org.apache.spark.sql.hudi.HoodieSparkSessionExtension")

  override def rootPaths: Seq[Path] = getQueryPaths.asScala.map(e => new Path(e.toUri)).toSeq

  /**
   * Returns the FileStatus for all the base files (excluding log files). This should be used only for
   * cases where Spark directly fetches the list of files via HoodieFileIndex or for read optimized query logic
   * implemented internally within Hudi like HoodieBootstrapRelation. This helps avoid the use of path filter
   * to filter out log files within Spark.
   *
   * @return List of FileStatus for base files
   */
  def allBaseFiles: Seq[StoragePathInfo] = {
    getAllInputFileSlices.values.asScala.flatMap(_.asScala)
      .map(fs => fs.getBaseFile.orElse(null))
      .filter(_ != null)
      .map(_.getPathInfo)
      .toSeq
  }

  /**
   * Returns the FileStatus for all the base files and log files.
   *
   * @return List of FileStatus for base files and log files
   */
  private def allBaseFilesAndLogFiles: Seq[StoragePathInfo] = {
    getAllInputFileSlices.values.asScala.flatMap(_.asScala)
      .flatMap(fs => {
        val baseFileStatusOpt = getBaseFileInfo(Option.apply(fs.getBaseFile.orElse(null)))
        val logFilesStatus = fs.getLogFiles.map[StoragePathInfo](JFunction.toJavaFunction[HoodieLogFile, StoragePathInfo](lf => lf.getPathInfo))
        val files = logFilesStatus.collect(Collectors.toList[StoragePathInfo]).asScala
        baseFileStatusOpt.foreach(f => files.append(f))
        files
      }).toSeq
  }

  /**
   * Invoked by Spark to fetch list of latest base files per partition.
   *
   * @param partitionFilters partition column filters
   * @param dataFilters      data columns filters
   * @return list of PartitionDirectory containing partition to base files mapping
   */
  override def listFiles(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): Seq[PartitionDirectory] = {
    val prunedPartitionsAndFilteredFileSlices = filterFileSlices(dataFilters, partitionFilters).map {
      case (partitionOpt, fileSlices) =>
        if (shouldEmbedFileSlices) {
          val baseFileStatusesAndLogFileOnly: Seq[FileStatus] = fileSlices.map(slice => {
            if (slice.getBaseFile.isPresent) {
              slice.getBaseFile.get().getPathInfo
            } else if (slice.hasLogFiles) {
              slice.getLogFiles.findAny().get().getPathInfo
            } else {
              null
            }
          }).filter(slice => slice != null)
            .map(fileInfo => new FileStatus(fileInfo.getLength, fileInfo.isDirectory, 0, fileInfo.getBlockSize,
              fileInfo.getModificationTime, new Path(fileInfo.getPath.toUri)))
          val c = fileSlices.filter(f => f.hasLogFiles || f.hasBootstrapBase).foldLeft(Map[String, FileSlice]()) { (m, f) => m + (f.getFileId -> f) }
          if (c.nonEmpty) {
            sparkAdapter.getSparkPartitionedFileUtils.newPartitionDirectory(
              new HoodiePartitionFileSliceMapping(InternalRow.fromSeq(partitionOpt.get.values), c), baseFileStatusesAndLogFileOnly)
          } else {
            sparkAdapter.getSparkPartitionedFileUtils.newPartitionDirectory(
              InternalRow.fromSeq(partitionOpt.get.values), baseFileStatusesAndLogFileOnly)
          }

        } else {
          val allCandidateFiles: Seq[FileStatus] = fileSlices.flatMap(fs => {
            val baseFileStatusOpt = getBaseFileInfo(Option.apply(fs.getBaseFile.orElse(null)))
            val logPathInfoStream = fs.getLogFiles.map[StoragePathInfo](JFunction.toJavaFunction[HoodieLogFile, StoragePathInfo](lf => lf.getPathInfo))
            val files = logPathInfoStream.collect(Collectors.toList[StoragePathInfo]).asScala
            baseFileStatusOpt.foreach(f => files.append(f))
            files
          })
            .map(fileInfo => new FileStatus(fileInfo.getLength, fileInfo.isDirectory, 0, fileInfo.getBlockSize,
              fileInfo.getModificationTime, new Path(fileInfo.getPath.toUri)))
          sparkAdapter.getSparkPartitionedFileUtils.newPartitionDirectory(
            InternalRow.fromSeq(partitionOpt.get.values), allCandidateFiles)
        }
    }

    hasPushedDownPartitionPredicates = true

    if (shouldReadAsPartitionedTable()) {
      prunedPartitionsAndFilteredFileSlices
    } else if (shouldEmbedFileSlices) {
      assert(partitionSchema.isEmpty)
      prunedPartitionsAndFilteredFileSlices
    } else {
      Seq(PartitionDirectory(InternalRow.empty, prunedPartitionsAndFilteredFileSlices.flatMap(_.files)))
    }
  }

  /**
   * The functions prunes the partition paths based on the input partition filters. For every partition path, the file
   * slices are further filtered after querying metadata table based on the data filters.
   *
   * @param dataFilters data columns filters
   * @param partitionFilters partition column filters
   * @param partitionPrune for HoodiePruneFileSourcePartitions rule only prune partitions
   * @return A sequence of pruned partitions and corresponding filtered file slices
   */
  def filterFileSlices(dataFilters: Seq[Expression], partitionFilters: Seq[Expression], isPartitionPruned: Boolean = false)
  : Seq[(Option[BaseHoodieTableFileIndex.PartitionPath], Seq[FileSlice])] = {

    val (isPruned, prunedPartitionsAndFileSlices) =
      prunePartitionsAndGetFileSlices(dataFilters, partitionFilters)
    hasPushedDownPartitionPredicates = true

    // If there are no data filters, return all the file slices.
    // If isPartitionPurge is true, this fun is trigger by HoodiePruneFileSourcePartitions, don't look up candidate files
    // If there are no file slices, return empty list.
    if (prunedPartitionsAndFileSlices.isEmpty || dataFilters.isEmpty || isPartitionPruned ) {
      prunedPartitionsAndFileSlices
    } else {
      // Look up candidate files names in the col-stats or record level index, if all of the following conditions are true
      //    - Data-skipping is enabled
      //    - Col-Stats Index is present
      //    - Record-level Index is present
      //    - List of predicates (filters) is present
      val candidateFilesNamesOpt: Option[Set[String]] =
        lookupCandidateFilesInMetadataTable(dataFilters, prunedPartitionsAndFileSlices, isPruned) match {
        case Success(opt) => opt
        case Failure(e) =>
          logError("Failed to lookup candidate files in File Index", e)

          spark.sqlContext.getConf(DataSkippingFailureMode.configName, DataSkippingFailureMode.Fallback.value) match {
            case DataSkippingFailureMode.Fallback.value => Option.empty
            case DataSkippingFailureMode.Strict.value => throw new HoodieException(e);
          }
      }

      logDebug(s"Overlapping candidate files from Column Stats or Record Level Index: ${candidateFilesNamesOpt.getOrElse(Set.empty)}")

      var totalFileSliceSize = 0
      var candidateFileSliceSize = 0

      val prunedPartitionsAndFilteredFileSlices = prunedPartitionsAndFileSlices.map {
        case (partitionOpt, fileSlices) =>
          // Filter in candidate files based on the col-stats or record level index lookup
          val candidateFileSlices: Seq[FileSlice] = {
            fileSlices.filter(fs => {
              val fileSliceFiles = fs.getLogFiles.map[String](JFunction.toJavaFunction[HoodieLogFile, String](lf => lf.getPath.getName))
                .collect(Collectors.toSet[String])
              val baseFileStatusOpt = getBaseFileInfo(Option.apply(fs.getBaseFile.orElse(null)))
              baseFileStatusOpt.exists(f => fileSliceFiles.add(f.getPath.getName))
              if (candidateFilesNamesOpt.isDefined) {
                // if any file in the file slice is part of candidate file names, we need to inclue the file slice.
                // in other words, if all files in the file slice is not present in candidate file names, we can filter out the file slice.
                fileSliceFiles.stream().filter(fileSliceFile => candidateFilesNamesOpt.get.contains(fileSliceFile)).findAny().isPresent
              } else {
                true
              }
            })
          }

          totalFileSliceSize += fileSlices.size
          candidateFileSliceSize += candidateFileSlices.size
          (partitionOpt, candidateFileSlices)
      }

      val skippingRatio =
        if (!areAllFileSlicesCached) -1
        else if (getAllFiles().nonEmpty && totalFileSliceSize > 0)
          (totalFileSliceSize - candidateFileSliceSize) / totalFileSliceSize.toDouble
        else 0

      logInfo(s"Total file slices: $totalFileSliceSize; " +
        s"candidate file slices after data skipping: $candidateFileSliceSize; " +
        s"skipping percentage $skippingRatio")

      prunedPartitionsAndFilteredFileSlices
    }
  }

  /**
   * Prunes table partitions to list if possible.
   *
   * @param dataFilters      filters based on data columns
   * @param partitionFilters filters based on partition columns
   * @return a pair of elements, with the first element indicating whether the partition pruning
   *         is applied, and the second element as a list of partition paths and file slices
   */
  def prunePartitionsAndGetFileSlices(dataFilters: Seq[Expression],
                                      partitionFilters: Seq[Expression]):
  (Boolean, Seq[(Option[BaseHoodieTableFileIndex.PartitionPath], Seq[FileSlice])]) = {
    val isPartitionedTable = getPartitionColumns.length > 0
    val prunedPartitionsTuple: (Boolean, Seq[PartitionPath]) =
      if (isPartitionedTable && partitionFilters.nonEmpty) {
        // For partitioned table and partition filters, prune the partitions by the partition filters
        if (shouldEmbedFileSlices) {
          (true, listMatchingPartitionPaths(convertFilterForTimestampKeyGenerator(metaClient, partitionFilters)))
        } else {
          (true, listMatchingPartitionPaths(partitionFilters))
        }
      } else if (isPartitionedTable && isDataSkippingEnabled) {
        // For partitioned table and no partition filters, if data skipping is enabled,
        // try using the PARTITION_STATS index to prune the partitions
        lazy val filterReferencedColumns = collectReferencedColumns(spark, dataFilters, schema)
        val prunedPartitionPaths = new PartitionStatsIndexSupport(spark, schema, metadataConfig, metaClient)
          .prunePartitions(this, dataFilters, filterReferencedColumns)
        if (prunedPartitionPaths.nonEmpty) {
          try {
            (true, prunedPartitionPaths.get.map(e => convertToPartitionPath(e)).toSeq)
          } catch {
            // If the partition values cannot be parsed by [[convertToPartitionPath]],
            // fall back to listing all partitions
            case _: HoodieException => (false, listMatchingPartitionPaths(Seq.empty))
          }
        } else {
          // Cannot use partition stats index (not available) for pruning partitions,
          // fall back to listing all partitions
          (false, listMatchingPartitionPaths(Seq.empty))
        }
      } else {
        // Listing all partitions for non-partitioned table,
        // or partitioned table without partition filter or data skipping or PARTITION_STATS index
        (false, listMatchingPartitionPaths(Seq.empty))
      }

    (prunedPartitionsTuple._1, getInputFileSlices(prunedPartitionsTuple._2: _*).asScala.map(
      { case (partition, fileSlices) =>
        (Option.apply(partition), fileSlices.asScala.map(f => f.withLogFiles(includeLogFiles)).toSeq) }).toSeq)
  }

  /**
   * In the fast bootstrap read code path, it gets the path info for the bootstrap base file instead of
   * skeleton file. Returns path info for the base file if available.
   */
  protected def getBaseFileInfo(baseFileOpt: Option[HoodieBaseFile]): Option[StoragePathInfo] = {
    baseFileOpt.map(baseFile => {
      if (shouldFastBootstrap) {
        if (baseFile.getBootstrapBaseFile.isPresent) {
          baseFile.getBootstrapBaseFile.get().getPathInfo
        } else {
          baseFile.getPathInfo
        }
      } else {
        baseFile.getPathInfo
      }
    })
  }

  /**
   * Computes pruned list of candidate base-files' names based on provided list of {@link dataFilters}
   * conditions, by leveraging Metadata Table's Record Level Index and Column Statistics index (hereon referred as
   * ColStats for brevity) bearing "min", "max", "num_nulls" statistics for all columns.
   *
   * NOTE: This method has to return complete set of candidate files, since only provided candidates will
   * ultimately be scanned as part of query execution. Hence, this method has to maintain the
   * invariant of conservatively including every base-file and log file's name, that is NOT referenced in its index.
   *
   * @param queryFilters list of original data filters passed down from querying engine
   * @return list of pruned (data-skipped) candidate base-files and log files' names
   */
  // scalastyle:off return
  private def lookupCandidateFilesInMetadataTable(queryFilters: Seq[Expression],
                                                  prunedPartitionsAndFileSlices: Seq[(Option[BaseHoodieTableFileIndex.PartitionPath], Seq[FileSlice])],
                                                  shouldPushDownFilesFilter: Boolean): Try[Option[Set[String]]] = Try {
    // NOTE: For column stats, Data Skipping is only effective when it references columns that are indexed w/in
    //       the Column Stats Index (CSI). Following cases could not be effectively handled by Data Skipping:
    //          - Expressions on top-level column's fields (ie, for ex filters like "struct.field > 0", since
    //          CSI only contains stats for top-level columns, in this case for "struct")
    //          - Any expression not directly referencing top-level column (for ex, sub-queries, since there's
    //          nothing CSI in particular could be applied for)
    //       For record index, Data Skipping is only effective when one of the query filter is of type EqualTo
    //       or IN query on simple record keys. In such a case the record index is used to filter the file slices
    //       and candidate files are obtained from these file slices.

    lazy val queryReferencedColumns = collectReferencedColumns(spark, queryFilters, schema)
    if (isDataSkippingEnabled) {
      for(indexSupport: SparkBaseIndexSupport <- indicesSupport) {
        if (indexSupport.isIndexAvailable && indexSupport.supportsQueryType(options)) {
          val prunedFileNames = indexSupport.computeCandidateIsStrict(spark, this, queryFilters, queryReferencedColumns,
            prunedPartitionsAndFileSlices, shouldPushDownFilesFilter)
          if (prunedFileNames.nonEmpty) {
            return Try(prunedFileNames)
          }
        }
      }
    }
    validateConfig()
    Option.empty
  }

  override def refresh(): Unit = {
    super.refresh()
    indicesSupport.foreach(idx => idx.invalidateCaches())
    hasPushedDownPartitionPredicates = false
  }

  private def getAllFiles(): Seq[StoragePathInfo] = {
    if (includeLogFiles) allBaseFilesAndLogFiles else allBaseFiles
  }

  override def inputFiles: Array[String] =
    getAllFiles().map(_.getPath.toString).toArray

  override def sizeInBytes: Long = {
    val size = getTotalCachedFilesSize
    if (size == 0 && !enableHoodieExtension) {
      // Avoid always broadcast the hudi table if not enable HoodieExtension
      logWarning("Note: Please add 'org.apache.spark.sql.hudi.HoodieSparkSessionExtension' to the Spark SQL configuration property " +
        "'spark.sql.extensions'.\n Multiple extensions can be set using a comma-separated list.")
      Long.MaxValue
    } else {
      size
    }
  }

  def hasPredicatesPushedDown: Boolean =
    hasPushedDownPartitionPredicates

  private def isDataSkippingEnabled: Boolean = getConfigValue(options, spark.sessionState.conf,
    DataSourceReadOptions.ENABLE_DATA_SKIPPING.key, DataSourceReadOptions.ENABLE_DATA_SKIPPING.defaultValue.toString).toBoolean

  private def isMetadataTableEnabled: Boolean = metadataConfig.isEnabled()

  private def isColumnStatsIndexEnabled: Boolean = metadataConfig.isColumnStatsIndexEnabled

  private def isRecordIndexEnabled: Boolean = indicesSupport.exists(idx =>
    idx.getIndexName == RecordLevelIndexSupport.INDEX_NAME && idx.isIndexAvailable)

  private def isExpressionIndexEnabled: Boolean = indicesSupport.exists(idx =>
    idx.getIndexName == ExpressionIndexSupport.INDEX_NAME && idx.isIndexAvailable)

  private def isBucketIndexEnabled: Boolean = indicesSupport.exists(idx =>
    idx.getIndexName == BucketIndexSupport.INDEX_NAME && idx.isIndexAvailable)

  private def isPartitionStatsIndexEnabled: Boolean = indicesSupport.exists(idx =>
    idx.getIndexName == PartitionStatsIndexSupport.INDEX_NAME && idx.isIndexAvailable)

  private def isBloomFiltersIndexEnabled: Boolean = indicesSupport.exists(idx =>
    idx.getIndexName == BloomFiltersIndexSupport.INDEX_NAME && idx.isIndexAvailable)

  private def isSecondaryIndexEnabled: Boolean = indicesSupport.exists(idx =>
    idx.getIndexName == SecondaryIndexSupport.INDEX_NAME && idx.isIndexAvailable)

  private def isIndexAvailable: Boolean = indicesSupport.exists(idx => idx.isIndexAvailable)

  private def validateConfig(): Unit = {
    if (isDataSkippingEnabled && (!isMetadataTableEnabled || !isIndexAvailable)) {
      logWarning("Data skipping requires Metadata Table and at least one of the indices to be enabled! "
        + s"(isMetadataTableEnabled = $isMetadataTableEnabled, isColumnStatsIndexEnabled = $isColumnStatsIndexEnabled"
        + s", isRecordIndexApplicable = $isRecordIndexEnabled, isExpressionIndexEnabled = $isExpressionIndexEnabled, " +
        s"isBucketIndexEnable = $isBucketIndexEnabled, isPartitionStatsIndexEnabled = $isPartitionStatsIndexEnabled)"
        + s", isBloomFiltersIndexEnabled = $isBloomFiltersIndexEnabled)")
    }
  }

}

object HoodieFileIndex extends Logging {

  object DataSkippingFailureMode extends Enumeration {
    val configName = "hoodie.fileIndex.dataSkippingFailureMode"

    type DataSkippingFailureMode = Value

    case class Val(value: String) extends super.Val {
      override def toString(): String = value
    }

    import scala.language.implicitConversions
    implicit def valueToVal(x: Value): DataSkippingFailureMode = x.asInstanceOf[Val]

    val Fallback: Val = Val("fallback")
    val Strict: Val   = Val("strict")
  }

  private def collectReferencedColumns(spark: SparkSession, queryFilters: Seq[Expression], schema: StructType): Seq[String] = {
    val resolver = spark.sessionState.analyzer.resolver
    val refs = queryFilters.flatMap(_.references)
    schema.fieldNames.filter { colName => refs.exists(r => resolver.apply(colName, r.name)) }
  }

  def getConfigProperties(spark: SparkSession, options: Map[String, String], tableConfig: HoodieTableConfig) = {
    val sqlConf: SQLConf = spark.sessionState.conf
    val properties = TypedProperties.fromMap(options.filter(p => p._2 != null).asJava)

    // TODO(HUDI-5361) clean up properties carry-over

    // To support metadata listing via Spark SQL we allow users to pass the config via SQL Conf in spark session. Users
    // would be able to run SET hoodie.metadata.enable=true in the spark sql session to enable metadata listing.
    val isMetadataTableEnabled = getConfigValue(options, sqlConf, HoodieMetadataConfig.ENABLE.key, null)
    if (isMetadataTableEnabled != null) {
      properties.setProperty(HoodieMetadataConfig.ENABLE.key(), String.valueOf(isMetadataTableEnabled))
    }

    val listingModeOverride = getConfigValue(options, sqlConf,
      DataSourceReadOptions.FILE_INDEX_LISTING_MODE_OVERRIDE.key, null)
    if (listingModeOverride != null) {
      properties.setProperty(DataSourceReadOptions.FILE_INDEX_LISTING_MODE_OVERRIDE.key, listingModeOverride)
    }

    if (tableConfig != null) {
      properties.setProperty(RECORDKEY_FIELD.key, tableConfig.getRecordKeyFields.orElse(Array.empty).mkString(","))
      properties.setProperty(PRECOMBINE_FIELD.key, Option(tableConfig.getPreCombineField).getOrElse(""))
      properties.setProperty(PARTITIONPATH_FIELD.key, HoodieTableConfig.getPartitionFieldPropForKeyGenerator(tableConfig).orElse(""))
    }

    properties
  }

  def convertFilterForTimestampKeyGenerator(metaClient: HoodieTableMetaClient,
      partitionFilters: Seq[Expression]): Seq[Expression] = {

    val tableConfig = metaClient.getTableConfig
    val keyGenerator = tableConfig.getKeyGeneratorClassName

    if (keyGenerator != null && (keyGenerator.equals(classOf[TimestampBasedKeyGenerator].getCanonicalName) ||
        keyGenerator.equals(classOf[TimestampBasedAvroKeyGenerator].getCanonicalName))) {
      val inputFormat = tableConfig.getString(TIMESTAMP_INPUT_DATE_FORMAT)
      val outputFormat = tableConfig.getString(TIMESTAMP_OUTPUT_DATE_FORMAT)
      if (StringUtils.isNullOrEmpty(inputFormat) || StringUtils.isNullOrEmpty(outputFormat) ||
        inputFormat.equals(outputFormat)) {
        partitionFilters
      } else {
        try {
          val inDateFormat = new SimpleDateFormat(inputFormat)
          val outDateFormat = new SimpleDateFormat(outputFormat)
          partitionFilters.toArray.map {
            _.transformDown {
              case Literal(value, dataType) if dataType.isInstanceOf[StringType] =>
                try {
                  val converted = outDateFormat.format(inDateFormat.parse(value.toString))
                  Literal(UTF8String.fromString(converted), StringType)
                } catch {
                  case _: java.text.ParseException =>
                    try {
                      outDateFormat.parse(value.toString)
                    } catch {
                      case e: Exception => throw new HoodieException("Partition filter for TimestampKeyGenerator cannot be converted to format " + outDateFormat.toString, e)
                    }
                    Literal(UTF8String.fromString(value.toString), StringType)
                }
            }
          }
        } catch {
          case NonFatal(e) =>
            logWarning("Fail to convert filters for TimestampBaseAvroKeyGenerator", e)
            partitionFilters
        }
      }
    } else {
      partitionFilters
    }
  }

  private def getQueryPaths(options: Map[String, String]): Seq[StoragePath] = {
    // NOTE: To make sure that globbing is appropriately handled w/in the
    //       `path`, we need to:
    //          - First, probe whether requested globbed paths has been resolved (and `glob.paths` was provided
    //          in options); otherwise
    //          - Treat `path` as fully-qualified (ie non-globbed) path
    val paths = options.get("glob.paths") match {
      case Some(globbed) =>
        globbed.split(",").toSeq
      case None =>
        val path = options.getOrElse("path",
          throw new IllegalArgumentException("'path' or 'glob paths' option required"))
        Seq(path)
    }

    paths.map(new StoragePath(_))
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy