All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.ExpressionIndexSupport.scala Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.hudi

import org.apache.hudi.ColumnStatsIndexSupport.{composeColumnStatStructType, deserialize, tryUnpackValueWrapper}
import org.apache.hudi.ExpressionIndexSupport._
import org.apache.hudi.HoodieCatalystUtils.{withPersistedData, withPersistedDataset}
import org.apache.hudi.HoodieConversionUtils.toScalaOption
import org.apache.hudi.HoodieSparkExpressionIndex.SPARK_FUNCTION_MAP
import org.apache.hudi.RecordLevelIndexSupport.{fetchQueryWithAttribute, filterQueryWithRecordKey}
import org.apache.hudi.avro.model.{HoodieMetadataColumnStats, HoodieMetadataRecord}
import org.apache.hudi.common.config.HoodieMetadataConfig
import org.apache.hudi.common.data.HoodieData
import org.apache.hudi.common.model.{FileSlice, HoodieIndexDefinition, HoodieRecord}
import org.apache.hudi.common.table.HoodieTableMetaClient
import org.apache.hudi.common.util.ValidationUtils.checkState
import org.apache.hudi.common.util.collection
import org.apache.hudi.common.util.hash.{ColumnIndexID, PartitionIndexID}
import org.apache.hudi.data.HoodieJavaRDD
import org.apache.hudi.index.functional.HoodieExpressionIndex.SPARK_FROM_UNIXTIME
import org.apache.hudi.metadata.{HoodieMetadataPayload, HoodieTableMetadataUtil, MetadataPartitionType}
import org.apache.hudi.util.JFunction
import org.apache.spark.sql.HoodieUnsafeUtils.{createDataFrameFromInternalRows, createDataFrameFromRDD, createDataFrameFromRows}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{EqualTo, Expression, FromUnixTime, In, Literal, UnaryExpression}
import org.apache.spark.sql.catalyst.util.TimestampFormatter
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.storage.StorageLevel

import scala.collection.JavaConverters._
import scala.collection.immutable.TreeSet
import scala.collection.mutable.ListBuffer
import scala.collection.parallel.mutable.ParHashMap

class ExpressionIndexSupport(spark: SparkSession,
                             tableSchema: StructType,
                             metadataConfig: HoodieMetadataConfig,
                             metaClient: HoodieTableMetaClient)
  extends SparkBaseIndexSupport (spark, metadataConfig, metaClient) {

  @transient private lazy val cachedColumnStatsIndexViews: ParHashMap[Seq[String], DataFrame] = ParHashMap()


  // NOTE: Since [[metadataConfig]] is transient this has to be eagerly persisted, before this will be passed on to the executor
  private val inMemoryProjectionThreshold = metadataConfig.getColumnStatsIndexInMemoryProjectionThreshold

  override def getIndexName: String = ExpressionIndexSupport.INDEX_NAME

  override def computeCandidateFileNames(fileIndex: HoodieFileIndex,
                                         queryFilters: Seq[Expression],
                                         queryReferencedColumns: Seq[String],
                                         prunedPartitionsAndFileSlices: Seq[(Option[BaseHoodieTableFileIndex.PartitionPath], Seq[FileSlice])],
                                         shouldPushDownFilesFilter: Boolean
                                        ): Option[Set[String]] = {
    lazy val expressionIndexPartitionOpt = getExpressionIndexPartitionAndLiterals(queryFilters)
    if (isIndexAvailable && queryFilters.nonEmpty && expressionIndexPartitionOpt.nonEmpty) {
      val (indexPartition, expressionIndexQuery, literals) = expressionIndexPartitionOpt.get
      val indexDefinition = metaClient.getIndexMetadata.get().getIndexDefinitions.get(indexPartition)
      if (indexDefinition.getIndexType.equals(HoodieTableMetadataUtil.PARTITION_NAME_COLUMN_STATS)) {
        val readInMemory = shouldReadInMemory(fileIndex, queryReferencedColumns, inMemoryProjectionThreshold)
        val (prunedPartitions, prunedFileNames) = getPrunedPartitionsAndFileNames(fileIndex, prunedPartitionsAndFileSlices)
        val expressionIndexRecords = loadExpressionIndexRecords(indexPartition, prunedPartitions, readInMemory)
        loadTransposed(queryReferencedColumns, readInMemory, expressionIndexRecords, expressionIndexQuery) {
          transposedColStatsDF =>Some(getCandidateFiles(transposedColStatsDF, Seq(expressionIndexQuery), prunedFileNames, isExpressionIndex = true))
        }
      } else if (indexDefinition.getIndexType.equals(HoodieTableMetadataUtil.PARTITION_NAME_BLOOM_FILTERS)) {
        val prunedPartitionAndFileNames = getPrunedPartitionsAndFileNamesMap(prunedPartitionsAndFileSlices, includeLogFiles = true)
        Option.apply(getCandidateFilesForKeys(indexPartition, prunedPartitionAndFileNames, literals))
      } else {
        Option.empty
      }
    } else {
      Option.empty
    }
  }

  /**
   * Loads view of the Column Stats Index in a transposed format where single row coalesces every columns'
   * statistics for a single file, returning it as [[DataFrame]]
   *
   * Please check out scala-doc of the [[transpose]] method explaining this view in more details
   */
  def loadTransposed[T](targetColumns: Seq[String],
                        shouldReadInMemory: Boolean,
                        colStatRecords: HoodieData[HoodieMetadataColumnStats],
                        expressionIndexQuery: Expression) (block: DataFrame => T): T = {
    cachedColumnStatsIndexViews.get(targetColumns) match {
      case Some(cachedDF) =>
        block(cachedDF)
      case None =>
        val colStatsRecords: HoodieData[HoodieMetadataColumnStats] = colStatRecords
        withPersistedData(colStatsRecords, StorageLevel.MEMORY_ONLY) {
          val (transposedRows, indexSchema) = transpose(colStatsRecords, targetColumns, expressionIndexQuery)
          val df = if (shouldReadInMemory) {
            // NOTE: This will instantiate a [[Dataset]] backed by [[LocalRelation]] holding all of the rows
            //       of the transposed table in memory, facilitating execution of the subsequently chained operations
            //       on it locally (on the driver; all such operations are actually going to be performed by Spark's
            //       Optimizer)
            createDataFrameFromRows(spark, transposedRows.collectAsList().asScala.toSeq, indexSchema)
          } else {
            val rdd = HoodieJavaRDD.getJavaRDD(transposedRows)
            spark.createDataFrame(rdd, indexSchema)
          }

          val allowCaching: Boolean = false
          if (allowCaching) {
            cachedColumnStatsIndexViews.put(targetColumns, df)
            // NOTE: Instead of collecting the rows from the index and hold them in memory, we instead rely
            //       on Spark as (potentially distributed) cache managing data lifecycle, while we simply keep
            //       the referenced to persisted [[DataFrame]] instance
            df.persist(StorageLevel.MEMORY_ONLY)

            block(df)
          } else {
            withPersistedDataset(df) {
              block(df)
            }
          }
        }
    }
  }

  /**
   * Transposes and converts the raw table format of the Column Stats Index representation,
   * where each row/record corresponds to individual (column, file) pair, into the table format
   * where each row corresponds to single file with statistic for individual columns collated
   * w/in such row:
   *
   * Metadata Table Column Stats Index format:
   *
   * 
   *  +---------------------------+------------+------------+------------+-------------+
   *  |        fileName           | columnName |  minValue  |  maxValue  |  num_nulls  |
   *  +---------------------------+------------+------------+------------+-------------+
   *  | one_base_file.parquet     |          A |          1 |         10 |           0 |
   *  | another_base_file.parquet |          A |        -10 |          0 |           5 |
   *  +---------------------------+------------+------------+------------+-------------+
   * 
* * Returned table format * *
   *  +---------------------------+------------+------------+-------------+
   *  |          file             | A_minValue | A_maxValue | A_nullCount |
   *  +---------------------------+------------+------------+-------------+
   *  | one_base_file.parquet     |          1 |         10 |           0 |
   *  | another_base_file.parquet |        -10 |          0 |           5 |
   *  +---------------------------+------------+------------+-------------+
   * 
* * NOTE: Column Stats Index might potentially contain statistics for many columns (if not all), while * query at hand might only be referencing a handful of those. As such, we collect all the * column references from the filtering expressions, and only transpose records corresponding to the * columns referenced in those * * @param colStatsRecords [[HoodieData[HoodieMetadataColumnStats]]] bearing raw Column Stats Index records * @param queryColumns target columns to be included into the final table * @return reshaped table according to the format outlined above */ private def transpose(colStatsRecords: HoodieData[HoodieMetadataColumnStats], queryColumns: Seq[String], expressionIndexQuery: Expression): (HoodieData[Row], StructType) = { val tableSchemaFieldMap = tableSchema.fields.map(f => (f.name, f)).toMap // NOTE: We're sorting the columns to make sure final index schema matches layout // of the transposed table val sortedTargetColumnsSet = TreeSet(queryColumns:_*) // NOTE: This is a trick to avoid pulling all of [[ColumnStatsIndexSupport]] object into the lambdas' // closures below val indexedColumns = queryColumns.toSet // NOTE: It's crucial to maintain appropriate ordering of the columns // matching table layout: hence, we cherry-pick individual columns // instead of simply filtering in the ones we're interested in the schema val (indexSchema, targetIndexedColumns) = composeIndexSchema(sortedTargetColumnsSet.toSeq, indexedColumns, tableSchema, expressionIndexQuery) // Here we perform complex transformation which requires us to modify the layout of the rows // of the dataset, and therefore we rely on low-level RDD API to avoid incurring encoding/decoding // penalty of the [[Dataset]], since it's required to adhere to its schema at all times, while // RDDs are not; val transposedRows: HoodieData[Row] = colStatsRecords //TODO: [HUDI-8303] Explicit conversion might not be required for Scala 2.12+ .filter(JFunction.toJavaSerializableFunction(r => sortedTargetColumnsSet.contains(r.getColumnName))) .mapToPair(JFunction.toJavaSerializablePairFunction(r => { if (r.getMinValue == null && r.getMaxValue == null) { // Corresponding row could be null in either of the 2 cases // - Column contains only null values (in that case both min/max have to be nulls) // - This is a stubbed Column Stats record (used as a tombstone) collection.Pair.of(r.getFileName, r) } else { val minValueWrapper = r.getMinValue val maxValueWrapper = r.getMaxValue checkState(minValueWrapper != null && maxValueWrapper != null, "Invalid Column Stats record: either both min/max have to be null, or both have to be non-null") val colName = r.getColumnName val colType = tableSchemaFieldMap(colName).dataType val minValue = deserialize(tryUnpackValueWrapper(minValueWrapper), colType) val maxValue = deserialize(tryUnpackValueWrapper(maxValueWrapper), colType) // Update min-/max-value structs w/ unwrapped values in-place r.setMinValue(minValue) r.setMaxValue(maxValue) collection.Pair.of(r.getFileName, r) } })) .groupByKey() .map(JFunction.toJavaSerializableFunction(p => { val columnRecordsSeq: Seq[HoodieMetadataColumnStats] = p.getValue.asScala.toSeq val fileName: String = p.getKey val valueCount: Long = columnRecordsSeq.head.getValueCount // To properly align individual rows (corresponding to a file) w/in the transposed projection, we need // to align existing column-stats for individual file with the list of expected ones for the // whole transposed projection (a superset of all files) val columnRecordsMap = columnRecordsSeq.map(r => (r.getColumnName, r)).toMap val alignedColStatRecordsSeq = targetIndexedColumns.map(columnRecordsMap.get) val coalescedRowValuesSeq = alignedColStatRecordsSeq.foldLeft(ListBuffer[Any](fileName, valueCount)) { case (acc, opt) => opt match { case Some(colStatRecord) => acc ++= Seq(colStatRecord.getMinValue, colStatRecord.getMaxValue, colStatRecord.getNullCount) case None => // NOTE: This could occur in either of the following cases: // 1. When certain columns exist in the schema but are absent in some data files due to // schema evolution or other reasons, these columns will not be present in the column stats. // In this case, we fill in default values by setting the min, max and null-count to null // (this behavior is consistent with reading non-existent columns from Parquet). // 2. When certain columns are present both in the schema and the data files, // but the column stats are absent for these columns due to their types not supporting indexing, // we also set these columns to default values. // // This approach prevents errors during data skipping and, because the filter includes an isNull check, // these conditions will not affect the accurate return of files from data skipping. acc ++= Seq(null, null, null) } } Row(coalescedRowValuesSeq.toSeq: _*) })) (transposedRows, indexSchema) } def composeIndexSchema(targetColumnNames: Seq[String], indexedColumns: Set[String], tableSchema: StructType, expressionIndexQuery: Expression): (StructType, Seq[String]) = { val fileNameField = StructField(HoodieMetadataPayload.COLUMN_STATS_FIELD_FILE_NAME, StringType, nullable = true, Metadata.empty) val valueCountField = StructField(HoodieMetadataPayload.COLUMN_STATS_FIELD_VALUE_COUNT, LongType, nullable = true, Metadata.empty) val targetIndexedColumns = targetColumnNames.filter(indexedColumns.contains(_)) val targetIndexedFields = targetIndexedColumns.map(colName => tableSchema.fields.find(f => f.name == colName).get) val dataType: DataType = expressionIndexQuery match { case eq: EqualTo => eq.right.asInstanceOf[Literal].dataType case in: In => in.list(0).asInstanceOf[Literal].dataType case _ => targetIndexedFields(0).dataType } (StructType( targetIndexedFields.foldLeft(Seq(fileNameField, valueCountField)) { case (acc, field) => acc ++ Seq( composeColumnStatStructType(field.name, HoodieMetadataPayload.COLUMN_STATS_FIELD_MIN_VALUE, dataType), composeColumnStatStructType(field.name, HoodieMetadataPayload.COLUMN_STATS_FIELD_MAX_VALUE, dataType), composeColumnStatStructType(field.name, HoodieMetadataPayload.COLUMN_STATS_FIELD_NULL_COUNT, LongType)) } ), targetIndexedColumns) } def loadColumnStatsIndexRecords(targetColumns: Seq[String], prunedPartitions: Option[Set[String]] = None, shouldReadInMemory: Boolean): HoodieData[HoodieMetadataColumnStats] = { // Read Metadata Table's Column Stats Index records into [[HoodieData]] container by // - Fetching the records from CSI by key-prefixes (encoded column names) // - Extracting [[HoodieMetadataColumnStats]] records // - Filtering out nulls checkState(targetColumns.nonEmpty) // TODO encoding should be done internally w/in HoodieBackedTableMetadata val encodedTargetColumnNames = targetColumns.map(colName => new ColumnIndexID(colName).asBase64EncodedString()) // encode column name and parition name if partition list is available val keyPrefixes = if (prunedPartitions.isDefined) { prunedPartitions.get.map(partitionPath => new PartitionIndexID(HoodieTableMetadataUtil.getPartitionIdentifier(partitionPath)).asBase64EncodedString() ).flatMap(encodedPartition => { encodedTargetColumnNames.map(encodedTargetColumn => encodedTargetColumn.concat(encodedPartition)) }) } else { encodedTargetColumnNames } val metadataRecords: HoodieData[HoodieRecord[HoodieMetadataPayload]] = metadataTable.getRecordsByKeyPrefixes(keyPrefixes.toSeq.asJava, HoodieTableMetadataUtil.PARTITION_NAME_COLUMN_STATS, shouldReadInMemory) val columnStatsRecords: HoodieData[HoodieMetadataColumnStats] = //TODO: [HUDI-8303] Explicit conversion might not be required for Scala 2.12+ metadataRecords.map(JFunction.toJavaSerializableFunction(record => { toScalaOption(record.getData.getInsertValue(null, null)) .map(metadataRecord => metadataRecord.asInstanceOf[HoodieMetadataRecord].getColumnStatsMetadata) .orNull })) .filter(JFunction.toJavaSerializableFunction(columnStatsRecord => columnStatsRecord != null)) columnStatsRecords } override def invalidateCaches(): Unit = { cachedColumnStatsIndexViews.foreach { case (_, df) => df.unpersist() } cachedColumnStatsIndexViews.clear() } /** * Return true if metadata table is enabled and expression index metadata partition is available. */ def isIndexAvailable: Boolean = { metadataConfig.isEnabled && metaClient.getIndexMetadata.isPresent && !metaClient.getIndexMetadata.get().getIndexDefinitions.isEmpty } private def filterQueriesWithFunctionalFilterKey(queryFilters: Seq[Expression], sourceFieldOpt: Option[String]): List[Tuple2[Expression, List[String]]] = { var expressionIndexQueries: List[Tuple2[Expression, List[String]]] = List.empty for (query <- queryFilters) { val attributeFetcher = (expr: Expression) => { expr match { case expression: UnaryExpression => expression.child case expression: FromUnixTime => expression.sec case other => other } } filterQueryWithRecordKey(query, sourceFieldOpt, attributeFetcher).foreach({ case (exp: Expression, literals: List[String]) => expressionIndexQueries = expressionIndexQueries :+ Tuple2.apply(exp, literals) }) } expressionIndexQueries } /** * Searches for an index partition based on the specified index function and target column name. * * This method looks up the index definitions available in the metadata of a `metaClient` instance * and attempts to find an index partition where the index function and the source fields match * the provided arguments. If a matching index definition is found, the partition identifier for * that index is returned. * * @param queryFilters A sequence of `Expression` objects to analyze. Each expression should involve a single column * for the method to consider it (expressions involving multiple columns are skipped). * @return An `Option` containing the index partition identifier if a matching index definition is found. * Returns `None` if no matching index definition is found. */ private def getExpressionIndexPartitionAndLiterals(queryFilters: Seq[Expression]): Option[Tuple3[String, Expression, List[String]]] = { val indexDefinitions = metaClient.getIndexMetadata.get().getIndexDefinitions.asScala if (indexDefinitions.nonEmpty) { val functionDefinitions = indexDefinitions.values .filter(definition => MetadataPartitionType.fromPartitionPath(definition.getIndexName).equals(MetadataPartitionType.EXPRESSION_INDEX)) .toList var indexPartitionAndLiteralsOpt: Option[Tuple3[String, Expression, List[String]]] = Option.empty functionDefinitions.foreach(indexDefinition => { val queryInfoOpt = extractQueryAndLiterals(queryFilters, indexDefinition) if (queryInfoOpt.isDefined) { indexPartitionAndLiteralsOpt = Option.apply(Tuple3.apply(indexDefinition.getIndexName, queryInfoOpt.get._1, queryInfoOpt.get._2)) } }) indexPartitionAndLiteralsOpt } else { Option.empty } } /** * Extracts mappings from function names to column names from a sequence of expressions. * * This method iterates over a given sequence of Spark SQL expressions and identifies expressions * that contain function calls corresponding to keys in the `SPARK_FUNCTION_MAP`. It supports only * expressions that are simple binary expressions involving a single column. If an expression contains * one of the functions and operates on a single column, this method maps the function name to the * column name. */ private def extractQueryAndLiterals(queryFilters: Seq[Expression], indexDefinition: HoodieIndexDefinition): Option[(Expression, List[String])] = { val expressionIndexQueries = filterQueriesWithFunctionalFilterKey(queryFilters, Option.apply(indexDefinition.getSourceFields.get(0))) var queryAndLiteralsOpt: Option[(Expression, List[String])] = Option.empty expressionIndexQueries.foreach { tuple => val (expr, literals) = (tuple._1, tuple._2) val functionNameOption = SPARK_FUNCTION_MAP.asScala.keys.find(expr.toString.contains) val functionName = functionNameOption.getOrElse("identity") if (indexDefinition.getIndexFunction.equals(functionName)) { val attributeFetcher = (expr: Expression) => { expr match { case expression: UnaryExpression => expression.child case expression: FromUnixTime => expression.sec case other => other } } if (functionName.equals(SPARK_FROM_UNIXTIME)) { val configuredFormat = indexDefinition.getIndexOptions.getOrDefault("format", TimestampFormatter.defaultPattern) if (expr.toString().contains(configuredFormat)) { val pruningExpr = fetchQueryWithAttribute(expr, Option.apply(indexDefinition.getSourceFields.get(0)), RecordLevelIndexSupport.getSimpleLiteralGenerator(), attributeFetcher)._1.get._1 queryAndLiteralsOpt = Option.apply(Tuple2.apply(pruningExpr, literals)) } } else { val pruningExpr = fetchQueryWithAttribute(expr, Option.apply(indexDefinition.getSourceFields.get(0)), RecordLevelIndexSupport.getSimpleLiteralGenerator(), attributeFetcher)._1.get._1 queryAndLiteralsOpt = Option.apply(Tuple2.apply(pruningExpr, literals)) } } } queryAndLiteralsOpt } private def loadExpressionIndexRecords(indexPartition: String, prunedPartitions: Set[String], shouldReadInMemory: Boolean): HoodieData[HoodieMetadataColumnStats] = { val indexDefinition = metaClient.getIndexMetadata.get().getIndexDefinitions.get(indexPartition) val colStatsRecords: HoodieData[HoodieMetadataColumnStats] = loadExpressionIndexForColumnsInternal( indexDefinition.getSourceFields.asScala.toSeq, prunedPartitions, indexPartition, shouldReadInMemory) //TODO: [HUDI-8303] Explicit conversion might not be required for Scala 2.12+ colStatsRecords } def loadExpressionIndexDataFrame(indexPartition: String, prunedPartitions: Set[String], shouldReadInMemory: Boolean): DataFrame = { val colStatsDF = { val indexDefinition = metaClient.getIndexMetadata.get().getIndexDefinitions.get(indexPartition) val colStatsRecords: HoodieData[HoodieMetadataColumnStats] = loadExpressionIndexForColumnsInternal( indexDefinition.getSourceFields.asScala.toSeq, prunedPartitions, indexPartition, shouldReadInMemory) //TODO: [HUDI-8303] Explicit conversion might not be required for Scala 2.12+ val catalystRows: HoodieData[InternalRow] = colStatsRecords.map[InternalRow](JFunction.toJavaSerializableFunction(r => { val converter = AvroConversionUtils.createAvroToInternalRowConverter(HoodieMetadataColumnStats.SCHEMA$, columnStatsRecordStructType) converter(r).orNull })) if (shouldReadInMemory) { // NOTE: This will instantiate a [[Dataset]] backed by [[LocalRelation]] holding all of the rows // of the transposed table in memory, facilitating execution of the subsequently chained operations // on it locally (on the driver; all such operations are actually going to be performed by Spark's // Optimizer) createDataFrameFromInternalRows(spark, catalystRows.collectAsList().asScala.toSeq, columnStatsRecordStructType) } else { createDataFrameFromRDD(spark, HoodieJavaRDD.getJavaRDD(catalystRows), columnStatsRecordStructType) } } colStatsDF.select(targetColumnStatsIndexColumns.map(col): _*) } private def loadExpressionIndexForColumnsInternal(targetColumns: Seq[String], prunedPartitions: Set[String], indexPartition: String, shouldReadInMemory: Boolean): HoodieData[HoodieMetadataColumnStats] = { // Read Metadata Table's Expression Index records into [[HoodieData]] container by // - Fetching the records from CSI by key-prefixes (encoded column names) // - Extracting [[HoodieMetadataColumnStats]] records // - Filtering out nulls checkState(targetColumns.nonEmpty) val encodedTargetColumnNames = targetColumns.map(colName => new ColumnIndexID(colName).asBase64EncodedString()) val keyPrefixes = if (prunedPartitions.nonEmpty) { prunedPartitions.map(partitionPath => new PartitionIndexID(HoodieTableMetadataUtil.getPartitionIdentifier(partitionPath)).asBase64EncodedString() ).flatMap(encodedPartition => { encodedTargetColumnNames.map(encodedTargetColumn => encodedTargetColumn.concat(encodedPartition)) }) } else { encodedTargetColumnNames } val metadataRecords: HoodieData[HoodieRecord[HoodieMetadataPayload]] = metadataTable.getRecordsByKeyPrefixes(keyPrefixes.toSeq.asJava, indexPartition, shouldReadInMemory) val columnStatsRecords: HoodieData[HoodieMetadataColumnStats] = //TODO: [HUDI-8303] Explicit conversion might not be required for Scala 2.12+ metadataRecords.map(JFunction.toJavaSerializableFunction(record => { toScalaOption(record.getData.getInsertValue(null, null)) .map(metadataRecord => metadataRecord.asInstanceOf[HoodieMetadataRecord].getColumnStatsMetadata) .orNull })) .filter(JFunction.toJavaSerializableFunction(columnStatsRecord => columnStatsRecord != null)) columnStatsRecords } def getPrunedPartitionsAndFileNamesMap(prunedPartitionsAndFileSlices: Seq[(Option[BaseHoodieTableFileIndex.PartitionPath], Seq[FileSlice])], includeLogFiles: Boolean = false): Map[String, Set[String]] = { prunedPartitionsAndFileSlices.foldLeft(Map.empty[String, Set[String]]) { case (partitionToFileMap, (partitionPathOpt, fileSlices)) => partitionPathOpt match { case Some(partitionPath) => val fileNames = fileSlices.flatMap { fileSlice => val baseFile = Option(fileSlice.getBaseFile.orElse(null)).map(_.getFileName).toSeq val logFiles = if (includeLogFiles) { fileSlice.getLogFiles.iterator().asScala.map(_.getFileName).toSeq } else Seq.empty[String] baseFile ++ logFiles }.toSet // Update the map with the new partition and its file names partitionToFileMap.updated(partitionPath.path, partitionToFileMap.getOrElse(partitionPath.path, Set.empty) ++ fileNames) case None => partitionToFileMap // Skip if no partition path } } } private def getCandidateFilesForKeys(indexPartition: String, prunedPartitionAndFileNames: Map[String, Set[String]], keys: List[String]): Set[String] = { val candidateFiles = prunedPartitionAndFileNames.flatMap { case (partition, fileNames) => fileNames.filter { fileName => val bloomFilterOpt = toScalaOption(metadataTable.getBloomFilter(partition, fileName, indexPartition)) bloomFilterOpt match { case Some(bloomFilter) => keys.exists(bloomFilter.mightContain) case None => true // If bloom filter is empty or undefined, assume the file might contain the record key } } }.toSet candidateFiles } } object ExpressionIndexSupport { val INDEX_NAME = "FUNCTIONAL" /** * Target Column Stats Index columns which internally are mapped onto fields of the corresponding * Column Stats record payload ([[HoodieMetadataColumnStats]]) persisted w/in Metadata Table */ private val targetColumnStatsIndexColumns = Seq( HoodieMetadataPayload.COLUMN_STATS_FIELD_FILE_NAME, HoodieMetadataPayload.COLUMN_STATS_FIELD_MIN_VALUE, HoodieMetadataPayload.COLUMN_STATS_FIELD_MAX_VALUE, HoodieMetadataPayload.COLUMN_STATS_FIELD_NULL_COUNT, HoodieMetadataPayload.COLUMN_STATS_FIELD_VALUE_COUNT, HoodieMetadataPayload.COLUMN_STATS_FIELD_COLUMN_NAME ) private val columnStatsRecordStructType: StructType = AvroConversionUtils.convertAvroSchemaToStructType(HoodieMetadataColumnStats.SCHEMA$) }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy