org.apache.hudi.ExpressionIndexSupport.scala Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hudi
import org.apache.hudi.ColumnStatsIndexSupport.{composeColumnStatStructType, deserialize, tryUnpackValueWrapper}
import org.apache.hudi.ExpressionIndexSupport._
import org.apache.hudi.HoodieCatalystUtils.{withPersistedData, withPersistedDataset}
import org.apache.hudi.HoodieConversionUtils.toScalaOption
import org.apache.hudi.HoodieSparkExpressionIndex.SPARK_FUNCTION_MAP
import org.apache.hudi.RecordLevelIndexSupport.{fetchQueryWithAttribute, filterQueryWithRecordKey}
import org.apache.hudi.avro.model.{HoodieMetadataColumnStats, HoodieMetadataRecord}
import org.apache.hudi.common.config.HoodieMetadataConfig
import org.apache.hudi.common.data.HoodieData
import org.apache.hudi.common.model.{FileSlice, HoodieIndexDefinition, HoodieRecord}
import org.apache.hudi.common.table.HoodieTableMetaClient
import org.apache.hudi.common.util.ValidationUtils.checkState
import org.apache.hudi.common.util.collection
import org.apache.hudi.common.util.hash.{ColumnIndexID, PartitionIndexID}
import org.apache.hudi.data.HoodieJavaRDD
import org.apache.hudi.index.functional.HoodieExpressionIndex.SPARK_FROM_UNIXTIME
import org.apache.hudi.metadata.{HoodieMetadataPayload, HoodieTableMetadataUtil, MetadataPartitionType}
import org.apache.hudi.util.JFunction
import org.apache.spark.sql.HoodieUnsafeUtils.{createDataFrameFromInternalRows, createDataFrameFromRDD, createDataFrameFromRows}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{EqualTo, Expression, FromUnixTime, In, Literal, UnaryExpression}
import org.apache.spark.sql.catalyst.util.TimestampFormatter
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.storage.StorageLevel
import scala.collection.JavaConverters._
import scala.collection.immutable.TreeSet
import scala.collection.mutable.ListBuffer
import scala.collection.parallel.mutable.ParHashMap
class ExpressionIndexSupport(spark: SparkSession,
tableSchema: StructType,
metadataConfig: HoodieMetadataConfig,
metaClient: HoodieTableMetaClient)
extends SparkBaseIndexSupport (spark, metadataConfig, metaClient) {
@transient private lazy val cachedColumnStatsIndexViews: ParHashMap[Seq[String], DataFrame] = ParHashMap()
// NOTE: Since [[metadataConfig]] is transient this has to be eagerly persisted, before this will be passed on to the executor
private val inMemoryProjectionThreshold = metadataConfig.getColumnStatsIndexInMemoryProjectionThreshold
override def getIndexName: String = ExpressionIndexSupport.INDEX_NAME
override def computeCandidateFileNames(fileIndex: HoodieFileIndex,
queryFilters: Seq[Expression],
queryReferencedColumns: Seq[String],
prunedPartitionsAndFileSlices: Seq[(Option[BaseHoodieTableFileIndex.PartitionPath], Seq[FileSlice])],
shouldPushDownFilesFilter: Boolean
): Option[Set[String]] = {
lazy val expressionIndexPartitionOpt = getExpressionIndexPartitionAndLiterals(queryFilters)
if (isIndexAvailable && queryFilters.nonEmpty && expressionIndexPartitionOpt.nonEmpty) {
val (indexPartition, expressionIndexQuery, literals) = expressionIndexPartitionOpt.get
val indexDefinition = metaClient.getIndexMetadata.get().getIndexDefinitions.get(indexPartition)
if (indexDefinition.getIndexType.equals(HoodieTableMetadataUtil.PARTITION_NAME_COLUMN_STATS)) {
val readInMemory = shouldReadInMemory(fileIndex, queryReferencedColumns, inMemoryProjectionThreshold)
val (prunedPartitions, prunedFileNames) = getPrunedPartitionsAndFileNames(fileIndex, prunedPartitionsAndFileSlices)
val expressionIndexRecords = loadExpressionIndexRecords(indexPartition, prunedPartitions, readInMemory)
loadTransposed(queryReferencedColumns, readInMemory, expressionIndexRecords, expressionIndexQuery) {
transposedColStatsDF =>Some(getCandidateFiles(transposedColStatsDF, Seq(expressionIndexQuery), prunedFileNames, isExpressionIndex = true))
}
} else if (indexDefinition.getIndexType.equals(HoodieTableMetadataUtil.PARTITION_NAME_BLOOM_FILTERS)) {
val prunedPartitionAndFileNames = getPrunedPartitionsAndFileNamesMap(prunedPartitionsAndFileSlices, includeLogFiles = true)
Option.apply(getCandidateFilesForKeys(indexPartition, prunedPartitionAndFileNames, literals))
} else {
Option.empty
}
} else {
Option.empty
}
}
/**
* Loads view of the Column Stats Index in a transposed format where single row coalesces every columns'
* statistics for a single file, returning it as [[DataFrame]]
*
* Please check out scala-doc of the [[transpose]] method explaining this view in more details
*/
def loadTransposed[T](targetColumns: Seq[String],
shouldReadInMemory: Boolean,
colStatRecords: HoodieData[HoodieMetadataColumnStats],
expressionIndexQuery: Expression) (block: DataFrame => T): T = {
cachedColumnStatsIndexViews.get(targetColumns) match {
case Some(cachedDF) =>
block(cachedDF)
case None =>
val colStatsRecords: HoodieData[HoodieMetadataColumnStats] = colStatRecords
withPersistedData(colStatsRecords, StorageLevel.MEMORY_ONLY) {
val (transposedRows, indexSchema) = transpose(colStatsRecords, targetColumns, expressionIndexQuery)
val df = if (shouldReadInMemory) {
// NOTE: This will instantiate a [[Dataset]] backed by [[LocalRelation]] holding all of the rows
// of the transposed table in memory, facilitating execution of the subsequently chained operations
// on it locally (on the driver; all such operations are actually going to be performed by Spark's
// Optimizer)
createDataFrameFromRows(spark, transposedRows.collectAsList().asScala.toSeq, indexSchema)
} else {
val rdd = HoodieJavaRDD.getJavaRDD(transposedRows)
spark.createDataFrame(rdd, indexSchema)
}
val allowCaching: Boolean = false
if (allowCaching) {
cachedColumnStatsIndexViews.put(targetColumns, df)
// NOTE: Instead of collecting the rows from the index and hold them in memory, we instead rely
// on Spark as (potentially distributed) cache managing data lifecycle, while we simply keep
// the referenced to persisted [[DataFrame]] instance
df.persist(StorageLevel.MEMORY_ONLY)
block(df)
} else {
withPersistedDataset(df) {
block(df)
}
}
}
}
}
/**
* Transposes and converts the raw table format of the Column Stats Index representation,
* where each row/record corresponds to individual (column, file) pair, into the table format
* where each row corresponds to single file with statistic for individual columns collated
* w/in such row:
*
* Metadata Table Column Stats Index format:
*
*
* +---------------------------+------------+------------+------------+-------------+
* | fileName | columnName | minValue | maxValue | num_nulls |
* +---------------------------+------------+------------+------------+-------------+
* | one_base_file.parquet | A | 1 | 10 | 0 |
* | another_base_file.parquet | A | -10 | 0 | 5 |
* +---------------------------+------------+------------+------------+-------------+
*
*
* Returned table format
*
*
* +---------------------------+------------+------------+-------------+
* | file | A_minValue | A_maxValue | A_nullCount |
* +---------------------------+------------+------------+-------------+
* | one_base_file.parquet | 1 | 10 | 0 |
* | another_base_file.parquet | -10 | 0 | 5 |
* +---------------------------+------------+------------+-------------+
*
*
* NOTE: Column Stats Index might potentially contain statistics for many columns (if not all), while
* query at hand might only be referencing a handful of those. As such, we collect all the
* column references from the filtering expressions, and only transpose records corresponding to the
* columns referenced in those
*
* @param colStatsRecords [[HoodieData[HoodieMetadataColumnStats]]] bearing raw Column Stats Index records
* @param queryColumns target columns to be included into the final table
* @return reshaped table according to the format outlined above
*/
private def transpose(colStatsRecords: HoodieData[HoodieMetadataColumnStats], queryColumns: Seq[String], expressionIndexQuery: Expression): (HoodieData[Row], StructType) = {
val tableSchemaFieldMap = tableSchema.fields.map(f => (f.name, f)).toMap
// NOTE: We're sorting the columns to make sure final index schema matches layout
// of the transposed table
val sortedTargetColumnsSet = TreeSet(queryColumns:_*)
// NOTE: This is a trick to avoid pulling all of [[ColumnStatsIndexSupport]] object into the lambdas'
// closures below
val indexedColumns = queryColumns.toSet
// NOTE: It's crucial to maintain appropriate ordering of the columns
// matching table layout: hence, we cherry-pick individual columns
// instead of simply filtering in the ones we're interested in the schema
val (indexSchema, targetIndexedColumns) = composeIndexSchema(sortedTargetColumnsSet.toSeq, indexedColumns, tableSchema, expressionIndexQuery)
// Here we perform complex transformation which requires us to modify the layout of the rows
// of the dataset, and therefore we rely on low-level RDD API to avoid incurring encoding/decoding
// penalty of the [[Dataset]], since it's required to adhere to its schema at all times, while
// RDDs are not;
val transposedRows: HoodieData[Row] = colStatsRecords
//TODO: [HUDI-8303] Explicit conversion might not be required for Scala 2.12+
.filter(JFunction.toJavaSerializableFunction(r => sortedTargetColumnsSet.contains(r.getColumnName)))
.mapToPair(JFunction.toJavaSerializablePairFunction(r => {
if (r.getMinValue == null && r.getMaxValue == null) {
// Corresponding row could be null in either of the 2 cases
// - Column contains only null values (in that case both min/max have to be nulls)
// - This is a stubbed Column Stats record (used as a tombstone)
collection.Pair.of(r.getFileName, r)
} else {
val minValueWrapper = r.getMinValue
val maxValueWrapper = r.getMaxValue
checkState(minValueWrapper != null && maxValueWrapper != null, "Invalid Column Stats record: either both min/max have to be null, or both have to be non-null")
val colName = r.getColumnName
val colType = tableSchemaFieldMap(colName).dataType
val minValue = deserialize(tryUnpackValueWrapper(minValueWrapper), colType)
val maxValue = deserialize(tryUnpackValueWrapper(maxValueWrapper), colType)
// Update min-/max-value structs w/ unwrapped values in-place
r.setMinValue(minValue)
r.setMaxValue(maxValue)
collection.Pair.of(r.getFileName, r)
}
}))
.groupByKey()
.map(JFunction.toJavaSerializableFunction(p => {
val columnRecordsSeq: Seq[HoodieMetadataColumnStats] = p.getValue.asScala.toSeq
val fileName: String = p.getKey
val valueCount: Long = columnRecordsSeq.head.getValueCount
// To properly align individual rows (corresponding to a file) w/in the transposed projection, we need
// to align existing column-stats for individual file with the list of expected ones for the
// whole transposed projection (a superset of all files)
val columnRecordsMap = columnRecordsSeq.map(r => (r.getColumnName, r)).toMap
val alignedColStatRecordsSeq = targetIndexedColumns.map(columnRecordsMap.get)
val coalescedRowValuesSeq =
alignedColStatRecordsSeq.foldLeft(ListBuffer[Any](fileName, valueCount)) {
case (acc, opt) =>
opt match {
case Some(colStatRecord) =>
acc ++= Seq(colStatRecord.getMinValue, colStatRecord.getMaxValue, colStatRecord.getNullCount)
case None =>
// NOTE: This could occur in either of the following cases:
// 1. When certain columns exist in the schema but are absent in some data files due to
// schema evolution or other reasons, these columns will not be present in the column stats.
// In this case, we fill in default values by setting the min, max and null-count to null
// (this behavior is consistent with reading non-existent columns from Parquet).
// 2. When certain columns are present both in the schema and the data files,
// but the column stats are absent for these columns due to their types not supporting indexing,
// we also set these columns to default values.
//
// This approach prevents errors during data skipping and, because the filter includes an isNull check,
// these conditions will not affect the accurate return of files from data skipping.
acc ++= Seq(null, null, null)
}
}
Row(coalescedRowValuesSeq.toSeq: _*)
}))
(transposedRows, indexSchema)
}
def composeIndexSchema(targetColumnNames: Seq[String], indexedColumns: Set[String], tableSchema: StructType, expressionIndexQuery: Expression): (StructType, Seq[String]) = {
val fileNameField = StructField(HoodieMetadataPayload.COLUMN_STATS_FIELD_FILE_NAME, StringType, nullable = true, Metadata.empty)
val valueCountField = StructField(HoodieMetadataPayload.COLUMN_STATS_FIELD_VALUE_COUNT, LongType, nullable = true, Metadata.empty)
val targetIndexedColumns = targetColumnNames.filter(indexedColumns.contains(_))
val targetIndexedFields = targetIndexedColumns.map(colName => tableSchema.fields.find(f => f.name == colName).get)
val dataType: DataType = expressionIndexQuery match {
case eq: EqualTo => eq.right.asInstanceOf[Literal].dataType
case in: In => in.list(0).asInstanceOf[Literal].dataType
case _ => targetIndexedFields(0).dataType
}
(StructType(
targetIndexedFields.foldLeft(Seq(fileNameField, valueCountField)) {
case (acc, field) =>
acc ++ Seq(
composeColumnStatStructType(field.name, HoodieMetadataPayload.COLUMN_STATS_FIELD_MIN_VALUE, dataType),
composeColumnStatStructType(field.name, HoodieMetadataPayload.COLUMN_STATS_FIELD_MAX_VALUE, dataType),
composeColumnStatStructType(field.name, HoodieMetadataPayload.COLUMN_STATS_FIELD_NULL_COUNT, LongType))
}
), targetIndexedColumns)
}
def loadColumnStatsIndexRecords(targetColumns: Seq[String], prunedPartitions: Option[Set[String]] = None, shouldReadInMemory: Boolean): HoodieData[HoodieMetadataColumnStats] = {
// Read Metadata Table's Column Stats Index records into [[HoodieData]] container by
// - Fetching the records from CSI by key-prefixes (encoded column names)
// - Extracting [[HoodieMetadataColumnStats]] records
// - Filtering out nulls
checkState(targetColumns.nonEmpty)
// TODO encoding should be done internally w/in HoodieBackedTableMetadata
val encodedTargetColumnNames = targetColumns.map(colName => new ColumnIndexID(colName).asBase64EncodedString())
// encode column name and parition name if partition list is available
val keyPrefixes = if (prunedPartitions.isDefined) {
prunedPartitions.get.map(partitionPath =>
new PartitionIndexID(HoodieTableMetadataUtil.getPartitionIdentifier(partitionPath)).asBase64EncodedString()
).flatMap(encodedPartition => {
encodedTargetColumnNames.map(encodedTargetColumn => encodedTargetColumn.concat(encodedPartition))
})
} else {
encodedTargetColumnNames
}
val metadataRecords: HoodieData[HoodieRecord[HoodieMetadataPayload]] =
metadataTable.getRecordsByKeyPrefixes(keyPrefixes.toSeq.asJava, HoodieTableMetadataUtil.PARTITION_NAME_COLUMN_STATS, shouldReadInMemory)
val columnStatsRecords: HoodieData[HoodieMetadataColumnStats] =
//TODO: [HUDI-8303] Explicit conversion might not be required for Scala 2.12+
metadataRecords.map(JFunction.toJavaSerializableFunction(record => {
toScalaOption(record.getData.getInsertValue(null, null))
.map(metadataRecord => metadataRecord.asInstanceOf[HoodieMetadataRecord].getColumnStatsMetadata)
.orNull
}))
.filter(JFunction.toJavaSerializableFunction(columnStatsRecord => columnStatsRecord != null))
columnStatsRecords
}
override def invalidateCaches(): Unit = {
cachedColumnStatsIndexViews.foreach { case (_, df) => df.unpersist() }
cachedColumnStatsIndexViews.clear()
}
/**
* Return true if metadata table is enabled and expression index metadata partition is available.
*/
def isIndexAvailable: Boolean = {
metadataConfig.isEnabled && metaClient.getIndexMetadata.isPresent && !metaClient.getIndexMetadata.get().getIndexDefinitions.isEmpty
}
private def filterQueriesWithFunctionalFilterKey(queryFilters: Seq[Expression], sourceFieldOpt: Option[String]): List[Tuple2[Expression, List[String]]] = {
var expressionIndexQueries: List[Tuple2[Expression, List[String]]] = List.empty
for (query <- queryFilters) {
val attributeFetcher = (expr: Expression) => {
expr match {
case expression: UnaryExpression => expression.child
case expression: FromUnixTime => expression.sec
case other => other
}
}
filterQueryWithRecordKey(query, sourceFieldOpt, attributeFetcher).foreach({
case (exp: Expression, literals: List[String]) =>
expressionIndexQueries = expressionIndexQueries :+ Tuple2.apply(exp, literals)
})
}
expressionIndexQueries
}
/**
* Searches for an index partition based on the specified index function and target column name.
*
* This method looks up the index definitions available in the metadata of a `metaClient` instance
* and attempts to find an index partition where the index function and the source fields match
* the provided arguments. If a matching index definition is found, the partition identifier for
* that index is returned.
*
* @param queryFilters A sequence of `Expression` objects to analyze. Each expression should involve a single column
* for the method to consider it (expressions involving multiple columns are skipped).
* @return An `Option` containing the index partition identifier if a matching index definition is found.
* Returns `None` if no matching index definition is found.
*/
private def getExpressionIndexPartitionAndLiterals(queryFilters: Seq[Expression]): Option[Tuple3[String, Expression, List[String]]] = {
val indexDefinitions = metaClient.getIndexMetadata.get().getIndexDefinitions.asScala
if (indexDefinitions.nonEmpty) {
val functionDefinitions = indexDefinitions.values
.filter(definition => MetadataPartitionType.fromPartitionPath(definition.getIndexName).equals(MetadataPartitionType.EXPRESSION_INDEX))
.toList
var indexPartitionAndLiteralsOpt: Option[Tuple3[String, Expression, List[String]]] = Option.empty
functionDefinitions.foreach(indexDefinition => {
val queryInfoOpt = extractQueryAndLiterals(queryFilters, indexDefinition)
if (queryInfoOpt.isDefined) {
indexPartitionAndLiteralsOpt = Option.apply(Tuple3.apply(indexDefinition.getIndexName, queryInfoOpt.get._1, queryInfoOpt.get._2))
}
})
indexPartitionAndLiteralsOpt
} else {
Option.empty
}
}
/**
* Extracts mappings from function names to column names from a sequence of expressions.
*
* This method iterates over a given sequence of Spark SQL expressions and identifies expressions
* that contain function calls corresponding to keys in the `SPARK_FUNCTION_MAP`. It supports only
* expressions that are simple binary expressions involving a single column. If an expression contains
* one of the functions and operates on a single column, this method maps the function name to the
* column name.
*/
private def extractQueryAndLiterals(queryFilters: Seq[Expression], indexDefinition: HoodieIndexDefinition): Option[(Expression, List[String])] = {
val expressionIndexQueries = filterQueriesWithFunctionalFilterKey(queryFilters, Option.apply(indexDefinition.getSourceFields.get(0)))
var queryAndLiteralsOpt: Option[(Expression, List[String])] = Option.empty
expressionIndexQueries.foreach { tuple =>
val (expr, literals) = (tuple._1, tuple._2)
val functionNameOption = SPARK_FUNCTION_MAP.asScala.keys.find(expr.toString.contains)
val functionName = functionNameOption.getOrElse("identity")
if (indexDefinition.getIndexFunction.equals(functionName)) {
val attributeFetcher = (expr: Expression) => {
expr match {
case expression: UnaryExpression => expression.child
case expression: FromUnixTime => expression.sec
case other => other
}
}
if (functionName.equals(SPARK_FROM_UNIXTIME)) {
val configuredFormat = indexDefinition.getIndexOptions.getOrDefault("format", TimestampFormatter.defaultPattern)
if (expr.toString().contains(configuredFormat)) {
val pruningExpr = fetchQueryWithAttribute(expr, Option.apply(indexDefinition.getSourceFields.get(0)), RecordLevelIndexSupport.getSimpleLiteralGenerator(), attributeFetcher)._1.get._1
queryAndLiteralsOpt = Option.apply(Tuple2.apply(pruningExpr, literals))
}
} else {
val pruningExpr = fetchQueryWithAttribute(expr, Option.apply(indexDefinition.getSourceFields.get(0)), RecordLevelIndexSupport.getSimpleLiteralGenerator(), attributeFetcher)._1.get._1
queryAndLiteralsOpt = Option.apply(Tuple2.apply(pruningExpr, literals))
}
}
}
queryAndLiteralsOpt
}
private def loadExpressionIndexRecords(indexPartition: String,
prunedPartitions: Set[String],
shouldReadInMemory: Boolean): HoodieData[HoodieMetadataColumnStats] = {
val indexDefinition = metaClient.getIndexMetadata.get().getIndexDefinitions.get(indexPartition)
val colStatsRecords: HoodieData[HoodieMetadataColumnStats] = loadExpressionIndexForColumnsInternal(
indexDefinition.getSourceFields.asScala.toSeq, prunedPartitions, indexPartition, shouldReadInMemory)
//TODO: [HUDI-8303] Explicit conversion might not be required for Scala 2.12+
colStatsRecords
}
def loadExpressionIndexDataFrame(indexPartition: String,
prunedPartitions: Set[String],
shouldReadInMemory: Boolean): DataFrame = {
val colStatsDF = {
val indexDefinition = metaClient.getIndexMetadata.get().getIndexDefinitions.get(indexPartition)
val colStatsRecords: HoodieData[HoodieMetadataColumnStats] = loadExpressionIndexForColumnsInternal(
indexDefinition.getSourceFields.asScala.toSeq, prunedPartitions, indexPartition, shouldReadInMemory)
//TODO: [HUDI-8303] Explicit conversion might not be required for Scala 2.12+
val catalystRows: HoodieData[InternalRow] = colStatsRecords.map[InternalRow](JFunction.toJavaSerializableFunction(r => {
val converter = AvroConversionUtils.createAvroToInternalRowConverter(HoodieMetadataColumnStats.SCHEMA$, columnStatsRecordStructType)
converter(r).orNull
}))
if (shouldReadInMemory) {
// NOTE: This will instantiate a [[Dataset]] backed by [[LocalRelation]] holding all of the rows
// of the transposed table in memory, facilitating execution of the subsequently chained operations
// on it locally (on the driver; all such operations are actually going to be performed by Spark's
// Optimizer)
createDataFrameFromInternalRows(spark, catalystRows.collectAsList().asScala.toSeq, columnStatsRecordStructType)
} else {
createDataFrameFromRDD(spark, HoodieJavaRDD.getJavaRDD(catalystRows), columnStatsRecordStructType)
}
}
colStatsDF.select(targetColumnStatsIndexColumns.map(col): _*)
}
private def loadExpressionIndexForColumnsInternal(targetColumns: Seq[String],
prunedPartitions: Set[String],
indexPartition: String,
shouldReadInMemory: Boolean): HoodieData[HoodieMetadataColumnStats] = {
// Read Metadata Table's Expression Index records into [[HoodieData]] container by
// - Fetching the records from CSI by key-prefixes (encoded column names)
// - Extracting [[HoodieMetadataColumnStats]] records
// - Filtering out nulls
checkState(targetColumns.nonEmpty)
val encodedTargetColumnNames = targetColumns.map(colName => new ColumnIndexID(colName).asBase64EncodedString())
val keyPrefixes = if (prunedPartitions.nonEmpty) {
prunedPartitions.map(partitionPath =>
new PartitionIndexID(HoodieTableMetadataUtil.getPartitionIdentifier(partitionPath)).asBase64EncodedString()
).flatMap(encodedPartition => {
encodedTargetColumnNames.map(encodedTargetColumn => encodedTargetColumn.concat(encodedPartition))
})
} else {
encodedTargetColumnNames
}
val metadataRecords: HoodieData[HoodieRecord[HoodieMetadataPayload]] =
metadataTable.getRecordsByKeyPrefixes(keyPrefixes.toSeq.asJava, indexPartition, shouldReadInMemory)
val columnStatsRecords: HoodieData[HoodieMetadataColumnStats] =
//TODO: [HUDI-8303] Explicit conversion might not be required for Scala 2.12+
metadataRecords.map(JFunction.toJavaSerializableFunction(record => {
toScalaOption(record.getData.getInsertValue(null, null))
.map(metadataRecord => metadataRecord.asInstanceOf[HoodieMetadataRecord].getColumnStatsMetadata)
.orNull
}))
.filter(JFunction.toJavaSerializableFunction(columnStatsRecord => columnStatsRecord != null))
columnStatsRecords
}
def getPrunedPartitionsAndFileNamesMap(prunedPartitionsAndFileSlices: Seq[(Option[BaseHoodieTableFileIndex.PartitionPath], Seq[FileSlice])],
includeLogFiles: Boolean = false): Map[String, Set[String]] = {
prunedPartitionsAndFileSlices.foldLeft(Map.empty[String, Set[String]]) {
case (partitionToFileMap, (partitionPathOpt, fileSlices)) =>
partitionPathOpt match {
case Some(partitionPath) =>
val fileNames = fileSlices.flatMap { fileSlice =>
val baseFile = Option(fileSlice.getBaseFile.orElse(null)).map(_.getFileName).toSeq
val logFiles = if (includeLogFiles) {
fileSlice.getLogFiles.iterator().asScala.map(_.getFileName).toSeq
} else Seq.empty[String]
baseFile ++ logFiles
}.toSet
// Update the map with the new partition and its file names
partitionToFileMap.updated(partitionPath.path, partitionToFileMap.getOrElse(partitionPath.path, Set.empty) ++ fileNames)
case None =>
partitionToFileMap // Skip if no partition path
}
}
}
private def getCandidateFilesForKeys(indexPartition: String, prunedPartitionAndFileNames: Map[String, Set[String]], keys: List[String]): Set[String] = {
val candidateFiles = prunedPartitionAndFileNames.flatMap { case (partition, fileNames) =>
fileNames.filter { fileName =>
val bloomFilterOpt = toScalaOption(metadataTable.getBloomFilter(partition, fileName, indexPartition))
bloomFilterOpt match {
case Some(bloomFilter) =>
keys.exists(bloomFilter.mightContain)
case None =>
true // If bloom filter is empty or undefined, assume the file might contain the record key
}
}
}.toSet
candidateFiles
}
}
object ExpressionIndexSupport {
val INDEX_NAME = "FUNCTIONAL"
/**
* Target Column Stats Index columns which internally are mapped onto fields of the corresponding
* Column Stats record payload ([[HoodieMetadataColumnStats]]) persisted w/in Metadata Table
*/
private val targetColumnStatsIndexColumns = Seq(
HoodieMetadataPayload.COLUMN_STATS_FIELD_FILE_NAME,
HoodieMetadataPayload.COLUMN_STATS_FIELD_MIN_VALUE,
HoodieMetadataPayload.COLUMN_STATS_FIELD_MAX_VALUE,
HoodieMetadataPayload.COLUMN_STATS_FIELD_NULL_COUNT,
HoodieMetadataPayload.COLUMN_STATS_FIELD_VALUE_COUNT,
HoodieMetadataPayload.COLUMN_STATS_FIELD_COLUMN_NAME
)
private val columnStatsRecordStructType: StructType = AvroConversionUtils.convertAvroSchemaToStructType(HoodieMetadataColumnStats.SCHEMA$)
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy