org.apache.hudi.ColumnStatsIndexSupport.scala Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi
import org.apache.hudi.ColumnStatsIndexSupport._
import org.apache.hudi.HoodieCatalystUtils.{withPersistedData, withPersistedDataset}
import org.apache.hudi.HoodieConversionUtils.toScalaOption
import org.apache.hudi.avro.model._
import org.apache.hudi.common.config.HoodieMetadataConfig
import org.apache.hudi.common.data.HoodieData
import org.apache.hudi.common.function.SerializableFunction
import org.apache.hudi.common.model.{FileSlice, HoodieRecord}
import org.apache.hudi.common.table.HoodieTableMetaClient
import org.apache.hudi.common.util.BinaryUtil.toBytes
import org.apache.hudi.common.util.ValidationUtils.checkState
import org.apache.hudi.common.util.collection
import org.apache.hudi.common.util.hash.ColumnIndexID
import org.apache.hudi.data.HoodieJavaRDD
import org.apache.hudi.metadata.{HoodieMetadataPayload, HoodieTableMetadata, HoodieTableMetadataUtil, MetadataPartitionType}
import org.apache.hudi.util.JFunction
import org.apache.avro.Conversions.DecimalConversion
import org.apache.avro.generic.GenericData
import org.apache.spark.sql.HoodieUnsafeUtils.{createDataFrameFromInternalRows, createDataFrameFromRDD, createDataFrameFromRows}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.util.DateTimeUtils
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.storage.StorageLevel
import java.nio.ByteBuffer
import scala.collection.JavaConverters._
import scala.collection.immutable.TreeSet
import scala.collection.mutable.ListBuffer
import scala.collection.parallel.mutable.ParHashMap
class ColumnStatsIndexSupport(spark: SparkSession,
tableSchema: StructType,
@transient metadataConfig: HoodieMetadataConfig,
@transient metaClient: HoodieTableMetaClient,
allowCaching: Boolean = false)
extends SparkBaseIndexSupport(spark, metadataConfig, metaClient) {
@transient private lazy val cachedColumnStatsIndexViews: ParHashMap[Seq[String], DataFrame] = ParHashMap()
// NOTE: Since [[metadataConfig]] is transient this has to be eagerly persisted, before this will be passed
// on to the executor
protected val inMemoryProjectionThreshold = metadataConfig.getColumnStatsIndexInMemoryProjectionThreshold
private lazy val indexedColumns: Set[String] = {
val customIndexedColumns = metadataConfig.getColumnsEnabledForColumnStatsIndex
// Column Stats Index could index either
// - The whole table
// - Only configured columns
if (customIndexedColumns.isEmpty) {
tableSchema.fieldNames.toSet
} else {
customIndexedColumns.asScala.toSet
}
}
override def getIndexName: String = ColumnStatsIndexSupport.INDEX_NAME
override def computeCandidateFileNames(fileIndex: HoodieFileIndex,
queryFilters: Seq[Expression],
queryReferencedColumns: Seq[String],
prunedPartitionsAndFileSlices: Seq[(Option[BaseHoodieTableFileIndex.PartitionPath], Seq[FileSlice])],
shouldPushDownFilesFilter: Boolean
): Option[Set[String]] = {
if (isIndexAvailable && queryFilters.nonEmpty && queryReferencedColumns.nonEmpty) {
val readInMemory = shouldReadInMemory(fileIndex, queryReferencedColumns, inMemoryProjectionThreshold)
val prunedFileNames = getPrunedFileNames(prunedPartitionsAndFileSlices)
// NOTE: If partition pruning doesn't prune any files, then there's no need to apply file filters
// when loading the Column Statistics Index
val prunedFileNamesOpt = if (shouldPushDownFilesFilter) Some(prunedFileNames) else None
loadTransposed(queryReferencedColumns, readInMemory, prunedFileNamesOpt) { transposedColStatsDF =>
Some(getCandidateFiles(transposedColStatsDF, queryFilters, prunedFileNames))
}
} else {
Option.empty
}
}
override def invalidateCaches(): Unit = {
cachedColumnStatsIndexViews.foreach { case (_, df) => df.unpersist() }
cachedColumnStatsIndexViews.clear()
}
/**
* Returns true in cases when Column Stats Index is built and available as standalone partition
* w/in the Metadata Table
*/
def isIndexAvailable: Boolean = {
metadataConfig.isEnabled &&
metaClient.getTableConfig.getMetadataPartitions.contains(HoodieTableMetadataUtil.PARTITION_NAME_COLUMN_STATS)
}
/**
* Loads view of the Column Stats Index in a transposed format where single row coalesces every columns'
* statistics for a single file, returning it as [[DataFrame]]
*
* Please check out scala-doc of the [[transpose]] method explaining this view in more details
*/
def loadTransposed[T](targetColumns: Seq[String],
shouldReadInMemory: Boolean,
prunedFileNamesOpt: Option[Set[String]] = None)(block: DataFrame => T): T = {
cachedColumnStatsIndexViews.get(targetColumns) match {
case Some(cachedDF) =>
block(cachedDF)
case None =>
val colStatsRecords: HoodieData[HoodieMetadataColumnStats] = prunedFileNamesOpt match {
case Some(prunedFileNames) =>
val filterFunction = new SerializableFunction[HoodieMetadataColumnStats, java.lang.Boolean] {
override def apply(r: HoodieMetadataColumnStats): java.lang.Boolean = {
prunedFileNames.contains(r.getFileName)
}
}
loadColumnStatsIndexRecords(targetColumns, shouldReadInMemory).filter(filterFunction)
case None =>
loadColumnStatsIndexRecords(targetColumns, shouldReadInMemory)
}
withPersistedData(colStatsRecords, StorageLevel.MEMORY_ONLY) {
val (transposedRows, indexSchema) = transpose(colStatsRecords, targetColumns)
val df = if (shouldReadInMemory) {
// NOTE: This will instantiate a [[Dataset]] backed by [[LocalRelation]] holding all of the rows
// of the transposed table in memory, facilitating execution of the subsequently chained operations
// on it locally (on the driver; all such operations are actually going to be performed by Spark's
// Optimizer)
createDataFrameFromRows(spark, transposedRows.collectAsList().asScala.toSeq, indexSchema)
} else {
val rdd = HoodieJavaRDD.getJavaRDD(transposedRows)
spark.createDataFrame(rdd, indexSchema)
}
if (allowCaching) {
cachedColumnStatsIndexViews.put(targetColumns, df)
// NOTE: Instead of collecting the rows from the index and hold them in memory, we instead rely
// on Spark as (potentially distributed) cache managing data lifecycle, while we simply keep
// the referenced to persisted [[DataFrame]] instance
df.persist(StorageLevel.MEMORY_ONLY)
block(df)
} else {
withPersistedDataset(df) {
block(df)
}
}
}
}
}
/**
* Loads a view of the Column Stats Index in a raw format, returning it as [[DataFrame]]
*
* Please check out scala-doc of the [[transpose]] method explaining this view in more details
*/
def load(targetColumns: Seq[String] = Seq.empty, shouldReadInMemory: Boolean = false): DataFrame = {
// NOTE: If specific columns have been provided, we can considerably trim down amount of data fetched
// by only fetching Column Stats Index records pertaining to the requested columns.
// Otherwise we fallback to read whole Column Stats Index
if (targetColumns.nonEmpty) {
loadColumnStatsIndexForColumnsInternal(targetColumns, shouldReadInMemory)
} else {
loadFullColumnStatsIndexInternal()
}
}
/**
* Transposes and converts the raw table format of the Column Stats Index representation,
* where each row/record corresponds to individual (column, file) pair, into the table format
* where each row corresponds to single file with statistic for individual columns collated
* w/in such row:
*
* Metadata Table Column Stats Index format:
*
*
* +---------------------------+------------+------------+------------+-------------+
* | fileName | columnName | minValue | maxValue | num_nulls |
* +---------------------------+------------+------------+------------+-------------+
* | one_base_file.parquet | A | 1 | 10 | 0 |
* | another_base_file.parquet | A | -10 | 0 | 5 |
* +---------------------------+------------+------------+------------+-------------+
*
*
* Returned table format
*
*
* +---------------------------+------------+------------+-------------+
* | file | A_minValue | A_maxValue | A_nullCount |
* +---------------------------+------------+------------+-------------+
* | one_base_file.parquet | 1 | 10 | 0 |
* | another_base_file.parquet | -10 | 0 | 5 |
* +---------------------------+------------+------------+-------------+
*
*
* NOTE: Column Stats Index might potentially contain statistics for many columns (if not all), while
* query at hand might only be referencing a handful of those. As such, we collect all the
* column references from the filtering expressions, and only transpose records corresponding to the
* columns referenced in those
*
* @param colStatsRecords [[HoodieData[HoodieMetadataColumnStats]]] bearing raw Column Stats Index records
* @param queryColumns target columns to be included into the final table
* @return reshaped table according to the format outlined above
*/
private def transpose(colStatsRecords: HoodieData[HoodieMetadataColumnStats], queryColumns: Seq[String]): (HoodieData[Row], StructType) = {
val tableSchemaFieldMap = tableSchema.fields.map(f => (f.name, f)).toMap
// NOTE: We're sorting the columns to make sure final index schema matches layout
// of the transposed table
val sortedTargetColumnsSet = TreeSet(queryColumns:_*)
// NOTE: This is a trick to avoid pulling all of [[ColumnStatsIndexSupport]] object into the lambdas'
// closures below
val indexedColumns = this.indexedColumns
// NOTE: It's crucial to maintain appropriate ordering of the columns
// matching table layout: hence, we cherry-pick individual columns
// instead of simply filtering in the ones we're interested in the schema
val (indexSchema, targetIndexedColumns) = composeIndexSchema(sortedTargetColumnsSet.toSeq, indexedColumns, tableSchema)
// Here we perform complex transformation which requires us to modify the layout of the rows
// of the dataset, and therefore we rely on low-level RDD API to avoid incurring encoding/decoding
// penalty of the [[Dataset]], since it's required to adhere to its schema at all times, while
// RDDs are not;
val transposedRows: HoodieData[Row] = colStatsRecords
// NOTE: Explicit conversion is required for Scala 2.11
.filter(JFunction.toJavaSerializableFunction(r => sortedTargetColumnsSet.contains(r.getColumnName)))
.mapToPair(JFunction.toJavaSerializablePairFunction(r => {
if (r.getMinValue == null && r.getMaxValue == null) {
// Corresponding row could be null in either of the 2 cases
// - Column contains only null values (in that case both min/max have to be nulls)
// - This is a stubbed Column Stats record (used as a tombstone)
collection.Pair.of(r.getFileName, r)
} else {
val minValueWrapper = r.getMinValue
val maxValueWrapper = r.getMaxValue
checkState(minValueWrapper != null && maxValueWrapper != null, "Invalid Column Stats record: either both min/max have to be null, or both have to be non-null")
val colName = r.getColumnName
val colType = tableSchemaFieldMap(colName).dataType
val minValue = deserialize(tryUnpackValueWrapper(minValueWrapper), colType)
val maxValue = deserialize(tryUnpackValueWrapper(maxValueWrapper), colType)
// Update min-/max-value structs w/ unwrapped values in-place
r.setMinValue(minValue)
r.setMaxValue(maxValue)
collection.Pair.of(r.getFileName, r)
}
}))
.groupByKey()
.map(JFunction.toJavaSerializableFunction(p => {
val columnRecordsSeq: Seq[HoodieMetadataColumnStats] = p.getValue.asScala.toSeq
val fileName: String = p.getKey
val valueCount: Long = columnRecordsSeq.head.getValueCount
// To properly align individual rows (corresponding to a file) w/in the transposed projection, we need
// to align existing column-stats for individual file with the list of expected ones for the
// whole transposed projection (a superset of all files)
val columnRecordsMap = columnRecordsSeq.map(r => (r.getColumnName, r)).toMap
val alignedColStatRecordsSeq = targetIndexedColumns.map(columnRecordsMap.get)
val coalescedRowValuesSeq =
alignedColStatRecordsSeq.foldLeft(ListBuffer[Any](fileName, valueCount)) {
case (acc, opt) =>
opt match {
case Some(colStatRecord) =>
acc ++= Seq(colStatRecord.getMinValue, colStatRecord.getMaxValue, colStatRecord.getNullCount)
case None =>
// NOTE: This could occur in either of the following cases:
// 1. When certain columns exist in the schema but are absent in some data files due to
// schema evolution or other reasons, these columns will not be present in the column stats.
// In this case, we fill in default values by setting the min, max and null-count to null
// (this behavior is consistent with reading non-existent columns from Parquet).
// 2. When certain columns are present both in the schema and the data files,
// but the column stats are absent for these columns due to their types not supporting indexing,
// we also set these columns to default values.
//
// This approach prevents errors during data skipping and, because the filter includes an isNull check,
// these conditions will not affect the accurate return of files from data skipping.
acc ++= Seq(null, null, null)
}
}
Row(coalescedRowValuesSeq.toSeq: _*)
}))
(transposedRows, indexSchema)
}
private def loadColumnStatsIndexForColumnsInternal(targetColumns: Seq[String], shouldReadInMemory: Boolean): DataFrame = {
val colStatsDF = {
val colStatsRecords: HoodieData[HoodieMetadataColumnStats] = loadColumnStatsIndexRecords(targetColumns, shouldReadInMemory)
// NOTE: Explicit conversion is required for Scala 2.11
val catalystRows: HoodieData[InternalRow] = colStatsRecords.mapPartitions(JFunction.toJavaSerializableFunction(it => {
val converter = AvroConversionUtils.createAvroToInternalRowConverter(HoodieMetadataColumnStats.SCHEMA$, columnStatsRecordStructType)
it.asScala.map(r => converter(r).orNull).asJava
}), false)
if (shouldReadInMemory) {
// NOTE: This will instantiate a [[Dataset]] backed by [[LocalRelation]] holding all of the rows
// of the transposed table in memory, facilitating execution of the subsequently chained operations
// on it locally (on the driver; all such operations are actually going to be performed by Spark's
// Optimizer)
createDataFrameFromInternalRows(spark, catalystRows.collectAsList().asScala.toSeq, columnStatsRecordStructType)
} else {
createDataFrameFromRDD(spark, HoodieJavaRDD.getJavaRDD(catalystRows), columnStatsRecordStructType)
}
}
colStatsDF.select(targetColumnStatsIndexColumns.map(col): _*)
}
def loadColumnStatsIndexRecords(targetColumns: Seq[String], shouldReadInMemory: Boolean): HoodieData[HoodieMetadataColumnStats] = {
// Read Metadata Table's Column Stats Index records into [[HoodieData]] container by
// - Fetching the records from CSI by key-prefixes (encoded column names)
// - Extracting [[HoodieMetadataColumnStats]] records
// - Filtering out nulls
checkState(targetColumns.nonEmpty)
// TODO encoding should be done internally w/in HoodieBackedTableMetadata
val encodedTargetColumnNames = targetColumns.map(colName => new ColumnIndexID(colName).asBase64EncodedString())
val metadataRecords: HoodieData[HoodieRecord[HoodieMetadataPayload]] =
metadataTable.getRecordsByKeyPrefixes(encodedTargetColumnNames.asJava, HoodieTableMetadataUtil.PARTITION_NAME_COLUMN_STATS, shouldReadInMemory)
val columnStatsRecords: HoodieData[HoodieMetadataColumnStats] =
// NOTE: Explicit conversion is required for Scala 2.11
metadataRecords.map(JFunction.toJavaSerializableFunction(record => {
toScalaOption(record.getData.getInsertValue(null, null))
.map(metadataRecord => metadataRecord.asInstanceOf[HoodieMetadataRecord].getColumnStatsMetadata)
.orNull
}))
.filter(JFunction.toJavaSerializableFunction(columnStatsRecord => columnStatsRecord != null))
columnStatsRecords
}
private def loadFullColumnStatsIndexInternal(): DataFrame = {
val metadataTablePath = HoodieTableMetadata.getMetadataTableBasePath(metaClient.getBasePath)
// Read Metadata Table's Column Stats Index into Spark's [[DataFrame]]
val colStatsDF = spark.read.format("org.apache.hudi")
.options(metadataConfig.getProps.asScala)
.load(s"$metadataTablePath/${MetadataPartitionType.COLUMN_STATS.getPartitionPath}")
val requiredIndexColumns =
targetColumnStatsIndexColumns.map(colName =>
col(s"${HoodieMetadataPayload.SCHEMA_FIELD_ID_COLUMN_STATS}.${colName}"))
colStatsDF.where(col(HoodieMetadataPayload.SCHEMA_FIELD_ID_COLUMN_STATS).isNotNull)
.select(requiredIndexColumns: _*)
}
}
object ColumnStatsIndexSupport {
val INDEX_NAME = "COLUMN_STATS"
private val expectedAvroSchemaValues = Set("BooleanWrapper", "IntWrapper", "LongWrapper", "FloatWrapper", "DoubleWrapper",
"BytesWrapper", "StringWrapper", "DateWrapper", "DecimalWrapper", "TimeMicrosWrapper", "TimestampMicrosWrapper")
/**
* Target Column Stats Index columns which internally are mapped onto fields of the corresponding
* Column Stats record payload ([[HoodieMetadataColumnStats]]) persisted w/in Metadata Table
*/
private val targetColumnStatsIndexColumns = Seq(
HoodieMetadataPayload.COLUMN_STATS_FIELD_FILE_NAME,
HoodieMetadataPayload.COLUMN_STATS_FIELD_MIN_VALUE,
HoodieMetadataPayload.COLUMN_STATS_FIELD_MAX_VALUE,
HoodieMetadataPayload.COLUMN_STATS_FIELD_NULL_COUNT,
HoodieMetadataPayload.COLUMN_STATS_FIELD_VALUE_COUNT,
HoodieMetadataPayload.COLUMN_STATS_FIELD_COLUMN_NAME
)
private val columnStatsRecordStructType: StructType = AvroConversionUtils.convertAvroSchemaToStructType(HoodieMetadataColumnStats.SCHEMA$)
/**
* @VisibleForTesting
*/
def composeIndexSchema(targetColumnNames: Seq[String], indexedColumns: Set[String], tableSchema: StructType): (StructType, Seq[String]) = {
val fileNameField = StructField(HoodieMetadataPayload.COLUMN_STATS_FIELD_FILE_NAME, StringType, nullable = true, Metadata.empty)
val valueCountField = StructField(HoodieMetadataPayload.COLUMN_STATS_FIELD_VALUE_COUNT, LongType, nullable = true, Metadata.empty)
val targetIndexedColumns = targetColumnNames.filter(indexedColumns.contains(_))
val targetIndexedFields = targetIndexedColumns.map(colName => tableSchema.fields.find(f => f.name == colName).get)
(StructType(
targetIndexedFields.foldLeft(Seq(fileNameField, valueCountField)) {
case (acc, field) =>
acc ++ Seq(
composeColumnStatStructType(field.name, HoodieMetadataPayload.COLUMN_STATS_FIELD_MIN_VALUE, field.dataType),
composeColumnStatStructType(field.name, HoodieMetadataPayload.COLUMN_STATS_FIELD_MAX_VALUE, field.dataType),
composeColumnStatStructType(field.name, HoodieMetadataPayload.COLUMN_STATS_FIELD_NULL_COUNT, LongType))
}
), targetIndexedColumns)
}
@inline def getMinColumnNameFor(colName: String): String =
formatColName(colName, HoodieMetadataPayload.COLUMN_STATS_FIELD_MIN_VALUE)
@inline def getMaxColumnNameFor(colName: String): String =
formatColName(colName, HoodieMetadataPayload.COLUMN_STATS_FIELD_MAX_VALUE)
@inline def getNullCountColumnNameFor(colName: String): String =
formatColName(colName, HoodieMetadataPayload.COLUMN_STATS_FIELD_NULL_COUNT)
@inline def getValueCountColumnNameFor: String =
HoodieMetadataPayload.COLUMN_STATS_FIELD_VALUE_COUNT
@inline private def formatColName(col: String, statName: String) = { // TODO add escaping for
String.format("%s_%s", col, statName)
}
@inline private def composeColumnStatStructType(col: String, statName: String, dataType: DataType) =
StructField(formatColName(col, statName), dataType, nullable = true, Metadata.empty)
private def tryUnpackValueWrapper(valueWrapper: AnyRef): Any = {
valueWrapper match {
case w: BooleanWrapper => w.getValue
case w: IntWrapper => w.getValue
case w: LongWrapper => w.getValue
case w: FloatWrapper => w.getValue
case w: DoubleWrapper => w.getValue
case w: BytesWrapper => w.getValue
case w: StringWrapper => w.getValue
case w: DateWrapper => w.getValue
case w: DecimalWrapper => w.getValue
case w: TimeMicrosWrapper => w.getValue
case w: TimestampMicrosWrapper => w.getValue
case r: GenericData.Record if expectedAvroSchemaValues.contains(r.getSchema.getName) =>
r.get("value")
case _ => throw new UnsupportedOperationException(s"Not recognized value wrapper type (${valueWrapper.getClass.getSimpleName})")
}
}
val decConv = new DecimalConversion()
private def deserialize(value: Any, dataType: DataType): Any = {
dataType match {
// NOTE: Since we can't rely on Avro's "date", and "timestamp-micros" logical-types, we're
// manually encoding corresponding values as int and long w/in the Column Stats Index and
// here we have to decode those back into corresponding logical representation.
case TimestampType => DateTimeUtils.toJavaTimestamp(value.asInstanceOf[Long])
case DateType => DateTimeUtils.toJavaDate(value.asInstanceOf[Int])
// Standard types
case StringType => value
case BooleanType => value
// Numeric types
case FloatType => value
case DoubleType => value
case LongType => value
case IntegerType => value
// NOTE: All integral types of size less than Int are encoded as Ints in MT
case ShortType => value.asInstanceOf[Int].toShort
case ByteType => value.asInstanceOf[Int].toByte
// TODO fix
case _: DecimalType =>
value match {
case buffer: ByteBuffer =>
val logicalType = DecimalWrapper.SCHEMA$.getField("value").schema().getLogicalType
decConv.fromBytes(buffer, null, logicalType)
case _ => value
}
case BinaryType =>
value match {
case b: ByteBuffer => toBytes(b)
case other => other
}
case _ =>
throw new UnsupportedOperationException(s"Data type for the statistic value is not recognized $dataType")
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy