All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.ColumnStatsIndexSupport.scala Maven / Gradle / Ivy

There is a newer version: 1.0.0-beta2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi

import org.apache.avro.Schema.Parser
import org.apache.avro.generic.GenericRecord
import org.apache.hudi.ColumnStatsIndexSupport.{composeIndexSchema, deserialize, metadataRecordSchemaString, metadataRecordStructType, tryUnpackNonNullVal}
import org.apache.hudi.HoodieConversionUtils.toScalaOption
import org.apache.hudi.avro.model.HoodieMetadataRecord
import org.apache.hudi.client.common.HoodieSparkEngineContext
import org.apache.hudi.common.config.HoodieMetadataConfig
import org.apache.hudi.common.model.HoodieRecord
import org.apache.hudi.common.table.view.FileSystemViewStorageConfig
import org.apache.hudi.common.util.ValidationUtils.checkState
import org.apache.hudi.common.util.hash.ColumnIndexID
import org.apache.hudi.data.HoodieJavaRDD
import org.apache.hudi.metadata.{HoodieMetadataPayload, HoodieTableMetadata, HoodieTableMetadataUtil, MetadataPartitionType}
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.util.DateTimeUtils
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, HoodieUnsafeRDDUtils, Row, SparkSession}

import scala.collection.JavaConverters._
import scala.collection.immutable.TreeSet

/**
 * Mixin trait abstracting away heavy-lifting of interactions with Metadata Table's Column Stats Index,
 * providing convenient interfaces to read it, transpose, etc
 */
trait ColumnStatsIndexSupport extends SparkAdapterSupport {

  def readColumnStatsIndex(spark: SparkSession,
                           tableBasePath: String,
                           metadataConfig: HoodieMetadataConfig,
                           targetColumns: Seq[String] = Seq.empty): DataFrame = {
    val targetColStatsIndexColumns = Seq(
      HoodieMetadataPayload.COLUMN_STATS_FIELD_FILE_NAME,
      HoodieMetadataPayload.COLUMN_STATS_FIELD_MIN_VALUE,
      HoodieMetadataPayload.COLUMN_STATS_FIELD_MAX_VALUE,
      HoodieMetadataPayload.COLUMN_STATS_FIELD_NULL_COUNT,
      HoodieMetadataPayload.COLUMN_STATS_FIELD_VALUE_COUNT)

    val requiredMetadataIndexColumns =
      (targetColStatsIndexColumns :+ HoodieMetadataPayload.COLUMN_STATS_FIELD_COLUMN_NAME).map(colName =>
        s"${HoodieMetadataPayload.SCHEMA_FIELD_ID_COLUMN_STATS}.${colName}")

    val metadataTableDF: DataFrame = {
      // NOTE: If specific columns have been provided, we can considerably trim down amount of data fetched
      //       by only fetching Column Stats Index records pertaining to the requested columns.
      //       Otherwise we fallback to read whole Column Stats Index
      if (targetColumns.nonEmpty) {
        readColumnStatsIndexForColumnsInternal(spark, targetColumns, metadataConfig, tableBasePath)
      } else {
        readFullColumnStatsIndexInternal(spark, metadataConfig, tableBasePath)
      }
    }

    val colStatsDF = metadataTableDF.where(col(HoodieMetadataPayload.SCHEMA_FIELD_ID_COLUMN_STATS).isNotNull)
      .select(requiredMetadataIndexColumns.map(col): _*)

    colStatsDF
  }

  /**
   * Transposes and converts the raw table format of the Column Stats Index representation,
   * where each row/record corresponds to individual (column, file) pair, into the table format
   * where each row corresponds to single file with statistic for individual columns collated
   * w/in such row:
   *
   * Metadata Table Column Stats Index format:
   *
   * 
   *  +---------------------------+------------+------------+------------+-------------+
   *  |        fileName           | columnName |  minValue  |  maxValue  |  num_nulls  |
   *  +---------------------------+------------+------------+------------+-------------+
   *  | one_base_file.parquet     |          A |          1 |         10 |           0 |
   *  | another_base_file.parquet |          A |        -10 |          0 |           5 |
   *  +---------------------------+------------+------------+------------+-------------+
   * 
* * Returned table format * *
   *  +---------------------------+------------+------------+-------------+
   *  |          file             | A_minValue | A_maxValue | A_nullCount |
   *  +---------------------------+------------+------------+-------------+
   *  | one_base_file.parquet     |          1 |         10 |           0 |
   *  | another_base_file.parquet |        -10 |          0 |           5 |
   *  +---------------------------+------------+------------+-------------+
   * 
* * NOTE: Column Stats Index might potentially contain statistics for many columns (if not all), while * query at hand might only be referencing a handful of those. As such, we collect all the * column references from the filtering expressions, and only transpose records corresponding to the * columns referenced in those * * @param spark Spark session ref * @param colStatsDF [[DataFrame]] bearing raw Column Stats Index table * @param queryColumns target columns to be included into the final table * @param tableSchema schema of the source data table * @return reshaped table according to the format outlined above */ def transposeColumnStatsIndex(spark: SparkSession, colStatsDF: DataFrame, queryColumns: Seq[String], tableSchema: StructType): DataFrame = { val colStatsSchema = colStatsDF.schema val colStatsSchemaOrdinalsMap = colStatsSchema.fields.zipWithIndex.map({ case (field, ordinal) => (field.name, ordinal) }).toMap val tableSchemaFieldMap = tableSchema.fields.map(f => (f.name, f)).toMap val colNameOrdinal = colStatsSchemaOrdinalsMap(HoodieMetadataPayload.COLUMN_STATS_FIELD_COLUMN_NAME) val minValueOrdinal = colStatsSchemaOrdinalsMap(HoodieMetadataPayload.COLUMN_STATS_FIELD_MIN_VALUE) val maxValueOrdinal = colStatsSchemaOrdinalsMap(HoodieMetadataPayload.COLUMN_STATS_FIELD_MAX_VALUE) val fileNameOrdinal = colStatsSchemaOrdinalsMap(HoodieMetadataPayload.COLUMN_STATS_FIELD_FILE_NAME) val nullCountOrdinal = colStatsSchemaOrdinalsMap(HoodieMetadataPayload.COLUMN_STATS_FIELD_NULL_COUNT) val valueCountOrdinal = colStatsSchemaOrdinalsMap(HoodieMetadataPayload.COLUMN_STATS_FIELD_VALUE_COUNT) // NOTE: We have to collect list of indexed columns to make sure we properly align the rows // w/in the transposed dataset: since some files might not have all of the columns indexed // either due to the Column Stats Index config changes, schema evolution, etc, we have // to make sure that all of the rows w/in transposed data-frame are properly padded (with null // values) for such file-column combinations val indexedColumns: Seq[String] = colStatsDF.rdd.map(row => row.getString(colNameOrdinal)).distinct().collect() // NOTE: We're sorting the columns to make sure final index schema matches layout // of the transposed table val sortedTargetColumns = TreeSet(queryColumns.intersect(indexedColumns): _*) val transposedRDD = colStatsDF.rdd .filter(row => sortedTargetColumns.contains(row.getString(colNameOrdinal))) .map { row => if (row.isNullAt(minValueOrdinal) && row.isNullAt(maxValueOrdinal)) { // Corresponding row could be null in either of the 2 cases // - Column contains only null values (in that case both min/max have to be nulls) // - This is a stubbed Column Stats record (used as a tombstone) row } else { val minValueStruct = row.getAs[Row](minValueOrdinal) val maxValueStruct = row.getAs[Row](maxValueOrdinal) checkState(minValueStruct != null && maxValueStruct != null, "Invalid Column Stats record: either both min/max have to be null, or both have to be non-null") val colName = row.getString(colNameOrdinal) val colType = tableSchemaFieldMap(colName).dataType val (minValue, _) = tryUnpackNonNullVal(minValueStruct) val (maxValue, _) = tryUnpackNonNullVal(maxValueStruct) val rowValsSeq = row.toSeq.toArray // Update min-/max-value structs w/ unwrapped values in-place rowValsSeq(minValueOrdinal) = deserialize(minValue, colType) rowValsSeq(maxValueOrdinal) = deserialize(maxValue, colType) Row(rowValsSeq: _*) } } .groupBy(r => r.getString(fileNameOrdinal)) .foldByKey(Seq[Row]()) { case (_, columnRowsSeq) => // Rows seq is always non-empty (otherwise it won't be grouped into) val fileName = columnRowsSeq.head.get(fileNameOrdinal) val valueCount = columnRowsSeq.head.get(valueCountOrdinal) // To properly align individual rows (corresponding to a file) w/in the transposed projection, we need // to align existing column-stats for individual file with the list of expected ones for the // whole transposed projection (a superset of all files) val columnRowsMap = columnRowsSeq.map(row => (row.getString(colNameOrdinal), row)).toMap val alignedColumnRowsSeq = sortedTargetColumns.toSeq.map(columnRowsMap.get) val coalescedRowValuesSeq = alignedColumnRowsSeq.foldLeft(Seq[Any](fileName, valueCount)) { case (acc, opt) => opt match { case Some(columnStatsRow) => acc ++ Seq(minValueOrdinal, maxValueOrdinal, nullCountOrdinal).map(ord => columnStatsRow.get(ord)) case None => // NOTE: Since we're assuming missing column to essentially contain exclusively // null values, we set null-count to be equal to value-count (this behavior is // consistent with reading non-existent columns from Parquet) acc ++ Seq(null, null, valueCount) } } Seq(Row(coalescedRowValuesSeq:_*)) } .values .flatMap(it => it) // NOTE: It's crucial to maintain appropriate ordering of the columns // matching table layout: hence, we cherry-pick individual columns // instead of simply filtering in the ones we're interested in the schema val indexSchema = composeIndexSchema(sortedTargetColumns.toSeq, tableSchema) spark.createDataFrame(transposedRDD, indexSchema) } private def readFullColumnStatsIndexInternal(spark: SparkSession, metadataConfig: HoodieMetadataConfig, tableBasePath: String): DataFrame = { val metadataTablePath = HoodieTableMetadata.getMetadataTableBasePath(tableBasePath) // Read Metadata Table's Column Stats Index into Spark's [[DataFrame]] spark.read.format("org.apache.hudi") .options(metadataConfig.getProps.asScala) .load(s"$metadataTablePath/${MetadataPartitionType.COLUMN_STATS.getPartitionPath}") } private def readColumnStatsIndexForColumnsInternal(spark: SparkSession, targetColumns: Seq[String], metadataConfig: HoodieMetadataConfig, tableBasePath: String) = { val ctx = new HoodieSparkEngineContext(new JavaSparkContext(spark.sparkContext)) // Read Metadata Table's Column Stats Index into Spark's [[DataFrame]] by // - Fetching the records from CSI by key-prefixes (encoded column names) // - Deserializing fetched records into [[InternalRow]]s // - Composing [[DataFrame]] val metadataTableDF = { val metadataTable = HoodieTableMetadata.create(ctx, metadataConfig, tableBasePath, FileSystemViewStorageConfig.SPILLABLE_DIR.defaultValue) // TODO encoding should be done internally w/in HoodieBackedTableMetadata val encodedTargetColumnNames = targetColumns.map(colName => new ColumnIndexID(colName).asBase64EncodedString()) val recordsRDD: RDD[HoodieRecord[HoodieMetadataPayload]] = HoodieJavaRDD.getJavaRDD( metadataTable.getRecordsByKeyPrefixes(encodedTargetColumnNames.asJava, HoodieTableMetadataUtil.PARTITION_NAME_COLUMN_STATS) ) val catalystRowsRDD: RDD[InternalRow] = recordsRDD.mapPartitions { it => val metadataRecordSchema = new Parser().parse(metadataRecordSchemaString) val converter = AvroConversionUtils.createAvroToInternalRowConverter(metadataRecordSchema, metadataRecordStructType) it.map { record => // schema and props are ignored for generating metadata record from the payload // instead, the underlying file system, or bloom filter, or columns stats metadata (part of payload) are directly used toScalaOption(record.getData.getInsertValue(null, null)) .flatMap(avroRecord => converter(avroRecord.asInstanceOf[GenericRecord])) .orNull } } HoodieUnsafeRDDUtils.createDataFrame(spark, catalystRowsRDD, metadataRecordStructType) } metadataTableDF } } object ColumnStatsIndexSupport { private val metadataRecordSchemaString: String = HoodieMetadataRecord.SCHEMA$.toString private val metadataRecordStructType: StructType = AvroConversionUtils.convertAvroSchemaToStructType(HoodieMetadataRecord.SCHEMA$) /** * @VisibleForTesting */ def composeIndexSchema(targetColumnNames: Seq[String], tableSchema: StructType): StructType = { val fileNameField = StructField(HoodieMetadataPayload.COLUMN_STATS_FIELD_FILE_NAME, StringType, nullable = true, Metadata.empty) val valueCountField = StructField(HoodieMetadataPayload.COLUMN_STATS_FIELD_VALUE_COUNT, LongType, nullable = true, Metadata.empty) val targetFields = targetColumnNames.map(colName => tableSchema.fields.find(f => f.name == colName).get) StructType( targetFields.foldLeft(Seq(fileNameField, valueCountField)) { case (acc, field) => acc ++ Seq( composeColumnStatStructType(field.name, HoodieMetadataPayload.COLUMN_STATS_FIELD_MIN_VALUE, field.dataType), composeColumnStatStructType(field.name, HoodieMetadataPayload.COLUMN_STATS_FIELD_MAX_VALUE, field.dataType), composeColumnStatStructType(field.name, HoodieMetadataPayload.COLUMN_STATS_FIELD_NULL_COUNT, LongType)) } ) } @inline def getMinColumnNameFor(colName: String): String = formatColName(colName, HoodieMetadataPayload.COLUMN_STATS_FIELD_MIN_VALUE) @inline def getMaxColumnNameFor(colName: String): String = formatColName(colName, HoodieMetadataPayload.COLUMN_STATS_FIELD_MAX_VALUE) @inline def getNullCountColumnNameFor(colName: String): String = formatColName(colName, HoodieMetadataPayload.COLUMN_STATS_FIELD_NULL_COUNT) @inline def getValueCountColumnNameFor: String = HoodieMetadataPayload.COLUMN_STATS_FIELD_VALUE_COUNT @inline private def formatColName(col: String, statName: String) = { // TODO add escaping for String.format("%s_%s", col, statName) } @inline private def composeColumnStatStructType(col: String, statName: String, dataType: DataType) = StructField(formatColName(col, statName), dataType, nullable = true, Metadata.empty) private def tryUnpackNonNullVal(statStruct: Row): (Any, Int) = statStruct.toSeq.zipWithIndex .find(_._1 != null) // NOTE: First non-null value will be a wrapper (converted into Row), bearing a single // value .map { case (value, ord) => (value.asInstanceOf[Row].get(0), ord)} .getOrElse((null, -1)) private def deserialize(value: Any, dataType: DataType): Any = { dataType match { // NOTE: Since we can't rely on Avro's "date", and "timestamp-micros" logical-types, we're // manually encoding corresponding values as int and long w/in the Column Stats Index and // here we have to decode those back into corresponding logical representation. case TimestampType => DateTimeUtils.toJavaTimestamp(value.asInstanceOf[Long]) case DateType => DateTimeUtils.toJavaDate(value.asInstanceOf[Int]) // NOTE: All integral types of size less than Int are encoded as Ints in MT case ShortType => value.asInstanceOf[Int].toShort case ByteType => value.asInstanceOf[Int].toByte case _ => value } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy