org.apache.spark.sql.hudi.DataSkippingUtils.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hudi-spark3.5-bundle_2.13 Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.sql.hudi

import org.apache.hudi.ColumnStatsIndexSupport.{getMaxColumnNameFor, getMinColumnNameFor, getNullCountColumnNameFor, getValueCountColumnNameFor}
import org.apache.hudi.SparkAdapterSupport
import org.apache.hudi.common.util.ValidationUtils.checkState
import org.apache.spark.internal.Logging
import org.apache.spark.sql.{AnalysisException, HoodieCatalystExpressionUtils}
import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
import org.apache.spark.sql.catalyst.expressions.{Alias, And, Attribute, AttributeReference, EqualNullSafe, EqualTo, Expression, ExtractValue, GetStructField, GreaterThan, GreaterThanOrEqual, In, InSet, IsNotNull, IsNull, LessThan, LessThanOrEqual, Literal, Not, Or, StartsWith, SubqueryExpression}
import org.apache.spark.sql.catalyst.expressions.Literal.TrueLiteral
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.hudi.ColumnStatsExpressionUtils._
import org.apache.spark.sql.types.StructType
import org.apache.spark.unsafe.types.UTF8String

object DataSkippingUtils extends Logging {

  /**
   * Translates provided {@link filterExpr} into corresponding filter-expression for column-stats index table
   * to filter out candidate files that would hold records matching the original filter.
   * In case the column stats were creating using expression index, the index filter must also account for the expression.
   *
   * @param dataTableFilterExpr source table's query's filter expression
   * @param indexSchema index table schema
   * @param isExpressionIndex whether the index is an expression index
   * @return filter for column-stats index's table
   */
  def translateIntoColumnStatsIndexFilterExpr(dataTableFilterExpr: Expression, indexSchema: StructType, isExpressionIndex: Boolean = false): Expression = {
    try {
      createColumnStatsIndexFilterExprInternal(dataTableFilterExpr, indexSchema, isExpressionIndex)
    } catch {
      case e: AnalysisException =>
        logDebug(s"Failed to translated provided data table filter expr into column stats one ($dataTableFilterExpr)", e)
        throw e
    }
  }

  private def createColumnStatsIndexFilterExprInternal(dataTableFilterExpr: Expression, indexSchema: StructType, isExpressionIndex: Boolean = false): Expression = {
    // Try to transform original Source Table's filter expression into
    // Column-Stats Index filter expression
    tryComposeIndexFilterExpr(dataTableFilterExpr, indexSchema, isExpressionIndex) match {
      case Some(e) => e
      // NOTE: In case we can't transform source filter expression, we fallback
      // to {@code TrueLiteral}, to essentially avoid pruning any indexed files from scanning
      case None => TrueLiteral
    }
  }

  private def tryComposeIndexFilterExpr(sourceFilterExpr: Expression, indexSchema: StructType, isExpressionIndex: Boolean = false): Option[Expression] = {
    //
    // For translation of the Filter Expression for the Data Table into Filter Expression for Column Stats Index, we're
    // assuming that
    //    - The column A is queried in the Data Table (hereafter referred to as "colA")
    //    - Filter Expression is a relational expression (ie "=", "<", "<=", ...) of the following form
    //
    //      ```transform_expr(colA) = value_expr```
    //
    //      Where
    //        - "transform_expr" is an expression of the _transformation_ which preserve ordering of the "colA"
    //        - "value_expr" is an "value"-expression (ie one NOT referring to other attributes/columns or containing sub-queries)
    //
    // We translate original Filter Expr into the one querying Column Stats Index like following: let's consider
    // equality Filter Expr referred to above:
    //
    //   ```transform_expr(colA) = value_expr```
    //
    // This expression will be translated into following Filter Expression for the Column Stats Index:
    //
    //   ```(transform_expr(colA_minValue) <= value_expr) AND (value_expr <= transform_expr(colA_maxValue))```
    //
    // Which will enable us to match files with the range of values in column A containing the target ```value_expr```
    //
    // NOTE: That we can apply ```transform_expr``` transformation precisely b/c it preserves the ordering of the
    //       values of the source column, ie following holds true:
    //
    //       colA_minValue = min(colA)  =>  transform_expr(colA_minValue) = min(transform_expr(colA))
    //       colA_maxValue = max(colA)  =>  transform_expr(colA_maxValue) = max(transform_expr(colA))
    //
    sourceFilterExpr match {
      // If Expression is not resolved, we can't perform the analysis accurately, bailing
      case expr if !expr.resolved && !isExpressionIndex => None

      // Filter "expr(colA) = B" and "B = expr(colA)"
      // Translates to "(expr(colA_minValue) <= B) AND (B <= expr(colA_maxValue))" condition for index lookup
      case EqualTo(sourceExpr @ AllowedTransformationExpression(attrRef), valueExpr: Expression) if isValueExpression(valueExpr) =>
        getTargetIndexedColumnName(attrRef, indexSchema)
          .map { colName =>
            // NOTE: Since we're supporting (almost) arbitrary expressions of the form `f(colA) = B`, we have to
            //       appropriately translate such original expression targeted at Data Table, to corresponding
            //       expression targeted at Column Stats Index Table. For that, we take original expression holding
            //       [[AttributeReference]] referring to the Data Table, and swap it w/ expression referring to
            //       corresponding column in the Column Stats Index
            val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _)
            genColumnValuesEqualToExpression(colName, valueExpr, targetExprBuilder)
          }

      case EqualTo(valueExpr: Expression, sourceExpr @ AllowedTransformationExpression(attrRef)) if isValueExpression(valueExpr) =>
        getTargetIndexedColumnName(attrRef, indexSchema)
          .map { colName =>
            val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _)
            genColumnValuesEqualToExpression(colName, valueExpr, targetExprBuilder)
          }

      // Filter "expr(colA) != B" and "B != expr(colA)"
      // Translates to "NOT(expr(colA_minValue) = B AND expr(colA_maxValue) = B)"
      // NOTE: This is NOT an inversion of `colA = b`, instead this filter ONLY excludes files for which `colA = B`
      //       holds true
      case Not(EqualTo(sourceExpr @ AllowedTransformationExpression(attrRef), value: Expression)) if isValueExpression(value) =>
        getTargetIndexedColumnName(attrRef, indexSchema)
          .map { colName =>
            val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _)
            Not(genColumnOnlyValuesEqualToExpression(colName, value, targetExprBuilder))
          }

      case Not(EqualTo(value: Expression, sourceExpr @ AllowedTransformationExpression(attrRef))) if isValueExpression(value) =>
        getTargetIndexedColumnName(attrRef, indexSchema)
          .map { colName =>
            val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _)
            Not(genColumnOnlyValuesEqualToExpression(colName, value, targetExprBuilder))
          }

      // Filter "colA = null"
      // Translates to "colA_nullCount = null" for index lookup
      case EqualNullSafe(attrRef: AttributeReference, litNull @ Literal(null, _)) =>
        getTargetIndexedColumnName(attrRef, indexSchema)
          .map(colName => EqualTo(genColNumNullsExpr(colName), litNull))

      // Filter "expr(colA) < B" and "B > expr(colA)"
      // Translates to "expr(colA_minValue) < B" for index lookup
      case LessThan(sourceExpr @ AllowedTransformationExpression(attrRef), value: Expression) if isValueExpression(value) =>
        getTargetIndexedColumnName(attrRef, indexSchema)
          .map { colName =>
            val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _)
            LessThan(targetExprBuilder.apply(genColMinValueExpr(colName)), value)
          }

      case GreaterThan(value: Expression, sourceExpr @ AllowedTransformationExpression(attrRef)) if isValueExpression(value) =>
        getTargetIndexedColumnName(attrRef, indexSchema)
          .map { colName =>
            val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _)
            LessThan(targetExprBuilder.apply(genColMinValueExpr(colName)), value)
          }

      // Filter "B < expr(colA)" and "expr(colA) > B"
      // Translates to "B < colA_maxValue" for index lookup
      case LessThan(value: Expression, sourceExpr @ AllowedTransformationExpression(attrRef)) if isValueExpression(value) =>
        getTargetIndexedColumnName(attrRef, indexSchema)
          .map { colName =>
            val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _)
            GreaterThan(targetExprBuilder.apply(genColMaxValueExpr(colName)), value)
          }

      case GreaterThan(sourceExpr @ AllowedTransformationExpression(attrRef), value: Expression) if isValueExpression(value) =>
        getTargetIndexedColumnName(attrRef, indexSchema)
          .map { colName =>
            val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _)
            GreaterThan(targetExprBuilder.apply(genColMaxValueExpr(colName)), value)
          }

      // Filter "expr(colA) <= B" and "B >= expr(colA)"
      // Translates to "colA_minValue <= B" for index lookup
      case LessThanOrEqual(sourceExpr @ AllowedTransformationExpression(attrRef), value: Expression) if isValueExpression(value) =>
        getTargetIndexedColumnName(attrRef, indexSchema)
          .map { colName =>
            val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _)
            LessThanOrEqual(targetExprBuilder.apply(genColMinValueExpr(colName)), value)
          }

      case GreaterThanOrEqual(value: Expression, sourceExpr @ AllowedTransformationExpression(attrRef)) if isValueExpression(value) =>
        getTargetIndexedColumnName(attrRef, indexSchema)
          .map { colName =>
            val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _)
            LessThanOrEqual(targetExprBuilder.apply(genColMinValueExpr(colName)), value)
          }

      // Filter "B <= expr(colA)" and "expr(colA) >= B"
      // Translates to "B <= colA_maxValue" for index lookup
      case LessThanOrEqual(value: Expression, sourceExpr @ AllowedTransformationExpression(attrRef)) if isValueExpression(value) =>
        getTargetIndexedColumnName(attrRef, indexSchema)
          .map { colName =>
            val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _)
            GreaterThanOrEqual(targetExprBuilder.apply(genColMaxValueExpr(colName)), value)
          }

      case GreaterThanOrEqual(sourceExpr @ AllowedTransformationExpression(attrRef), value: Expression) if isValueExpression(value) =>
        getTargetIndexedColumnName(attrRef, indexSchema)
          .map { colName =>
            val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _)
            GreaterThanOrEqual(targetExprBuilder.apply(genColMaxValueExpr(colName)), value)
          }

      // Filter "colA is null"
      // Translates to "colA_nullCount > 0" for index lookup
      case IsNull(attribute: AttributeReference) =>
        getTargetIndexedColumnName(attribute, indexSchema)
          .map(colName => GreaterThan(genColNumNullsExpr(colName), Literal(0)))

      // Filter "colA is not null"
      // Translates to "colA_nullCount = null or colA_valueCount = null or colA_nullCount < colA_valueCount" for index lookup
      // "colA_nullCount = null or colA_valueCount = null" means we are not certain whether the column is null or not,
      // hence we return True to ensure this does not affect the query.
      case IsNotNull(attribute: AttributeReference) =>
        getTargetIndexedColumnName(attribute, indexSchema)
          .map {colName =>
            val numNullExpr = genColNumNullsExpr(colName)
            val valueCountExpr = genColValueCountExpr
            Or(Or(IsNull(numNullExpr), IsNull(valueCountExpr)), LessThan(numNullExpr, valueCountExpr))
          }

      // Filter "expr(colA) in (B1, B2, ...)"
      // Translates to "(colA_minValue <= B1 AND colA_maxValue >= B1) OR (colA_minValue <= B2 AND colA_maxValue >= B2) ... "
      // for index lookup
      // NOTE: This is equivalent to "colA = B1 OR colA = B2 OR ..."
      case In(sourceExpr @ AllowedTransformationExpression(attrRef), list: Seq[Expression]) if list.forall(isValueExpression) =>
        getTargetIndexedColumnName(attrRef, indexSchema)
          .map { colName =>
            val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _)
            list.map(lit => genColumnValuesEqualToExpression(colName, lit, targetExprBuilder)).reduce(Or)
          }

      // Filter "expr(colA) in (B1, B2, ...)"
      // NOTE: [[InSet]] is an optimized version of the [[In]] expression, where every sub-expression w/in the
      //       set is a static literal
      case InSet(sourceExpr @ AllowedTransformationExpression(attrRef), hset: Set[Any]) =>
        getTargetIndexedColumnName(attrRef, indexSchema)
          .map { colName =>
            val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _)
            hset.map { value =>
              // NOTE: [[Literal]] has a gap where it could hold [[UTF8String]], but [[Literal#apply]] doesn't
              //       accept [[UTF8String]]. As such we have to handle it separately
              val lit = value match {
                case str: UTF8String => Literal(str.toString)
                case _ => Literal(value)
              }
              genColumnValuesEqualToExpression(colName, lit, targetExprBuilder)
            }.reduce(Or)
          }

      // Filter "expr(colA) not in (B1, B2, ...)"
      // Translates to "NOT((colA_minValue = B1 AND colA_maxValue = B1) OR (colA_minValue = B2 AND colA_maxValue = B2))" for index lookup
      // NOTE: This is NOT an inversion of `in (B1, B2, ...)` expr, this is equivalent to "colA != B1 AND colA != B2 AND ..."
      case Not(In(sourceExpr @ AllowedTransformationExpression(attrRef), list: Seq[Expression])) if list.forall(_.foldable) =>
        getTargetIndexedColumnName(attrRef, indexSchema)
          .map { colName =>
            val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _)
            Not(list.map(lit => genColumnOnlyValuesEqualToExpression(colName, lit, targetExprBuilder)).reduce(Or))
          }

      // Filter "colA like 'xxx%'"
      // Translates to "colA_minValue <= xxx AND xxx <= colA_maxValue" for index lookup
      //
      // NOTE: Since a) this operator matches strings by prefix and b) given that this column is going to be ordered
      //       lexicographically, we essentially need to check that provided literal falls w/in min/max bounds of the
      //       given column
      case StartsWith(sourceExpr @ AllowedTransformationExpression(attrRef), v @ Literal(_: UTF8String, _)) =>
        getTargetIndexedColumnName(attrRef, indexSchema)
          .map { colName =>
            val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _)
            genColumnValuesEqualToExpression(colName, v, targetExprBuilder)
          }

      // Filter "expr(colA) not like 'xxx%'"
      // Translates to "NOT(expr(colA_minValue) like 'xxx%' AND expr(colA_maxValue) like 'xxx%')" for index lookup
      // NOTE: This is NOT an inversion of "colA like xxx"
      case Not(StartsWith(sourceExpr @ AllowedTransformationExpression(attrRef), value @ Literal(_: UTF8String, _))) =>
        getTargetIndexedColumnName(attrRef, indexSchema)
          .map { colName =>
            val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _)
            val minValueExpr = targetExprBuilder.apply(genColMinValueExpr(colName))
            val maxValueExpr = targetExprBuilder.apply(genColMaxValueExpr(colName))
            Not(And(StartsWith(minValueExpr, value), StartsWith(maxValueExpr, value)))
          }

      case or: Or =>
        val resLeft = createColumnStatsIndexFilterExprInternal(or.left, indexSchema)
        val resRight = createColumnStatsIndexFilterExprInternal(or.right, indexSchema)

        Option(Or(resLeft, resRight))

      case and: And =>
        val resLeft = createColumnStatsIndexFilterExprInternal(and.left, indexSchema)
        val resRight = createColumnStatsIndexFilterExprInternal(and.right, indexSchema)

        Option(And(resLeft, resRight))

      //
      // Pushing Logical NOT inside the AND/OR expressions
      // NOTE: This is required to make sure we're properly handling negations in
      //       cases like {@code NOT(colA = 0)}, {@code NOT(colA in (a, b, ...)}
      //

      case Not(And(left: Expression, right: Expression)) =>
        Option(createColumnStatsIndexFilterExprInternal(Or(Not(left), Not(right)), indexSchema))

      case Not(Or(left: Expression, right: Expression)) =>
        Option(createColumnStatsIndexFilterExprInternal(And(Not(left), Not(right)), indexSchema))

      case _: Expression => None
    }
  }

  private def checkColIsIndexed(colName: String, indexSchema: StructType): Boolean = {
    Set.apply(
      getMinColumnNameFor(colName),
      getMaxColumnNameFor(colName),
      getNullCountColumnNameFor(colName)
    )
      .forall(stat => indexSchema.exists(_.name == stat))
  }

  private def getTargetIndexedColumnName(resolvedExpr: AttributeReference, indexSchema: StructType): Option[String] = {
    val colName = UnresolvedAttribute(getTargetColNameParts(resolvedExpr)).name

    // Verify that the column is indexed
    if (checkColIsIndexed(colName, indexSchema)) {
      Option.apply(colName)
    } else {
      None
    }
  }

  private def getTargetColNameParts(resolvedTargetCol: Expression): Seq[String] = {
    resolvedTargetCol match {
      case attr: Attribute => Seq(attr.name)
      case Alias(c, _) => getTargetColNameParts(c)
      case GetStructField(c, _, Some(name)) => getTargetColNameParts(c) :+ name
      case ex: ExtractValue =>
        throw new AnalysisException(s"convert reference to name failed, Updating nested fields is only supported for StructType: ${ex}.")
      case other =>
        throw new AnalysisException(s"convert reference to name failed,  Found unsupported expression ${other}")
    }
  }
}

object ColumnStatsExpressionUtils {

  @inline def genColMinValueExpr(colName: String): Expression = col(getMinColumnNameFor(colName)).expr
  @inline def genColMaxValueExpr(colName: String): Expression = col(getMaxColumnNameFor(colName)).expr
  @inline def genColNumNullsExpr(colName: String): Expression = col(getNullCountColumnNameFor(colName)).expr
  @inline def genColValueCountExpr: Expression = col(getValueCountColumnNameFor).expr

  @inline def genColumnValuesEqualToExpression(colName: String,
                                               value: Expression,
                                               targetExprBuilder: Function[Expression, Expression] = Predef.identity): Expression = {
    val minValueExpr = targetExprBuilder.apply(genColMinValueExpr(colName))
    val maxValueExpr = targetExprBuilder.apply(genColMaxValueExpr(colName))
    // Only case when column C contains value V is when min(C) <= V <= max(c)
    And(LessThanOrEqual(minValueExpr, value), GreaterThanOrEqual(maxValueExpr, value))
  }

  def genColumnOnlyValuesEqualToExpression(colName: String,
                                           value: Expression,
                                           targetExprBuilder: Function[Expression, Expression] = Predef.identity): Expression = {
    val minValueExpr = targetExprBuilder.apply(genColMinValueExpr(colName))
    val maxValueExpr = targetExprBuilder.apply(genColMaxValueExpr(colName))
    // Only case when column C contains _only_ value V is when min(C) = V AND max(c) = V
    And(EqualTo(minValueExpr, value), EqualTo(maxValueExpr, value))
  }

  def swapAttributeRefInExpr(sourceExpr: Expression, from: AttributeReference, to: Expression): Expression = {
    checkState(sourceExpr.references.size == 1)
    sourceExpr.transformDown {
      case attrRef: AttributeReference if attrRef.sameRef(from) => to
    }
  }

  /**
   * This check is used to validate that the expression that target column is compared against
   *    *    a) Has no references to other attributes (for ex, columns)
   *    b) Does not contain sub-queries
   * 
   *
   * This in turn allows us to be certain that Spark will be able to evaluate such expression
   * against Column Stats Index as well
   */
  def isValueExpression(expr: Expression): Boolean =
    expr.references.isEmpty && !SubqueryExpression.hasSubquery(expr)

  /**
   * This utility pattern-matches an expression iff
   *
   * 
   *   It references *exactly* 1 attribute (column)
   *   It does NOT contain sub-queries
   *   It contains only whitelisted transformations that preserve ordering of the source column [1]
   * 
   *
   * [1] This is required to make sure that we can correspondingly map Column Stats Index values as well. Applying
   * transformations that do not preserve the ordering might lead to incorrect results being returned by Data
   * Skipping flow.
   *
   * Returns only [[AttributeReference]] contained as a sub-expression
   */
  object AllowedTransformationExpression extends SparkAdapterSupport {
    val exprUtils: HoodieCatalystExpressionUtils = sparkAdapter.getCatalystExpressionUtils

    def unapply(expr: Expression): Option[AttributeReference] = {
      // First step, we check that expression
      //    - Does NOT contain sub-queries
      //    - Does contain exactly 1 attribute
      if (SubqueryExpression.hasSubquery(expr) || expr.references.size != 1) {
        None
      } else {
        // Second step, we validate that holding expression is an actually permitted
        // transformation
        // NOTE: That transformation composition is permitted
        exprUtils.tryMatchAttributeOrderingPreservingTransformation(expr)
      }
    }
  }
}