org.apache.hudi.SparkBaseIndexSupport.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hudi-spark-bundle_2.11 Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi

import org.apache.hudi.HoodieFileIndex.DataSkippingFailureMode
import org.apache.hudi.client.common.HoodieSparkEngineContext
import org.apache.hudi.common.config.HoodieMetadataConfig
import org.apache.hudi.common.model.FileSlice
import org.apache.hudi.common.table.HoodieTableMetaClient
import org.apache.hudi.metadata.{HoodieMetadataPayload, HoodieTableMetadata}
import org.apache.hudi.util.JFunction

import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql.catalyst.expressions.{And, Expression}
import org.apache.spark.sql.hudi.DataSkippingUtils.translateIntoColumnStatsIndexFilterExpr
import org.apache.spark.sql.{Column, DataFrame, SparkSession}

import scala.collection.JavaConverters._
import scala.util.control.NonFatal

abstract class SparkBaseIndexSupport(spark: SparkSession,
                                     metadataConfig: HoodieMetadataConfig,
                                     metaClient: HoodieTableMetaClient) {
  @transient protected lazy val engineCtx = new HoodieSparkEngineContext(new JavaSparkContext(spark.sparkContext))
  @transient protected lazy val metadataTable: HoodieTableMetadata =
    HoodieTableMetadata.create(engineCtx, metaClient.getStorage, metadataConfig, metaClient.getBasePath.toString)

  def getIndexName: String

  def isIndexAvailable: Boolean

  def computeCandidateIsStrict(spark: SparkSession,
                               fileIndex: HoodieFileIndex,
                               queryFilters: Seq[Expression],
                               queryReferencedColumns: Seq[String],
                               prunedPartitionsAndFileSlices: Seq[(Option[BaseHoodieTableFileIndex.PartitionPath], Seq[FileSlice])],
                               shouldPushDownFilesFilter: Boolean): Option[Set[String]] = {
    try {
      computeCandidateFileNames(fileIndex, queryFilters, queryReferencedColumns, prunedPartitionsAndFileSlices, shouldPushDownFilesFilter)
    } catch {
      case NonFatal(e) =>
        spark.sqlContext.getConf(DataSkippingFailureMode.configName, DataSkippingFailureMode.Fallback.value) match {
          case DataSkippingFailureMode.Fallback.value => Option.empty
          case DataSkippingFailureMode.Strict.value => throw e;
        }
    }
  }

  def computeCandidateFileNames(fileIndex: HoodieFileIndex,
                                queryFilters: Seq[Expression],
                                queryReferencedColumns: Seq[String],
                                prunedPartitionsAndFileSlices: Seq[(Option[BaseHoodieTableFileIndex.PartitionPath], Seq[FileSlice])],
                                shouldPushDownFilesFilter: Boolean): Option[Set[String]]

  def invalidateCaches(): Unit

  protected def getPrunedFileNames(prunedPartitionsAndFileSlices: Seq[(Option[BaseHoodieTableFileIndex.PartitionPath], Seq[FileSlice])],
                                   includeLogFiles: Boolean = false): Set[String] = {
    prunedPartitionsAndFileSlices
      .flatMap {
        case (_, fileSlices) => fileSlices
      }
      .flatMap { fileSlice =>
        val baseFileOption = Option(fileSlice.getBaseFile.orElse(null))
        val logFiles = if (includeLogFiles) {
          fileSlice.getLogFiles.iterator().asScala.map(_.getFileName).toList
        } else Nil
        baseFileOption.map(_.getFileName).toList ++ logFiles
      }
      .toSet
  }

  protected def getCandidateFiles(indexDf: DataFrame, queryFilters: Seq[Expression], prunedFileNames: Set[String]): Set[String] = {
    val indexSchema = indexDf.schema
    val indexFilter = queryFilters.map(translateIntoColumnStatsIndexFilterExpr(_, indexSchema)).reduce(And)
    val prunedCandidateFileNames =
      indexDf.where(new Column(indexFilter))
        .select(HoodieMetadataPayload.COLUMN_STATS_FIELD_FILE_NAME)
        .collect()
        .map(_.getString(0))
        .toSet

    // NOTE: Col-Stats Index isn't guaranteed to have complete set of statistics for every
    //       base-file or log file: since it's bound to clustering, which could occur asynchronously
    //       at arbitrary point in time, and is not likely to be touching all of the base files.
    //
    //       To close that gap, we manually compute the difference b/w all indexed (by col-stats-index)
    //       files and all outstanding base-files or log files, and make sure that all base files and
    //       log file not represented w/in the index are included in the output of this method
    val allIndexedFileNames =
    indexDf.select(HoodieMetadataPayload.COLUMN_STATS_FIELD_FILE_NAME)
      .collect()
      .map(_.getString(0))
      .toSet
    val notIndexedFileNames = prunedFileNames -- allIndexedFileNames

    prunedCandidateFileNames ++ notIndexedFileNames
  }

  /**
   * Determines whether it would be more optimal to read Column Stats Index a) in-memory of the invoking process,
   * or b) executing it on-cluster via Spark [[Dataset]] and [[RDD]] APIs
   */
  protected def shouldReadInMemory(fileIndex: HoodieFileIndex, queryReferencedColumns: Seq[String], inMemoryProjectionThreshold: Integer): Boolean = {
    // NOTE: Since executing on-cluster via Spark API has its own non-trivial amount of overhead,
    //       it's most often preferential to fetch index w/in the same process (usually driver),
    //       w/o resorting to on-cluster execution.
    //       For that we use a simple-heuristic to determine whether we should read and process the index in-memory or
    //       on-cluster: total number of rows of the expected projected portion of the index has to be below the
    //       threshold (of 100k records)
    Option(metadataConfig.getColumnStatsIndexProcessingModeOverride) match {
      case Some(mode) =>
        mode == HoodieMetadataConfig.COLUMN_STATS_INDEX_PROCESSING_MODE_IN_MEMORY
      case None =>
        fileIndex.getFileSlicesCount * queryReferencedColumns.length < inMemoryProjectionThreshold
    }
  }

  /**
   * Given query filters, it filters the EqualTo and IN queries on simple record key columns and returns a tuple of
   * list of such queries and list of record key literals present in the query.
   * If record index is not available, it returns empty list for record filters and record keys
   * @param queryFilters The queries that need to be filtered.
   * @return Tuple of List of filtered queries and list of record key literals that need to be matched
   */
  protected def filterQueriesWithRecordKey(queryFilters: Seq[Expression]): (List[Expression], List[String]) = {
    if (!isIndexAvailable) {
      (List.empty, List.empty)
    } else {
      var recordKeyQueries: List[Expression] = List.empty
      var recordKeys: List[String] = List.empty
      for (query <- queryFilters) {
        val recordKeyOpt = getRecordKeyConfig
        RecordLevelIndexSupport.filterQueryWithRecordKey(query, recordKeyOpt).foreach({
          case (exp: Expression, recKeys: List[String]) =>
            recordKeys = recordKeys ++ recKeys
            recordKeyQueries = recordKeyQueries :+ exp
        })
      }

      Tuple2.apply(recordKeyQueries, recordKeys)
    }
  }

  /**
   * Returns the configured record key for the table if it is a simple record key else returns empty option.
   */
  private def getRecordKeyConfig: Option[String] = {
    val recordKeysOpt: org.apache.hudi.common.util.Option[Array[String]] = metaClient.getTableConfig.getRecordKeyFields
    val recordKeyOpt = recordKeysOpt.map[String](JFunction.toJavaFunction[Array[String], String](arr =>
      if (arr.length == 1) {
        arr(0)
      } else {
        null
      }))
    Option.apply(recordKeyOpt.orElse(null))
  }

}