org.apache.spark.sql.SparkSessionExtensions.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-sql_2.12 Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.sql

import scala.collection.mutable

import org.apache.spark.annotation.{DeveloperApi, Experimental, Unstable}
import org.apache.spark.sql.catalyst.FunctionIdentifier
import org.apache.spark.sql.catalyst.analysis.{FunctionRegistry, TableFunctionRegistry}
import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder
import org.apache.spark.sql.catalyst.analysis.TableFunctionRegistry.TableFunctionBuilder
import org.apache.spark.sql.catalyst.expressions.ExpressionInfo
import org.apache.spark.sql.catalyst.parser.ParserInterface
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.execution.{ColumnarRule, SparkPlan}

/**
 * :: Experimental ::
 * Holder for injection points to the [[SparkSession]]. We make NO guarantee about the stability
 * regarding binary compatibility and source compatibility of methods here.
 *
 * This current provides the following extension points:
 *
 * 
 * Analyzer Rules.
 * Check Analysis Rules.
 * Cache Plan Normalization Rules.
 * Optimizer Rules.
 * Pre CBO Rules.
 * Planning Strategies.
 * Customized Parser.
 * (External) Catalog listeners.
 * Columnar Rules.
 * Adaptive Query Post Planner Strategy Rules.
 * Adaptive Query Stage Preparation Rules.
 * Adaptive Query Execution Runtime Optimizer Rules.
 * Adaptive Query Stage Optimizer Rules.
 * 
 *
 * The extensions can be used by calling `withExtensions` on the [[SparkSession.Builder]], for
 * example:
 * {{{
 *   SparkSession.builder()
 *     .master("...")
 *     .config("...", true)
 *     .withExtensions { extensions =>
 *       extensions.injectResolutionRule { session =>
 *         ...
 *       }
 *       extensions.injectParser { (session, parser) =>
 *         ...
 *       }
 *     }
 *     .getOrCreate()
 * }}}
 *
 * The extensions can also be used by setting the Spark SQL configuration property
 * `spark.sql.extensions`. Multiple extensions can be set using a comma-separated list. For example:
 * {{{
 *   SparkSession.builder()
 *     .master("...")
 *     .config("spark.sql.extensions", "org.example.MyExtensions,org.example.YourExtensions")
 *     .getOrCreate()
 *
 *   class MyExtensions extends Function1[SparkSessionExtensions, Unit] {
 *     override def apply(extensions: SparkSessionExtensions): Unit = {
 *       extensions.injectResolutionRule { session =>
 *         ...
 *       }
 *       extensions.injectParser { (session, parser) =>
 *         ...
 *       }
 *     }
 *   }
 *
 *   class YourExtensions extends SparkSessionExtensionsProvider {
 *     override def apply(extensions: SparkSessionExtensions): Unit = {
 *       extensions.injectResolutionRule { session =>
 *         ...
 *       }
 *       extensions.injectFunction(...)
 *     }
 *   }
 * }}}
 *
 * Note that none of the injected builders should assume that the [[SparkSession]] is fully
 * initialized and should not touch the session's internals (e.g. the SessionState).
 */
@DeveloperApi
@Experimental
@Unstable
class SparkSessionExtensions {
  type RuleBuilder = SparkSession => Rule[LogicalPlan]
  type CheckRuleBuilder = SparkSession => LogicalPlan => Unit
  type StrategyBuilder = SparkSession => Strategy
  type ParserBuilder = (SparkSession, ParserInterface) => ParserInterface
  type FunctionDescription = (FunctionIdentifier, ExpressionInfo, FunctionBuilder)
  type TableFunctionDescription = (FunctionIdentifier, ExpressionInfo, TableFunctionBuilder)
  type ColumnarRuleBuilder = SparkSession => ColumnarRule
  type QueryPostPlannerStrategyBuilder = SparkSession => Rule[SparkPlan]
  type QueryStagePrepRuleBuilder = SparkSession => Rule[SparkPlan]
  type QueryStageOptimizerRuleBuilder = SparkSession => Rule[SparkPlan]

  private[this] val columnarRuleBuilders = mutable.Buffer.empty[ColumnarRuleBuilder]
  private[this] val queryPostPlannerStrategyRuleBuilders =
    mutable.Buffer.empty[QueryPostPlannerStrategyBuilder]
  private[this] val queryStagePrepRuleBuilders = mutable.Buffer.empty[QueryStagePrepRuleBuilder]
  private[this] val runtimeOptimizerRules = mutable.Buffer.empty[RuleBuilder]
  private[this] val queryStageOptimizerRuleBuilders =
    mutable.Buffer.empty[QueryStageOptimizerRuleBuilder]

  /**
   * Build the override rules for columnar execution.
   */
  private[sql] def buildColumnarRules(session: SparkSession): Seq[ColumnarRule] = {
    columnarRuleBuilders.map(_.apply(session)).toSeq
  }

  /**
   * Build the override rules for the query post planner strategy phase of adaptive query execution.
   */
  private[sql] def buildQueryPostPlannerStrategyRules(
      session: SparkSession): Seq[Rule[SparkPlan]] = {
    queryPostPlannerStrategyRuleBuilders.map(_.apply(session)).toSeq
  }

  /**
   * Build the override rules for the query stage preparation phase of adaptive query execution.
   */
  private[sql] def buildQueryStagePrepRules(session: SparkSession): Seq[Rule[SparkPlan]] = {
    queryStagePrepRuleBuilders.map(_.apply(session)).toSeq
  }

  /**
   * Build the override rules for the optimizer of adaptive query execution.
   */
  private[sql] def buildRuntimeOptimizerRules(session: SparkSession): Seq[Rule[LogicalPlan]] = {
    runtimeOptimizerRules.map(_.apply(session)).toSeq
  }

  /**
   * Build the override rules for the query stage optimizer phase of adaptive query execution.
   */
  private[sql] def buildQueryStageOptimizerRules(session: SparkSession): Seq[Rule[SparkPlan]] = {
    queryStageOptimizerRuleBuilders.map(_.apply(session)).toSeq
  }

  /**
   * Inject a rule that can override the columnar execution of an executor.
   */
  def injectColumnar(builder: ColumnarRuleBuilder): Unit = {
    columnarRuleBuilders += builder
  }

  /**
   * Inject a rule that applied between `plannerStrategy` and `queryStagePrepRules`, so
   * it can get the whole plan before injecting exchanges.
   * Note, these rules can only be applied within AQE.
   */
  def injectQueryPostPlannerStrategyRule(builder: QueryPostPlannerStrategyBuilder): Unit = {
    queryPostPlannerStrategyRuleBuilders += builder
  }

  /**
   * Inject a rule that can override the query stage preparation phase of adaptive query
   * execution.
   */
  def injectQueryStagePrepRule(builder: QueryStagePrepRuleBuilder): Unit = {
    queryStagePrepRuleBuilders += builder
  }

  /**
   * Inject a runtime `Rule` builder into the [[SparkSession]].
   * The injected rules will be executed after built-in
   * [[org.apache.spark.sql.execution.adaptive.AQEOptimizer]] rules are applied.
   * A runtime optimizer rule is used to improve the quality of a logical plan during execution
   * which can leverage accurate statistics from shuffle.
   *
   * Note that, it does not work if adaptive query execution is disabled.
   */
  def injectRuntimeOptimizerRule(builder: RuleBuilder): Unit = {
    runtimeOptimizerRules += builder
  }

  /**
   * Inject a rule that can override the query stage optimizer phase of adaptive query
   * execution.
   */
  def injectQueryStageOptimizerRule(builder: QueryStageOptimizerRuleBuilder): Unit = {
    queryStageOptimizerRuleBuilders += builder
  }

  private[this] val resolutionRuleBuilders = mutable.Buffer.empty[RuleBuilder]

  /**
   * Build the analyzer resolution `Rule`s using the given [[SparkSession]].
   */
  private[sql] def buildResolutionRules(session: SparkSession): Seq[Rule[LogicalPlan]] = {
    resolutionRuleBuilders.map(_.apply(session)).toSeq
  }

  /**
   * Inject an analyzer resolution `Rule` builder into the [[SparkSession]]. These analyzer
   * rules will be executed as part of the resolution phase of analysis.
   */
  def injectResolutionRule(builder: RuleBuilder): Unit = {
    resolutionRuleBuilders += builder
  }

  private[this] val postHocResolutionRuleBuilders = mutable.Buffer.empty[RuleBuilder]

  /**
   * Build the analyzer post-hoc resolution `Rule`s using the given [[SparkSession]].
   */
  private[sql] def buildPostHocResolutionRules(session: SparkSession): Seq[Rule[LogicalPlan]] = {
    postHocResolutionRuleBuilders.map(_.apply(session)).toSeq
  }

  /**
   * Inject an analyzer `Rule` builder into the [[SparkSession]]. These analyzer
   * rules will be executed after resolution.
   */
  def injectPostHocResolutionRule(builder: RuleBuilder): Unit = {
    postHocResolutionRuleBuilders += builder
  }

  private[this] val checkRuleBuilders = mutable.Buffer.empty[CheckRuleBuilder]

  /**
   * Build the check analysis `Rule`s using the given [[SparkSession]].
   */
  private[sql] def buildCheckRules(session: SparkSession): Seq[LogicalPlan => Unit] = {
    checkRuleBuilders.map(_.apply(session)).toSeq
  }

  /**
   * Inject an check analysis `Rule` builder into the [[SparkSession]]. The injected rules will
   * be executed after the analysis phase. A check analysis rule is used to detect problems with a
   * LogicalPlan and should throw an exception when a problem is found.
   */
  def injectCheckRule(builder: CheckRuleBuilder): Unit = {
    checkRuleBuilders += builder
  }

  private[this] val planNormalizationRules = mutable.Buffer.empty[RuleBuilder]

  def buildPlanNormalizationRules(session: SparkSession): Seq[Rule[LogicalPlan]] = {
    planNormalizationRules.map(_.apply(session)).toSeq
  }

  /**
   * Inject a plan normalization `Rule` builder into the [[SparkSession]]. The injected rules will
   * be executed just before query caching decisions are made. Such rules can be used to improve the
   * cache hit rate by normalizing different plans to the same form. These rules should never modify
   * the result of the LogicalPlan.
   */
  def injectPlanNormalizationRule(builder: RuleBuilder): Unit = {
    planNormalizationRules += builder
  }

  private[this] val optimizerRules = mutable.Buffer.empty[RuleBuilder]

  private[sql] def buildOptimizerRules(session: SparkSession): Seq[Rule[LogicalPlan]] = {
    optimizerRules.map(_.apply(session)).toSeq
  }

  /**
   * Inject an optimizer `Rule` builder into the [[SparkSession]]. The injected rules will be
   * executed during the operator optimization batch. An optimizer rule is used to improve the
   * quality of an analyzed logical plan; these rules should never modify the result of the
   * LogicalPlan.
   */
  def injectOptimizerRule(builder: RuleBuilder): Unit = {
    optimizerRules += builder
  }

  private[this] val preCBORules = mutable.Buffer.empty[RuleBuilder]

  private[sql] def buildPreCBORules(session: SparkSession): Seq[Rule[LogicalPlan]] = {
    preCBORules.map(_.apply(session)).toSeq
  }

  /**
   * Inject an optimizer `Rule` builder that rewrites logical plans into the [[SparkSession]].
   * The injected rules will be executed once after the operator optimization batch and
   * before any cost-based optimization rules that depend on stats.
   */
  def injectPreCBORule(builder: RuleBuilder): Unit = {
    preCBORules += builder
  }

  private[this] val plannerStrategyBuilders = mutable.Buffer.empty[StrategyBuilder]

  private[sql] def buildPlannerStrategies(session: SparkSession): Seq[Strategy] = {
    plannerStrategyBuilders.map(_.apply(session)).toSeq
  }

  /**
   * Inject a planner `Strategy` builder into the [[SparkSession]]. The injected strategy will
   * be used to convert a `LogicalPlan` into a executable
   * [[org.apache.spark.sql.execution.SparkPlan]].
   */
  def injectPlannerStrategy(builder: StrategyBuilder): Unit = {
    plannerStrategyBuilders += builder
  }

  private[this] val parserBuilders = mutable.Buffer.empty[ParserBuilder]

  private[sql] def buildParser(
      session: SparkSession,
      initial: ParserInterface): ParserInterface = {
    parserBuilders.foldLeft(initial) { (parser, builder) =>
      builder(session, parser)
    }
  }

  /**
   * Inject a custom parser into the [[SparkSession]]. Note that the builder is passed a session
   * and an initial parser. The latter allows for a user to create a partial parser and to delegate
   * to the underlying parser for completeness. If a user injects more parsers, then the parsers
   * are stacked on top of each other.
   */
  def injectParser(builder: ParserBuilder): Unit = {
    parserBuilders += builder
  }

  private[this] val injectedFunctions = mutable.Buffer.empty[FunctionDescription]

  private[this] val injectedTableFunctions = mutable.Buffer.empty[TableFunctionDescription]

  private[sql] def registerFunctions(functionRegistry: FunctionRegistry) = {
    for ((name, expressionInfo, function) <- injectedFunctions) {
      functionRegistry.registerFunction(name, expressionInfo, function)
    }
    functionRegistry
  }

  private[sql] def registerTableFunctions(tableFunctionRegistry: TableFunctionRegistry) = {
    for ((name, expressionInfo, function) <- injectedTableFunctions) {
      tableFunctionRegistry.registerFunction(name, expressionInfo, function)
    }
    tableFunctionRegistry
  }

  /**
  * Injects a custom function into the [[org.apache.spark.sql.catalyst.analysis.FunctionRegistry]]
  * at runtime for all sessions.
  */
  def injectFunction(functionDescription: FunctionDescription): Unit = {
    injectedFunctions += functionDescription
  }

  /**
   * Injects a custom function into the
   * [[org.apache.spark.sql.catalyst.analysis.TableFunctionRegistry]] at runtime for all sessions.
   */
  def injectTableFunction(functionDescription: TableFunctionDescription): Unit = {
    injectedTableFunctions += functionDescription
  }
}