org.apache.spark.sql.MergeIntoWriter.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-sql-api_2.13 Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.spark.sql

import org.apache.spark.annotation.Experimental

/**
 * `MergeIntoWriter` provides methods to define and execute merge actions based on specified
 * conditions.
 *
 * Please note that schema evolution is disabled by default.
 *
 * @tparam T
 *   the type of data in the Dataset.
 * @since 4.0.0
 */
@Experimental
abstract class MergeIntoWriter[T] {
  private var schemaEvolution: Boolean = false

  private[sql] def schemaEvolutionEnabled: Boolean = schemaEvolution

  /**
   * Initialize a `WhenMatched` action without any condition.
   *
   * This `WhenMatched` action will be executed when a source row matches a target table row based
   * on the merge condition.
   *
   * This `WhenMatched` can be followed by one of the following merge actions:
   *   - `updateAll`: Update all the matched target table rows with source dataset rows.
   *   - `update(Map)`: Update all the matched target table rows while changing only a subset of
   *     columns based on the provided assignment.
   *   - `delete`: Delete all target rows that have a match in the source table.
   *
   * @return
   *   a new `WhenMatched` object.
   */
  def whenMatched(): WhenMatched[T] = {
    new WhenMatched[T](this, None)
  }

  /**
   * Initialize a `WhenMatched` action with a condition.
   *
   * This `WhenMatched` action will be executed when a source row matches a target table row based
   * on the merge condition and the specified `condition` is satisfied.
   *
   * This `WhenMatched` can be followed by one of the following merge actions:
   *   - `updateAll`: Update all the matched target table rows with source dataset rows.
   *   - `update(Map)`: Update all the matched target table rows while changing only a subset of
   *     columns based on the provided assignment.
   *   - `delete`: Delete all target rows that have a match in the source table.
   *
   * @param condition
   *   a `Column` representing the condition to be evaluated for the action.
   * @return
   *   a new `WhenMatched` object configured with the specified condition.
   */
  def whenMatched(condition: Column): WhenMatched[T] = {
    new WhenMatched[T](this, Some(condition))
  }

  /**
   * Initialize a `WhenNotMatched` action without any condition.
   *
   * This `WhenNotMatched` action will be executed when a source row does not match any target row
   * based on the merge condition.
   *
   * This `WhenNotMatched` can be followed by one of the following merge actions:
   *   - `insertAll`: Insert all rows from the source that are not already in the target table.
   *   - `insert(Map)`: Insert all rows from the source that are not already in the target table,
   *     with the specified columns based on the provided assignment.
   *
   * @return
   *   a new `WhenNotMatched` object.
   */
  def whenNotMatched(): WhenNotMatched[T] = {
    new WhenNotMatched[T](this, None)
  }

  /**
   * Initialize a `WhenNotMatched` action with a condition.
   *
   * This `WhenNotMatched` action will be executed when a source row does not match any target row
   * based on the merge condition and the specified `condition` is satisfied.
   *
   * This `WhenNotMatched` can be followed by one of the following merge actions:
   *   - `insertAll`: Insert all rows from the source that are not already in the target table.
   *   - `insert(Map)`: Insert all rows from the source that are not already in the target table,
   *     with the specified columns based on the provided assignment.
   *
   * @param condition
   *   a `Column` representing the condition to be evaluated for the action.
   * @return
   *   a new `WhenNotMatched` object configured with the specified condition.
   */
  def whenNotMatched(condition: Column): WhenNotMatched[T] = {
    new WhenNotMatched[T](this, Some(condition))
  }

  /**
   * Initialize a `WhenNotMatchedBySource` action without any condition.
   *
   * This `WhenNotMatchedBySource` action will be executed when a target row does not match any
   * rows in the source table based on the merge condition.
   *
   * This `WhenNotMatchedBySource` can be followed by one of the following merge actions:
   *   - `updateAll`: Update all the not matched target table rows with source dataset rows.
   *   - `update(Map)`: Update all the not matched target table rows while changing only the
   *     specified columns based on the provided assignment.
   *   - `delete`: Delete all target rows that have no matches in the source table.
   *
   * @return
   *   a new `WhenNotMatchedBySource` object.
   */
  def whenNotMatchedBySource(): WhenNotMatchedBySource[T] = {
    new WhenNotMatchedBySource[T](this, None)
  }

  /**
   * Initialize a `WhenNotMatchedBySource` action with a condition.
   *
   * This `WhenNotMatchedBySource` action will be executed when a target row does not match any
   * rows in the source table based on the merge condition and the specified `condition` is
   * satisfied.
   *
   * This `WhenNotMatchedBySource` can be followed by one of the following merge actions:
   *   - `updateAll`: Update all the not matched target table rows with source dataset rows.
   *   - `update(Map)`: Update all the not matched target table rows while changing only the
   *     specified columns based on the provided assignment.
   *   - `delete`: Delete all target rows that have no matches in the source table.
   *
   * @param condition
   *   a `Column` representing the condition to be evaluated for the action.
   * @return
   *   a new `WhenNotMatchedBySource` object configured with the specified condition.
   */
  def whenNotMatchedBySource(condition: Column): WhenNotMatchedBySource[T] = {
    new WhenNotMatchedBySource[T](this, Some(condition))
  }

  /**
   * Enable automatic schema evolution for this merge operation.
   *
   * @return
   *   A `MergeIntoWriter` instance with schema evolution enabled.
   */
  def withSchemaEvolution(): MergeIntoWriter[T] = {
    schemaEvolution = true
    this
  }

  /**
   * Executes the merge operation.
   */
  def merge(): Unit

  // Action callbacks.
  protected[sql] def insertAll(condition: Option[Column]): MergeIntoWriter[T]

  protected[sql] def insert(
      condition: Option[Column],
      map: Map[String, Column]): MergeIntoWriter[T]

  protected[sql] def updateAll(
      condition: Option[Column],
      notMatchedBySource: Boolean): MergeIntoWriter[T]

  protected[sql] def update(
      condition: Option[Column],
      map: Map[String, Column],
      notMatchedBySource: Boolean): MergeIntoWriter[T]

  protected[sql] def delete(
      condition: Option[Column],
      notMatchedBySource: Boolean): MergeIntoWriter[T]
}

/**
 * A class for defining actions to be taken when matching rows in a DataFrame during a merge
 * operation.
 *
 * @param mergeIntoWriter
 *   The MergeIntoWriter instance responsible for writing data to a target DataFrame.
 * @param condition
 *   An optional condition Expression that specifies when the actions should be applied. If the
 *   condition is None, the actions will be applied to all matched rows.
 * @tparam T
 *   The type of data in the MergeIntoWriter.
 */
case class WhenMatched[T] private[sql] (
    mergeIntoWriter: MergeIntoWriter[T],
    condition: Option[Column]) {

  /**
   * Specifies an action to update all matched rows in the DataFrame.
   *
   * @return
   *   The MergeIntoWriter instance with the update all action configured.
   */
  def updateAll(): MergeIntoWriter[T] =
    mergeIntoWriter.updateAll(condition, notMatchedBySource = false)

  /**
   * Specifies an action to update matched rows in the DataFrame with the provided column
   * assignments.
   *
   * @param map
   *   A Map of column names to Column expressions representing the updates to be applied.
   * @return
   *   The MergeIntoWriter instance with the update action configured.
   */
  def update(map: Map[String, Column]): MergeIntoWriter[T] =
    mergeIntoWriter.update(condition, map, notMatchedBySource = false)

  /**
   * Specifies an action to delete matched rows from the DataFrame.
   *
   * @return
   *   The MergeIntoWriter instance with the delete action configured.
   */
  def delete(): MergeIntoWriter[T] =
    mergeIntoWriter.delete(condition, notMatchedBySource = false)
}

/**
 * A class for defining actions to be taken when no matching rows are found in a DataFrame during
 * a merge operation.
 *
 * @param mergeIntoWriter
 *   The MergeIntoWriter instance responsible for writing data to a target DataFrame.
 * @param condition
 *   An optional condition Expression that specifies when the actions defined in this
 *   configuration should be applied. If the condition is None, the actions will be applied when
 *   there are no matching rows.
 * @tparam T
 *   The type of data in the MergeIntoWriter.
 */
case class WhenNotMatched[T] private[sql] (
    mergeIntoWriter: MergeIntoWriter[T],
    condition: Option[Column]) {

  /**
   * Specifies an action to insert all non-matched rows into the DataFrame.
   *
   * @return
   *   The MergeIntoWriter instance with the insert all action configured.
   */
  def insertAll(): MergeIntoWriter[T] =
    mergeIntoWriter.insertAll(condition)

  /**
   * Specifies an action to insert non-matched rows into the DataFrame with the provided column
   * assignments.
   *
   * @param map
   *   A Map of column names to Column expressions representing the values to be inserted.
   * @return
   *   The MergeIntoWriter instance with the insert action configured.
   */
  def insert(map: Map[String, Column]): MergeIntoWriter[T] =
    mergeIntoWriter.insert(condition, map)
}

/**
 * A class for defining actions to be performed when there is no match by source during a merge
 * operation in a MergeIntoWriter.
 *
 * @param mergeIntoWriter
 *   the MergeIntoWriter instance to which the merge actions will be applied.
 * @param condition
 *   an optional condition to be used with the merge actions.
 * @tparam T
 *   the type parameter for the MergeIntoWriter.
 */
case class WhenNotMatchedBySource[T] private[sql] (
    mergeIntoWriter: MergeIntoWriter[T],
    condition: Option[Column]) {

  /**
   * Specifies an action to update all non-matched rows in the target DataFrame when not matched
   * by the source.
   *
   * @return
   *   The MergeIntoWriter instance with the update all action configured.
   */
  def updateAll(): MergeIntoWriter[T] =
    mergeIntoWriter.updateAll(condition, notMatchedBySource = true)

  /**
   * Specifies an action to update non-matched rows in the target DataFrame with the provided
   * column assignments when not matched by the source.
   *
   * @param map
   *   A Map of column names to Column expressions representing the updates to be applied.
   * @return
   *   The MergeIntoWriter instance with the update action configured.
   */
  def update(map: Map[String, Column]): MergeIntoWriter[T] =
    mergeIntoWriter.update(condition, map, notMatchedBySource = true)

  /**
   * Specifies an action to delete non-matched rows from the target DataFrame when not matched by
   * the source.
   *
   * @return
   *   The MergeIntoWriter instance with the delete action configured.
   */
  def delete(): MergeIntoWriter[T] =
    mergeIntoWriter.delete(condition, notMatchedBySource = true)
}