All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.spark.sql.api.KeyValueGroupedDataset.scala Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.spark.sql.api

import org.apache.spark.api.java.function.{CoGroupFunction, FlatMapGroupsFunction, FlatMapGroupsWithStateFunction, MapFunction, MapGroupsFunction, MapGroupsWithStateFunction, ReduceFunction}
import org.apache.spark.sql.{Column, Encoder, TypedColumn}
import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.PrimitiveLongEncoder
import org.apache.spark.sql.functions.{count => cnt, lit}
import org.apache.spark.sql.internal.{ToScalaUDF, UDFAdaptors}
import org.apache.spark.sql.streaming.{GroupState, GroupStateTimeout, OutputMode, StatefulProcessor, StatefulProcessorWithInitialState, TimeMode}

/**
 * A [[Dataset]] has been logically grouped by a user specified grouping key. Users should not
 * construct a [[KeyValueGroupedDataset]] directly, but should instead call `groupByKey` on an
 * existing [[Dataset]].
 *
 * @since 2.0.0
 */
abstract class KeyValueGroupedDataset[K, V, DS[U] <: Dataset[U, DS]] extends Serializable {
  type KVDS[KY, VL] <: KeyValueGroupedDataset[KY, VL, DS]

  /**
   * Returns a new [[KeyValueGroupedDataset]] where the type of the key has been mapped to the
   * specified type. The mapping of key columns to the type follows the same rules as `as` on
   * [[Dataset]].
   *
   * @since 1.6.0
   */
  def keyAs[L: Encoder]: KVDS[L, V]

  /**
   * Returns a new [[KeyValueGroupedDataset]] where the given function `func` has been applied to
   * the data. The grouping key is unchanged by this.
   *
   * {{{
   *   // Create values grouped by key from a Dataset[(K, V)]
   *   ds.groupByKey(_._1).mapValues(_._2) // Scala
   * }}}
   *
   * @since 2.1.0
   */
  def mapValues[W: Encoder](func: V => W): KVDS[K, W]

  /**
   * Returns a new [[KeyValueGroupedDataset]] where the given function `func` has been applied to
   * the data. The grouping key is unchanged by this.
   *
   * {{{
   *   // Create Integer values grouped by String key from a Dataset>
   *   Dataset> ds = ...;
   *   KeyValueGroupedDataset grouped =
   *     ds.groupByKey(t -> t._1, Encoders.STRING()).mapValues(t -> t._2, Encoders.INT());
   * }}}
   *
   * @since 2.1.0
   */
  def mapValues[W](func: MapFunction[V, W], encoder: Encoder[W]): KVDS[K, W] = {
    mapValues(ToScalaUDF(func))(encoder)
  }

  /**
   * Returns a [[Dataset]] that contains each unique key. This is equivalent to doing mapping over
   * the Dataset to extract the keys and then running a distinct operation on those.
   *
   * @since 1.6.0
   */
  def keys: DS[K]

  /**
   * (Scala-specific) Applies the given function to each group of data. For each unique group, the
   * function will be passed the group key and an iterator that contains all of the elements in
   * the group. The function can return an iterator containing elements of an arbitrary type which
   * will be returned as a new [[Dataset]].
   *
   * This function does not support partial aggregation, and as a result requires shuffling all
   * the data in the [[Dataset]]. If an application intends to perform an aggregation over each
   * key, it is best to use the reduce function or an
   * `org.apache.spark.sql.expressions#Aggregator`.
   *
   * Internally, the implementation will spill to disk if any given group is too large to fit into
   * memory. However, users must take care to avoid materializing the whole iterator for a group
   * (for example, by calling `toList`) unless they are sure that this is possible given the
   * memory constraints of their cluster.
   *
   * @since 1.6.0
   */
  def flatMapGroups[U: Encoder](f: (K, Iterator[V]) => IterableOnce[U]): DS[U] = {
    flatMapSortedGroups(Nil: _*)(f)
  }

  /**
   * (Java-specific) Applies the given function to each group of data. For each unique group, the
   * function will be passed the group key and an iterator that contains all of the elements in
   * the group. The function can return an iterator containing elements of an arbitrary type which
   * will be returned as a new [[Dataset]].
   *
   * This function does not support partial aggregation, and as a result requires shuffling all
   * the data in the [[Dataset]]. If an application intends to perform an aggregation over each
   * key, it is best to use the reduce function or an
   * `org.apache.spark.sql.expressions#Aggregator`.
   *
   * Internally, the implementation will spill to disk if any given group is too large to fit into
   * memory. However, users must take care to avoid materializing the whole iterator for a group
   * (for example, by calling `toList`) unless they are sure that this is possible given the
   * memory constraints of their cluster.
   *
   * @since 1.6.0
   */
  def flatMapGroups[U](f: FlatMapGroupsFunction[K, V, U], encoder: Encoder[U]): DS[U] = {
    flatMapGroups(ToScalaUDF(f))(encoder)
  }

  /**
   * (Scala-specific) Applies the given function to each group of data. For each unique group, the
   * function will be passed the group key and a sorted iterator that contains all of the elements
   * in the group. The function can return an iterator containing elements of an arbitrary type
   * which will be returned as a new [[Dataset]].
   *
   * This function does not support partial aggregation, and as a result requires shuffling all
   * the data in the [[Dataset]]. If an application intends to perform an aggregation over each
   * key, it is best to use the reduce function or an
   * `org.apache.spark.sql.expressions#Aggregator`.
   *
   * Internally, the implementation will spill to disk if any given group is too large to fit into
   * memory. However, users must take care to avoid materializing the whole iterator for a group
   * (for example, by calling `toList`) unless they are sure that this is possible given the
   * memory constraints of their cluster.
   *
   * This is equivalent to [[KeyValueGroupedDataset#flatMapGroups]], except for the iterator to be
   * sorted according to the given sort expressions. That sorting does not add computational
   * complexity.
   *
   * @see
   *   `org.apache.spark.sql.api.KeyValueGroupedDataset#flatMapGroups`
   * @since 3.4.0
   */
  def flatMapSortedGroups[U: Encoder](sortExprs: Column*)(
      f: (K, Iterator[V]) => IterableOnce[U]): DS[U]

  /**
   * (Java-specific) Applies the given function to each group of data. For each unique group, the
   * function will be passed the group key and a sorted iterator that contains all of the elements
   * in the group. The function can return an iterator containing elements of an arbitrary type
   * which will be returned as a new [[Dataset]].
   *
   * This function does not support partial aggregation, and as a result requires shuffling all
   * the data in the [[Dataset]]. If an application intends to perform an aggregation over each
   * key, it is best to use the reduce function or an
   * `org.apache.spark.sql.expressions#Aggregator`.
   *
   * Internally, the implementation will spill to disk if any given group is too large to fit into
   * memory. However, users must take care to avoid materializing the whole iterator for a group
   * (for example, by calling `toList`) unless they are sure that this is possible given the
   * memory constraints of their cluster.
   *
   * This is equivalent to [[KeyValueGroupedDataset#flatMapGroups]], except for the iterator to be
   * sorted according to the given sort expressions. That sorting does not add computational
   * complexity.
   *
   * @see
   *   `org.apache.spark.sql.api.KeyValueGroupedDataset#flatMapGroups`
   * @since 3.4.0
   */
  def flatMapSortedGroups[U](
      SortExprs: Array[Column],
      f: FlatMapGroupsFunction[K, V, U],
      encoder: Encoder[U]): DS[U] = {
    import org.apache.spark.util.ArrayImplicits._
    flatMapSortedGroups(SortExprs.toImmutableArraySeq: _*)(ToScalaUDF(f))(encoder)
  }

  /**
   * (Scala-specific) Applies the given function to each group of data. For each unique group, the
   * function will be passed the group key and an iterator that contains all of the elements in
   * the group. The function can return an element of arbitrary type which will be returned as a
   * new [[Dataset]].
   *
   * This function does not support partial aggregation, and as a result requires shuffling all
   * the data in the [[Dataset]]. If an application intends to perform an aggregation over each
   * key, it is best to use the reduce function or an
   * `org.apache.spark.sql.expressions#Aggregator`.
   *
   * Internally, the implementation will spill to disk if any given group is too large to fit into
   * memory. However, users must take care to avoid materializing the whole iterator for a group
   * (for example, by calling `toList`) unless they are sure that this is possible given the
   * memory constraints of their cluster.
   *
   * @since 1.6.0
   */
  def mapGroups[U: Encoder](f: (K, Iterator[V]) => U): DS[U] = {
    flatMapGroups(UDFAdaptors.mapGroupsToFlatMapGroups(f))
  }

  /**
   * (Java-specific) Applies the given function to each group of data. For each unique group, the
   * function will be passed the group key and an iterator that contains all of the elements in
   * the group. The function can return an element of arbitrary type which will be returned as a
   * new [[Dataset]].
   *
   * This function does not support partial aggregation, and as a result requires shuffling all
   * the data in the [[Dataset]]. If an application intends to perform an aggregation over each
   * key, it is best to use the reduce function or an
   * `org.apache.spark.sql.expressions#Aggregator`.
   *
   * Internally, the implementation will spill to disk if any given group is too large to fit into
   * memory. However, users must take care to avoid materializing the whole iterator for a group
   * (for example, by calling `toList`) unless they are sure that this is possible given the
   * memory constraints of their cluster.
   *
   * @since 1.6.0
   */
  def mapGroups[U](f: MapGroupsFunction[K, V, U], encoder: Encoder[U]): DS[U] = {
    mapGroups(ToScalaUDF(f))(encoder)
  }

  /**
   * (Scala-specific) Applies the given function to each group of data, while maintaining a
   * user-defined per-group state. The result Dataset will represent the objects returned by the
   * function. For a static batch Dataset, the function will be invoked once per group. For a
   * streaming Dataset, the function will be invoked for each group repeatedly in every trigger,
   * and updates to each group's state will be saved across invocations. See
   * [[org.apache.spark.sql.streaming.GroupState]] for more details.
   *
   * @tparam S
   *   The type of the user-defined state. Must be encodable to Spark SQL types.
   * @tparam U
   *   The type of the output objects. Must be encodable to Spark SQL types.
   * @param func
   *   Function to be called on every group.
   *
   * See [[org.apache.spark.sql.Encoder]] for more details on what types are encodable to Spark
   * SQL.
   * @since 2.2.0
   */
  def mapGroupsWithState[S: Encoder, U: Encoder](
      func: (K, Iterator[V], GroupState[S]) => U): DS[U]

  /**
   * (Scala-specific) Applies the given function to each group of data, while maintaining a
   * user-defined per-group state. The result Dataset will represent the objects returned by the
   * function. For a static batch Dataset, the function will be invoked once per group. For a
   * streaming Dataset, the function will be invoked for each group repeatedly in every trigger,
   * and updates to each group's state will be saved across invocations. See
   * [[org.apache.spark.sql.streaming.GroupState]] for more details.
   *
   * @tparam S
   *   The type of the user-defined state. Must be encodable to Spark SQL types.
   * @tparam U
   *   The type of the output objects. Must be encodable to Spark SQL types.
   * @param func
   *   Function to be called on every group.
   * @param timeoutConf
   *   Timeout configuration for groups that do not receive data for a while.
   *
   * See [[org.apache.spark.sql.Encoder]] for more details on what types are encodable to Spark
   * SQL.
   * @since 2.2.0
   */
  def mapGroupsWithState[S: Encoder, U: Encoder](timeoutConf: GroupStateTimeout)(
      func: (K, Iterator[V], GroupState[S]) => U): DS[U]

  /**
   * (Scala-specific) Applies the given function to each group of data, while maintaining a
   * user-defined per-group state. The result Dataset will represent the objects returned by the
   * function. For a static batch Dataset, the function will be invoked once per group. For a
   * streaming Dataset, the function will be invoked for each group repeatedly in every trigger,
   * and updates to each group's state will be saved across invocations. See
   * [[org.apache.spark.sql.streaming.GroupState]] for more details.
   *
   * @tparam S
   *   The type of the user-defined state. Must be encodable to Spark SQL types.
   * @tparam U
   *   The type of the output objects. Must be encodable to Spark SQL types.
   * @param func
   *   Function to be called on every group.
   * @param timeoutConf
   *   Timeout Conf, see GroupStateTimeout for more details
   * @param initialState
   *   The user provided state that will be initialized when the first batch of data is processed
   *   in the streaming query. The user defined function will be called on the state data even if
   *   there are no other values in the group. To convert a Dataset ds of type Dataset[(K, S)] to
   *   a KeyValueGroupedDataset[K, S] do {{{ds.groupByKey(x => x._1).mapValues(_._2)}}}
   *
   * See [[org.apache.spark.sql.Encoder]] for more details on what types are encodable to Spark
   * SQL.
   * @since 3.2.0
   */
  def mapGroupsWithState[S: Encoder, U: Encoder](
      timeoutConf: GroupStateTimeout,
      initialState: KVDS[K, S])(func: (K, Iterator[V], GroupState[S]) => U): DS[U]

  /**
   * (Java-specific) Applies the given function to each group of data, while maintaining a
   * user-defined per-group state. The result Dataset will represent the objects returned by the
   * function. For a static batch Dataset, the function will be invoked once per group. For a
   * streaming Dataset, the function will be invoked for each group repeatedly in every trigger,
   * and updates to each group's state will be saved across invocations. See `GroupState` for more
   * details.
   *
   * @tparam S
   *   The type of the user-defined state. Must be encodable to Spark SQL types.
   * @tparam U
   *   The type of the output objects. Must be encodable to Spark SQL types.
   * @param func
   *   Function to be called on every group.
   * @param stateEncoder
   *   Encoder for the state type.
   * @param outputEncoder
   *   Encoder for the output type.
   *
   * See [[org.apache.spark.sql.Encoder]] for more details on what types are encodable to Spark
   * SQL.
   * @since 2.2.0
   */
  def mapGroupsWithState[S, U](
      func: MapGroupsWithStateFunction[K, V, S, U],
      stateEncoder: Encoder[S],
      outputEncoder: Encoder[U]): DS[U] = {
    mapGroupsWithState[S, U](ToScalaUDF(func))(stateEncoder, outputEncoder)
  }

  /**
   * (Java-specific) Applies the given function to each group of data, while maintaining a
   * user-defined per-group state. The result Dataset will represent the objects returned by the
   * function. For a static batch Dataset, the function will be invoked once per group. For a
   * streaming Dataset, the function will be invoked for each group repeatedly in every trigger,
   * and updates to each group's state will be saved across invocations. See `GroupState` for more
   * details.
   *
   * @tparam S
   *   The type of the user-defined state. Must be encodable to Spark SQL types.
   * @tparam U
   *   The type of the output objects. Must be encodable to Spark SQL types.
   * @param func
   *   Function to be called on every group.
   * @param stateEncoder
   *   Encoder for the state type.
   * @param outputEncoder
   *   Encoder for the output type.
   * @param timeoutConf
   *   Timeout configuration for groups that do not receive data for a while.
   *
   * See [[org.apache.spark.sql.Encoder]] for more details on what types are encodable to Spark
   * SQL.
   * @since 2.2.0
   */
  def mapGroupsWithState[S, U](
      func: MapGroupsWithStateFunction[K, V, S, U],
      stateEncoder: Encoder[S],
      outputEncoder: Encoder[U],
      timeoutConf: GroupStateTimeout): DS[U] = {
    mapGroupsWithState[S, U](timeoutConf)(ToScalaUDF(func))(stateEncoder, outputEncoder)
  }

  /**
   * (Java-specific) Applies the given function to each group of data, while maintaining a
   * user-defined per-group state. The result Dataset will represent the objects returned by the
   * function. For a static batch Dataset, the function will be invoked once per group. For a
   * streaming Dataset, the function will be invoked for each group repeatedly in every trigger,
   * and updates to each group's state will be saved across invocations. See `GroupState` for more
   * details.
   *
   * @tparam S
   *   The type of the user-defined state. Must be encodable to Spark SQL types.
   * @tparam U
   *   The type of the output objects. Must be encodable to Spark SQL types.
   * @param func
   *   Function to be called on every group.
   * @param stateEncoder
   *   Encoder for the state type.
   * @param outputEncoder
   *   Encoder for the output type.
   * @param timeoutConf
   *   Timeout configuration for groups that do not receive data for a while.
   * @param initialState
   *   The user provided state that will be initialized when the first batch of data is processed
   *   in the streaming query. The user defined function will be called on the state data even if
   *   there are no other values in the group.
   *
   * See [[org.apache.spark.sql.Encoder]] for more details on what types are encodable to Spark
   * SQL.
   * @since 3.2.0
   */
  def mapGroupsWithState[S, U](
      func: MapGroupsWithStateFunction[K, V, S, U],
      stateEncoder: Encoder[S],
      outputEncoder: Encoder[U],
      timeoutConf: GroupStateTimeout,
      initialState: KVDS[K, S]): DS[U] = {
    val f = ToScalaUDF(func)
    mapGroupsWithState[S, U](timeoutConf, initialState)(f)(stateEncoder, outputEncoder)
  }

  /**
   * (Scala-specific) Applies the given function to each group of data, while maintaining a
   * user-defined per-group state. The result Dataset will represent the objects returned by the
   * function. For a static batch Dataset, the function will be invoked once per group. For a
   * streaming Dataset, the function will be invoked for each group repeatedly in every trigger,
   * and updates to each group's state will be saved across invocations. See `GroupState` for more
   * details.
   *
   * @tparam S
   *   The type of the user-defined state. Must be encodable to Spark SQL types.
   * @tparam U
   *   The type of the output objects. Must be encodable to Spark SQL types.
   * @param func
   *   Function to be called on every group.
   * @param outputMode
   *   The output mode of the function.
   * @param timeoutConf
   *   Timeout configuration for groups that do not receive data for a while.
   *
   * See [[org.apache.spark.sql.Encoder]] for more details on what types are encodable to Spark
   * SQL.
   * @since 2.2.0
   */
  def flatMapGroupsWithState[S: Encoder, U: Encoder](
      outputMode: OutputMode,
      timeoutConf: GroupStateTimeout)(func: (K, Iterator[V], GroupState[S]) => Iterator[U]): DS[U]

  /**
   * (Scala-specific) Applies the given function to each group of data, while maintaining a
   * user-defined per-group state. The result Dataset will represent the objects returned by the
   * function. For a static batch Dataset, the function will be invoked once per group. For a
   * streaming Dataset, the function will be invoked for each group repeatedly in every trigger,
   * and updates to each group's state will be saved across invocations. See `GroupState` for more
   * details.
   *
   * @tparam S
   *   The type of the user-defined state. Must be encodable to Spark SQL types.
   * @tparam U
   *   The type of the output objects. Must be encodable to Spark SQL types.
   * @param func
   *   Function to be called on every group.
   * @param outputMode
   *   The output mode of the function.
   * @param timeoutConf
   *   Timeout configuration for groups that do not receive data for a while.
   * @param initialState
   *   The user provided state that will be initialized when the first batch of data is processed
   *   in the streaming query. The user defined function will be called on the state data even if
   *   there are no other values in the group. To covert a Dataset `ds` of type of type
   *   `Dataset[(K, S)]` to a `KeyValueGroupedDataset[K, S]`, use
   *   {{{ds.groupByKey(x => x._1).mapValues(_._2)}}} See [[org.apache.spark.sql.Encoder]] for
   *   more details on what types are encodable to Spark SQL.
   * @since 3.2.0
   */
  def flatMapGroupsWithState[S: Encoder, U: Encoder](
      outputMode: OutputMode,
      timeoutConf: GroupStateTimeout,
      initialState: KVDS[K, S])(func: (K, Iterator[V], GroupState[S]) => Iterator[U]): DS[U]

  /**
   * (Java-specific) Applies the given function to each group of data, while maintaining a
   * user-defined per-group state. The result Dataset will represent the objects returned by the
   * function. For a static batch Dataset, the function will be invoked once per group. For a
   * streaming Dataset, the function will be invoked for each group repeatedly in every trigger,
   * and updates to each group's state will be saved across invocations. See `GroupState` for more
   * details.
   *
   * @tparam S
   *   The type of the user-defined state. Must be encodable to Spark SQL types.
   * @tparam U
   *   The type of the output objects. Must be encodable to Spark SQL types.
   * @param func
   *   Function to be called on every group.
   * @param outputMode
   *   The output mode of the function.
   * @param stateEncoder
   *   Encoder for the state type.
   * @param outputEncoder
   *   Encoder for the output type.
   * @param timeoutConf
   *   Timeout configuration for groups that do not receive data for a while.
   *
   * See [[org.apache.spark.sql.Encoder]] for more details on what types are encodable to Spark
   * SQL.
   * @since 2.2.0
   */
  def flatMapGroupsWithState[S, U](
      func: FlatMapGroupsWithStateFunction[K, V, S, U],
      outputMode: OutputMode,
      stateEncoder: Encoder[S],
      outputEncoder: Encoder[U],
      timeoutConf: GroupStateTimeout): DS[U] = {
    val f = ToScalaUDF(func)
    flatMapGroupsWithState[S, U](outputMode, timeoutConf)(f)(stateEncoder, outputEncoder)
  }

  /**
   * (Java-specific) Applies the given function to each group of data, while maintaining a
   * user-defined per-group state. The result Dataset will represent the objects returned by the
   * function. For a static batch Dataset, the function will be invoked once per group. For a
   * streaming Dataset, the function will be invoked for each group repeatedly in every trigger,
   * and updates to each group's state will be saved across invocations. See `GroupState` for more
   * details.
   *
   * @tparam S
   *   The type of the user-defined state. Must be encodable to Spark SQL types.
   * @tparam U
   *   The type of the output objects. Must be encodable to Spark SQL types.
   * @param func
   *   Function to be called on every group.
   * @param outputMode
   *   The output mode of the function.
   * @param stateEncoder
   *   Encoder for the state type.
   * @param outputEncoder
   *   Encoder for the output type.
   * @param timeoutConf
   *   Timeout configuration for groups that do not receive data for a while.
   * @param initialState
   *   The user provided state that will be initialized when the first batch of data is processed
   *   in the streaming query. The user defined function will be called on the state data even if
   *   there are no other values in the group. To covert a Dataset `ds` of type of type
   *   `Dataset[(K, S)]` to a `KeyValueGroupedDataset[K, S]`, use
   *   {{{ds.groupByKey(x => x._1).mapValues(_._2)}}}
   *
   * See [[org.apache.spark.sql.Encoder]] for more details on what types are encodable to Spark
   * SQL.
   * @since 3.2.0
   */
  def flatMapGroupsWithState[S, U](
      func: FlatMapGroupsWithStateFunction[K, V, S, U],
      outputMode: OutputMode,
      stateEncoder: Encoder[S],
      outputEncoder: Encoder[U],
      timeoutConf: GroupStateTimeout,
      initialState: KVDS[K, S]): DS[U] = {
    flatMapGroupsWithState[S, U](outputMode, timeoutConf, initialState)(ToScalaUDF(func))(
      stateEncoder,
      outputEncoder)
  }

  /**
   * (Scala-specific) Invokes methods defined in the stateful processor used in arbitrary state
   * API v2. We allow the user to act on per-group set of input rows along with keyed state and
   * the user can choose to output/return 0 or more rows. For a streaming dataframe, we will
   * repeatedly invoke the interface methods for new rows in each trigger and the user's
   * state/state variables will be stored persistently across invocations.
   *
   * @tparam U
   *   The type of the output objects. Must be encodable to Spark SQL types.
   * @param statefulProcessor
   *   Instance of statefulProcessor whose functions will be invoked by the operator.
   * @param timeMode
   *   The time mode semantics of the stateful processor for timers and TTL.
   * @param outputMode
   *   The output mode of the stateful processor.
   *
   * See [[org.apache.spark.sql.Encoder]] for more details on what types are encodable to Spark
   * SQL.
   */
  private[sql] def transformWithState[U: Encoder](
      statefulProcessor: StatefulProcessor[K, V, U],
      timeMode: TimeMode,
      outputMode: OutputMode): DS[U]

  /**
   * (Scala-specific) Invokes methods defined in the stateful processor used in arbitrary state
   * API v2. We allow the user to act on per-group set of input rows along with keyed state and
   * the user can choose to output/return 0 or more rows. For a streaming dataframe, we will
   * repeatedly invoke the interface methods for new rows in each trigger and the user's
   * state/state variables will be stored persistently across invocations.
   *
   * Downstream operators would use specified eventTimeColumnName to calculate watermark. Note
   * that TimeMode is set to EventTime to ensure correct flow of watermark.
   *
   * @tparam U
   *   The type of the output objects. Must be encodable to Spark SQL types.
   * @param statefulProcessor
   *   Instance of statefulProcessor whose functions will be invoked by the operator.
   * @param eventTimeColumnName
   *   eventTime column in the output dataset. Any operations after transformWithState will use
   *   the new eventTimeColumn. The user needs to ensure that the eventTime for emitted output
   *   adheres to the watermark boundary, otherwise streaming query will fail.
   * @param outputMode
   *   The output mode of the stateful processor.
   *
   * See [[org.apache.spark.sql.Encoder]] for more details on what types are encodable to Spark
   * SQL.
   */
  private[sql] def transformWithState[U: Encoder](
      statefulProcessor: StatefulProcessor[K, V, U],
      eventTimeColumnName: String,
      outputMode: OutputMode): DS[U]

  /**
   * (Java-specific) Invokes methods defined in the stateful processor used in arbitrary state API
   * v2. We allow the user to act on per-group set of input rows along with keyed state and the
   * user can choose to output/return 0 or more rows. For a streaming dataframe, we will
   * repeatedly invoke the interface methods for new rows in each trigger and the user's
   * state/state variables will be stored persistently across invocations.
   *
   * @tparam U
   *   The type of the output objects. Must be encodable to Spark SQL types.
   * @param statefulProcessor
   *   Instance of statefulProcessor whose functions will be invoked by the operator.
   * @param timeMode
   *   The time mode semantics of the stateful processor for timers and TTL.
   * @param outputMode
   *   The output mode of the stateful processor.
   * @param outputEncoder
   *   Encoder for the output type.
   *
   * See [[org.apache.spark.sql.Encoder]] for more details on what types are encodable to Spark
   * SQL.
   */
  private[sql] def transformWithState[U: Encoder](
      statefulProcessor: StatefulProcessor[K, V, U],
      timeMode: TimeMode,
      outputMode: OutputMode,
      outputEncoder: Encoder[U]): DS[U] = {
    transformWithState(statefulProcessor, timeMode, outputMode)(outputEncoder)
  }

  /**
   * (Java-specific) Invokes methods defined in the stateful processor used in arbitrary state API
   * v2. We allow the user to act on per-group set of input rows along with keyed state and the
   * user can choose to output/return 0 or more rows.
   *
   * For a streaming dataframe, we will repeatedly invoke the interface methods for new rows in
   * each trigger and the user's state/state variables will be stored persistently across
   * invocations.
   *
   * Downstream operators would use specified eventTimeColumnName to calculate watermark. Note
   * that TimeMode is set to EventTime to ensure correct flow of watermark.
   *
   * @tparam U
   *   The type of the output objects. Must be encodable to Spark SQL types.
   * @param statefulProcessor
   *   Instance of statefulProcessor whose functions will be invoked by the operator.
   * @param eventTimeColumnName
   *   eventTime column in the output dataset. Any operations after transformWithState will use
   *   the new eventTimeColumn. The user needs to ensure that the eventTime for emitted output
   *   adheres to the watermark boundary, otherwise streaming query will fail.
   * @param outputMode
   *   The output mode of the stateful processor.
   * @param outputEncoder
   *   Encoder for the output type.
   *
   * See [[org.apache.spark.sql.Encoder]] for more details on what types are encodable to Spark
   * SQL.
   */
  private[sql] def transformWithState[U: Encoder](
      statefulProcessor: StatefulProcessor[K, V, U],
      eventTimeColumnName: String,
      outputMode: OutputMode,
      outputEncoder: Encoder[U]): DS[U] = {
    transformWithState(statefulProcessor, eventTimeColumnName, outputMode)(outputEncoder)
  }

  /**
   * (Scala-specific) Invokes methods defined in the stateful processor used in arbitrary state
   * API v2. Functions as the function above, but with additional initial state.
   *
   * @tparam U
   *   The type of the output objects. Must be encodable to Spark SQL types.
   * @tparam S
   *   The type of initial state objects. Must be encodable to Spark SQL types.
   * @param statefulProcessor
   *   Instance of statefulProcessor whose functions will be invoked by the operator.
   * @param timeMode
   *   The time mode semantics of the stateful processor for timers and TTL.
   * @param outputMode
   *   The output mode of the stateful processor.
   * @param initialState
   *   User provided initial state that will be used to initiate state for the query in the first
   *   batch.
   *
   * See [[org.apache.spark.sql.Encoder]] for more details on what types are encodable to Spark
   * SQL.
   */
  private[sql] def transformWithState[U: Encoder, S: Encoder](
      statefulProcessor: StatefulProcessorWithInitialState[K, V, U, S],
      timeMode: TimeMode,
      outputMode: OutputMode,
      initialState: KVDS[K, S]): DS[U]

  /**
   * (Scala-specific) Invokes methods defined in the stateful processor used in arbitrary state
   * API v2. Functions as the function above, but with additional eventTimeColumnName for output.
   *
   * @tparam U
   *   The type of the output objects. Must be encodable to Spark SQL types.
   * @tparam S
   *   The type of initial state objects. Must be encodable to Spark SQL types.
   *
   * Downstream operators would use specified eventTimeColumnName to calculate watermark. Note
   * that TimeMode is set to EventTime to ensure correct flow of watermark.
   *
   * @param statefulProcessor
   *   Instance of statefulProcessor whose functions will be invoked by the operator.
   * @param eventTimeColumnName
   *   eventTime column in the output dataset. Any operations after transformWithState will use
   *   the new eventTimeColumn. The user needs to ensure that the eventTime for emitted output
   *   adheres to the watermark boundary, otherwise streaming query will fail.
   * @param outputMode
   *   The output mode of the stateful processor.
   * @param initialState
   *   User provided initial state that will be used to initiate state for the query in the first
   *   batch.
   *
   * See [[org.apache.spark.sql.Encoder]] for more details on what types are encodable to Spark
   * SQL.
   */
  private[sql] def transformWithState[U: Encoder, S: Encoder](
      statefulProcessor: StatefulProcessorWithInitialState[K, V, U, S],
      eventTimeColumnName: String,
      outputMode: OutputMode,
      initialState: KVDS[K, S]): DS[U]

  /**
   * (Java-specific) Invokes methods defined in the stateful processor used in arbitrary state API
   * v2. Functions as the function above, but with additional initialStateEncoder for state
   * encoding.
   *
   * @tparam U
   *   The type of the output objects. Must be encodable to Spark SQL types.
   * @tparam S
   *   The type of initial state objects. Must be encodable to Spark SQL types.
   * @param statefulProcessor
   *   Instance of statefulProcessor whose functions will be invoked by the operator.
   * @param timeMode
   *   The time mode semantics of the stateful processor for timers and TTL.
   * @param outputMode
   *   The output mode of the stateful processor.
   * @param initialState
   *   User provided initial state that will be used to initiate state for the query in the first
   *   batch.
   * @param outputEncoder
   *   Encoder for the output type.
   * @param initialStateEncoder
   *   Encoder for the initial state type.
   *
   * See [[org.apache.spark.sql.Encoder]] for more details on what types are encodable to Spark
   * SQL.
   */
  private[sql] def transformWithState[U: Encoder, S: Encoder](
      statefulProcessor: StatefulProcessorWithInitialState[K, V, U, S],
      timeMode: TimeMode,
      outputMode: OutputMode,
      initialState: KVDS[K, S],
      outputEncoder: Encoder[U],
      initialStateEncoder: Encoder[S]): DS[U] = {
    transformWithState(statefulProcessor, timeMode, outputMode, initialState)(
      outputEncoder,
      initialStateEncoder)
  }

  /**
   * (Java-specific) Invokes methods defined in the stateful processor used in arbitrary state API
   * v2. Functions as the function above, but with additional eventTimeColumnName for output.
   *
   * Downstream operators would use specified eventTimeColumnName to calculate watermark. Note
   * that TimeMode is set to EventTime to ensure correct flow of watermark.
   *
   * @tparam U
   *   The type of the output objects. Must be encodable to Spark SQL types.
   * @tparam S
   *   The type of initial state objects. Must be encodable to Spark SQL types.
   * @param statefulProcessor
   *   Instance of statefulProcessor whose functions will be invoked by the operator.
   * @param outputMode
   *   The output mode of the stateful processor.
   * @param initialState
   *   User provided initial state that will be used to initiate state for the query in the first
   *   batch.
   * @param eventTimeColumnName
   *   event column in the output dataset. Any operations after transformWithState will use the
   *   new eventTimeColumn. The user needs to ensure that the eventTime for emitted output adheres
   *   to the watermark boundary, otherwise streaming query will fail.
   * @param outputEncoder
   *   Encoder for the output type.
   * @param initialStateEncoder
   *   Encoder for the initial state type.
   *
   * See [[org.apache.spark.sql.Encoder]] for more details on what types are encodable to Spark
   * SQL.
   */
  private[sql] def transformWithState[U: Encoder, S: Encoder](
      statefulProcessor: StatefulProcessorWithInitialState[K, V, U, S],
      outputMode: OutputMode,
      initialState: KVDS[K, S],
      eventTimeColumnName: String,
      outputEncoder: Encoder[U],
      initialStateEncoder: Encoder[S]): DS[U] = {
    transformWithState(statefulProcessor, eventTimeColumnName, outputMode, initialState)(
      outputEncoder,
      initialStateEncoder)
  }

  /**
   * (Scala-specific) Reduces the elements of each group of data using the specified binary
   * function. The given function must be commutative and associative or the result may be
   * non-deterministic.
   *
   * @since 1.6.0
   */
  def reduceGroups(f: (V, V) => V): DS[(K, V)]

  /**
   * (Java-specific) Reduces the elements of each group of data using the specified binary
   * function. The given function must be commutative and associative or the result may be
   * non-deterministic.
   *
   * @since 1.6.0
   */
  def reduceGroups(f: ReduceFunction[V]): DS[(K, V)] = {
    reduceGroups(ToScalaUDF(f))
  }

  /**
   * Internal helper function for building typed aggregations that return tuples. For simplicity
   * and code reuse, we do this without the help of the type system and then use helper functions
   * that cast appropriately for the user facing interface.
   */
  protected def aggUntyped(columns: TypedColumn[_, _]*): DS[_]

  /**
   * Computes the given aggregation, returning a [[Dataset]] of tuples for each unique key and the
   * result of computing this aggregation over all elements in the group.
   *
   * @since 1.6.0
   */
  def agg[U1](col1: TypedColumn[V, U1]): DS[(K, U1)] =
    aggUntyped(col1).asInstanceOf[DS[(K, U1)]]

  /**
   * Computes the given aggregations, returning a [[Dataset]] of tuples for each unique key and
   * the result of computing these aggregations over all elements in the group.
   *
   * @since 1.6.0
   */
  def agg[U1, U2](col1: TypedColumn[V, U1], col2: TypedColumn[V, U2]): DS[(K, U1, U2)] =
    aggUntyped(col1, col2).asInstanceOf[DS[(K, U1, U2)]]

  /**
   * Computes the given aggregations, returning a [[Dataset]] of tuples for each unique key and
   * the result of computing these aggregations over all elements in the group.
   *
   * @since 1.6.0
   */
  def agg[U1, U2, U3](
      col1: TypedColumn[V, U1],
      col2: TypedColumn[V, U2],
      col3: TypedColumn[V, U3]): DS[(K, U1, U2, U3)] =
    aggUntyped(col1, col2, col3).asInstanceOf[DS[(K, U1, U2, U3)]]

  /**
   * Computes the given aggregations, returning a [[Dataset]] of tuples for each unique key and
   * the result of computing these aggregations over all elements in the group.
   *
   * @since 1.6.0
   */
  def agg[U1, U2, U3, U4](
      col1: TypedColumn[V, U1],
      col2: TypedColumn[V, U2],
      col3: TypedColumn[V, U3],
      col4: TypedColumn[V, U4]): DS[(K, U1, U2, U3, U4)] =
    aggUntyped(col1, col2, col3, col4).asInstanceOf[DS[(K, U1, U2, U3, U4)]]

  /**
   * Computes the given aggregations, returning a [[Dataset]] of tuples for each unique key and
   * the result of computing these aggregations over all elements in the group.
   *
   * @since 3.0.0
   */
  def agg[U1, U2, U3, U4, U5](
      col1: TypedColumn[V, U1],
      col2: TypedColumn[V, U2],
      col3: TypedColumn[V, U3],
      col4: TypedColumn[V, U4],
      col5: TypedColumn[V, U5]): DS[(K, U1, U2, U3, U4, U5)] =
    aggUntyped(col1, col2, col3, col4, col5).asInstanceOf[DS[(K, U1, U2, U3, U4, U5)]]

  /**
   * Computes the given aggregations, returning a [[Dataset]] of tuples for each unique key and
   * the result of computing these aggregations over all elements in the group.
   *
   * @since 3.0.0
   */
  def agg[U1, U2, U3, U4, U5, U6](
      col1: TypedColumn[V, U1],
      col2: TypedColumn[V, U2],
      col3: TypedColumn[V, U3],
      col4: TypedColumn[V, U4],
      col5: TypedColumn[V, U5],
      col6: TypedColumn[V, U6]): DS[(K, U1, U2, U3, U4, U5, U6)] =
    aggUntyped(col1, col2, col3, col4, col5, col6)
      .asInstanceOf[DS[(K, U1, U2, U3, U4, U5, U6)]]

  /**
   * Computes the given aggregations, returning a [[Dataset]] of tuples for each unique key and
   * the result of computing these aggregations over all elements in the group.
   *
   * @since 3.0.0
   */
  def agg[U1, U2, U3, U4, U5, U6, U7](
      col1: TypedColumn[V, U1],
      col2: TypedColumn[V, U2],
      col3: TypedColumn[V, U3],
      col4: TypedColumn[V, U4],
      col5: TypedColumn[V, U5],
      col6: TypedColumn[V, U6],
      col7: TypedColumn[V, U7]): DS[(K, U1, U2, U3, U4, U5, U6, U7)] =
    aggUntyped(col1, col2, col3, col4, col5, col6, col7)
      .asInstanceOf[DS[(K, U1, U2, U3, U4, U5, U6, U7)]]

  /**
   * Computes the given aggregations, returning a [[Dataset]] of tuples for each unique key and
   * the result of computing these aggregations over all elements in the group.
   *
   * @since 3.0.0
   */
  def agg[U1, U2, U3, U4, U5, U6, U7, U8](
      col1: TypedColumn[V, U1],
      col2: TypedColumn[V, U2],
      col3: TypedColumn[V, U3],
      col4: TypedColumn[V, U4],
      col5: TypedColumn[V, U5],
      col6: TypedColumn[V, U6],
      col7: TypedColumn[V, U7],
      col8: TypedColumn[V, U8]): DS[(K, U1, U2, U3, U4, U5, U6, U7, U8)] =
    aggUntyped(col1, col2, col3, col4, col5, col6, col7, col8)
      .asInstanceOf[DS[(K, U1, U2, U3, U4, U5, U6, U7, U8)]]

  /**
   * Returns a [[Dataset]] that contains a tuple with each key and the number of items present for
   * that key.
   *
   * @since 1.6.0
   */
  def count(): DS[(K, Long)] = agg(cnt(lit(1)).as(PrimitiveLongEncoder))

  /**
   * (Scala-specific) Applies the given function to each cogrouped data. For each unique group,
   * the function will be passed the grouping key and 2 iterators containing all elements in the
   * group from [[Dataset]] `this` and `other`. The function can return an iterator containing
   * elements of an arbitrary type which will be returned as a new [[Dataset]].
   *
   * @since 1.6.0
   */
  def cogroup[U, R: Encoder](other: KVDS[K, U])(
      f: (K, Iterator[V], Iterator[U]) => IterableOnce[R]): DS[R] = {
    cogroupSorted(other)(Nil: _*)(Nil: _*)(f)
  }

  /**
   * (Java-specific) Applies the given function to each cogrouped data. For each unique group, the
   * function will be passed the grouping key and 2 iterators containing all elements in the group
   * from [[Dataset]] `this` and `other`. The function can return an iterator containing elements
   * of an arbitrary type which will be returned as a new [[Dataset]].
   *
   * @since 1.6.0
   */
  def cogroup[U, R](
      other: KVDS[K, U],
      f: CoGroupFunction[K, V, U, R],
      encoder: Encoder[R]): DS[R] = {
    cogroup(other)(ToScalaUDF(f))(encoder)
  }

  /**
   * (Scala-specific) Applies the given function to each sorted cogrouped data. For each unique
   * group, the function will be passed the grouping key and 2 sorted iterators containing all
   * elements in the group from [[Dataset]] `this` and `other`. The function can return an
   * iterator containing elements of an arbitrary type which will be returned as a new
   * [[Dataset]].
   *
   * This is equivalent to [[KeyValueGroupedDataset#cogroup]], except for the iterators to be
   * sorted according to the given sort expressions. That sorting does not add computational
   * complexity.
   *
   * @see
   *   `org.apache.spark.sql.api.KeyValueGroupedDataset#cogroup`
   * @since 3.4.0
   */
  def cogroupSorted[U, R: Encoder](other: KVDS[K, U])(thisSortExprs: Column*)(
      otherSortExprs: Column*)(f: (K, Iterator[V], Iterator[U]) => IterableOnce[R]): DS[R]

  /**
   * (Java-specific) Applies the given function to each sorted cogrouped data. For each unique
   * group, the function will be passed the grouping key and 2 sorted iterators containing all
   * elements in the group from [[Dataset]] `this` and `other`. The function can return an
   * iterator containing elements of an arbitrary type which will be returned as a new
   * [[Dataset]].
   *
   * This is equivalent to [[KeyValueGroupedDataset#cogroup]], except for the iterators to be
   * sorted according to the given sort expressions. That sorting does not add computational
   * complexity.
   *
   * @see
   *   `org.apache.spark.sql.api.KeyValueGroupedDataset#cogroup`
   * @since 3.4.0
   */
  def cogroupSorted[U, R](
      other: KVDS[K, U],
      thisSortExprs: Array[Column],
      otherSortExprs: Array[Column],
      f: CoGroupFunction[K, V, U, R],
      encoder: Encoder[R]): DS[R] = {
    import org.apache.spark.util.ArrayImplicits._
    cogroupSorted(other)(thisSortExprs.toImmutableArraySeq: _*)(
      otherSortExprs.toImmutableArraySeq: _*)(ToScalaUDF(f))(encoder)
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy