
org.jetbrains.kotlinx.spark.api.KeyValueGroupedDataset.kt Maven / Gradle / Ivy
/*-
* =LICENSE=
* Kotlin Spark API: API for Spark 3.0+ (Scala 2.12)
* ----------
* Copyright (C) 2019 - 2021 JetBrains
* ----------
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* =LICENSEEND=
*/
/**
* This file contains several extension functions to work with [KeyValueGroupedDataset]s more easily
* from Kotlin. This includes automatically providing the right encoders, as well as mapping to `Arities`.
*/
package org.jetbrains.kotlinx.spark.api
import org.apache.spark.api.java.function.CoGroupFunction
import org.apache.spark.api.java.function.FlatMapGroupsFunction
import org.apache.spark.api.java.function.FlatMapGroupsWithStateFunction
import org.apache.spark.api.java.function.MapFunction
import org.apache.spark.api.java.function.MapGroupsFunction
import org.apache.spark.api.java.function.MapGroupsWithStateFunction
import org.apache.spark.api.java.function.ReduceFunction
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.Encoder
import org.apache.spark.sql.KeyValueGroupedDataset
import org.apache.spark.sql.streaming.GroupState
import org.apache.spark.sql.streaming.GroupStateTimeout
import org.apache.spark.sql.streaming.OutputMode
import scala.Tuple2
/**
* Returns a new [KeyValueGroupedDataset] where the given function [func] has been applied
* to the data. The grouping key is unchanged by this.
*
* ```kotlin
* // Create values grouped by key from a Dataset>
* ds.groupByKey { it._1 }.mapValues { it._2 }
* ```
*/
inline fun KeyValueGroupedDataset.mapValues(noinline func: (VALUE) -> R): KeyValueGroupedDataset =
mapValues(MapFunction(func), encoder())
/**
* (Kotlin-specific)
* Applies the given function to each group of data. For each unique group, the function will
* be passed the group key and an iterator that contains all the elements in the group. The
* function can return an element of arbitrary type which will be returned as a new [Dataset].
*
* This function does not support partial aggregation, and as a result requires shuffling all
* the data in the [Dataset]. If an application intends to perform an aggregation over each
* key, it is best to use the reduce function or an
* [org.apache.spark.sql.expressions.Aggregator].
*
* Internally, the implementation will spill to disk if any given group is too large to fit into
* memory. However, users must take care to avoid materializing the whole iterator for a group
* (for example, by calling [toList]) unless they are sure that this is possible given the memory
* constraints of their cluster.
*/
inline fun KeyValueGroupedDataset.mapGroups(noinline func: (KEY, Iterator) -> R): Dataset =
mapGroups(MapGroupsFunction(func), encoder())
/**
* (Kotlin-specific)
* Reduces the elements of each group of data using the specified binary function.
* The given function must be commutative and associative or the result may be non-deterministic.
*
* Note that you need to use [reduceGroupsK] always instead of the Java- or Scala-specific
* [KeyValueGroupedDataset.reduceGroups] to make the compiler work.
*/
inline fun KeyValueGroupedDataset.reduceGroupsK(noinline func: (VALUE, VALUE) -> VALUE): Dataset> =
reduceGroups(ReduceFunction(func))
/**
* (Kotlin-specific)
* Applies the given function to each group of data. For each unique group, the function will
* be passed the group key and an iterator that contains all the elements in the group. The
* function can return an iterator containing elements of an arbitrary type which will be returned
* as a new [Dataset].
*
* This function does not support partial aggregation, and as a result requires shuffling all
* the data in the [Dataset]. If an application intends to perform an aggregation over each
* key, it is best to use the reduce function or an
* [org.apache.spark.sql.expressions.Aggregator].
*
* Internally, the implementation will spill to disk if any given group is too large to fit into
* memory. However, users must take care to avoid materializing the whole iterator for a group
* (for example, by calling [toList]) unless they are sure that this is possible given the memory
* constraints of their cluster.
*/
inline fun KeyValueGroupedDataset.flatMapGroups(
noinline func: (key: K, values: Iterator) -> Iterator,
): Dataset = flatMapGroups(
FlatMapGroupsFunction(func),
encoder(),
)
/**
* (Kotlin-specific)
* Applies the given function to each group of data, while maintaining a user-defined per-group
* state. The result Dataset will represent the objects returned by the function.
* For a static batch Dataset, the function will be invoked once per group. For a streaming
* Dataset, the function will be invoked for each group repeatedly in every trigger, and
* updates to each group's state will be saved across invocations.
* See [org.apache.spark.sql.streaming.GroupState] for more details.
*
* @param S The type of the user-defined state. Must be encodable to Spark SQL types.
* @param U The type of the output objects. Must be encodable to Spark SQL types.
* @param func Function to be called on every group.
*
* See [Encoder] for more details on what types are encodable to Spark SQL.
*/
inline fun KeyValueGroupedDataset.mapGroupsWithState(
noinline func: (key: K, values: Iterator, state: GroupState) -> U,
): Dataset = mapGroupsWithState(
MapGroupsWithStateFunction(func),
encoder(),
encoder(),
)
/**
* (Kotlin-specific)
* Applies the given function to each group of data, while maintaining a user-defined per-group
* state. The result Dataset will represent the objects returned by the function.
* For a static batch Dataset, the function will be invoked once per group. For a streaming
* Dataset, the function will be invoked for each group repeatedly in every trigger, and
* updates to each group's state will be saved across invocations.
* See [org.apache.spark.sql.streaming.GroupState] for more details.
*
* @param S The type of the user-defined state. Must be encodable to Spark SQL types.
* @param U The type of the output objects. Must be encodable to Spark SQL types.
* @param func Function to be called on every group.
* @param timeoutConf Timeout configuration for groups that do not receive data for a while.
*
* See [Encoder] for more details on what types are encodable to Spark SQL.
*/
inline fun KeyValueGroupedDataset.mapGroupsWithState(
timeoutConf: GroupStateTimeout,
noinline func: (key: K, values: Iterator, state: GroupState) -> U,
): Dataset = mapGroupsWithState(
MapGroupsWithStateFunction(func),
encoder(),
encoder(),
timeoutConf,
)
/**
* (Kotlin-specific)
* Applies the given function to each group of data, while maintaining a user-defined per-group
* state. The result Dataset will represent the objects returned by the function.
* For a static batch Dataset, the function will be invoked once per group. For a streaming
* Dataset, the function will be invoked for each group repeatedly in every trigger, and
* updates to each group's state will be saved across invocations.
* See [GroupState] for more details.
*
* @param S The type of the user-defined state. Must be encodable to Spark SQL types.
* @param U The type of the output objects. Must be encodable to Spark SQL types.
* @param func Function to be called on every group.
* @param outputMode The output mode of the function.
* @param timeoutConf Timeout configuration for groups that do not receive data for a while.
*
* See [Encoder] for more details on what types are encodable to Spark SQL.
*/
inline fun KeyValueGroupedDataset.flatMapGroupsWithState(
outputMode: OutputMode,
timeoutConf: GroupStateTimeout,
noinline func: (key: K, values: Iterator, state: GroupState) -> Iterator,
): Dataset = flatMapGroupsWithState(
FlatMapGroupsWithStateFunction(func),
outputMode,
encoder(),
encoder(),
timeoutConf,
)
/**
* (Kotlin-specific)
* Applies the given function to each cogrouped data. For each unique group, the function will
* be passed the grouping key and 2 iterators containing all elements in the group from
* [Dataset] [this] and [other]. The function can return an iterator containing elements of an
* arbitrary type which will be returned as a new [Dataset].
*/
inline fun KeyValueGroupedDataset.cogroup(
other: KeyValueGroupedDataset,
noinline func: (key: K, left: Iterator, right: Iterator) -> Iterator,
): Dataset = cogroup(
other,
CoGroupFunction(func),
encoder(),
)
© 2015 - 2025 Weber Informatics LLC | Privacy Policy