All Downloads are FREE. Search and download functionalities are using the official Maven repository.

zio.spark.sql.RelationalGroupedDataset.scala Maven / Gradle / Ivy

There is a newer version: 0.12.0
Show newest version
/**
 * /!\ Warning /!\
 *
 * This file is generated using zio-spark-codegen, you should not edit
 * this file directly.
 */

package zio.spark.sql

import org.apache.spark.sql.{
  Column,
  Dataset => UnderlyingDataset,
  Encoder,
  KeyValueGroupedDataset => UnderlyingKeyValueGroupedDataset,
  RelationalGroupedDataset => UnderlyingRelationalGroupedDataset
}

final case class RelationalGroupedDataset(underlying: UnderlyingRelationalGroupedDataset) { self =>
  // scalafix:off
  implicit private def liftKeyValueGroupedDataset[K, V](
      x: UnderlyingKeyValueGroupedDataset[K, V]
  ): KeyValueGroupedDataset[K, V] = KeyValueGroupedDataset(x)
  // scalafix:on

  /**
   * Unpack the underlying RelationalGroupedDataset into a DataFrame.
   */
  def unpack[U](f: UnderlyingRelationalGroupedDataset => UnderlyingDataset[U]): Dataset[U] = Dataset(f(underlying))

  /**
   * Unpack the underlying RelationalGroupedDataset into a DataFrame, it
   * is used for transformations that can fail due to an
   * AnalysisException.
   */
  def unpackWithAnalysis[U](f: UnderlyingRelationalGroupedDataset => UnderlyingDataset[U]): TryAnalysis[Dataset[U]] =
    TryAnalysis(unpack(f))

  /**
   * Applies a transformation to the underlying
   * RelationalGroupedDataset.
   */
  def transformation(
      f: UnderlyingRelationalGroupedDataset => UnderlyingRelationalGroupedDataset
  ): RelationalGroupedDataset = RelationalGroupedDataset(f(underlying))

  /**
   * Applies a transformation to the underlying
   * RelationalGroupedDataset, it is used for transformations that can
   * fail due to an AnalysisException.
   */
  def transformationWithAnalysis(
      f: UnderlyingRelationalGroupedDataset => UnderlyingRelationalGroupedDataset
  ): TryAnalysis[RelationalGroupedDataset] = TryAnalysis(transformation(f))

  /** Applies an action to the underlying RelationalGroupedDataset. */
  def get[U](f: UnderlyingRelationalGroupedDataset => U): U = f(underlying)

  /**
   * Applies an action to the underlying RelationalGroupedDataset, it is
   * used for transformations that can fail due to an AnalysisException.
   */
  def getWithAnalysis[U](f: UnderlyingRelationalGroupedDataset => U): TryAnalysis[U] = TryAnalysis(f(underlying))

  // Generated functions coming from spark
  /**
   * Returns a `KeyValueGroupedDataset` where the data is grouped by the
   * grouping expressions of current `RelationalGroupedDataset`.
   *
   * @since 3.0.0
   */
  def as[K: Encoder, T: Encoder]: TryAnalysis[KeyValueGroupedDataset[K, T]] = getWithAnalysis(_.as[K, T])

  // ===============

  /**
   * Pivots a column of the current `DataFrame` and performs the
   * specified aggregation.
   *
   * There are two versions of `pivot` function: one that requires the
   * caller to specify the list of distinct values to pivot on, and one
   * that does not. The latter is more concise but less efficient,
   * because Spark needs to first compute the list of distinct values
   * internally.
   *
   * {{{
   *   // Compute the sum of earnings for each year by course with each course as a separate column
   *   df.groupBy("year").pivot("course", Seq("dotNET", "Java")).sum("earnings")
   *
   *   // Or without specifying column values (less efficient)
   *   df.groupBy("year").pivot("course").sum("earnings")
   * }}}
   *
   * @param pivotColumn
   *   Name of the column to pivot.
   * @since 1.6.0
   */
  def pivot(pivotColumn: String): TryAnalysis[RelationalGroupedDataset] =
    transformationWithAnalysis(_.pivot(pivotColumn))

  /**
   * Pivots a column of the current `DataFrame` and performs the
   * specified aggregation. There are two versions of pivot function:
   * one that requires the caller to specify the list of distinct values
   * to pivot on, and one that does not. The latter is more concise but
   * less efficient, because Spark needs to first compute the list of
   * distinct values internally.
   *
   * {{{
   *   // Compute the sum of earnings for each year by course with each course as a separate column
   *   df.groupBy("year").pivot("course", Seq("dotNET", "Java")).sum("earnings")
   *
   *   // Or without specifying column values (less efficient)
   *   df.groupBy("year").pivot("course").sum("earnings")
   * }}}
   *
   * From Spark 3.0.0, values can be literal columns, for instance,
   * struct. For pivoting by multiple columns, use the `struct` function
   * to combine the columns and values:
   *
   * {{{
   *   df.groupBy("year")
   *     .pivot("trainingCourse", Seq(struct(lit("java"), lit("Experts"))))
   *     .agg(sum($"earnings"))
   * }}}
   *
   * @param pivotColumn
   *   Name of the column to pivot.
   * @param values
   *   List of values that will be translated to columns in the output
   *   DataFrame.
   * @since 1.6.0
   */
  def pivot(pivotColumn: String, values: Seq[Any]): TryAnalysis[RelationalGroupedDataset] =
    transformationWithAnalysis(_.pivot(pivotColumn, values))

  /**
   * Pivots a column of the current `DataFrame` and performs the
   * specified aggregation. This is an overloaded version of the `pivot`
   * method with `pivotColumn` of the `String` type.
   *
   * {{{
   *   // Or without specifying column values (less efficient)
   *   df.groupBy($"year").pivot($"course").sum($"earnings");
   * }}}
   *
   * @param pivotColumn
   *   he column to pivot.
   * @since 2.4.0
   */
  def pivot(pivotColumn: Column): TryAnalysis[RelationalGroupedDataset] =
    transformationWithAnalysis(_.pivot(pivotColumn))

  /**
   * Pivots a column of the current `DataFrame` and performs the
   * specified aggregation. This is an overloaded version of the `pivot`
   * method with `pivotColumn` of the `String` type.
   *
   * {{{
   *   // Compute the sum of earnings for each year by course with each course as a separate column
   *   df.groupBy($"year").pivot($"course", Seq("dotNET", "Java")).sum($"earnings")
   * }}}
   *
   * @param pivotColumn
   *   the column to pivot.
   * @param values
   *   List of values that will be translated to columns in the output
   *   DataFrame.
   * @since 2.4.0
   */
  def pivot(pivotColumn: Column, values: Seq[Any]): TryAnalysis[RelationalGroupedDataset] =
    transformationWithAnalysis(_.pivot(pivotColumn, values))

  // ===============

  /**
   * Count the number of rows for each group. The resulting `DataFrame`
   * will also contain the grouping columns.
   *
   * @since 1.3.0
   */
  def count: DataFrame = unpack(_.count())

  // ===============

  /**
   * Compute aggregates by specifying the column names and aggregate
   * methods. The resulting `DataFrame` will also contain the grouping
   * columns.
   *
   * The available aggregate methods are `avg`, `max`, `min`, `sum`,
   * `count`.
   * {{{
   *   // Selects the age of the oldest employee and the aggregate expense for each department
   *   df.groupBy("department").agg(
   *     "age" -> "max",
   *     "expense" -> "sum"
   *   )
   * }}}
   *
   * @since 1.3.0
   */
  def agg(aggExpr: (String, String), aggExprs: (String, String)*): TryAnalysis[DataFrame] =
    unpackWithAnalysis(_.agg(aggExpr, aggExprs: _*))

  /**
   * Compute aggregates by specifying a map from column name to
   * aggregate methods. The resulting `DataFrame` will also contain the
   * grouping columns.
   *
   * The available aggregate methods are `avg`, `max`, `min`, `sum`,
   * `count`.
   * {{{
   *   // Selects the age of the oldest employee and the aggregate expense for each department
   *   df.groupBy("department").agg(Map(
   *     "age" -> "max",
   *     "expense" -> "sum"
   *   ))
   * }}}
   *
   * @since 1.3.0
   */
  def agg(exprs: Map[String, String]): TryAnalysis[DataFrame] = unpackWithAnalysis(_.agg(exprs))

  /**
   * Compute aggregates by specifying a series of aggregate columns.
   * Note that this function by default retains the grouping columns in
   * its output. To not retain grouping columns, set
   * `spark.sql.retainGroupColumns` to false.
   *
   * The available aggregate methods are defined in
   * [[org.apache.spark.sql.functions]].
   *
   * {{{
   *   // Selects the age of the oldest employee and the aggregate expense for each department
   *
   *   // Scala:
   *   import org.apache.spark.sql.functions._
   *   df.groupBy("department").agg(max("age"), sum("expense"))
   *
   *   // Java:
   *   import static org.apache.spark.sql.functions.*;
   *   df.groupBy("department").agg(max("age"), sum("expense"));
   * }}}
   *
   * Note that before Spark 1.4, the default behavior is to NOT retain
   * grouping columns. To change to that behavior, set config variable
   * `spark.sql.retainGroupColumns` to `false`.
   * {{{
   *   // Scala, 1.3.x:
   *   df.groupBy("department").agg($"department", max("age"), sum("expense"))
   *
   *   // Java, 1.3.x:
   *   df.groupBy("department").agg(col("department"), max("age"), sum("expense"));
   * }}}
   *
   * @since 1.3.0
   */
  def agg(expr: Column, exprs: Column*): TryAnalysis[DataFrame] = unpackWithAnalysis(_.agg(expr, exprs: _*))

  /**
   * Compute the mean value for each numeric columns for each group. The
   * resulting `DataFrame` will also contain the grouping columns. When
   * specified columns are given, only compute the mean values for them.
   *
   * @since 1.3.0
   */
  def avg(colNames: String*): TryAnalysis[DataFrame] = unpackWithAnalysis(_.avg(colNames: _*))

  /**
   * Compute the max value for each numeric columns for each group. The
   * resulting `DataFrame` will also contain the grouping columns. When
   * specified columns are given, only compute the max values for them.
   *
   * @since 1.3.0
   */
  def max(colNames: String*): TryAnalysis[DataFrame] = unpackWithAnalysis(_.max(colNames: _*))

  /**
   * Compute the average value for each numeric columns for each group.
   * This is an alias for `avg`. The resulting `DataFrame` will also
   * contain the grouping columns. When specified columns are given,
   * only compute the average values for them.
   *
   * @since 1.3.0
   */
  def mean(colNames: String*): TryAnalysis[DataFrame] = unpackWithAnalysis(_.mean(colNames: _*))

  /**
   * Compute the min value for each numeric column for each group. The
   * resulting `DataFrame` will also contain the grouping columns. When
   * specified columns are given, only compute the min values for them.
   *
   * @since 1.3.0
   */
  def min(colNames: String*): TryAnalysis[DataFrame] = unpackWithAnalysis(_.min(colNames: _*))

  /**
   * Compute the sum for each numeric columns for each group. The
   * resulting `DataFrame` will also contain the grouping columns. When
   * specified columns are given, only compute the sum for them.
   *
   * @since 1.3.0
   */
  def sum(colNames: String*): TryAnalysis[DataFrame] = unpackWithAnalysis(_.sum(colNames: _*))

  // ===============

  // Ignored methods
  //
  // [[org.apache.spark.sql.RelationalGroupedDataset.toString]]
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy