zio.spark.sql.Dataset.scala Maven / Gradle / Ivy
/**
* /!\ Warning /!\
*
* This file is generated using zio-spark-codegen, you should not edit
* this file directly.
*/
package zio.spark.sql
import org.apache.spark.sql.{
Column,
DataFrameNaFunctions => UnderlyingDataFrameNaFunctions,
DataFrameStatFunctions => UnderlyingDataFrameStatFunctions,
Dataset => UnderlyingDataset,
Encoder,
KeyValueGroupedDataset => UnderlyingKeyValueGroupedDataset,
RelationalGroupedDataset => UnderlyingRelationalGroupedDataset,
Row,
Sniffer,
TypedColumn
}
import org.apache.spark.sql.execution.ExplainMode
import org.apache.spark.sql.types.Metadata
import org.apache.spark.sql.types.StructType
import org.apache.spark.storage.StorageLevel
import zio._
import zio.spark.rdd._
import zio.spark.sql.streaming.DataStreamWriter
import scala.jdk.CollectionConverters._
import scala.reflect.runtime.universe.TypeTag
import java.io.IOException
final case class Dataset[T](underlying: UnderlyingDataset[T]) { self =>
// scalafix:off
implicit private def lift[U](x: UnderlyingDataset[U]): Dataset[U] = Dataset(x)
implicit private def iteratorConversion[U](iterator: java.util.Iterator[U]): Iterator[U] = iterator.asScala
implicit private def liftDataFrameNaFunctions(x: UnderlyingDataFrameNaFunctions): DataFrameNaFunctions =
DataFrameNaFunctions(x)
implicit private def liftDataFrameStatFunctions(x: UnderlyingDataFrameStatFunctions): DataFrameStatFunctions =
DataFrameStatFunctions(x)
implicit private def liftRelationalGroupedDataset[U](
x: UnderlyingRelationalGroupedDataset
): RelationalGroupedDataset = RelationalGroupedDataset(x)
implicit private def liftKeyValueGroupedDataset[K, V](
x: UnderlyingKeyValueGroupedDataset[K, V]
): KeyValueGroupedDataset[K, V] = KeyValueGroupedDataset(x)
// scalafix:on
/** Applies an action to the underlying Dataset. */
def action[U](f: UnderlyingDataset[T] => U)(implicit trace: Trace): Task[U] = ZIO.attempt(get(f))
/** Applies a transformation to the underlying Dataset. */
def transformation[TNew](f: UnderlyingDataset[T] => UnderlyingDataset[TNew]): Dataset[TNew] = Dataset(f(underlying))
/**
* Applies a transformation to the underlying Dataset, it is used for
* transformations that can fail due to an AnalysisException.
*/
def transformationWithAnalysis[TNew](f: UnderlyingDataset[T] => UnderlyingDataset[TNew]): TryAnalysis[Dataset[TNew]] =
TryAnalysis(transformation(f))
/** Applies an action to the underlying Dataset. */
def get[U](f: UnderlyingDataset[T] => U): U = f(underlying)
/**
* Applies an action to the underlying Dataset, it is used for
* transformations that can fail due to an AnalysisException.
*/
def getWithAnalysis[U](f: UnderlyingDataset[T] => U): TryAnalysis[U] = TryAnalysis(f(underlying))
// Handmade functions specific to zio-spark
/**
* Prints the plans (logical and physical) with a format specified by
* a given explain mode.
*
* @param mode
* specifies the expected output format of plans. - `simple`
* Print only a physical plan.
- `extended`: Print both
* logical and physical plans.
- `codegen`: Print a physical
* plan and generated codes if they are available.
- `cost`:
* Print a logical plan and statistics if they are available.
* - `formatted`: Split explain output into two sections: a
* physical plan outline and node details.
* @group basic
* @since 3.0.0
*/
def explain(mode: String)(implicit trace: Trace): SIO[Unit] = explain(ExplainMode.fromString(mode))
/**
* Prints the plans (logical and physical) with a format specified by
* a given explain mode.
*
* @group basic
* @since 3.0.0
*/
def explain(mode: ExplainMode)(implicit trace: Trace): SIO[Unit] =
for {
ss <- ZIO.service[SparkSession]
plan <- ss.withActive(underlying.queryExecution.explainString(mode))
_ <- Console.printLine(plan)
} yield ()
/** Alias for [[headOption]]. */
def firstOption(implicit trace: Trace): Task[Option[T]] = headOption
// template:on
/** Transforms the Dataset into a RelationalGroupedDataset. */
def group(f: UnderlyingDataset[T] => UnderlyingRelationalGroupedDataset): RelationalGroupedDataset =
RelationalGroupedDataset(f(underlying))
/**
* Groups the Dataset using the specified columns, so we ca run
* aggregations on them.
*
* See [[UnderlyingDataset.groupBy]] for more information.
*/
def groupBy(cols: Column*): RelationalGroupedDataset = group(_.groupBy(cols: _*))
/** Takes the first element of a dataset or None. */
def headOption(implicit trace: Trace): Task[Option[T]] = head(1).map(_.headOption)
// template:on
/** Alias for [[tail]]. */
def last(implicit trace: Trace): Task[T] = tail
/** Alias for [[tailOption]]. */
def lastOption(implicit trace: Trace): Task[Option[T]] = tailOption
/**
* Prints the schema to the console in a nice tree format.
*
* @group basic
* @since 1.6.0
*/
def printSchema(implicit trace: Trace): IO[IOException, Unit] = printSchema(Int.MaxValue)
/**
* Prints the schema up to the given level to the console in a nice
* tree format.
*
* @group basic
* @since 3.0.0
*/
def printSchema(level: Int)(implicit trace: Trace): IO[IOException, Unit] =
Console.printLine(schema.treeString(level))
/**
* Transform the dataset into a [[RDD]].
*
* See [[UnderlyingDataset.rdd]] for more information.
*/
def rdd: RDD[T] = RDD(get(_.rdd))
/**
* Displays the top rows of Dataset in a tabular form. Strings with
* more than 20 characters will be truncated.
*
* See [[UnderlyingDataset.show]] for more information.
*/
def show(numRows: Int)(implicit trace: Trace): IO[IOException, Unit] = show(numRows, truncate = true)
/**
* Displays the top 20 rows of Dataset in a tabular form. Strings with
* more than 20 characters will be truncated.
*
* See [[UnderlyingDataset.show]] for more information.
*/
def show(implicit trace: Trace): IO[IOException, Unit] = show(20)
/**
* Displays the top 20 rows of Dataset in a tabular form.
*
* See [[UnderlyingDataset.show]] for more information.
*/
def show(truncate: Boolean)(implicit trace: Trace): IO[IOException, Unit] = show(20, truncate)
/**
* Displays the top rows of Dataset in a tabular form.
*
* See [[UnderlyingDataset.show]] for more information.
*/
def show(numRows: Int, truncate: Boolean)(implicit trace: Trace): IO[IOException, Unit] = {
val trunc = if (truncate) 20 else 0
val stringifiedDf = Sniffer.datasetShowString(underlying, numRows, truncate = trunc)
Console.printLine(stringifiedDf)
}
/**
* Computes specified statistics for numeric and string columns.
*
* See [[org.apache.spark.sql.Dataset.summary]] for more information.
*/
def summary(statistics: Statistics*)(implicit d: DummyImplicit): DataFrame =
self.summary(statistics.map(_.toString): _*)
/**
* Takes the last element of a dataset or throws an exception.
*
* See [[Dataset.tail]] for more information.
*/
def tail(implicit trace: Trace): Task[T] = self.tail(1).map(_.head)
/** Takes the last element of a dataset or None. */
def tailOption(implicit trace: Trace): Task[Option[T]] = self.tail(1).map(_.headOption)
/** Alias for [[tail]]. */
def takeRight(n: Int)(implicit trace: Trace): Task[Seq[T]] = self.tail(n)
/**
* Chains custom transformations.
*
* See [[UnderlyingDataset.transform]] for more information.
*/
def transform[U](t: Dataset[T] => Dataset[U]): Dataset[U] = t(self)
/**
* Mark the Dataset as non-persistent, and remove all blocks for it
* from memory and disk in a blocking way.
*
* See [[UnderlyingDataset.unpersist]] for more information.
*/
def unpersistBlocking(implicit trace: Trace): UIO[Dataset[T]] =
ZIO.succeed(transformation(_.unpersist(blocking = true)))
/** Alias for [[filter]]. */
def where(f: T => Boolean): Dataset[T] = filter(f)
/** Create a DataFrameWriter from this dataset. */
def write: DataFrameWriter[T] = DataFrameWriter(self)
/** Create a DataStreamWriter from this dataset. */
def writeStream: DataStreamWriter[T] = DataStreamWriter(self)
// Generated functions coming from spark
/**
* Returns all column names as an array.
*
* @group basic
* @since 1.6.0
*/
def columns: Seq[String] = get(_.columns.toSeq)
/**
* (Scala-specific) Returns a [[KeyValueGroupedDataset]] where the
* data is grouped by the given key `func`.
*
* @group typedrel
* @since 2.0.0
*/
def groupByKey[K: Encoder](func: T => K): KeyValueGroupedDataset[K, T] = get(_.groupByKey[K](func))
// scalastyle:on println
/**
* Returns a [[DataFrameNaFunctions]] for working with missing data.
* {{{
* // Dropping rows containing any null values.
* ds.na.drop()
* }}}
*
* @group untypedrel
* @since 1.6.0
*/
def na: DataFrameNaFunctions = get(_.na)
/**
* Returns the schema of this Dataset.
*
* @group basic
* @since 1.6.0
*/
def schema: StructType = get(_.schema)
/**
* Returns a [[DataFrameStatFunctions]] for working statistic
* functions support.
* {{{
* // Finding frequent items in column with name 'a'.
* ds.stat.freqItems(Seq("a"))
* }}}
*
* @group untypedrel
* @since 1.6.0
*/
def stat: DataFrameStatFunctions = get(_.stat)
// ===============
/**
* Selects column based on the column name and returns it as a
* [[Column]].
*
* @note
* The column name can also reference to a nested column like `a.b`.
*
* @group untypedrel
* @since 2.0.0
*/
def col(colName: String): TryAnalysis[Column] = getWithAnalysis(_.col(colName))
/**
* Selects column based on the column name specified as a regex and
* returns it as [[Column]].
* @group untypedrel
* @since 2.3.0
*/
def colRegex(colName: String): TryAnalysis[Column] = getWithAnalysis(_.colRegex(colName))
/**
* Create a multi-dimensional cube for the current Dataset using the
* specified columns, so we can run aggregation on them. See
* [[RelationalGroupedDataset]] for all the available aggregate
* functions.
*
* {{{
* // Compute the average for all numeric columns cubed by department and group.
* ds.cube($"department", $"group").avg()
*
* // Compute the max age and average salary, cubed by department and gender.
* ds.cube($"department", $"gender").agg(Map(
* "salary" -> "avg",
* "age" -> "max"
* ))
* }}}
*
* @group untypedrel
* @since 2.0.0
*/
def cube(cols: Column*): TryAnalysis[RelationalGroupedDataset] = getWithAnalysis(_.cube(cols: _*))
/**
* Create a multi-dimensional cube for the current Dataset using the
* specified columns, so we can run aggregation on them. See
* [[RelationalGroupedDataset]] for all the available aggregate
* functions.
*
* This is a variant of cube that can only group by existing columns
* using column names (i.e. cannot construct expressions).
*
* {{{
* // Compute the average for all numeric columns cubed by department and group.
* ds.cube("department", "group").avg()
*
* // Compute the max age and average salary, cubed by department and gender.
* ds.cube($"department", $"gender").agg(Map(
* "salary" -> "avg",
* "age" -> "max"
* ))
* }}}
* @group untypedrel
* @since 2.0.0
*/
def cube(col1: String, cols: String*): TryAnalysis[RelationalGroupedDataset] = getWithAnalysis(_.cube(col1, cols: _*))
/**
* Create a multi-dimensional rollup for the current Dataset using the
* specified columns, so we can run aggregation on them. See
* [[RelationalGroupedDataset]] for all the available aggregate
* functions.
*
* {{{
* // Compute the average for all numeric columns rolled up by department and group.
* ds.rollup($"department", $"group").avg()
*
* // Compute the max age and average salary, rolled up by department and gender.
* ds.rollup($"department", $"gender").agg(Map(
* "salary" -> "avg",
* "age" -> "max"
* ))
* }}}
*
* @group untypedrel
* @since 2.0.0
*/
def rollup(cols: Column*): TryAnalysis[RelationalGroupedDataset] = getWithAnalysis(_.rollup(cols: _*))
/**
* Create a multi-dimensional rollup for the current Dataset using the
* specified columns, so we can run aggregation on them. See
* [[RelationalGroupedDataset]] for all the available aggregate
* functions.
*
* This is a variant of rollup that can only group by existing columns
* using column names (i.e. cannot construct expressions).
*
* {{{
* // Compute the average for all numeric columns rolled up by department and group.
* ds.rollup("department", "group").avg()
*
* // Compute the max age and average salary, rolled up by department and gender.
* ds.rollup($"department", $"gender").agg(Map(
* "salary" -> "avg",
* "age" -> "max"
* ))
* }}}
*
* @group untypedrel
* @since 2.0.0
*/
def rollup(col1: String, cols: String*): TryAnalysis[RelationalGroupedDataset] =
getWithAnalysis(_.rollup(col1, cols: _*))
// ===============
/**
* Returns an array that contains all rows in this Dataset.
*
* Running collect requires moving all the data into the application's
* driver process, and doing so on a very large dataset can crash the
* driver process with OutOfMemoryError.
*
* For Java API, use [[collectAsList]].
*
* @group action
* @since 1.6.0
*/
def collect(implicit trace: Trace): Task[Seq[T]] = action(_.collect().toSeq)
/**
* Returns the number of rows in the Dataset.
* @group action
* @since 1.6.0
*/
def count(implicit trace: Trace): Task[Long] = action(_.count())
/**
* Returns the first row. Alias for head().
* @group action
* @since 1.6.0
*/
def first(implicit trace: Trace): Task[T] = action(_.first())
/**
* Applies a function `f` to all rows.
*
* @group action
* @since 1.6.0
*/
def foreach(f: T => Unit)(implicit trace: Trace): Task[Unit] = action(_.foreach(f))
/**
* Applies a function `f` to each partition of this Dataset.
*
* @group action
* @since 1.6.0
*/
def foreachPartition(f: Iterator[T] => Unit)(implicit trace: Trace): Task[Unit] = action(_.foreachPartition(f))
/**
* Returns the first `n` rows.
*
* @note
* this method should only be used if the resulting array is
* expected to be small, as all the data is loaded into the driver's
* memory.
*
* @group action
* @since 1.6.0
*/
def head(n: => Int)(implicit trace: Trace): Task[Seq[T]] = action(_.head(n).toSeq)
/**
* Returns the first row.
* @group action
* @since 1.6.0
*/
def head(implicit trace: Trace): Task[T] = action(_.head())
/**
* Returns true if the `Dataset` is empty.
*
* @group basic
* @since 2.4.0
*/
def isEmpty(implicit trace: Trace): Task[Boolean] = action(_.isEmpty)
/**
* (Scala-specific) Reduces the elements of this Dataset using the
* specified binary function. The given `func` must be commutative and
* associative or the result may be non-deterministic.
*
* @group action
* @since 1.6.0
*/
def reduce(func: (T, T) => T)(implicit trace: Trace): Task[T] = action(_.reduce(func))
/**
* Returns the last `n` rows in the Dataset.
*
* Running tail requires moving data into the application's driver
* process, and doing so with a very large `n` can crash the driver
* process with OutOfMemoryError.
*
* @group action
* @since 3.0.0
*/
def tail(n: => Int)(implicit trace: Trace): Task[Seq[T]] = action(_.tail(n).toSeq)
/**
* Returns the first `n` rows in the Dataset.
*
* Running take requires moving data into the application's driver
* process, and doing so with a very large `n` can crash the driver
* process with OutOfMemoryError.
*
* @group action
* @since 1.6.0
*/
def take(n: => Int)(implicit trace: Trace): Task[Seq[T]] = action(_.take(n).toSeq)
/**
* Returns an iterator that contains all rows in this Dataset.
*
* The iterator will consume as much memory as the largest partition
* in this Dataset.
*
* @note
* this results in multiple Spark jobs, and if the input Dataset is
* the result of a wide transformation (e.g. join with different
* partitioners), to avoid recomputing the input Dataset should be
* cached first.
*
* @group action
* @since 2.0.0
*/
def toLocalIterator(implicit trace: Trace): Task[Iterator[T]] = action(_.toLocalIterator())
// ===============
/**
* Persist this Dataset with the default storage level
* (`MEMORY_AND_DISK`).
*
* @group basic
* @since 1.6.0
*/
def cache(implicit trace: Trace): Task[Dataset[T]] = action(_.cache())
/**
* Eagerly checkpoint a Dataset and return the new Dataset.
* Checkpointing can be used to truncate the logical plan of this
* Dataset, which is especially useful in iterative algorithms where
* the plan may grow exponentially. It will be saved to files inside
* the checkpoint directory set with `SparkContext#setCheckpointDir`.
*
* @group basic
* @since 2.1.0
*/
def checkpoint(implicit trace: Trace): Task[Dataset[T]] = action(_.checkpoint())
/**
* Returns a checkpointed version of this Dataset. Checkpointing can
* be used to truncate the logical plan of this Dataset, which is
* especially useful in iterative algorithms where the plan may grow
* exponentially. It will be saved to files inside the checkpoint
* directory set with `SparkContext#setCheckpointDir`.
*
* @group basic
* @since 2.1.0
*/
def checkpoint(eager: => Boolean)(implicit trace: Trace): Task[Dataset[T]] = action(_.checkpoint(eager))
/**
* Creates a global temporary view using the given name. The lifetime
* of this temporary view is tied to this Spark application.
*
* Global temporary view is cross-session. Its lifetime is the
* lifetime of the Spark application,
* i.e. it will be automatically dropped when the application
* terminates. It's tied to a system preserved database `global_temp`,
* and we must use the qualified name to refer a global temp view,
* e.g. `SELECT * FROM global_temp.view1`.
*
* @throws AnalysisException
* if the view name is invalid or already exists
*
* @group basic
* @since 2.1.0
*/
def createGlobalTempView(viewName: => String)(implicit trace: Trace): Task[Unit] =
action(_.createGlobalTempView(viewName))
/**
* Creates or replaces a global temporary view using the given name.
* The lifetime of this temporary view is tied to this Spark
* application.
*
* Global temporary view is cross-session. Its lifetime is the
* lifetime of the Spark application,
* i.e. it will be automatically dropped when the application
* terminates. It's tied to a system preserved database `global_temp`,
* and we must use the qualified name to refer a global temp view,
* e.g. `SELECT * FROM global_temp.view1`.
*
* @group basic
* @since 2.2.0
*/
def createOrReplaceGlobalTempView(viewName: => String)(implicit trace: Trace): Task[Unit] =
action(_.createOrReplaceGlobalTempView(viewName))
/**
* Creates a local temporary view using the given name. The lifetime
* of this temporary view is tied to the [[SparkSession]] that was
* used to create this Dataset.
*
* @group basic
* @since 2.0.0
*/
def createOrReplaceTempView(viewName: => String)(implicit trace: Trace): Task[Unit] =
action(_.createOrReplaceTempView(viewName))
/**
* Creates a local temporary view using the given name. The lifetime
* of this temporary view is tied to the [[SparkSession]] that was
* used to create this Dataset.
*
* Local temporary view is session-scoped. Its lifetime is the
* lifetime of the session that created it, i.e. it will be
* automatically dropped when the session terminates. It's not tied to
* any databases, i.e. we can't use `db1.view1` to reference a local
* temporary view.
*
* @throws AnalysisException
* if the view name is invalid or already exists
*
* @group basic
* @since 2.0.0
*/
def createTempView(viewName: => String)(implicit trace: Trace): Task[Unit] = action(_.createTempView(viewName))
/**
* Returns all column names and their data types as an array.
*
* @group basic
* @since 1.6.0
*/
def dtypes(implicit trace: Trace): Task[Seq[(String, String)]] = action(_.dtypes.toSeq)
/**
* Returns a best-effort snapshot of the files that compose this
* Dataset. This method simply asks each constituent BaseRelation for
* its respective files and takes the union of all results. Depending
* on the source relations, this may not find all input files.
* Duplicates are removed.
*
* @group basic
* @since 2.0.0
*/
def inputFiles(implicit trace: Trace): Task[Seq[String]] = action(_.inputFiles.toSeq)
/**
* Returns true if the `collect` and `take` methods can be run locally
* (without any Spark executors).
*
* @group basic
* @since 1.6.0
*/
def isLocal(implicit trace: Trace): Task[Boolean] = action(_.isLocal)
/**
* Returns true if this Dataset contains one or more sources that
* continuously return data as it arrives. A Dataset that reads data
* from a streaming source must be executed as a `StreamingQuery`
* using the `start()` method in `DataStreamWriter`. Methods that
* return a single answer, e.g. `count()` or `collect()`, will throw
* an [[AnalysisException]] when there is a streaming source present.
*
* @group streaming
* @since 2.0.0
*/
def isStreaming(implicit trace: Trace): Task[Boolean] = action(_.isStreaming)
/**
* Eagerly locally checkpoints a Dataset and return the new Dataset.
* Checkpointing can be used to truncate the logical plan of this
* Dataset, which is especially useful in iterative algorithms where
* the plan may grow exponentially. Local checkpoints are written to
* executor storage and despite potentially faster they are unreliable
* and may compromise job completion.
*
* @group basic
* @since 2.3.0
*/
def localCheckpoint(implicit trace: Trace): Task[Dataset[T]] = action(_.localCheckpoint())
/**
* Locally checkpoints a Dataset and return the new Dataset.
* Checkpointing can be used to truncate the logical plan of this
* Dataset, which is especially useful in iterative algorithms where
* the plan may grow exponentially. Local checkpoints are written to
* executor storage and despite potentially faster they are unreliable
* and may compromise job completion.
*
* @group basic
* @since 2.3.0
*/
def localCheckpoint(eager: => Boolean)(implicit trace: Trace): Task[Dataset[T]] = action(_.localCheckpoint(eager))
/**
* Persist this Dataset with the default storage level
* (`MEMORY_AND_DISK`).
*
* @group basic
* @since 1.6.0
*/
def persist(implicit trace: Trace): Task[Dataset[T]] = action(_.persist())
/**
* Persist this Dataset with the given storage level.
* @param newLevel
* One of: `MEMORY_ONLY`, `MEMORY_AND_DISK`, `MEMORY_ONLY_SER`,
* `MEMORY_AND_DISK_SER`, `DISK_ONLY`, `MEMORY_ONLY_2`,
* `MEMORY_AND_DISK_2`, etc.
*
* @group basic
* @since 1.6.0
*/
def persist(newLevel: => StorageLevel)(implicit trace: Trace): Task[Dataset[T]] = action(_.persist(newLevel))
/**
* Registers this Dataset as a temporary table using the given name.
* The lifetime of this temporary table is tied to the
* [[SparkSession]] that was used to create this Dataset.
*
* @group basic
* @since 1.6.0
*/
@deprecated("Use createOrReplaceTempView(viewName) instead.", "2.0.0")
def registerTempTable(tableName: => String)(implicit trace: Trace): Task[Unit] =
action(_.registerTempTable(tableName))
/**
* Get the Dataset's current storage level, or StorageLevel.NONE if
* not persisted.
*
* @group basic
* @since 2.1.0
*/
def storageLevel(implicit trace: Trace): Task[StorageLevel] = action(_.storageLevel)
/**
* Mark the Dataset as non-persistent, and remove all blocks for it
* from memory and disk. This will not un-persist any cached data that
* is built upon this Dataset.
*
* @param blocking
* Whether to block until all blocks are deleted.
*
* @group basic
* @since 1.6.0
*/
def unpersist(blocking: => Boolean)(implicit trace: Trace): Task[Dataset[T]] = action(_.unpersist(blocking))
/**
* Mark the Dataset as non-persistent, and remove all blocks for it
* from memory and disk. This will not un-persist any cached data that
* is built upon this Dataset.
*
* @group basic
* @since 1.6.0
*/
def unpersist(implicit trace: Trace): Task[Dataset[T]] = action(_.unpersist())
// ===============
/**
* Returns a new Dataset with an alias set. Same as `as`.
*
* @group typedrel
* @since 2.0.0
*/
def alias(alias: String): Dataset[T] = transformation(_.alias(alias))
/**
* Returns a new Dataset with an alias set. Same as `as`.
*
* @group typedrel
* @since 2.0.0
*/
def alias(alias: Symbol): Dataset[T] = transformation(_.alias(alias))
/**
* Returns a new Dataset with an alias set.
*
* @group typedrel
* @since 1.6.0
*/
def as(alias: String): Dataset[T] = transformation(_.as(alias))
/**
* Returns a new Dataset with an alias set.
*
* @group typedrel
* @since 2.0.0
*/
def as(alias: Symbol): Dataset[T] = transformation(_.as(alias))
/**
* Returns a new Dataset that has exactly `numPartitions` partitions,
* when the fewer partitions are requested. If a larger number of
* partitions is requested, it will stay at the current number of
* partitions. Similar to coalesce defined on an `RDD`, this operation
* results in a narrow dependency, e.g. if you go from 1000 partitions
* to 100 partitions, there will not be a shuffle, instead each of the
* 100 new partitions will claim 10 of the current partitions.
*
* However, if you're doing a drastic coalesce, e.g. to
* {{{numPartitions = 1}}}, this may result in your computation taking
* place on fewer nodes than you like (e.g. one node in the case of
* {{{numPartitions = 1}}}). To avoid this, you can call repartition.
* This will add a shuffle step, but means the current upstream
* partitions will be executed in parallel (per whatever the current
* partitioning is).
*
* @group typedrel
* @since 1.6.0
*/
def coalesce(numPartitions: Int): Dataset[T] = transformation(_.coalesce(numPartitions))
/**
* Explicit cartesian join with another `DataFrame`.
*
* @param right
* Right side of the join operation.
*
* @note
* Cartesian joins are very expensive without an extra filter that
* can be pushed down.
*
* @group untypedrel
* @since 2.1.0
*/
def crossJoin(right: Dataset[_]): DataFrame = transformation(_.crossJoin(right.underlying))
/**
* Returns a new Dataset that contains only the unique rows from this
* Dataset. This is an alias for `dropDuplicates`.
*
* Note that for a streaming [[Dataset]], this method returns distinct
* rows only once regardless of the output mode, which the behavior
* may not be same with `DISTINCT` in SQL against streaming
* [[Dataset]].
*
* @note
* Equality checking is performed directly on the encoded
* representation of the data and thus is not affected by a custom
* `equals` function defined on `T`.
*
* @group typedrel
* @since 2.0.0
*/
def distinct: Dataset[T] = transformation(_.distinct())
/**
* Returns a new Dataset with a column dropped. This is a no-op if
* schema doesn't contain column name.
*
* This method can only be used to drop top level columns. the colName
* string is treated literally without further interpretation.
*
* @group untypedrel
* @since 2.0.0
*/
def drop(colName: String): DataFrame = transformation(_.drop(colName))
/**
* Returns a new Dataset with columns dropped. This is a no-op if
* schema doesn't contain column name(s).
*
* This method can only be used to drop top level columns. the colName
* string is treated literally without further interpretation.
*
* @group untypedrel
* @since 2.0.0
*/
def drop(colNames: String*): DataFrame = transformation(_.drop(colNames: _*))
/**
* Returns a new Dataset with a column dropped. This version of drop
* accepts a [[Column]] rather than a name. This is a no-op if the
* Dataset doesn't have a column with an equivalent expression.
*
* @group untypedrel
* @since 2.0.0
*/
def drop(col: Column): DataFrame = transformation(_.drop(col))
/**
* Returns a new Dataset that contains only the unique rows from this
* Dataset. This is an alias for `distinct`.
*
* For a static batch [[Dataset]], it just drops duplicate rows. For a
* streaming [[Dataset]], it will keep all data across triggers as
* intermediate state to drop duplicates rows. You can use
* [[withWatermark]] to limit how late the duplicate data can be and
* system will accordingly limit the state. In addition, too late data
* older than watermark will be dropped to avoid any possibility of
* duplicates.
*
* @group typedrel
* @since 2.0.0
*/
def dropDuplicates: Dataset[T] = transformation(_.dropDuplicates())
/**
* Returns a new Dataset containing rows in this Dataset but not in
* another Dataset. This is equivalent to `EXCEPT DISTINCT` in SQL.
*
* @note
* Equality checking is performed directly on the encoded
* representation of the data and thus is not affected by a custom
* `equals` function defined on `T`.
*
* @group typedrel
* @since 2.0.0
*/
def except(other: Dataset[T]): Dataset[T] = transformation(_.except(other.underlying))
/**
* Returns a new Dataset containing rows in this Dataset but not in
* another Dataset while preserving the duplicates. This is equivalent
* to `EXCEPT ALL` in SQL.
*
* @note
* Equality checking is performed directly on the encoded
* representation of the data and thus is not affected by a custom
* `equals` function defined on `T`. Also as standard in SQL, this
* function resolves columns by position (not by name).
*
* @group typedrel
* @since 2.4.0
*/
def exceptAll(other: Dataset[T]): Dataset[T] = transformation(_.exceptAll(other.underlying))
/**
* Returns a new Dataset where each row has been expanded to zero or
* more rows by the provided function. This is similar to a `LATERAL
* VIEW` in HiveQL. The columns of the input row are implicitly joined
* with each row that is output by the function.
*
* Given that this is deprecated, as an alternative, you can explode
* columns either using `functions.explode()` or `flatMap()`. The
* following example uses these alternatives to count the number of
* books that contain a given word:
*
* {{{
* case class Book(title: String, words: String)
* val ds: Dataset[Book]
*
* val allWords = ds.select($"title", explode(split($"words", " ")).as("word"))
*
* val bookCountPerWord = allWords.groupBy("word").agg(count_distinct("title"))
* }}}
*
* Using `flatMap()` this can similarly be exploded as:
*
* {{{
* ds.flatMap(_.words.split(" "))
* }}}
*
* @group untypedrel
* @since 2.0.0
*/
@deprecated("use flatMap() or select() with functions.explode() instead", "2.0.0")
def explode[A <: Product: TypeTag](input: Column*)(f: Row => TraversableOnce[A]): DataFrame =
transformation(_.explode[A](input: _*)(f))
/**
* (Scala-specific) Returns a new Dataset that only contains elements
* where `func` returns `true`.
*
* @group typedrel
* @since 1.6.0
*/
def filter(func: T => Boolean): Dataset[T] = transformation(_.filter(func))
/**
* (Scala-specific) Returns a new Dataset by first applying a function
* to all elements of this Dataset, and then flattening the results.
*
* @group typedrel
* @since 1.6.0
*/
def flatMap[U: Encoder](func: T => TraversableOnce[U]): Dataset[U] = transformation(_.flatMap[U](func))
/**
* Specifies some hint on the current Dataset. As an example, the
* following code specifies that one of the plan can be broadcasted:
*
* {{{
* df1.join(df2.hint("broadcast"))
* }}}
*
* @group basic
* @since 2.2.0
*/
def hint(name: String, parameters: Any*): Dataset[T] = transformation(_.hint(name, parameters: _*))
/**
* Returns a new Dataset containing rows only in both this Dataset and
* another Dataset. This is equivalent to `INTERSECT` in SQL.
*
* @note
* Equality checking is performed directly on the encoded
* representation of the data and thus is not affected by a custom
* `equals` function defined on `T`.
*
* @group typedrel
* @since 1.6.0
*/
def intersect(other: Dataset[T]): Dataset[T] = transformation(_.intersect(other.underlying))
/**
* Returns a new Dataset containing rows only in both this Dataset and
* another Dataset while preserving the duplicates. This is equivalent
* to `INTERSECT ALL` in SQL.
*
* @note
* Equality checking is performed directly on the encoded
* representation of the data and thus is not affected by a custom
* `equals` function defined on `T`. Also as standard in SQL, this
* function resolves columns by position (not by name).
*
* @group typedrel
* @since 2.4.0
*/
def intersectAll(other: Dataset[T]): Dataset[T] = transformation(_.intersectAll(other.underlying))
/**
* Join with another `DataFrame`.
*
* Behaves as an INNER JOIN and requires a subsequent join predicate.
*
* @param right
* Right side of the join operation.
*
* @group untypedrel
* @since 2.0.0
*/
def join(right: Dataset[_]): DataFrame = transformation(_.join(right.underlying))
/**
* Returns a new Dataset by taking the first `n` rows. The difference
* between this function and `head` is that `head` is an action and
* returns an array (by triggering query execution) while `limit`
* returns a new Dataset.
*
* @group typedrel
* @since 2.0.0
*/
def limit(n: Int): Dataset[T] = transformation(_.limit(n))
/**
* (Scala-specific) Returns a new Dataset that contains the result of
* applying `func` to each element.
*
* @group typedrel
* @since 1.6.0
*/
def map[U: Encoder](func: T => U): Dataset[U] = transformation(_.map[U](func))
/**
* (Scala-specific) Returns a new Dataset that contains the result of
* applying `func` to each partition.
*
* @group typedrel
* @since 1.6.0
*/
def mapPartitions[U: Encoder](func: Iterator[T] => Iterator[U]): Dataset[U] = transformation(_.mapPartitions[U](func))
/**
* Returns a new Dataset that has exactly `numPartitions` partitions.
*
* @group typedrel
* @since 1.6.0
*/
def repartition(numPartitions: Int): Dataset[T] = transformation(_.repartition(numPartitions))
/**
* Returns a new [[Dataset]] by sampling a fraction of rows (without
* replacement), using a user-supplied seed.
*
* @param fraction
* Fraction of rows to generate, range [0.0, 1.0].
* @param seed
* Seed for sampling.
*
* @note
* This is NOT guaranteed to provide exactly the fraction of the
* count of the given [[Dataset]].
*
* @group typedrel
* @since 2.3.0
*/
def sample(fraction: Double, seed: Long): Dataset[T] = transformation(_.sample(fraction, seed))
/**
* Returns a new [[Dataset]] by sampling a fraction of rows (without
* replacement), using a random seed.
*
* @param fraction
* Fraction of rows to generate, range [0.0, 1.0].
*
* @note
* This is NOT guaranteed to provide exactly the fraction of the
* count of the given [[Dataset]].
*
* @group typedrel
* @since 2.3.0
*/
def sample(fraction: Double): Dataset[T] = transformation(_.sample(fraction))
/**
* Returns a new [[Dataset]] by sampling a fraction of rows, using a
* user-supplied seed.
*
* @param withReplacement
* Sample with replacement or not.
* @param fraction
* Fraction of rows to generate, range [0.0, 1.0].
* @param seed
* Seed for sampling.
*
* @note
* This is NOT guaranteed to provide exactly the fraction of the
* count of the given [[Dataset]].
*
* @group typedrel
* @since 1.6.0
*/
def sample(withReplacement: Boolean, fraction: Double, seed: Long): Dataset[T] =
transformation(_.sample(withReplacement, fraction, seed))
/**
* Returns a new [[Dataset]] by sampling a fraction of rows, using a
* random seed.
*
* @param withReplacement
* Sample with replacement or not.
* @param fraction
* Fraction of rows to generate, range [0.0, 1.0].
*
* @note
* This is NOT guaranteed to provide exactly the fraction of the
* total count of the given [[Dataset]].
*
* @group typedrel
* @since 1.6.0
*/
def sample(withReplacement: Boolean, fraction: Double): Dataset[T] =
transformation(_.sample(withReplacement, fraction))
/**
* Returns a new Dataset by computing the given [[Column]] expression
* for each element.
*
* {{{
* val ds = Seq(1, 2, 3).toDS()
* val newDS = ds.select(expr("value + 1").as[Int])
* }}}
*
* @group typedrel
* @since 1.6.0
*/
def select[U1](c1: TypedColumn[T, U1]): Dataset[U1] = transformation(_.select[U1](c1))
/**
* Returns a new Dataset by computing the given [[Column]] expressions
* for each element.
*
* @group typedrel
* @since 1.6.0
*/
def select[U1, U2](c1: TypedColumn[T, U1], c2: TypedColumn[T, U2]): Dataset[(U1, U2)] =
transformation(_.select[U1, U2](c1, c2))
/**
* Returns a new Dataset by computing the given [[Column]] expressions
* for each element.
*
* @group typedrel
* @since 1.6.0
*/
def select[U1, U2, U3](
c1: TypedColumn[T, U1],
c2: TypedColumn[T, U2],
c3: TypedColumn[T, U3]
): Dataset[(U1, U2, U3)] = transformation(_.select[U1, U2, U3](c1, c2, c3))
/**
* Returns a new Dataset by computing the given [[Column]] expressions
* for each element.
*
* @group typedrel
* @since 1.6.0
*/
def select[U1, U2, U3, U4](
c1: TypedColumn[T, U1],
c2: TypedColumn[T, U2],
c3: TypedColumn[T, U3],
c4: TypedColumn[T, U4]
): Dataset[(U1, U2, U3, U4)] = transformation(_.select[U1, U2, U3, U4](c1, c2, c3, c4))
/**
* Returns a new Dataset by computing the given [[Column]] expressions
* for each element.
*
* @group typedrel
* @since 1.6.0
*/
def select[U1, U2, U3, U4, U5](
c1: TypedColumn[T, U1],
c2: TypedColumn[T, U2],
c3: TypedColumn[T, U3],
c4: TypedColumn[T, U4],
c5: TypedColumn[T, U5]
): Dataset[(U1, U2, U3, U4, U5)] = transformation(_.select[U1, U2, U3, U4, U5](c1, c2, c3, c4, c5))
/**
* Computes specified statistics for numeric and string columns.
* Available statistics are: - count
- mean
* - stddev
- min
- max
- arbitrary approximate
* percentiles specified as a percentage (e.g. 75%)
* - count_distinct
- approx_count_distinct
*
* If no statistics are given, this function computes count, mean,
* stddev, min, approximate quartiles (percentiles at 25%, 50%, and
* 75%), and max.
*
* This function is meant for exploratory data analysis, as we make no
* guarantee about the backward compatibility of the schema of the
* resulting Dataset. If you want to programmatically compute summary
* statistics, use the `agg` function instead.
*
* {{{
* ds.summary().show()
*
* // output:
* // summary age height
* // count 10.0 10.0
* // mean 53.3 178.05
* // stddev 11.6 15.7
* // min 18.0 163.0
* // 25% 24.0 176.0
* // 50% 24.0 176.0
* // 75% 32.0 180.0
* // max 92.0 192.0
* }}}
*
* {{{
* ds.summary("count", "min", "25%", "75%", "max").show()
*
* // output:
* // summary age height
* // count 10.0 10.0
* // min 18.0 163.0
* // 25% 24.0 176.0
* // 75% 32.0 180.0
* // max 92.0 192.0
* }}}
*
* To do a summary for specific columns first select them:
*
* {{{
* ds.select("age", "height").summary().show()
* }}}
*
* Specify statistics to output custom summaries:
*
* {{{
* ds.summary("count", "count_distinct").show()
* }}}
*
* The distinct count isn't included by default.
*
* You can also run approximate distinct counts which are faster:
*
* {{{
* ds.summary("count", "approx_count_distinct").show()
* }}}
*
* See also [[describe]] for basic statistics.
*
* @param statistics
* Statistics from above list to be computed.
*
* @group action
* @since 2.3.0
*/
def summary(statistics: String*): DataFrame = transformation(_.summary(statistics: _*))
/**
* Converts this strongly typed collection of data to generic
* Dataframe. In contrast to the strongly typed objects that Dataset
* operations work on, a Dataframe returns generic [[Row]] objects
* that allow fields to be accessed by ordinal or name.
*
* @group basic
* @since 1.6.0
*/
// This is declared with parentheses to prevent the Scala compiler from treating
// `ds.toDF("1")` as invoking this toDF and then apply on the returned DataFrame.
def toDF: DataFrame = transformation(_.toDF())
/**
* Returns the content of the Dataset as a Dataset of JSON strings.
* @since 2.0.0
*/
def toJSON: Dataset[String] = transformation(_.toJSON)
/**
* Returns a new Dataset containing union of rows in this Dataset and
* another Dataset.
*
* This is equivalent to `UNION ALL` in SQL. To do a SQL-style set
* union (that does deduplication of elements), use this function
* followed by a [[distinct]].
*
* Also as standard in SQL, this function resolves columns by position
* (not by name):
*
* {{{
* val df1 = Seq((1, 2, 3)).toDF("col0", "col1", "col2")
* val df2 = Seq((4, 5, 6)).toDF("col1", "col2", "col0")
* df1.union(df2).show
*
* // output:
* // +----+----+----+
* // |col0|col1|col2|
* // +----+----+----+
* // | 1| 2| 3|
* // | 4| 5| 6|
* // +----+----+----+
* }}}
*
* Notice that the column positions in the schema aren't necessarily
* matched with the fields in the strongly typed objects in a Dataset.
* This function resolves columns by their positions in the schema,
* not the fields in the strongly typed objects. Use [[unionByName]]
* to resolve columns by field name in the typed objects.
*
* @group typedrel
* @since 2.0.0
*/
def union(other: Dataset[T]): Dataset[T] = transformation(_.union(other.underlying))
/**
* Returns a new Dataset containing union of rows in this Dataset and
* another Dataset. This is an alias for `union`.
*
* This is equivalent to `UNION ALL` in SQL. To do a SQL-style set
* union (that does deduplication of elements), use this function
* followed by a [[distinct]].
*
* Also as standard in SQL, this function resolves columns by position
* (not by name).
*
* @group typedrel
* @since 2.0.0
*/
def unionAll(other: Dataset[T]): Dataset[T] = transformation(_.unionAll(other.underlying))
/**
* Returns a new Dataset containing union of rows in this Dataset and
* another Dataset.
*
* This is different from both `UNION ALL` and `UNION DISTINCT` in
* SQL. To do a SQL-style set union (that does deduplication of
* elements), use this function followed by a [[distinct]].
*
* The difference between this function and [[union]] is that this
* function resolves columns by name (not by position):
*
* {{{
* val df1 = Seq((1, 2, 3)).toDF("col0", "col1", "col2")
* val df2 = Seq((4, 5, 6)).toDF("col1", "col2", "col0")
* df1.unionByName(df2).show
*
* // output:
* // +----+----+----+
* // |col0|col1|col2|
* // +----+----+----+
* // | 1| 2| 3|
* // | 6| 4| 5|
* // +----+----+----+
* }}}
*
* @group typedrel
* @since 2.3.0
*/
def unionByName(other: Dataset[T]): Dataset[T] = transformation(_.unionByName(other.underlying))
/**
* Returns a new Dataset with a column renamed. This is a no-op if
* schema doesn't contain existingName.
*
* @group untypedrel
* @since 2.0.0
*/
def withColumnRenamed(existingName: String, newName: String): DataFrame =
transformation(_.withColumnRenamed(existingName, newName))
/**
* Defines an event time watermark for this [[Dataset]]. A watermark
* tracks a point in time before which we assume no more late data is
* going to arrive.
*
* Spark will use this watermark for several purposes: - To
* know when a given time window aggregation can be finalized and thus
* can be emitted when using output modes that do not allow
* updates.
- To minimize the amount of state that we need to
* keep for on-going aggregations, `mapGroupsWithState` and
* `dropDuplicates` operators.
The current watermark is
* computed by looking at the `MAX(eventTime)` seen across all of the
* partitions in the query minus a user specified `delayThreshold`.
* Due to the cost of coordinating this value across partitions, the
* actual watermark used is only guaranteed to be at least
* `delayThreshold` behind the actual event time. In some cases we may
* still process records that arrive more than `delayThreshold` late.
*
* @param eventTime
* the name of the column that contains the event time of the row.
* @param delayThreshold
* the minimum delay to wait to data to arrive late, relative to the
* latest record that has been processed in the form of an interval
* (e.g. "1 minute" or "5 hours"). NOTE: This should not be
* negative.
*
* @group streaming
* @since 2.1.0
*/
// We only accept an existing column name, not a derived column here as a watermark that is
// defined on a derived column cannot referenced elsewhere in the plan.
def withWatermark(eventTime: String, delayThreshold: String): Dataset[T] =
transformation(_.withWatermark(eventTime, delayThreshold))
// ===============
/**
* Aggregates on the entire Dataset without groups.
* {{{
* // ds.agg(...) is a shorthand for ds.groupBy().agg(...)
* ds.agg("age" -> "max", "salary" -> "avg")
* ds.groupBy().agg("age" -> "max", "salary" -> "avg")
* }}}
*
* @group untypedrel
* @since 2.0.0
*/
def agg(aggExpr: (String, String), aggExprs: (String, String)*): TryAnalysis[DataFrame] =
transformationWithAnalysis(_.agg(aggExpr, aggExprs: _*))
/**
* Aggregates on the entire Dataset without groups.
* {{{
* // ds.agg(...) is a shorthand for ds.groupBy().agg(...)
* ds.agg(Map("age" -> "max", "salary" -> "avg"))
* ds.groupBy().agg(Map("age" -> "max", "salary" -> "avg"))
* }}}
*
* @group untypedrel
* @since 2.0.0
*/
def agg(exprs: Map[String, String]): TryAnalysis[DataFrame] = transformationWithAnalysis(_.agg(exprs))
/**
* Aggregates on the entire Dataset without groups.
* {{{
* // ds.agg(...) is a shorthand for ds.groupBy().agg(...)
* ds.agg(max($"age"), avg($"salary"))
* ds.groupBy().agg(max($"age"), avg($"salary"))
* }}}
*
* @group untypedrel
* @since 2.0.0
*/
def agg(expr: Column, exprs: Column*): TryAnalysis[DataFrame] = transformationWithAnalysis(_.agg(expr, exprs: _*))
/**
* Returns a new Dataset where each record has been mapped on to the
* specified type. The method used to map columns depend on the type
* of `U`: - When `U` is a class, fields for the class will be
* mapped to columns of the same name (case sensitivity is determined
* by `spark.sql.caseSensitive`).
- When `U` is a tuple, the
* columns will be mapped by ordinal (i.e. the first column will be
* assigned to `_1`).
- When `U` is a primitive type (i.e.
* String, Int, etc), then the first column of the `DataFrame` will be
* used.
*
* If the schema of the Dataset does not match the desired `U` type,
* you can use `select` along with `alias` or `as` to rearrange or
* rename as required.
*
* Note that `as[]` only changes the view of the data that is passed
* into typed operations, such as `map()`, and does not eagerly
* project away any columns that are not present in the specified
* class.
*
* @group basic
* @since 1.6.0
*/
def as[U: Encoder]: TryAnalysis[Dataset[U]] = transformationWithAnalysis(_.as[U])
/**
* Computes basic statistics for numeric and string columns, including
* count, mean, stddev, min, and max. If no columns are given, this
* function computes statistics for all numerical or string columns.
*
* This function is meant for exploratory data analysis, as we make no
* guarantee about the backward compatibility of the schema of the
* resulting Dataset. If you want to programmatically compute summary
* statistics, use the `agg` function instead.
*
* {{{
* ds.describe("age", "height").show()
*
* // output:
* // summary age height
* // count 10.0 10.0
* // mean 53.3 178.05
* // stddev 11.6 15.7
* // min 18.0 163.0
* // max 92.0 192.0
* }}}
*
* Use [[summary]] for expanded statistics and control over which
* statistics to compute.
*
* @param cols
* Columns to compute statistics on.
*
* @group action
* @since 1.6.0
*/
def describe(cols: String*): TryAnalysis[DataFrame] = transformationWithAnalysis(_.describe(cols: _*))
/**
* Returns a new Dataset with duplicate rows removed, considering only
* the subset of columns.
*
* For a static batch [[Dataset]], it just drops duplicate rows. For a
* streaming [[Dataset]], it will keep all data across triggers as
* intermediate state to drop duplicates rows. You can use
* [[withWatermark]] to limit how late the duplicate data can be and
* system will accordingly limit the state. In addition, too late data
* older than watermark will be dropped to avoid any possibility of
* duplicates.
*
* @group typedrel
* @since 2.0.0
*/
def dropDuplicates(colNames: Seq[String]): TryAnalysis[Dataset[T]] =
transformationWithAnalysis(_.dropDuplicates(colNames))
/**
* Returns a new [[Dataset]] with duplicate rows removed, considering
* only the subset of columns.
*
* For a static batch [[Dataset]], it just drops duplicate rows. For a
* streaming [[Dataset]], it will keep all data across triggers as
* intermediate state to drop duplicates rows. You can use
* [[withWatermark]] to limit how late the duplicate data can be and
* system will accordingly limit the state. In addition, too late data
* older than watermark will be dropped to avoid any possibility of
* duplicates.
*
* @group typedrel
* @since 2.0.0
*/
def dropDuplicates(col1: String, cols: String*): TryAnalysis[Dataset[T]] =
transformationWithAnalysis(_.dropDuplicates(col1, cols: _*))
/**
* Returns a new Dataset where a single column has been expanded to
* zero or more rows by the provided function. This is similar to a
* `LATERAL VIEW` in HiveQL. All columns of the input row are
* implicitly joined with each value that is output by the function.
*
* Given that this is deprecated, as an alternative, you can explode
* columns either using `functions.explode()`:
*
* {{{
* ds.select(explode(split($"words", " ")).as("word"))
* }}}
*
* or `flatMap()`:
*
* {{{
* ds.flatMap(_.words.split(" "))
* }}}
*
* @group untypedrel
* @since 2.0.0
*/
@deprecated("use flatMap() or select() with functions.explode() instead", "2.0.0")
def explode[A, B: TypeTag](inputColumn: String, outputColumn: String)(
f: A => TraversableOnce[B]
): TryAnalysis[DataFrame] = transformationWithAnalysis(_.explode[A, B](inputColumn, outputColumn)(f))
/**
* Filters rows using the given condition.
* {{{
* // The following are equivalent:
* peopleDs.filter($"age" > 15)
* peopleDs.where($"age" > 15)
* }}}
*
* @group typedrel
* @since 1.6.0
*/
def filter(condition: Column): TryAnalysis[Dataset[T]] = transformationWithAnalysis(_.filter(condition))
/**
* Filters rows using the given SQL expression.
* {{{
* peopleDs.filter("age > 15")
* }}}
*
* @group typedrel
* @since 1.6.0
*/
def filter(conditionExpr: String): TryAnalysis[Dataset[T]] = transformationWithAnalysis(_.filter(conditionExpr))
/**
* Inner equi-join with another `DataFrame` using the given column.
*
* Different from other join functions, the join column will only
* appear once in the output,
* i.e. similar to SQL's `JOIN USING` syntax.
*
* {{{
* // Joining df1 and df2 using the column "user_id"
* df1.join(df2, "user_id")
* }}}
*
* @param right
* Right side of the join operation.
* @param usingColumn
* Name of the column to join on. This column must exist on both
* sides.
*
* @note
* If you perform a self-join using this function without aliasing
* the input `DataFrame`s, you will NOT be able to reference any
* columns after the join, since there is no way to disambiguate
* which side of the join you would like to reference.
*
* @group untypedrel
* @since 2.0.0
*/
def join(right: Dataset[_], usingColumn: String): TryAnalysis[DataFrame] =
transformationWithAnalysis(_.join(right.underlying, usingColumn))
/**
* Inner equi-join with another `DataFrame` using the given columns.
*
* Different from other join functions, the join columns will only
* appear once in the output,
* i.e. similar to SQL's `JOIN USING` syntax.
*
* {{{
* // Joining df1 and df2 using the columns "user_id" and "user_name"
* df1.join(df2, Seq("user_id", "user_name"))
* }}}
*
* @param right
* Right side of the join operation.
* @param usingColumns
* Names of the columns to join on. This columns must exist on both
* sides.
*
* @note
* If you perform a self-join using this function without aliasing
* the input `DataFrame`s, you will NOT be able to reference any
* columns after the join, since there is no way to disambiguate
* which side of the join you would like to reference.
*
* @group untypedrel
* @since 2.0.0
*/
def join(right: Dataset[_], usingColumns: Seq[String]): TryAnalysis[DataFrame] =
transformationWithAnalysis(_.join(right.underlying, usingColumns))
/**
* Equi-join with another `DataFrame` using the given columns. A cross
* join with a predicate is specified as an inner join. If you would
* explicitly like to perform a cross join use the `crossJoin` method.
*
* Different from other join functions, the join columns will only
* appear once in the output,
* i.e. similar to SQL's `JOIN USING` syntax.
*
* @param right
* Right side of the join operation.
* @param usingColumns
* Names of the columns to join on. This columns must exist on both
* sides.
* @param joinType
* Type of join to perform. Default `inner`. Must be one of:
* `inner`, `cross`, `outer`, `full`, `fullouter`, `full_outer`,
* `left`, `leftouter`, `left_outer`, `right`, `rightouter`,
* `right_outer`, `semi`, `leftsemi`, `left_semi`, `anti`,
* `leftanti`, left_anti`.
*
* @note
* If you perform a self-join using this function without aliasing
* the input `DataFrame`s, you will NOT be able to reference any
* columns after the join, since there is no way to disambiguate
* which side of the join you would like to reference.
*
* @group untypedrel
* @since 2.0.0
*/
def join(right: Dataset[_], usingColumns: Seq[String], joinType: String): TryAnalysis[DataFrame] =
transformationWithAnalysis(_.join(right.underlying, usingColumns, joinType))
/**
* Inner join with another `DataFrame`, using the given join
* expression.
*
* {{{
* // The following two are equivalent:
* df1.join(df2, $"df1Key" === $"df2Key")
* df1.join(df2).where($"df1Key" === $"df2Key")
* }}}
*
* @group untypedrel
* @since 2.0.0
*/
def join(right: Dataset[_], joinExprs: Column): TryAnalysis[DataFrame] =
transformationWithAnalysis(_.join(right.underlying, joinExprs))
/**
* Join with another `DataFrame`, using the given join expression. The
* following performs a full outer join between `df1` and `df2`.
*
* {{{
* // Scala:
* import org.apache.spark.sql.functions._
* df1.join(df2, $"df1Key" === $"df2Key", "outer")
*
* // Java:
* import static org.apache.spark.sql.functions.*;
* df1.join(df2, col("df1Key").equalTo(col("df2Key")), "outer");
* }}}
*
* @param right
* Right side of the join.
* @param joinExprs
* Join expression.
* @param joinType
* Type of join to perform. Default `inner`. Must be one of:
* `inner`, `cross`, `outer`, `full`, `fullouter`, `full_outer`,
* `left`, `leftouter`, `left_outer`, `right`, `rightouter`,
* `right_outer`, `semi`, `leftsemi`, `left_semi`, `anti`,
* `leftanti`, left_anti`.
*
* @group untypedrel
* @since 2.0.0
*/
def join(right: Dataset[_], joinExprs: Column, joinType: String): TryAnalysis[DataFrame] =
transformationWithAnalysis(_.join(right.underlying, joinExprs, joinType))
/**
* Joins this Dataset returning a `Tuple2` for each pair where
* `condition` evaluates to true.
*
* This is similar to the relation `join` function with one important
* difference in the result schema. Since `joinWith` preserves objects
* present on either side of the join, the result schema is similarly
* nested into a tuple under the column names `_1` and `_2`.
*
* This type of join can be useful both for preserving type-safety
* with the original object types as well as working with relational
* data where either side of the join has column names in common.
*
* @param other
* Right side of the join.
* @param condition
* Join expression.
* @param joinType
* Type of join to perform. Default `inner`. Must be one of:
* `inner`, `cross`, `outer`, `full`, `fullouter`,`full_outer`,
* `left`, `leftouter`, `left_outer`, `right`, `rightouter`,
* `right_outer`.
*
* @group typedrel
* @since 1.6.0
*/
def joinWith[U](other: Dataset[U], condition: Column, joinType: String): TryAnalysis[Dataset[(T, U)]] =
transformationWithAnalysis(_.joinWith[U](other.underlying, condition, joinType))
/**
* Using inner equi-join to join this Dataset returning a `Tuple2` for
* each pair where `condition` evaluates to true.
*
* @param other
* Right side of the join.
* @param condition
* Join expression.
*
* @group typedrel
* @since 1.6.0
*/
def joinWith[U](other: Dataset[U], condition: Column): TryAnalysis[Dataset[(T, U)]] =
transformationWithAnalysis(_.joinWith[U](other.underlying, condition))
/**
* Define (named) metrics to observe on the Dataset. This method
* returns an 'observed' Dataset that returns the same result as the
* input, with the following guarantees: - It will compute the
* defined aggregates (metrics) on all the data that is flowing
* through the Dataset at that point.
- It will report the
* value of the defined aggregate columns as soon as we reach a
* completion point. A completion point is either the end of a query
* (batch mode) or the end of a streaming epoch. The value of the
* aggregates only reflects the data processed since the previous
* completion point.
Please note that continuous execution
* is currently not supported.
*
* The metrics columns must either contain a literal (e.g. lit(42)),
* or should contain one or more aggregate functions (e.g. sum(a) or
* sum(a + b) + avg(c) - lit(1)). Expressions that contain references
* to the input Dataset's columns must always be wrapped in an
* aggregate function.
*
* A user can observe these metrics by either adding
* [[org.apache.spark.sql.streaming.StreamingQueryListener]] or a
* [[org.apache.spark.sql.util.QueryExecutionListener]] to the spark
* session.
*
* {{{
* // Monitor the metrics using a listener.
* spark.streams.addListener(new StreamingQueryListener() {
* override def onQueryProgress(event: QueryProgressEvent): Unit = {
* event.progress.observedMetrics.asScala.get("my_event").foreach { row =>
* // Trigger if the number of errors exceeds 5 percent
* val num_rows = row.getAs[Long]("rc")
* val num_error_rows = row.getAs[Long]("erc")
* val ratio = num_error_rows.toDouble / num_rows
* if (ratio > 0.05) {
* // Trigger alert
* }
* }
* }
* def onQueryStarted(event: QueryStartedEvent): Unit = {}
* def onQueryTerminated(event: QueryTerminatedEvent): Unit = {}
* })
* // Observe row count (rc) and error row count (erc) in the streaming Dataset
* val observed_ds = ds.observe("my_event", count(lit(1)).as("rc"), count($"error").as("erc"))
* observed_ds.writeStream.format("...").start()
* }}}
*
* @group typedrel
* @since 3.0.0
*/
def observe(name: String, expr: Column, exprs: Column*): TryAnalysis[Dataset[T]] =
transformationWithAnalysis(_.observe(name, expr, exprs: _*))
/**
* Returns a new Dataset sorted by the given expressions. This is an
* alias of the `sort` function.
*
* @group typedrel
* @since 2.0.0
*/
def orderBy(sortCol: String, sortCols: String*): TryAnalysis[Dataset[T]] =
transformationWithAnalysis(_.orderBy(sortCol, sortCols: _*))
/**
* Returns a new Dataset sorted by the given expressions. This is an
* alias of the `sort` function.
*
* @group typedrel
* @since 2.0.0
*/
def orderBy(sortExprs: Column*): TryAnalysis[Dataset[T]] = transformationWithAnalysis(_.orderBy(sortExprs: _*))
/**
* Returns a new Dataset partitioned by the given partitioning
* expressions into `numPartitions`. The resulting Dataset is hash
* partitioned.
*
* This is the same operation as "DISTRIBUTE BY" in SQL (Hive QL).
*
* @group typedrel
* @since 2.0.0
*/
def repartition(numPartitions: Int, partitionExprs: Column*): TryAnalysis[Dataset[T]] =
transformationWithAnalysis(_.repartition(numPartitions, partitionExprs: _*))
/**
* Returns a new Dataset partitioned by the given partitioning
* expressions, using `spark.sql.shuffle.partitions` as number of
* partitions. The resulting Dataset is hash partitioned.
*
* This is the same operation as "DISTRIBUTE BY" in SQL (Hive QL).
*
* @group typedrel
* @since 2.0.0
*/
def repartition(partitionExprs: Column*): TryAnalysis[Dataset[T]] =
transformationWithAnalysis(_.repartition(partitionExprs: _*))
/**
* Returns a new Dataset partitioned by the given partitioning
* expressions into `numPartitions`. The resulting Dataset is range
* partitioned.
*
* At least one partition-by expression must be specified. When no
* explicit sort order is specified, "ascending nulls first" is
* assumed. Note, the rows are not sorted in each partition of the
* resulting Dataset.
*
* Note that due to performance reasons this method uses sampling to
* estimate the ranges. Hence, the output may not be consistent, since
* sampling can return different values. The sample size can be
* controlled by the config
* `spark.sql.execution.rangeExchange.sampleSizePerPartition`.
*
* @group typedrel
* @since 2.3.0
*/
def repartitionByRange(numPartitions: Int, partitionExprs: Column*): TryAnalysis[Dataset[T]] =
transformationWithAnalysis(_.repartitionByRange(numPartitions, partitionExprs: _*))
/**
* Returns a new Dataset partitioned by the given partitioning
* expressions, using `spark.sql.shuffle.partitions` as number of
* partitions. The resulting Dataset is range partitioned.
*
* At least one partition-by expression must be specified. When no
* explicit sort order is specified, "ascending nulls first" is
* assumed. Note, the rows are not sorted in each partition of the
* resulting Dataset.
*
* Note that due to performance reasons this method uses sampling to
* estimate the ranges. Hence, the output may not be consistent, since
* sampling can return different values. The sample size can be
* controlled by the config
* `spark.sql.execution.rangeExchange.sampleSizePerPartition`.
*
* @group typedrel
* @since 2.3.0
*/
def repartitionByRange(partitionExprs: Column*): TryAnalysis[Dataset[T]] =
transformationWithAnalysis(_.repartitionByRange(partitionExprs: _*))
/**
* Selects a set of column based expressions.
* {{{
* ds.select($"colA", $"colB" + 1)
* }}}
*
* @group untypedrel
* @since 2.0.0
*/
def select(cols: Column*): TryAnalysis[DataFrame] = transformationWithAnalysis(_.select(cols: _*))
/**
* Selects a set of columns. This is a variant of `select` that can
* only select existing columns using column names (i.e. cannot
* construct expressions).
*
* {{{
* // The following two are equivalent:
* ds.select("colA", "colB")
* ds.select($"colA", $"colB")
* }}}
*
* @group untypedrel
* @since 2.0.0
*/
def select(col: String, cols: String*): TryAnalysis[DataFrame] = transformationWithAnalysis(_.select(col, cols: _*))
/**
* Selects a set of SQL expressions. This is a variant of `select`
* that accepts SQL expressions.
*
* {{{
* // The following are equivalent:
* ds.selectExpr("colA", "colB as newName", "abs(colC)")
* ds.select(expr("colA"), expr("colB as newName"), expr("abs(colC)"))
* }}}
*
* @group untypedrel
* @since 2.0.0
*/
def selectExpr(exprs: String*): TryAnalysis[DataFrame] = transformationWithAnalysis(_.selectExpr(exprs: _*))
/**
* Returns a new Dataset sorted by the specified column, all in
* ascending order.
* {{{
* // The following 3 are equivalent
* ds.sort("sortcol")
* ds.sort($"sortcol")
* ds.sort($"sortcol".asc)
* }}}
*
* @group typedrel
* @since 2.0.0
*/
def sort(sortCol: String, sortCols: String*): TryAnalysis[Dataset[T]] =
transformationWithAnalysis(_.sort(sortCol, sortCols: _*))
/**
* Returns a new Dataset sorted by the given expressions. For example:
* {{{
* ds.sort($"col1", $"col2".desc)
* }}}
*
* @group typedrel
* @since 2.0.0
*/
def sort(sortExprs: Column*): TryAnalysis[Dataset[T]] = transformationWithAnalysis(_.sort(sortExprs: _*))
/**
* Returns a new Dataset with each partition sorted by the given
* expressions.
*
* This is the same operation as "SORT BY" in SQL (Hive QL).
*
* @group typedrel
* @since 2.0.0
*/
def sortWithinPartitions(sortCol: String, sortCols: String*): TryAnalysis[Dataset[T]] =
transformationWithAnalysis(_.sortWithinPartitions(sortCol, sortCols: _*))
/**
* Returns a new Dataset with each partition sorted by the given
* expressions.
*
* This is the same operation as "SORT BY" in SQL (Hive QL).
*
* @group typedrel
* @since 2.0.0
*/
def sortWithinPartitions(sortExprs: Column*): TryAnalysis[Dataset[T]] =
transformationWithAnalysis(_.sortWithinPartitions(sortExprs: _*))
/**
* Converts this strongly typed collection of data to generic
* `DataFrame` with columns renamed. This can be quite convenient in
* conversion from an RDD of tuples into a `DataFrame` with meaningful
* names. For example:
* {{{
* val rdd: RDD[(Int, String)] = ...
* rdd.toDF() // this implicit conversion creates a DataFrame with column name `_1` and `_2`
* rdd.toDF("id", "name") // this creates a DataFrame with column name "id" and "name"
* }}}
*
* @group basic
* @since 2.0.0
*/
def toDF(colNames: String*): TryAnalysis[DataFrame] = transformationWithAnalysis(_.toDF(colNames: _*))
/**
* Returns a new Dataset containing union of rows in this Dataset and
* another Dataset.
*
* The difference between this function and [[union]] is that this
* function resolves columns by name (not by position).
*
* When the parameter `allowMissingColumns` is `true`, the set of
* column names in this and other `Dataset` can differ; missing
* columns will be filled with null. Further, the missing columns of
* this `Dataset` will be added at the end in the schema of the union
* result:
*
* {{{
* val df1 = Seq((1, 2, 3)).toDF("col0", "col1", "col2")
* val df2 = Seq((4, 5, 6)).toDF("col1", "col0", "col3")
* df1.unionByName(df2, true).show
*
* // output: "col3" is missing at left df1 and added at the end of schema.
* // +----+----+----+----+
* // |col0|col1|col2|col3|
* // +----+----+----+----+
* // | 1| 2| 3|null|
* // | 5| 4|null| 6|
* // +----+----+----+----+
*
* df2.unionByName(df1, true).show
*
* // output: "col2" is missing at left df2 and added at the end of schema.
* // +----+----+----+----+
* // |col1|col0|col3|col2|
* // +----+----+----+----+
* // | 4| 5| 6|null|
* // | 2| 1|null| 3|
* // +----+----+----+----+
* }}}
*
* Note that `allowMissingColumns` supports nested column in struct
* types. Missing nested columns of struct columns with the same name
* will also be filled with null values and added to the end of
* struct. This currently does not support nested columns in array and
* map types.
*
* @group typedrel
* @since 3.1.0
*/
def unionByName(other: Dataset[T], allowMissingColumns: Boolean): TryAnalysis[Dataset[T]] =
transformationWithAnalysis(_.unionByName(other.underlying, allowMissingColumns))
/**
* Filters rows using the given condition. This is an alias for
* `filter`.
* {{{
* // The following are equivalent:
* peopleDs.filter($"age" > 15)
* peopleDs.where($"age" > 15)
* }}}
*
* @group typedrel
* @since 1.6.0
*/
def where(condition: Column): TryAnalysis[Dataset[T]] = transformationWithAnalysis(_.where(condition))
/**
* Filters rows using the given SQL expression.
* {{{
* peopleDs.where("age > 15")
* }}}
*
* @group typedrel
* @since 1.6.0
*/
def where(conditionExpr: String): TryAnalysis[Dataset[T]] = transformationWithAnalysis(_.where(conditionExpr))
/**
* Returns a new Dataset by adding a column or replacing the existing
* column that has the same name.
*
* `column`'s expression must only refer to attributes supplied by
* this Dataset. It is an error to add a column that refers to some
* other Dataset.
*
* @note
* this method introduces a projection internally. Therefore,
* calling it multiple times, for instance, via loops in order to
* add multiple columns can generate big plans which can cause
* performance issues and even `StackOverflowException`. To avoid
* this, use `select` with the multiple columns at once.
*
* @group untypedrel
* @since 2.0.0
*/
def withColumn(colName: String, col: Column): TryAnalysis[DataFrame] =
transformationWithAnalysis(_.withColumn(colName, col))
// ===============
// Methods that need to be implemented
//
// [[org.apache.spark.sql.Dataset.writeTo]]
// ===============
// Methods with handmade implementations
//
// [[org.apache.spark.sql.Dataset.explain]]
// [[org.apache.spark.sql.Dataset.groupBy]]
// [[org.apache.spark.sql.Dataset.printSchema]]
// [[org.apache.spark.sql.Dataset.show]]
// [[org.apache.spark.sql.Dataset.transform]]
// [[org.apache.spark.sql.Dataset.write]]
// [[org.apache.spark.sql.Dataset.writeStream]]
// ===============
// Ignored methods
//
// [[org.apache.spark.sql.Dataset.apply]]
// [[org.apache.spark.sql.Dataset.collectAsList]]
// [[org.apache.spark.sql.Dataset.filter]]
// [[org.apache.spark.sql.Dataset.flatMap]]
// [[org.apache.spark.sql.Dataset.foreach]]
// [[org.apache.spark.sql.Dataset.foreachPartition]]
// [[org.apache.spark.sql.Dataset.groupByKey]]
// [[org.apache.spark.sql.Dataset.javaRDD]]
// [[org.apache.spark.sql.Dataset.map]]
// [[org.apache.spark.sql.Dataset.mapPartitions]]
// [[org.apache.spark.sql.Dataset.reduce]]
// [[org.apache.spark.sql.Dataset.takeAsList]]
// [[org.apache.spark.sql.Dataset.toJavaRDD]]
// [[org.apache.spark.sql.Dataset.toString]]
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy