org.apache.spark.sql.api.DataFrameStatFunctions.scala Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.sql.api
import scala.jdk.CollectionConverters._
import _root_.java.{lang => jl, util => ju}
import org.apache.spark.annotation.Stable
import org.apache.spark.sql.{Column, Row}
import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.BinaryEncoder
import org.apache.spark.sql.catalyst.trees.CurrentOrigin.withOrigin
import org.apache.spark.sql.functions.{count_min_sketch, lit}
import org.apache.spark.util.ArrayImplicits.SparkArrayOps
import org.apache.spark.util.sketch.{BloomFilter, CountMinSketch}
/**
* Statistic functions for `DataFrame`s.
*
* @since 1.4.0
*/
@Stable
abstract class DataFrameStatFunctions[DS[U] <: Dataset[U, DS]] {
protected def df: DS[Row]
/**
* Calculates the approximate quantiles of a numerical column of a DataFrame.
*
* The result of this algorithm has the following deterministic bound: If the DataFrame has N
* elements and if we request the quantile at probability `p` up to error `err`, then the
* algorithm will return a sample `x` from the DataFrame so that the *exact* rank of `x` is
* close to (p * N). More precisely,
*
* {{{
* floor((p - err) * N) <= rank(x) <= ceil((p + err) * N)
* }}}
*
* This method implements a variation of the Greenwald-Khanna algorithm (with some speed
* optimizations). The algorithm was first present in Space-efficient Online Computation of Quantile
* Summaries by Greenwald and Khanna.
*
* @param col
* the name of the numerical column
* @param probabilities
* a list of quantile probabilities Each number must belong to [0, 1]. For example 0 is the
* minimum, 0.5 is the median, 1 is the maximum.
* @param relativeError
* The relative target precision to achieve (greater than or equal to 0). If set to zero, the
* exact quantiles are computed, which could be very expensive. Note that values greater than
* 1 are accepted but give the same result as 1.
* @return
* the approximate quantiles at the given probabilities
*
* @note
* null and NaN values will be removed from the numerical column before calculation. If the
* dataframe is empty or the column only contains null or NaN, an empty array is returned.
*
* @since 2.0.0
*/
def approxQuantile(
col: String,
probabilities: Array[Double],
relativeError: Double): Array[Double] = withOrigin {
approxQuantile(Array(col), probabilities, relativeError).head
}
/**
* Calculates the approximate quantiles of numerical columns of a DataFrame.
* @see
* `approxQuantile(col:Str* approxQuantile)` for detailed description.
*
* @param cols
* the names of the numerical columns
* @param probabilities
* a list of quantile probabilities Each number must belong to [0, 1]. For example 0 is the
* minimum, 0.5 is the median, 1 is the maximum.
* @param relativeError
* The relative target precision to achieve (greater than or equal to 0). If set to zero, the
* exact quantiles are computed, which could be very expensive. Note that values greater than
* 1 are accepted but give the same result as 1.
* @return
* the approximate quantiles at the given probabilities of each column
*
* @note
* null and NaN values will be ignored in numerical columns before calculation. For columns
* only containing null or NaN values, an empty array is returned.
*
* @since 2.2.0
*/
def approxQuantile(
cols: Array[String],
probabilities: Array[Double],
relativeError: Double): Array[Array[Double]]
/**
* Calculate the sample covariance of two numerical columns of a DataFrame.
*
* @param col1
* the name of the first column
* @param col2
* the name of the second column
* @return
* the covariance of the two columns.
*
* {{{
* val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", rand(seed=10))
* .withColumn("rand2", rand(seed=27))
* df.stat.cov("rand1", "rand2")
* res1: Double = 0.065...
* }}}
* @since 1.4.0
*/
def cov(col1: String, col2: String): Double
/**
* Calculates the correlation of two columns of a DataFrame. Currently only supports the Pearson
* Correlation Coefficient. For Spearman Correlation, consider using RDD methods found in
* MLlib's Statistics.
*
* @param col1
* the name of the column
* @param col2
* the name of the column to calculate the correlation against
* @return
* The Pearson Correlation Coefficient as a Double.
*
* {{{
* val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", rand(seed=10))
* .withColumn("rand2", rand(seed=27))
* df.stat.corr("rand1", "rand2")
* res1: Double = 0.613...
* }}}
* @since 1.4.0
*/
def corr(col1: String, col2: String, method: String): Double
/**
* Calculates the Pearson Correlation Coefficient of two columns of a DataFrame.
*
* @param col1
* the name of the column
* @param col2
* the name of the column to calculate the correlation against
* @return
* The Pearson Correlation Coefficient as a Double.
*
* {{{
* val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", rand(seed=10))
* .withColumn("rand2", rand(seed=27))
* df.stat.corr("rand1", "rand2", "pearson")
* res1: Double = 0.613...
* }}}
* @since 1.4.0
*/
def corr(col1: String, col2: String): Double = {
corr(col1, col2, "pearson")
}
/**
* Computes a pair-wise frequency table of the given columns. Also known as a contingency table.
* The first column of each row will be the distinct values of `col1` and the column names will
* be the distinct values of `col2`. The name of the first column will be `col1_col2`. Counts
* will be returned as `Long`s. Pairs that have no occurrences will have zero as their counts.
* Null elements will be replaced by "null", and back ticks will be dropped from elements if
* they exist.
*
* @param col1
* The name of the first column. Distinct items will make the first item of each row.
* @param col2
* The name of the second column. Distinct items will make the column names of the DataFrame.
* @return
* A DataFrame containing for the contingency table.
*
* {{{
* val df = spark.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2), (3, 3)))
* .toDF("key", "value")
* val ct = df.stat.crosstab("key", "value")
* ct.show()
* +---------+---+---+---+
* |key_value| 1| 2| 3|
* +---------+---+---+---+
* | 2| 2| 0| 1|
* | 1| 1| 1| 0|
* | 3| 0| 1| 1|
* +---------+---+---+---+
* }}}
*
* @since 1.4.0
*/
def crosstab(col1: String, col2: String): DS[Row]
/**
* Finding frequent items for columns, possibly with false positives. Using the frequent element
* count algorithm described in here,
* proposed by Karp, Schenker, and Papadimitriou. The `support` should be greater than 1e-4.
*
* This function is meant for exploratory data analysis, as we make no guarantee about the
* backward compatibility of the schema of the resulting `DataFrame`.
*
* @param cols
* the names of the columns to search frequent items in.
* @param support
* The minimum frequency for an item to be considered `frequent`. Should be greater than 1e-4.
* @return
* A Local DataFrame with the Array of frequent items for each column.
*
* {{{
* val rows = Seq.tabulate(100) { i =>
* if (i % 2 == 0) (1, -1.0) else (i, i * -1.0)
* }
* val df = spark.createDataFrame(rows).toDF("a", "b")
* // find the items with a frequency greater than 0.4 (observed 40% of the time) for columns
* // "a" and "b"
* val freqSingles = df.stat.freqItems(Array("a", "b"), 0.4)
* freqSingles.show()
* +-----------+-------------+
* |a_freqItems| b_freqItems|
* +-----------+-------------+
* | [1, 99]|[-1.0, -99.0]|
* +-----------+-------------+
* // find the pair of items with a frequency greater than 0.1 in columns "a" and "b"
* val pairDf = df.select(struct("a", "b").as("a-b"))
* val freqPairs = pairDf.stat.freqItems(Array("a-b"), 0.1)
* freqPairs.select(explode($"a-b_freqItems").as("freq_ab")).show()
* +----------+
* | freq_ab|
* +----------+
* | [1,-1.0]|
* | ... |
* +----------+
* }}}
* @since 1.4.0
*/
def freqItems(cols: Array[String], support: Double): DS[Row] =
freqItems(cols.toImmutableArraySeq, support)
/**
* Finding frequent items for columns, possibly with false positives. Using the frequent element
* count algorithm described in here,
* proposed by Karp, Schenker, and Papadimitriou. Uses a `default` support of 1%.
*
* This function is meant for exploratory data analysis, as we make no guarantee about the
* backward compatibility of the schema of the resulting `DataFrame`.
*
* @param cols
* the names of the columns to search frequent items in.
* @return
* A Local DataFrame with the Array of frequent items for each column.
* @since 1.4.0
*/
def freqItems(cols: Array[String]): DS[Row] = freqItems(cols, 0.01)
/**
* (Scala-specific) Finding frequent items for columns, possibly with false positives. Using the
* frequent element count algorithm described in here, proposed by Karp, Schenker, and
* Papadimitriou.
*
* This function is meant for exploratory data analysis, as we make no guarantee about the
* backward compatibility of the schema of the resulting `DataFrame`.
*
* @param cols
* the names of the columns to search frequent items in.
* @return
* A Local DataFrame with the Array of frequent items for each column.
*
* {{{
* val rows = Seq.tabulate(100) { i =>
* if (i % 2 == 0) (1, -1.0) else (i, i * -1.0)
* }
* val df = spark.createDataFrame(rows).toDF("a", "b")
* // find the items with a frequency greater than 0.4 (observed 40% of the time) for columns
* // "a" and "b"
* val freqSingles = df.stat.freqItems(Seq("a", "b"), 0.4)
* freqSingles.show()
* +-----------+-------------+
* |a_freqItems| b_freqItems|
* +-----------+-------------+
* | [1, 99]|[-1.0, -99.0]|
* +-----------+-------------+
* // find the pair of items with a frequency greater than 0.1 in columns "a" and "b"
* val pairDf = df.select(struct("a", "b").as("a-b"))
* val freqPairs = pairDf.stat.freqItems(Seq("a-b"), 0.1)
* freqPairs.select(explode($"a-b_freqItems").as("freq_ab")).show()
* +----------+
* | freq_ab|
* +----------+
* | [1,-1.0]|
* | ... |
* +----------+
* }}}
*
* @since 1.4.0
*/
def freqItems(cols: Seq[String], support: Double): DS[Row]
/**
* (Scala-specific) Finding frequent items for columns, possibly with false positives. Using the
* frequent element count algorithm described in here, proposed by Karp, Schenker, and
* Papadimitriou. Uses a `default` support of 1%.
*
* This function is meant for exploratory data analysis, as we make no guarantee about the
* backward compatibility of the schema of the resulting `DataFrame`.
*
* @param cols
* the names of the columns to search frequent items in.
* @return
* A Local DataFrame with the Array of frequent items for each column.
* @since 1.4.0
*/
def freqItems(cols: Seq[String]): DS[Row] = freqItems(cols, 0.01)
/**
* Returns a stratified sample without replacement based on the fraction given on each stratum.
* @param col
* column that defines strata
* @param fractions
* sampling fraction for each stratum. If a stratum is not specified, we treat its fraction as
* zero.
* @param seed
* random seed
* @tparam T
* stratum type
* @return
* a new `DataFrame` that represents the stratified sample
*
* {{{
* val df = spark.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2),
* (3, 3))).toDF("key", "value")
* val fractions = Map(1 -> 1.0, 3 -> 0.5)
* df.stat.sampleBy("key", fractions, 36L).show()
* +---+-----+
* |key|value|
* +---+-----+
* | 1| 1|
* | 1| 2|
* | 3| 2|
* +---+-----+
* }}}
*
* @since 1.5.0
*/
def sampleBy[T](col: String, fractions: Map[T, Double], seed: Long): DS[Row] = {
sampleBy(Column(col), fractions, seed)
}
/**
* Returns a stratified sample without replacement based on the fraction given on each stratum.
* @param col
* column that defines strata
* @param fractions
* sampling fraction for each stratum. If a stratum is not specified, we treat its fraction as
* zero.
* @param seed
* random seed
* @tparam T
* stratum type
* @return
* a new `DataFrame` that represents the stratified sample
*
* @since 1.5.0
*/
def sampleBy[T](col: String, fractions: ju.Map[T, jl.Double], seed: Long): DS[Row] = {
sampleBy(col, fractions.asScala.toMap.asInstanceOf[Map[T, Double]], seed)
}
/**
* Returns a stratified sample without replacement based on the fraction given on each stratum.
* @param col
* column that defines strata
* @param fractions
* sampling fraction for each stratum. If a stratum is not specified, we treat its fraction as
* zero.
* @param seed
* random seed
* @tparam T
* stratum type
* @return
* a new `DataFrame` that represents the stratified sample
*
* The stratified sample can be performed over multiple columns:
* {{{
* import org.apache.spark.sql.Row
* import org.apache.spark.sql.functions.struct
*
* val df = spark.createDataFrame(Seq(("Bob", 17), ("Alice", 10), ("Nico", 8), ("Bob", 17),
* ("Alice", 10))).toDF("name", "age")
* val fractions = Map(Row("Alice", 10) -> 0.3, Row("Nico", 8) -> 1.0)
* df.stat.sampleBy(struct($"name", $"age"), fractions, 36L).show()
* +-----+---+
* | name|age|
* +-----+---+
* | Nico| 8|
* |Alice| 10|
* +-----+---+
* }}}
*
* @since 3.0.0
*/
def sampleBy[T](col: Column, fractions: Map[T, Double], seed: Long): DS[Row]
/**
* (Java-specific) Returns a stratified sample without replacement based on the fraction given
* on each stratum.
*
* @param col
* column that defines strata
* @param fractions
* sampling fraction for each stratum. If a stratum is not specified, we treat its fraction as
* zero.
* @param seed
* random seed
* @tparam T
* stratum type
* @return
* a new `DataFrame` that represents the stratified sample
* @since 3.0.0
*/
def sampleBy[T](col: Column, fractions: ju.Map[T, jl.Double], seed: Long): DS[Row] = {
sampleBy(col, fractions.asScala.toMap.asInstanceOf[Map[T, Double]], seed)
}
/**
* Builds a Count-min Sketch over a specified column.
*
* @param colName
* name of the column over which the sketch is built
* @param depth
* depth of the sketch
* @param width
* width of the sketch
* @param seed
* random seed
* @return
* a `CountMinSketch` over column `colName`
* @since 2.0.0
*/
def countMinSketch(colName: String, depth: Int, width: Int, seed: Int): CountMinSketch = {
countMinSketch(Column(colName), depth, width, seed)
}
/**
* Builds a Count-min Sketch over a specified column.
*
* @param colName
* name of the column over which the sketch is built
* @param eps
* relative error of the sketch
* @param confidence
* confidence of the sketch
* @param seed
* random seed
* @return
* a `CountMinSketch` over column `colName`
* @since 2.0.0
*/
def countMinSketch(
colName: String,
eps: Double,
confidence: Double,
seed: Int): CountMinSketch = {
countMinSketch(Column(colName), eps, confidence, seed)
}
/**
* Builds a Count-min Sketch over a specified column.
*
* @param col
* the column over which the sketch is built
* @param depth
* depth of the sketch
* @param width
* width of the sketch
* @param seed
* random seed
* @return
* a `CountMinSketch` over column `colName`
* @since 2.0.0
*/
def countMinSketch(col: Column, depth: Int, width: Int, seed: Int): CountMinSketch = {
val eps = 2.0 / width
val confidence = 1 - 1 / Math.pow(2, depth)
countMinSketch(col, eps, confidence, seed)
}
/**
* Builds a Count-min Sketch over a specified column.
*
* @param col
* the column over which the sketch is built
* @param eps
* relative error of the sketch
* @param confidence
* confidence of the sketch
* @param seed
* random seed
* @return
* a `CountMinSketch` over column `colName`
* @since 2.0.0
*/
def countMinSketch(col: Column, eps: Double, confidence: Double, seed: Int): CountMinSketch =
withOrigin {
val cms = count_min_sketch(col, lit(eps), lit(confidence), lit(seed))
val bytes: Array[Byte] = df.select(cms).as(BinaryEncoder).head()
CountMinSketch.readFrom(bytes)
}
/**
* Builds a Bloom filter over a specified column.
*
* @param colName
* name of the column over which the filter is built
* @param expectedNumItems
* expected number of items which will be put into the filter.
* @param fpp
* expected false positive probability of the filter.
* @since 2.0.0
*/
def bloomFilter(colName: String, expectedNumItems: Long, fpp: Double): BloomFilter = {
bloomFilter(Column(colName), expectedNumItems, fpp)
}
/**
* Builds a Bloom filter over a specified column.
*
* @param col
* the column over which the filter is built
* @param expectedNumItems
* expected number of items which will be put into the filter.
* @param fpp
* expected false positive probability of the filter.
* @since 2.0.0
*/
def bloomFilter(col: Column, expectedNumItems: Long, fpp: Double): BloomFilter = {
val numBits = BloomFilter.optimalNumOfBits(expectedNumItems, fpp)
bloomFilter(col, expectedNumItems, numBits)
}
/**
* Builds a Bloom filter over a specified column.
*
* @param colName
* name of the column over which the filter is built
* @param expectedNumItems
* expected number of items which will be put into the filter.
* @param numBits
* expected number of bits of the filter.
* @since 2.0.0
*/
def bloomFilter(colName: String, expectedNumItems: Long, numBits: Long): BloomFilter = {
bloomFilter(Column(colName), expectedNumItems, numBits)
}
/**
* Builds a Bloom filter over a specified column.
*
* @param col
* the column over which the filter is built
* @param expectedNumItems
* expected number of items which will be put into the filter.
* @param numBits
* expected number of bits of the filter.
* @since 2.0.0
*/
def bloomFilter(col: Column, expectedNumItems: Long, numBits: Long): BloomFilter = withOrigin {
val bf = Column.internalFn("bloom_filter_agg", col, lit(expectedNumItems), lit(numBits))
val bytes: Array[Byte] = df.select(bf).as(BinaryEncoder).head()
BloomFilter.readFrom(bytes)
}
}