zio.spark.rdd.RDD.scala Maven / Gradle / Ivy
/**
* /!\ Warning /!\
*
* This file is generated using zio-spark-codegen, you should not edit
* this file directly.
*/
package zio.spark.rdd
import org.apache.hadoop.io.compress.CompressionCodec
import org.apache.spark.{Dependency, Partition, Partitioner, TaskContext}
import org.apache.spark.partial.{BoundedDouble, PartialResult}
import org.apache.spark.rdd.{PartitionCoalescer, RDD => UnderlyingRDD, RDDBarrier}
import org.apache.spark.resource.ResourceProfile
import org.apache.spark.storage.StorageLevel
import zio._
import scala.collection.Map
import scala.io.Codec
import scala.reflect._
@SuppressWarnings(Array("scalafix:DisableSyntax.defaultArgs", "scalafix:DisableSyntax.null"))
final case class RDD[T](underlying: UnderlyingRDD[T]) { self =>
// scalafix:off
implicit private def lift[U](x: UnderlyingRDD[U]): RDD[U] = RDD(x)
implicit private def arrayToSeq2[U](x: UnderlyingRDD[Array[U]]): UnderlyingRDD[Seq[U]] = x.map(_.toIndexedSeq)
@inline private def noOrdering[U]: Ordering[U] = null
// scalafix:on
/** Applies an action to the underlying RDD. */
def action[U](f: UnderlyingRDD[T] => U)(implicit trace: Trace): Task[U] = ZIO.attempt(get(f))
/** Applies a transformation to the underlying RDD. */
def transformation[TNew](f: UnderlyingRDD[T] => UnderlyingRDD[TNew]): RDD[TNew] = RDD(f(underlying))
/** Applies an action to the underlying RDD. */
def get[U](f: UnderlyingRDD[T] => U): U = f(underlying)
// Generated functions coming from spark
/** Returns the number of partitions of this RDD. */
def getNumPartitions: Int = get(_.getNumPartitions)
/**
* Get the array of partitions of this RDD, taking into account
* whether the RDD is checkpointed or not.
*/
def partitions: Seq[Partition] = get(_.partitions.toSeq)
/**
* Get the preferred locations of a partition, taking into account
* whether the RDD is checkpointed.
*/
def preferredLocations(split: Partition): Seq[String] = get(_.preferredLocations(split))
/**
* A description of this RDD and its recursive dependencies for
* debugging.
*/
def toDebugString: String = get(_.toDebugString)
// ===============
/**
* Aggregate the elements of each partition, and then the results for
* all the partitions, using given combine functions and a neutral
* "zero value". This function can return a different result type, U,
* than the type of this RDD, T. Thus, we need one operation for
* merging a T into an U and one operation for merging two U's, as in
* scala.TraversableOnce. Both of these functions are allowed to
* modify and return their first argument instead of creating a new U
* to avoid memory allocation.
*
* @param zeroValue
* the initial value for the accumulated result of each partition
* for the `seqOp` operator, and also the initial value for the
* combine results from different partitions for the `combOp`
* operator - this will typically be the neutral element (e.g. `Nil`
* for list concatenation or `0` for summation)
* @param seqOp
* an operator used to accumulate results within a partition
* @param combOp
* an associative operator used to combine results from different
* partitions
*/
def aggregate[U: ClassTag](zeroValue: => U)(seqOp: (U, T) => U, combOp: (U, U) => U)(implicit trace: Trace): Task[U] =
action(_.aggregate[U](zeroValue)(seqOp, combOp))
/**
* Return an array that contains all of the elements in this RDD.
*
* @note
* This method should only be used if the resulting array is
* expected to be small, as all the data is loaded into the driver's
* memory.
*/
def collect(implicit trace: Trace): Task[Seq[T]] = action(_.collect().toSeq)
/** Return the number of elements in the RDD. */
def count(implicit trace: Trace): Task[Long] = action(_.count())
/**
* Approximate version of count() that returns a potentially
* incomplete result within a timeout, even if not all tasks have
* finished.
*
* The confidence is the probability that the error bounds of the
* result will contain the true value. That is, if countApprox were
* called repeatedly with confidence 0.9, we would expect 90% of the
* results to contain the true count. The confidence must be in the
* range [0,1] or an exception will be thrown.
*
* @param timeout
* maximum time to wait for the job, in milliseconds
* @param confidence
* the desired statistical confidence in the result
* @return
* a potentially incomplete result, with error bounds
*/
def countApprox(timeout: => Long, confidence: => Double = 0.95)(implicit
trace: Trace
): Task[PartialResult[BoundedDouble]] = action(_.countApprox(timeout, confidence))
/**
* Return approximate number of distinct elements in the RDD.
*
* The algorithm used is based on streamlib's implementation of
* "HyperLogLog in Practice: Algorithmic Engineering of a State of The
* Art Cardinality Estimation Algorithm", available here.
*
* The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting
* a nonzero (`sp` is greater than `p`) would trigger sparse
* representation of registers, which may reduce the memory
* consumption and increase accuracy when the cardinality is small.
*
* @param p
* The precision value for the normal set. `p` must be a value
* between 4 and `sp` if `sp` is not zero (32 max).
* @param sp
* The precision value for the sparse set, between 0 and 32. If `sp`
* equals 0, the sparse representation is skipped.
*/
def countApproxDistinct(p: => Int, sp: => Int)(implicit trace: Trace): Task[Long] =
action(_.countApproxDistinct(p, sp))
/**
* Return approximate number of distinct elements in the RDD.
*
* The algorithm used is based on streamlib's implementation of
* "HyperLogLog in Practice: Algorithmic Engineering of a State of The
* Art Cardinality Estimation Algorithm", available here.
*
* @param relativeSD
* Relative accuracy. Smaller values create counters that require
* more space. It must be greater than 0.000017.
*/
def countApproxDistinct(relativeSD: => Double = 0.05)(implicit trace: Trace): Task[Long] =
action(_.countApproxDistinct(relativeSD))
/**
* Return the count of each unique value in this RDD as a local map of
* (value, count) pairs.
*
* @note
* This method should only be used if the resulting map is expected
* to be small, as the whole thing is loaded into the driver's
* memory. To handle very large results, consider using
*
* {{{
* rdd.map(x => (x, 1L)).reduceByKey(_ + _)
* }}}
*
* , which returns an RDD[T, Long] instead of a map.
*/
def countByValue(implicit ord: Ordering[T] = noOrdering, trace: Trace): Task[Map[T, Long]] = action(_.countByValue())
/**
* Approximate version of countByValue().
*
* @param timeout
* maximum time to wait for the job, in milliseconds
* @param confidence
* the desired statistical confidence in the result
* @return
* a potentially incomplete result, with error bounds
*/
def countByValueApprox(timeout: => Long, confidence: => Double = 0.95)(implicit
ord: Ordering[T] = noOrdering,
trace: Trace
): Task[PartialResult[Map[T, BoundedDouble]]] = action(_.countByValueApprox(timeout, confidence))
/** Return the first element in this RDD. */
def first(implicit trace: Trace): Task[T] = action(_.first())
/**
* Aggregate the elements of each partition, and then the results for
* all the partitions, using a given associative function and a
* neutral "zero value". The function op(t1, t2) is allowed to modify
* t1 and return it as its result value to avoid object allocation;
* however, it should not modify t2.
*
* This behaves somewhat differently from fold operations implemented
* for non-distributed collections in functional languages like Scala.
* This fold operation may be applied to partitions individually, and
* then fold those results into the final result, rather than apply
* the fold to each element sequentially in some defined ordering. For
* functions that are not commutative, the result may differ from that
* of a fold applied to a non-distributed collection.
*
* @param zeroValue
* the initial value for the accumulated result of each partition
* for the `op` operator, and also the initial value for the combine
* results from different partitions for the `op` operator - this
* will typically be the neutral element (e.g. `Nil` for list
* concatenation or `0` for summation)
* @param op
* an operator used to both accumulate results within a partition
* and combine results from different partitions
*/
def fold(zeroValue: => T)(op: (T, T) => T)(implicit trace: Trace): Task[T] = action(_.fold(zeroValue)(op))
// Actions (launch a job to return a value to the user program)
/** Applies a function f to all elements of this RDD. */
def foreach(f: T => Unit)(implicit trace: Trace): Task[Unit] = action(_.foreach(f))
/** Applies a function f to each partition of this RDD. */
def foreachPartition(f: Iterator[T] => Unit)(implicit trace: Trace): Task[Unit] = action(_.foreachPartition(f))
/**
* @note
* Due to complications in the internal implementation, this method
* will raise an exception if called on an RDD of `Nothing` or
* `Null`. This may be come up in practice because, for example, the
* type of `parallelize(Seq())` is `RDD[Nothing]`.
* (`parallelize(Seq())` should be avoided anyway in favor of
* `parallelize(Seq[T]())`.)
* @return
* true if and only if the RDD contains no elements at all. Note
* that an RDD may be empty even when it has at least 1 partition.
*/
def isEmpty(implicit trace: Trace): Task[Boolean] = action(_.isEmpty())
/**
* Internal method to this RDD; will read from cache if applicable, or
* otherwise compute it. This should ''not'' be called by users
* directly, but is available for implementers of custom subclasses of
* RDD.
*/
def iterator(split: => Partition, context: => TaskContext)(implicit trace: Trace): Task[Iterator[T]] =
action(_.iterator(split, context))
/**
* Returns the max of this RDD as defined by the implicit Ordering[T].
* @return
* the maximum element of the RDD
*/
def max(implicit ord: Ordering[T], trace: Trace): Task[T] = action(_.max())
/**
* Returns the min of this RDD as defined by the implicit Ordering[T].
* @return
* the minimum element of the RDD
*/
def min(implicit ord: Ordering[T], trace: Trace): Task[T] = action(_.min())
/**
* Reduces the elements of this RDD using the specified commutative
* and associative binary operator.
*/
def reduce(f: (T, T) => T)(implicit trace: Trace): Task[T] = action(_.reduce(f))
/** Save this RDD as a SequenceFile of serialized objects. */
def saveAsObjectFile(path: => String)(implicit trace: Trace): Task[Unit] = action(_.saveAsObjectFile(path))
/**
* Save this RDD as a text file, using string representations of
* elements.
*/
def saveAsTextFile(path: => String)(implicit trace: Trace): Task[Unit] = action(_.saveAsTextFile(path))
/**
* Save this RDD as a compressed text file, using string
* representations of elements.
*/
def saveAsTextFile(path: => String, codec: => Class[_ <: CompressionCodec])(implicit trace: Trace): Task[Unit] =
action(_.saveAsTextFile(path, codec))
/**
* Take the first num elements of the RDD. It works by first scanning
* one partition, and use the results from that partition to estimate
* the number of additional partitions needed to satisfy the limit.
*
* @note
* This method should only be used if the resulting array is
* expected to be small, as all the data is loaded into the driver's
* memory.
*
* @note
* Due to complications in the internal implementation, this method
* will raise an exception if called on an RDD of `Nothing` or
* `Null`.
*/
def take(num: => Int)(implicit trace: Trace): Task[Seq[T]] = action(_.take(num).toSeq)
/**
* Returns the first k (smallest) elements from this RDD as defined by
* the specified implicit Ordering[T] and maintains the ordering. This
* does the opposite of [[top]]. For example:
* {{{
* sc.parallelize(Seq(10, 4, 2, 12, 3)).takeOrdered(1)
* // returns Array(2)
*
* sc.parallelize(Seq(2, 3, 4, 5, 6)).takeOrdered(2)
* // returns Array(2, 3)
* }}}
*
* @note
* This method should only be used if the resulting array is
* expected to be small, as all the data is loaded into the driver's
* memory.
*
* @param num
* k, the number of elements to return
* @param ord
* the implicit ordering for T
* @return
* an array of top elements
*/
def takeOrdered(num: => Int)(implicit ord: Ordering[T], trace: Trace): Task[Seq[T]] = action(_.takeOrdered(num).toSeq)
/**
* Return a fixed-size sampled subset of this RDD in an array
*
* @param withReplacement
* whether sampling is done with replacement
* @param num
* size of the returned sample
* @param seed
* seed for the random number generator
* @return
* sample of specified size in an array
*
* @note
* this method should only be used if the resulting array is
* expected to be small, as all the data is loaded into the driver's
* memory.
*/
def takeSample(withReplacement: => Boolean, num: => Int, seed: Long)(implicit trace: Trace): Task[Seq[T]] =
action(_.takeSample(withReplacement, num, seed).toSeq)
/**
* Return an iterator that contains all of the elements in this RDD.
*
* The iterator will consume as much memory as the largest partition
* in this RDD.
*
* @note
* This results in multiple Spark jobs, and if the input RDD is the
* result of a wide transformation (e.g. join with different
* partitioners), to avoid recomputing the input RDD should be
* cached first.
*/
def toLocalIterator(implicit trace: Trace): Task[Iterator[T]] = action(_.toLocalIterator)
/**
* Returns the top k (largest) elements from this RDD as defined by
* the specified implicit Ordering[T] and maintains the ordering. This
* does the opposite of [[takeOrdered]]. For example:
* {{{
* sc.parallelize(Seq(10, 4, 2, 12, 3)).top(1)
* // returns Array(12)
*
* sc.parallelize(Seq(2, 3, 4, 5, 6)).top(2)
* // returns Array(6, 5)
* }}}
*
* @note
* This method should only be used if the resulting array is
* expected to be small, as all the data is loaded into the driver's
* memory.
*
* @param num
* k, the number of top elements to return
* @param ord
* the implicit ordering for T
* @return
* an array of top elements
*/
def top(num: => Int)(implicit ord: Ordering[T], trace: Trace): Task[Seq[T]] = action(_.top(num).toSeq)
/**
* Aggregates the elements of this RDD in a multi-level tree pattern.
* This method is semantically identical to
* [[org.apache.spark.rdd.RDD#aggregate]].
*
* @param depth
* suggested depth of the tree (default: 2)
*/
def treeAggregate[U: ClassTag](zeroValue: => U)(seqOp: (U, T) => U, combOp: (U, U) => U, depth: => Int = 2)(implicit
trace: Trace
): Task[U] = action(_.treeAggregate[U](zeroValue)(seqOp, combOp, depth))
/**
* Reduces the elements of this RDD in a multi-level tree pattern.
*
* @param depth
* suggested depth of the tree (default: 2)
* @see
* [[org.apache.spark.rdd.RDD#reduce]]
*/
def treeReduce(f: (T, T) => T, depth: => Int = 2)(implicit trace: Trace): Task[T] = action(_.treeReduce(f, depth))
// ===============
/**
* :: Experimental :: Marks the current stage as a barrier stage,
* where Spark must launch all tasks together. In case of a task
* failure, instead of only restarting the failed task, Spark will
* abort the entire stage and re-launch all tasks for this stage. The
* barrier execution mode feature is experimental and it only handles
* limited scenarios. Please read the linked SPIP and design docs to
* understand the limitations and future plans.
* @return
* an [[RDDBarrier]] instance that provides actions within a barrier
* stage
* @see
* [[org.apache.spark.BarrierTaskContext]]
* @see
* SPIP:
* Barrier Execution Mode
* @see
* Design
* Doc
*/
def barrier(implicit trace: Trace): Task[RDDBarrier[T]] = action(_.barrier())
/**
* Persist this RDD with the default storage level (`MEMORY_ONLY`).
*/
def cache(implicit trace: Trace): Task[RDD[T]] = action(_.cache())
/**
* Mark this RDD for checkpointing. It will be saved to a file inside
* the checkpoint directory set with `SparkContext#setCheckpointDir`
* and all references to its parent RDDs will be removed. This
* function must be called before any job has been executed on this
* RDD. It is strongly recommended that this RDD is persisted in
* memory, otherwise saving it on a file will require recomputation.
*/
def checkpoint(implicit trace: Trace): Task[Unit] = action(_.checkpoint())
/**
* Get the list of dependencies of this RDD, taking into account
* whether the RDD is checkpointed or not.
*/
def dependencies(implicit trace: Trace): Task[Seq[Dependency[_]]] = action(_.dependencies)
/**
* Gets the name of the directory to which this RDD was checkpointed.
* This is not defined if the RDD is checkpointed locally.
*/
def getCheckpointFile(implicit trace: Trace): Task[Option[String]] = action(_.getCheckpointFile)
/**
* Get the ResourceProfile specified with this RDD or null if it
* wasn't specified.
* @return
* the user specified ResourceProfile or null (for Java
* compatibility) if none was specified
*/
def getResourceProfile(implicit trace: Trace): Task[ResourceProfile] = action(_.getResourceProfile())
/**
* Get the RDD's current storage level, or StorageLevel.NONE if none
* is set.
*/
def getStorageLevel(implicit trace: Trace): Task[StorageLevel] = action(_.getStorageLevel)
/**
* Return whether this RDD is checkpointed and materialized, either
* reliably or locally.
*/
def isCheckpointed(implicit trace: Trace): Task[Boolean] = action(_.isCheckpointed)
/**
* Mark this RDD for local checkpointing using Spark's existing
* caching layer.
*
* This method is for users who wish to truncate RDD lineages while
* skipping the expensive step of replicating the materialized data in
* a reliable distributed file system. This is useful for RDDs with
* long lineages that need to be truncated periodically (e.g. GraphX).
*
* Local checkpointing sacrifices fault-tolerance for performance. In
* particular, checkpointed data is written to ephemeral local storage
* in the executors instead of to a reliable, fault-tolerant storage.
* The effect is that if an executor fails during the computation, the
* checkpointed data may no longer be accessible, causing an
* irrecoverable job failure.
*
* This is NOT safe to use with dynamic allocation, which removes
* executors along with their cached blocks. If you must use both
* features, you are advised to set
* `spark.dynamicAllocation.cachedExecutorIdleTimeout` to a high
* value.
*
* The checkpoint directory set through
* `SparkContext#setCheckpointDir` is not used.
*/
def localCheckpoint(implicit trace: Trace): Task[RDD[T]] = action(_.localCheckpoint())
/**
* Set this RDD's storage level to persist its values across
* operations after the first time it is computed. This can only be
* used to assign a new storage level if the RDD does not have a
* storage level set yet. Local checkpointing is an exception.
*/
def persist(newLevel: => StorageLevel)(implicit trace: Trace): Task[RDD[T]] = action(_.persist(newLevel))
/**
* Persist this RDD with the default storage level (`MEMORY_ONLY`).
*/
def persist(implicit trace: Trace): Task[RDD[T]] = action(_.persist())
/**
* Mark the RDD as non-persistent, and remove all blocks for it from
* memory and disk.
*
* @param blocking
* Whether to block until all blocks are deleted (default: false)
* @return
* This RDD.
*/
def unpersist(blocking: => Boolean = false)(implicit trace: Trace): Task[RDD[T]] = action(_.unpersist(blocking))
// ===============
/**
* Return the union of this RDD and another one. Any identical
* elements will appear multiple times (use `.distinct()` to eliminate
* them).
*/
def ++(other: RDD[T]): RDD[T] = transformation(_.++(other.underlying))
/**
* Return the Cartesian product of this RDD and another one, that is,
* the RDD of all pairs of elements (a, b) where a is in `this` and b
* is in `other`.
*/
def cartesian[U: ClassTag](other: RDD[U]): RDD[(T, U)] = transformation(_.cartesian[U](other.underlying))
/**
* Return a new RDD that is reduced into `numPartitions` partitions.
*
* This results in a narrow dependency, e.g. if you go from 1000
* partitions to 100 partitions, there will not be a shuffle, instead
* each of the 100 new partitions will claim 10 of the current
* partitions. If a larger number of partitions is requested, it will
* stay at the current number of partitions.
*
* However, if you're doing a drastic coalesce, e.g. to
* {{{numPartitions = 1}}}, this may result in your computation taking
* place on fewer nodes than you like (e.g. one node in the case of
* {{{numPartitions = 1}}}). To avoid this, you can pass shuffle =
* true. This will add a shuffle step, but means the current upstream
* partitions will be executed in parallel (per whatever the current
* partitioning is).
*
* @note
* With shuffle = true, you can actually coalesce to a larger number
* of partitions. This is useful if you have a small number of
* partitions, say 100, potentially with a few partitions being
* abnormally large. Calling coalesce(1000, shuffle = true) will
* result in 1000 partitions with the data distributed using a hash
* partitioner. The optional partition coalescer passed in must be
* serializable.
*/
def coalesce(
numPartitions: Int,
shuffle: Boolean = false,
partitionCoalescer: Option[PartitionCoalescer] = Option.empty
)(implicit ord: Ordering[T] = noOrdering): RDD[T] =
transformation(_.coalesce(numPartitions, shuffle, partitionCoalescer))
/** Return a new RDD containing the distinct elements in this RDD. */
def distinct(numPartitions: Int)(implicit ord: Ordering[T] = noOrdering): RDD[T] =
transformation(_.distinct(numPartitions))
/** Return a new RDD containing the distinct elements in this RDD. */
def distinct: RDD[T] = transformation(_.distinct())
/**
* Return a new RDD containing only the elements that satisfy a
* predicate.
*/
def filter(f: T => Boolean): RDD[T] = transformation(_.filter(f))
/**
* Return a new RDD by first applying a function to all elements of
* this RDD, and then flattening the results.
*/
def flatMap[U: ClassTag](f: T => TraversableOnce[U]): RDD[U] = transformation(_.flatMap[U](f))
/**
* Return an RDD created by coalescing all elements within each
* partition into an array.
*/
def glom: RDD[Seq[T]] = transformation(_.glom())
/**
* Return the intersection of this RDD and another one. The output
* will not contain any duplicate elements, even if the input RDDs
* did.
*
* @note
* This method performs a shuffle internally.
*/
def intersection(other: RDD[T]): RDD[T] = transformation(_.intersection(other.underlying))
/**
* Return the intersection of this RDD and another one. The output
* will not contain any duplicate elements, even if the input RDDs
* did.
*
* @note
* This method performs a shuffle internally.
*
* @param partitioner
* Partitioner to use for the resulting RDD
*/
def intersection(other: RDD[T], partitioner: Partitioner)(implicit ord: Ordering[T] = noOrdering): RDD[T] =
transformation(_.intersection(other.underlying, partitioner))
/**
* Return the intersection of this RDD and another one. The output
* will not contain any duplicate elements, even if the input RDDs
* did. Performs a hash partition across the cluster
*
* @note
* This method performs a shuffle internally.
*
* @param numPartitions
* How many partitions to use in the resulting RDD
*/
def intersection(other: RDD[T], numPartitions: Int): RDD[T] =
transformation(_.intersection(other.underlying, numPartitions))
/** Creates tuples of the elements in this RDD by applying `f`. */
def keyBy[K](f: T => K): RDD[(K, T)] = transformation(_.keyBy[K](f))
// Transformations (return a new RDD)
/**
* Return a new RDD by applying a function to all elements of this
* RDD.
*/
def map[U: ClassTag](f: T => U): RDD[U] = transformation(_.map[U](f))
/**
* Return a new RDD by applying a function to each partition of this
* RDD.
*
* `preservesPartitioning` indicates whether the input function
* preserves the partitioner, which should be `false` unless this is a
* pair RDD and the input function doesn't modify the keys.
*/
def mapPartitions[U: ClassTag](f: Iterator[T] => Iterator[U], preservesPartitioning: Boolean = false): RDD[U] =
transformation(_.mapPartitions[U](f, preservesPartitioning))
/**
* Return a new RDD by applying a function to each partition of this
* RDD, while tracking the index of the original partition.
*
* `preservesPartitioning` indicates whether the input function
* preserves the partitioner, which should be `false` unless this is a
* pair RDD and the input function doesn't modify the keys.
*/
def mapPartitionsWithIndex[U: ClassTag](
f: (Int, Iterator[T]) => Iterator[U],
preservesPartitioning: Boolean = false
): RDD[U] = transformation(_.mapPartitionsWithIndex[U](f, preservesPartitioning))
/**
* Return an RDD created by piping elements to a forked external
* process.
*/
def pipe(command: String): RDD[String] = transformation(_.pipe(command))
/**
* Return an RDD created by piping elements to a forked external
* process.
*/
def pipe(command: String, env: Map[String, String]): RDD[String] = transformation(_.pipe(command, env))
/**
* Return an RDD created by piping elements to a forked external
* process. The resulting RDD is computed by executing the given
* process once per partition. All elements of each input partition
* are written to a process's stdin as lines of input separated by a
* newline. The resulting partition consists of the process's stdout
* output, with each line of stdout resulting in one element of the
* output partition. A process is invoked even for empty partitions.
*
* The print behavior can be customized by providing two functions.
*
* @param command
* command to run in forked process.
* @param env
* environment variables to set.
* @param printPipeContext
* Before piping elements, this function is called as an opportunity
* to pipe context data. Print line function (like out.println) will
* be passed as printPipeContext's parameter.
* @param printRDDElement
* Use this function to customize how to pipe elements. This
* function will be called with each RDD element as the 1st
* parameter, and the print line function (like out.println()) as
* the 2nd parameter. An example of pipe the RDD data of groupBy()
* in a streaming way, instead of constructing a huge String to
* concat all the elements:
* {{{
* def printRDDElement(record:(String, Seq[String]), f:String=>Unit) =
* for (e <- record._2) {f(e)}
* }}}
* @param separateWorkingDir
* Use separate working directories for each task.
* @param bufferSize
* Buffer size for the stdin writer for the piped process.
* @param encoding
* Char encoding used for interacting (via stdin, stdout and stderr)
* with the piped process
* @return
* the result RDD
*/
def pipe(
command: Seq[String],
env: Map[String, String] = Map(),
printPipeContext: (String => Unit) => Unit = null,
printRDDElement: (T, String => Unit) => Unit = null,
separateWorkingDir: Boolean = false,
bufferSize: Int = 8192,
encoding: String = Codec.defaultCharsetCodec.name
): RDD[String] =
transformation(_.pipe(command, env, printPipeContext, printRDDElement, separateWorkingDir, bufferSize, encoding))
/**
* Return a new RDD that has exactly numPartitions partitions.
*
* Can increase or decrease the level of parallelism in this RDD.
* Internally, this uses a shuffle to redistribute data.
*
* If you are decreasing the number of partitions in this RDD,
* consider using `coalesce`, which can avoid performing a shuffle.
*/
def repartition(numPartitions: Int)(implicit ord: Ordering[T] = noOrdering): RDD[T] =
transformation(_.repartition(numPartitions))
/**
* Return a sampled subset of this RDD.
*
* @param withReplacement
* can elements be sampled multiple times (replaced when sampled
* out)
* @param fraction
* expected size of the sample as a fraction of this RDD's size
* without replacement: probability that each element is chosen;
* fraction must be [0, 1] with replacement: expected number of
* times each element is chosen; fraction must be greater than or
* equal to 0
* @param seed
* seed for the random number generator
*
* @note
* This is NOT guaranteed to provide exactly the fraction of the
* count of the given [[RDD]].
*/
def sample(withReplacement: Boolean, fraction: Double, seed: Long): RDD[T] =
transformation(_.sample(withReplacement, fraction, seed))
/** Return this RDD sorted by the given key function. */
def sortBy[K](f: (T) => K, ascending: Boolean = true, numPartitions: Int = this.partitions.length)(implicit
ord: Ordering[K],
ctag: ClassTag[K]
): RDD[T] = transformation(_.sortBy[K](f, ascending, numPartitions))
/**
* Return an RDD with the elements from `this` that are not in
* `other`.
*
* Uses `this` partitioner/partition size, because even if `other` is
* huge, the resulting RDD will be <= us.
*/
def subtract(other: RDD[T]): RDD[T] = transformation(_.subtract(other.underlying))
/**
* Return an RDD with the elements from `this` that are not in
* `other`.
*/
def subtract(other: RDD[T], numPartitions: Int): RDD[T] = transformation(_.subtract(other.underlying, numPartitions))
/**
* Return an RDD with the elements from `this` that are not in
* `other`.
*/
def subtract(other: RDD[T], p: Partitioner)(implicit ord: Ordering[T] = noOrdering): RDD[T] =
transformation(_.subtract(other.underlying, p))
/**
* Return the union of this RDD and another one. Any identical
* elements will appear multiple times (use `.distinct()` to eliminate
* them).
*/
def union(other: RDD[T]): RDD[T] = transformation(_.union(other.underlying))
/**
* Specify a ResourceProfile to use when calculating this RDD. This is
* only supported on certain cluster managers and currently requires
* dynamic allocation to be enabled. It will result in new executors
* with the resources specified being acquired to calculate the RDD.
*/
def withResources(rp: ResourceProfile): RDD[T] = transformation(_.withResources(rp))
/**
* Zips this RDD with another one, returning key-value pairs with the
* first element in each RDD, second element in each RDD, etc. Assumes
* that the two RDDs have the *same number of partitions* and the
* *same number of elements in each partition* (e.g. one was made
* through a map on the other).
*/
def zip[U: ClassTag](other: RDD[U]): RDD[(T, U)] = transformation(_.zip[U](other.underlying))
/**
* Zip this RDD's partitions with one (or more) RDD(s) and return a
* new RDD by applying a function to the zipped partitions. Assumes
* that all the RDDs have the *same number of partitions*, but does
* *not* require them to have the same number of elements in each
* partition.
*/
def zipPartitions[B: ClassTag, V: ClassTag](rdd2: RDD[B], preservesPartitioning: Boolean)(
f: (Iterator[T], Iterator[B]) => Iterator[V]
): RDD[V] = transformation(_.zipPartitions[B, V](rdd2.underlying, preservesPartitioning)(f))
def zipPartitions[B: ClassTag, V: ClassTag](rdd2: RDD[B])(f: (Iterator[T], Iterator[B]) => Iterator[V]): RDD[V] =
transformation(_.zipPartitions[B, V](rdd2.underlying)(f))
def zipPartitions[B: ClassTag, C: ClassTag, V: ClassTag](rdd2: RDD[B], rdd3: RDD[C], preservesPartitioning: Boolean)(
f: (Iterator[T], Iterator[B], Iterator[C]) => Iterator[V]
): RDD[V] = transformation(_.zipPartitions[B, C, V](rdd2.underlying, rdd3.underlying, preservesPartitioning)(f))
def zipPartitions[B: ClassTag, C: ClassTag, V: ClassTag](rdd2: RDD[B], rdd3: RDD[C])(
f: (Iterator[T], Iterator[B], Iterator[C]) => Iterator[V]
): RDD[V] = transformation(_.zipPartitions[B, C, V](rdd2.underlying, rdd3.underlying)(f))
def zipPartitions[B: ClassTag, C: ClassTag, D: ClassTag, V: ClassTag](
rdd2: RDD[B],
rdd3: RDD[C],
rdd4: RDD[D],
preservesPartitioning: Boolean
)(f: (Iterator[T], Iterator[B], Iterator[C], Iterator[D]) => Iterator[V]): RDD[V] =
transformation(
_.zipPartitions[B, C, D, V](rdd2.underlying, rdd3.underlying, rdd4.underlying, preservesPartitioning)(f)
)
def zipPartitions[B: ClassTag, C: ClassTag, D: ClassTag, V: ClassTag](rdd2: RDD[B], rdd3: RDD[C], rdd4: RDD[D])(
f: (Iterator[T], Iterator[B], Iterator[C], Iterator[D]) => Iterator[V]
): RDD[V] = transformation(_.zipPartitions[B, C, D, V](rdd2.underlying, rdd3.underlying, rdd4.underlying)(f))
/**
* Zips this RDD with its element indices. The ordering is first based
* on the partition index and then the ordering of items within each
* partition. So the first item in the first partition gets index 0,
* and the last item in the last partition receives the largest index.
*
* This is similar to Scala's zipWithIndex but it uses Long instead of
* Int as the index type. This method needs to trigger a spark job
* when this RDD contains more than one partitions.
*
* @note
* Some RDDs, such as those returned by groupBy(), do not guarantee
* order of elements in a partition. The index assigned to each
* element is therefore not guaranteed, and may even change if the
* RDD is reevaluated. If a fixed ordering is required to guarantee
* the same index assignments, you should sort the RDD with
* sortByKey() or save it to a file.
*/
def zipWithIndex: RDD[(T, Long)] = transformation(_.zipWithIndex())
/**
* Zips this RDD with generated unique Long ids. Items in the kth
* partition will get ids k, n+k, 2*n+k, ..., where n is the number of
* partitions. So there may exist gaps, but this method won't trigger
* a spark job, which is different from
* [[org.apache.spark.rdd.RDD#zipWithIndex]].
*
* @note
* Some RDDs, such as those returned by groupBy(), do not guarantee
* order of elements in a partition. The unique ID assigned to each
* element is therefore not guaranteed, and may even change if the
* RDD is reevaluated. If a fixed ordering is required to guarantee
* the same index assignments, you should sort the RDD with
* sortByKey() or save it to a file.
*/
def zipWithUniqueId: RDD[(T, Long)] = transformation(_.zipWithUniqueId())
// ===============
// Methods that need to be implemented
//
// [[org.apache.spark.rdd.RDD.context]]
// [[org.apache.spark.rdd.RDD.sparkContext]]
// ===============
// Ignored methods
//
// [[org.apache.spark.rdd.RDD.collect]]
// [[org.apache.spark.rdd.RDD.groupBy]]
// [[org.apache.spark.rdd.RDD.setName]]
// [[org.apache.spark.rdd.RDD.toJavaRDD]]
// [[org.apache.spark.rdd.RDD.toString]]
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy