 * Copyright 2019 Spotify AB.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.

package com.spotify.scio.values

import java.lang.{Boolean => JBoolean, Double => JDouble, Iterable => JIterable}
import java.util.concurrent.ThreadLocalRandom
import com.spotify.scio.ScioContext
import com.spotify.scio.coders.{Coder, CoderMaterializer}
import com.spotify.scio.estimators.{
import com.spotify.scio.schemas.{Schema, SchemaMaterializer}
import com.spotify.scio.testing.TestDataManager
import com.spotify.scio.transforms.BatchDoFn
import com.spotify.scio.util.FilenamePolicySupplier
import com.spotify.scio.util._
import com.twitter.algebird.{Aggregator, Monoid, MonoidAggregator, Semigroup}
import org.apache.beam.sdk.coders.{ByteArrayCoder, Coder => BCoder}
import org.apache.beam.sdk.schemas.SchemaCoder
import{Compression, FileBasedSource}
import org.apache.beam.sdk.transforms.DoFn.{Element, OutputReceiver, ProcessElement, Timestamp}
import org.apache.beam.sdk.transforms._
import org.apache.beam.sdk.transforms.windowing._
import org.apache.beam.sdk.util.{CoderUtils, SerializableUtils}
import org.apache.beam.sdk.values.WindowingStrategy.AccumulationMode
import org.apache.beam.sdk.values._
import org.apache.beam.sdk.{io => beam}
import org.joda.time.{Duration, Instant, ReadableInstant}
import org.slf4j.LoggerFactory

import scala.jdk.CollectionConverters._
import scala.collection.compat._
import scala.collection.immutable.TreeMap
import scala.reflect.ClassTag
import scala.util.Try
import com.twitter.chill.ClosureCleaner
import org.typelevel.scalaccompat.annotation.{nowarn, unused}

/** Convenience functions for creating SCollections. */
object SCollection {
  private[values] val logger = LoggerFactory.getLogger(this.getClass)

   * Create a union of multiple [[SCollection]] instances. Will throw an exception if the provided
   * iterable is empty. For a version that accepts empty iterables, see [[ScioContext#unionAll]].
  // `T: Coder` context bound is required since `scs` might be empty.
  def unionAll[T: Coder](scs: Iterable[SCollection[T]]): SCollection[T] =

  /** Implicit conversion from SCollection to DoubleSCollectionFunctions. */
  implicit def makeDoubleSCollectionFunctions(s: SCollection[Double]): DoubleSCollectionFunctions =
    new DoubleSCollectionFunctions(s)

  /** Implicit conversion from SCollection to DoubleSCollectionFunctions. */
  implicit def makeDoubleSCollectionFunctions[T](
    s: SCollection[T]
  )(implicit num: Numeric[T]): DoubleSCollectionFunctions =
    new DoubleSCollectionFunctions(

  /** Implicit conversion from SCollection to PairSCollectionFunctions. */
  implicit def makePairSCollectionFunctions[K, V](
    s: SCollection[(K, V)]
  ): PairSCollectionFunctions[K, V] =
    new PairSCollectionFunctions(s)

  implicit def makePairHashSCollectionFunctions[K, V](
    s: SCollection[(K, V)]
  ): PairHashSCollectionFunctions[K, V] =
    new PairHashSCollectionFunctions(s)

  implicit def makePairSkewedSCollectionFunctions[K, V](
    s: SCollection[(K, V)]
  ): PairSkewedSCollectionFunctions[K, V] =
    new PairSkewedSCollectionFunctions(s)

  final private[scio] case class State(postGbkOp: Boolean = false)

 * A Scala wrapper for [[org.apache.beam.sdk.values.PCollection PCollection]]. Represents an
 * immutable, partitioned collection of elements that can be operated on in parallel. This class
 * contains the basic operations available on all SCollections, such as `map`, `filter`, and `sum`.
 * In addition, [[PairSCollectionFunctions]] contains operations available only on SCollections of
 * key-value pairs, such as `groupByKey` and `join`; [[DoubleSCollectionFunctions]] contains
 * operations available only on SCollections of `Double`s.
 * @groupname collection
 * Collection Operations
 * @groupname hash
 * Hash Operations
 * @groupname output
 * Output Sinks
 * @groupname side
 * Side Input and Output Operations
 * @groupname transform
 * Transformations
 * @groupname window
 * Windowing Operations
sealed trait SCollection[T] extends PCollectionWrapper[T] {
  self =>

  import TupleFunctions._

  // =======================================================================
  // States
  // =======================================================================

  private var _state: SCollection.State = SCollection.State()

  private[scio] def withState(f: SCollection.State => SCollection.State): SCollection[T] = {
    _state = f(_state)

  private[scio] def state: SCollection.State = _state

  // =======================================================================
  // Delegations for internal PCollection
  // =======================================================================

  /** A friendly name for this SCollection. */
  def name: String = internal.getName

  /** Assign a Coder to this SCollection. */
  def setCoder(coder: org.apache.beam.sdk.coders.Coder[T]): SCollection[T] =

  def setSchema(schema: Schema[T])(implicit ct: ClassTag[T]): SCollection[T] =
    if (!internal.hasSchema) {
      val (s, to, from) = SchemaMaterializer.materialize(schema)
      val td = TypeDescriptor.of(ScioUtil.classOf[T])
      try {
        context.wrap(internal.setSchema(s, td, to, from))
      } catch {
        case _: IllegalStateException =>
          // Coder has already been set
          map(identity)(Coder.beam(SchemaCoder.of(s, td, to, from)))
    } else this

  private def ensureSerializable[A](coder: BCoder[A]): Either[Throwable, BCoder[A]] =
    coder match {
      case c if !context.isTest =>
      case c if c.getClass.getPackage.getName.startsWith("org.apache.beam") =>
      case _ =>

   * Apply a [[org.apache.beam.sdk.transforms.PTransform PTransform]] and wrap the output in an
   * [[SCollection]].
  def applyTransform[U: Coder](
    transform: PTransform[_ >: PCollection[T], PCollection[U]]
  ): SCollection[U] = applyTransform(tfName, transform)

   * Apply a [[org.apache.beam.sdk.transforms.PTransform PTransform]] and wrap the output in an
   * [[SCollection]].
   * @param name
   *   default transform name
   * @param transform
   *   [[org.apache.beam.sdk.transforms.PTransform PTransform]] to be applied
  def applyTransform[U: Coder](
    name: String,
    transform: PTransform[_ >: PCollection[T], PCollection[U]]
  ): SCollection[U] = {
    val coder = CoderMaterializer.beam(context, Coder[U])
    ensureSerializable(coder).fold(throw _, pApply(name, transform).setCoder)

  private[scio] def pApply[U](
    name: Option[String],
    transform: PTransform[_ >: PCollection[T], PCollection[U]]
  ): SCollection[U] = {
    val isCombineGlobally = classOf[Combine.Globally[T, U]].isAssignableFrom(transform.getClass)
    val t = if (isCombineGlobally && ScioUtil.isWindowed(this)) {
      // In case PCollection is windowed
      transform.asInstanceOf[Combine.Globally[T, U]].withoutDefaults()
    } else {
    context.wrap(this.applyInternal(name, t))

  private[scio] def pApply[U](
    transform: PTransform[_ >: PCollection[T], PCollection[U]]
  ): SCollection[U] =
    pApply(None, transform)

  private[scio] def pApply[U](
    name: String,
    transform: PTransform[_ >: PCollection[T], PCollection[U]]
  ): SCollection[U] =
    pApply(Option(name), transform)

  private[scio] def parDo[U: Coder](fn: DoFn[T, U]): SCollection[U] =
      .setCoder(CoderMaterializer.beam(context, Coder[U]))

   * Apply a [[org.apache.beam.sdk.transforms.PTransform PTransform]] and wrap the output in an
   * [[SCollection]]. This is a special case of [[applyTransform]] for transforms with [[KV]]
   * output.
  def applyKvTransform[K: Coder, V: Coder](
    transform: PTransform[_ >: PCollection[T], PCollection[KV[K, V]]]
  ): SCollection[KV[K, V]] =
    applyKvTransform(tfName, transform)

   * Apply a [[org.apache.beam.sdk.transforms.PTransform PTransform]] and wrap the output in an
   * [[SCollection]]. This is a special case of [[applyTransform]] for transforms with [[KV]]
   * output.
   * @param name
   *   default transform name
   * @param transform
   *   [[org.apache.beam.sdk.transforms.PTransform PTransform]] to be applied
  def applyKvTransform[K: Coder, V: Coder](
    name: String,
    transform: PTransform[_ >: PCollection[T], PCollection[KV[K, V]]]
  ): SCollection[KV[K, V]] =
    applyTransform(name, transform)

  /** Apply a transform. */
  def transform[U](f: SCollection[T] => SCollection[U]): SCollection[U] = transform(this.tfName)(f)

  def transform[U](name: String)(f: SCollection[T] => SCollection[U]): SCollection[U] =

  private[scio] def transform_[U <: POutput](f: SCollection[T] => U): U =

  private[scio] def transform_[U <: POutput](name: String)(f: SCollection[T] => U): U = {
      new PTransform[PCollection[T], U]() {
        override def expand(input: PCollection[T]): U = f(context.wrap(input))

  // =======================================================================
  // Collection operations
  // =======================================================================

  /** lifts this [[SCollection]] to the specified type */
  def covary[U >: T]: SCollection[U] = this.asInstanceOf[SCollection[U]]

  /** lifts this [[SCollection]] to the specified type */
  def covary_[U](implicit @unused ev: T <:< U): SCollection[U] = this.asInstanceOf[SCollection[U]]

  /** lifts this [[SCollection]] to the specified type */
  def contravary[U <: T]: SCollection[U] = this.asInstanceOf[SCollection[U]]

   * Convert this SCollection to an [[SCollectionWithFanout]] that uses an intermediate node to
   * combine parts of the data to reduce load on the final global combine step.
   * @param fanout
   *   the number of intermediate keys that will be used
  def withFanout(fanout: Int): SCollectionWithFanout[T] =
    new SCollectionWithFanout[T](this, fanout)

   * Return the union of this SCollection and another one. Any identical elements will appear
   * multiple times (use [[distinct]] to eliminate them).
   * @group collection
  def ++(that: SCollection[T]): SCollection[T] = this.union(that)

   * Return the union of this SCollection and another one. Any identical elements will appear
   * multiple times (use [[distinct]] to eliminate them).
   * @group collection
  def union(that: SCollection[T]): SCollection[T] = {
    val o = PCollectionList
      .apply(this.tfName, Flatten.pCollections())

   * Return the intersection of this SCollection and another one. The output will not contain any
   * duplicate elements, even if the input SCollections did.
   * Note that this method performs a shuffle internally.
   * @group collection
  def intersection(that: SCollection[T]): SCollection[T] =
    this.transform {, 1)).cogroup(, 1))).flatMap { t =>
        if (t._2._1.nonEmpty && t._2._2.nonEmpty) Seq(t._1) else Seq.empty

   * Partition this SCollection with the provided function.
   * @param numPartitions
   *   number of output partitions
   * @param f
   *   function that assigns an output partition to each element, should be in the range `[0,
   *   numPartitions - 1]`
   * @return
   *   partitioned SCollections in a `Seq`
   * @group collection
  def partition(numPartitions: Int, f: T => Int): Seq[SCollection[T]] = {
    require(numPartitions > 0, "Number of partitions should be positive")
    if (numPartitions == 1) {
    } else {
        .applyInternal(Partition.of[T](numPartitions, Functions.partitionFn[T](f)))

   * Partition this SCollection into a pair of SCollections according to a predicate.
   * @param p
   *   predicate on which to partition
   * @return
   *   a pair of SCollections: the first SCollection consists of all elements that satisfy the
   *   predicate p and the second consists of all element that do not.
   * @group collection
  def partition(p: T => Boolean): (SCollection[T], SCollection[T]) = {
    val Seq(left, right) = partition(2, t => if (p(t)) 0 else 1): @nowarn
    (left, right)

   * Partition this SCollection into a map from possible key values to an SCollection of
   * corresponding elements based on the provided function .
   * @param partitionKeys
   *   The keys for the output partitions
   * @param f
   *   function that assigns an output partition to each element, should be in the range of
   *   `partitionKeys`
   * @return
   *   partitioned SCollections in a `Map`
   * @group collection
  def partitionByKey[U](partitionKeys: Set[U])(f: T => U): Map[U, SCollection[T]] = {
    val partitionKeysIndexed = partitionKeys.toIndexedSeq

      .zip(partition(partitionKeys.size, (t: T) => partitionKeysIndexed.indexOf(f(t))))

   * Partition this SCollection using T.## into `n` partitions
   * @param numPartitions
   *   number of output partitions
   * @return
   *   partitioned SCollections in a `Seq`
   * @group collection
  def hashPartition(numPartitions: Int): Seq[SCollection[T]] =
      t => Math.floorMod(ScioUtil.consistentHashCode(t), numPartitions)

  // =======================================================================
  // Transformations
  // =======================================================================

   * Aggregate the elements using given combine functions and a neutral "zero value". This function
   * can return a different result type, `U`, than the type of this SCollection, `T`. Thus, we need
   * one operation for merging a `T` into an `U` and one operation for merging two `U`'s. Both of
   * these functions are allowed to modify and return their first argument instead of creating a new
   * `U` to avoid memory allocation.
   * @group transform
  def aggregate[U: Coder](
    zeroValue: => U
  )(seqOp: (U, T) => U, combOp: (U, U) => U): SCollection[U] =
    this.pApply(Combine.globally(Functions.aggregateFn(context, zeroValue)(seqOp, combOp)))

   * Aggregate with [[com.twitter.algebird.Aggregator Aggregator]]. First each item `T` is mapped to
   * `A`, then we reduce with a [[com.twitter.algebird.Semigroup Semigroup]] of `A`, then finally we
   * present the results as `U`. This could be more powerful and better optimized in some cases.
   * @group transform
  def aggregate[A: Coder, U: Coder](aggregator: Aggregator[T, A, U]): SCollection[U] =
    this.transform { in =>
      val a = aggregator // defeat closure

   * Aggregate with [[com.twitter.algebird.MonoidAggregator MonoidAggregator]]. First each item `T`
   * is mapped to `A`, then we reduce with a [[com.twitter.algebird.Monoid Monoid]] of `A`, then
   * finally we present the results as `U`. This could be more powerful and better optimized in some
   * cases.
   * @group transform
  def aggregate[A: Coder, U: Coder](aggregator: MonoidAggregator[T, A, U]): SCollection[U] =
    this.transform { in =>
      val a = aggregator // defeat closure

   * Batches elements for amortized processing. Elements are batched per-window and batches emitted
   * in the window corresponding to its contents.
   * Batches are emitted even if the maximum size is not reached when bundle finishes or when there
   * are too many live windows.
   * @param batchSize
   *   desired number of elements in a batch
   * @param maxLiveWindows
   *   maximum number of window buffering
   * @group collection
  def batch(
    batchSize: Long,
    maxLiveWindows: Int = BatchDoFn.DEFAULT_MAX_LIVE_WINDOWS
  ): SCollection[Iterable[T]] = {
    val weigher = Functions.serializableFn[T, java.lang.Long](_ => 1)
      .parDo(new BatchDoFn[T](batchSize, weigher, maxLiveWindows))(Coder.aggregate)

   * Batches elements for amortized processing. Elements are batched per-window and batches emitted
   * in the window corresponding to its contents.
   * Batches are emitted even if the maximum size is not reached when bundle finishes or when there
   * are too many live windows.
   * @param batchByteSize
   *   desired batch size in bytes, estimated using the [[Coder]]
   * @param maxLiveWindows
   *   maximum number of window buffering
   * @group collection
  def batchByteSized(
    batchByteSize: Long,
    maxLiveWindows: Int = BatchDoFn.DEFAULT_MAX_LIVE_WINDOWS
  ): SCollection[Iterable[T]] =
    batchWeighted(batchByteSize, ScioUtil.elementByteSize(context), maxLiveWindows)

   * Batches elements for amortized processing. Elements are batched per-window and batches emitted
   * in the window corresponding to its contents.
   * Batches are emitted even if the maximum size is not reached when bundle finishes or when there
   * are too many live windows.
   * @param batchWeight
   *   desired batch weight
   * @param cost
   *   function that associated a weight to an element
   * @param maxLiveWindows
   *   maximum number of window buffering
   * @group collection
  def batchWeighted(
    batchWeight: Long,
    cost: T => Long,
    maxLiveWindows: Int = BatchDoFn.DEFAULT_MAX_LIVE_WINDOWS
  ): SCollection[Iterable[T]] = {
    val weigher = Functions.serializableFn(cost.andThen(
      .parDo(new BatchDoFn[T](batchWeight, weigher, maxLiveWindows))(Coder.aggregate)

   * Filter the elements for which the given `PartialFunction` is defined, and then map.
   * @group transform
  def collect[U: Coder](pfn: PartialFunction[T, U]): SCollection[U] =
    this.transform {

   * Generic function to combine the elements using a custom set of aggregation functions. Turns an
   * `SCollection[T]` into a result of type `SCollection[C]`, for a "combined type" `C`. Note that
   * `T` and `C` can be different -- for example, one might combine an SCollection of type `Int`
   * into an SCollection of type `Seq[Int]`. Users provide three functions:
   *   - `createCombiner`, which turns a `T` into a `C` (e.g., creates a one-element list)
   *   - `mergeValue`, to merge a `T` into a `C` (e.g., adds it to the end of a list)
   *   - `mergeCombiners`, to combine two `C`'s into a single one.
   * Both `mergeValue` and `mergeCombiners` are allowed to modify and return their first argument
   * instead of creating a new `U` to avoid memory allocation.
   * @group transform
  def combine[C: Coder](createCombiner: T => C)(
    mergeValue: (C, T) => C
  )(mergeCombiners: (C, C) => C): SCollection[C] = {
      "combine/sum does not support default value and may fail in some streaming scenarios. " +
        "Consider aggregate/fold instead."
        .globally(Functions.combineFn(context, createCombiner, mergeValue, mergeCombiners))

   * Count the number of elements in the SCollection.
   * @return
   *   a new SCollection with the count
   * @group transform
  def count: SCollection[Long] =

   * Count approximate number of distinct elements in the SCollection.
   * @param sampleSize
   *   the number of entries in the statistical sample; the higher this number, the more accurate
   *   the estimate will be; should be `>= 16`
   * @group transform
  def countApproxDistinct(sampleSize: Int): SCollection[Long] =

   * Count approximate number of distinct elements in the SCollection.
   * @param maximumEstimationError
   *   the maximum estimation error, which should be in the range `[0.01, 0.5]`
   * @group transform
  def countApproxDistinct(maximumEstimationError: Double = 0.02): SCollection[Long] =

   * Returns a single valued SCollection with estimated distinct count. Correctness is depends on
   * the [[ApproxDistinctCounter]] estimator.
   * @Example
   *   {{{
   *   val input: SCollection[T] = ...
   *   val distinctCount: SCollection[Long] = input.countApproxDistinct(ApproximateUniqueCounter(sampleSize))
   *   }}}
   * There are two different HLL++ implementations available in the `scio-extra` module.
   *   - [[com.spotify.scio.extra.hll.sketching.SketchHllPlusPlus]]
   *   - [[com.spotify.scio.extra.hll.zetasketch.ZetaSketchHllPlusPlus]]
   * @param estimator
   * @return
  def countApproxDistinct(estimator: ApproxDistinctCounter[T]): SCollection[Long] =

   * Count of each unique value in this SCollection as an SCollection of (value, count) pairs.
   * @group transform
  def countByValue: SCollection[(T, Long)] =
    this.transform {

   * Return a new SCollection containing the distinct elements in this SCollection.
   * @group transform
  def distinct: SCollection[T] = this.pApply(Distinct.create[T]())

   * Returns a new SCollection with distinct elements using given function to obtain a
   * representative value for each input element.
   * @param f
   *   The function to use to get representative values.
   * @tparam U
   *   The type of representative values used to dedup.
   * @group transform
  // This is simpler than Distinct.withRepresentativeValueFn, and allows us to set Coders
  def distinctBy[U: Coder](f: T => U): SCollection[T] =
    this.transform { me =>
        // we use aggregate by key to avoid errors in streaming mode
        // when a pane would fire without any element for the key
        .aggregateByKey[Option[T]](None)(_ orElse Some(_), _ orElse _)

   * Return a new SCollection containing only the elements that satisfy a predicate.
   * @group transform
  def filter(f: T => Boolean): SCollection[T] =
    this.pApply([T => JBoolean])))

   * Return a new SCollection containing only the elements that don't satisfy a predicate.
   * @group transform
  def filterNot(f: T => Boolean): SCollection[T] = filter(!f(_))

   * Return a new SCollection by first applying a function to all elements of this SCollection, and
   * then flattening the results.
   * @group transform
  def flatMap[U: Coder](f: T => TraversableOnce[U]): SCollection[U] =

   * Return a new `SCollection[U]` by flattening each element of an `SCollection[Traversable[U]]`.
   * @group transform
  // Cannot use `U: Coder` context bound here because `U` depends on `ev`.
  def flatten[U](implicit ev: T => TraversableOnce[U], coder: Coder[U]): SCollection[U] =

   * Aggregate the elements using a given associative function and a neutral "zero value". The
   * function op(t1, t2) is allowed to modify t1 and return it as its result value to avoid object
   * allocation; however, it should not modify t2.
   * @group transform
  def fold(zeroValue: => T)(op: (T, T) => T): SCollection[T] =
    this.pApply(Combine.globally(Functions.aggregateFn(context, zeroValue)(op, op)))

   * Fold with [[com.twitter.algebird.Monoid Monoid]], which defines the associative function and
   * "zero value" for `T`. This could be more powerful and better optimized in some cases.
   * @group transform
  def fold(implicit mon: Monoid[T]): SCollection[T] =
    this.pApply(Combine.globally(Functions.reduceFn(context, mon)))

   * Return an SCollection of grouped items. Each group consists of a key and a sequence of elements
   * mapping to that key. The ordering of elements within each group is not guaranteed, and may even
   * differ each time the resulting SCollection is evaluated.
   * Note: This operation may be very expensive. If you are grouping in order to perform an
   * aggregation (such as a sum or average) over each key, using
   * [[PairSCollectionFunctions.aggregateByKey[U]* PairSCollectionFunctions.aggregateByKey]] or
   * [[PairSCollectionFunctions.reduceByKey]] will provide much better performance.
   * @group transform
  def groupBy[K: Coder](f: T => K): SCollection[(K, Iterable[T])] = {
    if (!context.isTest && CallSites.wasCalledExternally) {
        "groupBy will materialize all values for a key to a single worker," +
          " which is a very common cause of memory issues." +
          " Consider using aggregateByKey/reduceByKey on a keyed SCollection instead."

   * Return an SCollection of grouped items. Each group consists of a key and a sequence of elements
   * transformed into a value of type `U`. The ordering of elements within each group is not
   * guaranteed, and may even differ each time the resulting SCollection is evaluated.
   * It is equivalent to groupBy(key).mapValues(, but more efficient.
   * @group transform
  def groupMap[K: Coder, U: Coder](f: T => K)(
    g: T => U
  ): SCollection[(K, Iterable[U])] =
    this.transform {
      val cf = ClosureCleaner.clean(f)
      val cg = ClosureCleaner.clean(g) => KV.of(cf(t), cg(t)))
        .pApply(GroupByKey.create[K, U]())

   * Return an SCollection of grouped items. Each group consists of a key and the result of an
   * associative reduce function. The ordering of elements within each group is not guaranteed, and
   * may even differ each time the resulting SCollection is evaluated.
   * The associative function is performed locally on each mapper before sending results to a
   * reducer, similarly to a "combiner" in MapReduce
   * @group transform
  def groupMapReduce[K: Coder](f: T => K)(
    g: (T, T) => T
  ): SCollection[(K, T)] =
    this.transform {
      val cf = ClosureCleaner.clean(f) => KV.of(cf(t), t))
        .pApply(Combine.perKey(Functions.reduceFn(context, g)))

   * Return a new SCollection containing only the elements that also exist in the `SideInput`.
   * @group transform
  def hashFilter(sideInput: SideInput[Set[T]]): SCollection[T] =, ())).hashIntersectByKey(sideInput).keys

   * Create tuples of the elements in this SCollection by applying `f`.
   * @group transform
  // Scala lambda is simpler than transforms.WithKeys
  def keyBy[K: Coder](f: T => K): SCollection[(K, T)] = => (f(v), v))

   * Return a new SCollection by applying a function to all elements of this SCollection.
   * @group transform
  def map[U: Coder](f: T => U): SCollection[U] = this.parDo(Functions.mapFn(f))

   * Return the min of this SCollection as defined by the implicit `Ordering[T]`.
   * @return
   *   a new SCollection with the minimum element
   * @group transform
  // Scala lambda is simpler and more powerful than transforms.Min
  def min(implicit ord: Ordering[T]): SCollection[T] =

   * Return the max of this SCollection as defined by the implicit `Ordering[T]`.
   * @return
   *   a new SCollection with the maximum element
   * @group transform
  // Scala lambda is simpler and more powerful than transforms.Max
  def max(implicit ord: Ordering[T]): SCollection[T] =

   * Return the latest of this SCollection according to its event time.
   * @return
   *   a new SCollection with the latest element
   * @group transform
  def latest: SCollection[T] =
    // widen to ReadableInstant for scala 2.12 implicit ordering
    this.withTimestamp.max( ReadableInstant)).keys

   * Reduce with [[com.twitter.algebird.Semigroup Semigroup]]. This could be more powerful and
   * better optimized than [[reduce]] in some cases.
   * @group transform
  def sum(implicit sg: Semigroup[T]): SCollection[T] = {
      "combine/sum does not support default value and may fail in some streaming scenarios. " +
        "Consider aggregate/fold instead."
    this.pApply(Combine.globally(Functions.reduceFn(context, sg)).withoutDefaults())

   * Return the mean of this SCollection as defined by the implicit `Numeric[T]`.
   * @return
   *   a new SCollection with the mean of elements
   * @group transform
  def mean(implicit ev: Numeric[T]): SCollection[Double] = this.transform { in =>
    val e = ev // defeat closure[JDouble](e.toDouble)

   * Compute the SCollection's data distribution using approximate `N`-tiles.
   * @return
   *   a new SCollection whose single value is an `Iterable` of the approximate `N`-tiles of the
   *   elements
   * @group transform
  def quantilesApprox(numQuantiles: Int)(implicit ord: Ordering[T]): SCollection[Iterable[T]] =
    this.transform {
      _.pApply(ApproximateQuantiles.globally(numQuantiles, ord))
        .map((_: JIterable[T]).asScala)

   * Randomly splits this SCollection with the provided weights.
   * @param weights
   *   weights for splits, will be normalized if they don't sum to 1
   * @return
   *   split SCollections in an array
   * @group transform
  def randomSplit(weights: Array[Double]): Array[SCollection[T]] = {
    val sum = weights.sum
    val normalizedCumWeights = / sum).scanLeft(0.0d)(_ + _)
    val m = TreeMap(normalizedCumWeights.toIndexedSeq.zipWithIndex: _*) // Map[lower bound, split]

    val sides = (1 until weights.length).map(_ => SideOutput[T]())
    val (head, tail) = this
      .withSideOutputs(sides: _*)
      .flatMap { (x, c) =>
        val i = m.rangeTo(ThreadLocalRandom.current().nextDouble()).last._2
        if (i == 0) {
          Seq(x) // Main output
        } else {
          c.output(sides(i - 1), x) // Side output
    (head +:

   * Randomly splits this SCollection into two parts.
   * @param weight
   *   weight for left hand side SCollection, should be in the range `(0, 1)`
   * @return
   *   split SCollections in a Tuple2
   * @group transform
  def randomSplit(weight: Double): (SCollection[T], SCollection[T]) = {
    require(weight > 0.0 && weight < 1.0)
    val splits = randomSplit(Array(weight, 1d - weight))
    (splits(0), splits(1))

   * Randomly splits this SCollection into three parts. Note: `0 < weightA + weightB < 1`
   * @param weightA
   *   weight for first SCollection, should be in the range `(0, 1)`
   * @param weightB
   *   weight for second SCollection, should be in the range `(0, 1)`
   * @return
   *   split SCollections in a Tuple3
   * @group transform
  def randomSplit(
    weightA: Double,
    weightB: Double
  ): (SCollection[T], SCollection[T], SCollection[T]) = {
    require(weightA > 0.0 && weightB > 0.0 && (weightA + weightB) < 1.0)
    val splits = randomSplit(Array(weightA, weightB, 1d - (weightA + weightB)))
    (splits(0), splits(1), splits(2))

   * Reduce the elements of this SCollection using the specified commutative and associative binary
   * operator.
   * @group transform
  def reduce(op: (T, T) => T): SCollection[T] =
    this.pApply(Combine.globally(Functions.reduceFn(context, op)).withoutDefaults())

   * Return a sampled subset of this SCollection containing exactly `sampleSize` items. Involves
   * combine operation resulting in shuffling. All the elements of the output should fit into main
   * memory of a single worker machine.
   * @return
   *   a new SCollection whose single value is an `Iterable` of the samples
   * @group transform
  // TODO move to implicit
  def sample(sampleSize: Int): SCollection[Iterable[T]] =
    new SampleSCollectionFunctions(this).sample(sampleSize)

  // TODO move to implicit
  def sampleWeighted(totalWeight: Long, cost: T => Long): SCollection[Iterable[T]] =
    new SampleSCollectionFunctions(this).sampleWeighted(totalWeight, cost)

  // TODO move to implicit
  def sampleByteSized(totalByteSize: Long): SCollection[Iterable[T]] =
    new SampleSCollectionFunctions(this).sampleByteSized(totalByteSize)

   * Return a sampled subset of this SCollection. Does not trigger shuffling.
   * @param withReplacement
   *   if `true` the same element can be produced more than once, otherwise the same element will be
   *   sampled only once
   * @param fraction
   *   the sampling fraction
   * @group transform
  // TODO move to implicit
  def sample(withReplacement: Boolean, fraction: Double): SCollection[T] =
    new SampleSCollectionFunctions(this).sample(withReplacement, fraction)

   * Return an SCollection with the elements from `this` that are not in `other`.
   * @group transform
  def subtract(that: SCollection[T]): SCollection[T] =
    this.transform {, ())).subtractByKey(that).keys

   * Return a sampled subset of any `num` elements of the SCollection.
   * @group transform
  def take(num: Long): SCollection[T] = this.pApply(Sample.any(num))

   * Return the top k (largest) elements from this SCollection as defined by the specified implicit
   * `Ordering[T]`.
   * @return
   *   a new SCollection whose single value is an `Iterable` of the top k
   * @group transform
  def top(num: Int)(implicit ord: Ordering[T]): SCollection[Iterable[T]] =
    this.transform {
      _.pApply(Top.of[T, Ordering[T]](num, ord)).map((l: JIterable[T]) => l.asScala)

  // =======================================================================
  // Hash operations
  // =======================================================================

   * Return the cross product with another SCollection by replicating `that` to all workers. The
   * right side should be tiny and fit in memory.
   * @group hash
  def cross[U](that: SCollection[U]): SCollection[(T, U)] = {
    implicit val uCoder = that.coder
    this.transform { in =>
      val side = that.asListSideInput
        .flatMap((t, s) => s(side).map((t, _)))

   * Look up values in an `SCollection[(T, V)]` for each element `T` in this SCollection by
   * replicating `that` to all workers. The right side should be tiny and fit in memory.
   * @group hash
  def hashLookup[V](
    that: SCollection[(T, V)]
  ): SCollection[(T, Iterable[V])] = {
    implicit val vCoder = that.valueCoder
    this.transform { in =>
      val side = that.asMultiMapSingletonSideInput
        .map((t, s) => (t, s(side).getOrElse(t, Iterable())))

   * Print content of an SCollection to `out()`.
   * @param out
   *   where to write the debug information. Default: stdout
   * @param prefix
   *   prefix for each logged entry. Default: empty string
   * @param enabled
   *   if debugging is enabled or not. Default: true. It can be useful to set this to sc.isTest to
   *   avoid debugging when running in production.
   * @group debug
  def debug(
    out: () => PrintStream = () => Console.out,
    prefix: String = "",
    enabled: Boolean = true
  ): SCollection[T] =
    if (enabled) {
      tap(elem => out().println(prefix + elem))
    } else {

   * Applies f to each element of this [[SCollection]], and returns the original value.
   * @group debug
  def tap(f: T => Any): SCollection[T] =
    pApply(ParDo.of(Functions.mapFn[T, T] { elem => f(elem); elem })).setCoder(internal.getCoder)

  // =======================================================================
  // Side input operations
  // =======================================================================

   * Convert this SCollection of a single value per window to a [[SideInput]], to be used with
   * [[withSideInputs]].
   * @group side
  def asSingletonSideInput: SideInput[T] =
    new SingletonSideInput[T](this.applyInternal(View.asSingleton()))

   * Convert this SCollection of a single value per window to a [[SideInput]] with a default value,
   * to be used with [[withSideInputs]].
   * @group side
  def asSingletonSideInput(defaultValue: T): SideInput[T] =
    new SingletonSideInput[T](this.applyInternal(View.asSingleton().withDefaultValue(defaultValue)))

   * Convert this SCollection to a [[SideInput]], mapping each window to a `Seq`, to be used with
   * [[withSideInputs]].
   * The resulting `Seq` is required to fit in memory.
   * @group side
  // j.u.List#asScala returns s.c.mutable.Buffer which has an O(n) .toList method
  // returning Seq[T] here to avoid copying
  def asListSideInput: SideInput[Seq[T]] =
    new ListSideInput[T](this.applyInternal(View.asList()))

   * Convert this SCollection to a [[SideInput]], mapping each window to an `Iterable`, to be used
   * with [[withSideInputs]].
   * The values of the `Iterable` for a window are not required to fit in memory, but they may also
   * not be effectively cached. If it is known that every window fits in memory, and stronger
   * caching is desired, use [[asListSideInput]].
   * @group side
  def asIterableSideInput: SideInput[Iterable[T]] =
    new IterableSideInput[T](this.applyInternal(View.asIterable()))

   * Convert this SCollection to a [[SideInput]], mapping each window to a `Set[T]`, to be used with
   * [[withSideInputs]].
   * The resulting [[SideInput]] is a one element singleton which is a `Set` of all elements in the
   * SCollection for the given window. The complete Set must fit in memory of the worker.
   * @group side
  // Find the distinct elements in parallel and then convert to a Set and SingletonSideInput.
  // This is preferred over aggregating as we want to map each window to a Set.
  def asSetSingletonSideInput: SideInput[Set[T]] =
          .groupBy(_ => ())

   * Convert this SCollection to an [[SCollectionWithSideInput]] with one or more [[SideInput]] s,
   * similar to Spark broadcast variables. Call [[SCollectionWithSideInput.toSCollection]] when done
   * with side inputs.
   * {{{
   * val s1: SCollection[Int] = // ...
   * val s2: SCollection[String] = // ...
   * val s3: SCollection[(String, Double)] = // ...
   * // Prepare side inputs
   * val side1 = s1.asSingletonSideInput
   * val side2 = s2.asIterableSideInput
   * val side3 = s3.asMapSideInput
   * val side4 = s4.asMultiMapSideInput
   * val p: SCollection[MyRecord] = // ...
   * p.withSideInputs(side1, side2, side3).map { (x, s) =>
   *   // Extract side inputs from context
   *   val s1: Int = s(side1)
   *   val s2: Iterable[String] = s(side2)
   *   val s3: Map[String, Double] = s(side3)
   *   val s4: Map[String, Iterable[Double]] = s(side4)
   *   // ...
   * }
   * }}}
   * @group side
  def withSideInputs(sides: SideInput[_]*): SCollectionWithSideInput[T] =
    new SCollectionWithSideInput[T](this, sides)

  // =======================================================================
  // Side output operations
  // =======================================================================

   * Convert this SCollection to an [[SCollectionWithSideOutput]] with one or more [[SideOutput]] s,
   * so that a single transform can write to multiple destinations.
   * {{{
   * // Prepare side inputs
   * val side1 = SideOutput[String]()
   * val side2 = SideOutput[Int]()
   * val p: SCollection[MyRecord] = // ...
   * p.withSideOutputs(side1, side2).map { (x, s) =>
   *   // Write to side outputs via context
   *   s.output(side1, "word").output(side2, 1)
   *   // ...
   * }
   * }}}
   * @group side
  def withSideOutputs(sides: SideOutput[_]*): SCollectionWithSideOutput[T] =
    new SCollectionWithSideOutput[T](this, sides)

  // =======================================================================
  // Windowing operations
  // =======================================================================

   * Convert this SCollection to an [[WindowedSCollection]].
   * @group window
  def toWindowed: WindowedSCollection[T] =
    new WindowedSCollection[T](this)

   * Window values with the given function.
   * @group window
  def withWindowFn[W <: BoundedWindow](
    fn: WindowFn[_ <: Any, W],
    options: WindowOptions = WindowOptions()
  ): SCollection[T] = {
    var transform = Window.into(fn).asInstanceOf[Window[T]]
    if (options.trigger != null) {
      transform = transform.triggering(options.trigger)
    if (options.accumulationMode != null) {
      if (options.accumulationMode == AccumulationMode.ACCUMULATING_FIRED_PANES) {
        transform = transform.accumulatingFiredPanes()
      } else if (options.accumulationMode == AccumulationMode.DISCARDING_FIRED_PANES) {
        transform = transform.discardingFiredPanes()
      } else {
        throw new RuntimeException(s"Unsupported accumulation mode ${options.accumulationMode}")
    if (options.allowedLateness != null) {
      transform = if (options.closingBehavior == null) {
      } else {
        transform.withAllowedLateness(options.allowedLateness, options.closingBehavior)
    if (options.timestampCombiner != null) {
      transform = transform.withTimestampCombiner(options.timestampCombiner)
    if (options.onTimeBehavior != null) {
      transform = transform.withOnTimeBehavior(options.onTimeBehavior)


   * Window values into fixed windows.
   * @group window
  def withFixedWindows(
    duration: Duration,
    offset: Duration = Duration.ZERO,
    options: WindowOptions = WindowOptions()
  ): SCollection[T] =
    this.withWindowFn(FixedWindows.of(duration).withOffset(offset), options)

   * Window values into sliding windows.
   * @group window
  def withSlidingWindows(
    size: Duration,
    period: Duration = null,
    offset: Duration = Duration.ZERO,
    options: WindowOptions = WindowOptions()
  ): SCollection[T] = {
    var transform = SlidingWindows.of(size)
    if (period != null) {
      transform = transform.every(period)
    transform = transform.withOffset(offset)
    this.withWindowFn(transform, options)

   * Window values based on sessions.
   * @group window
  def withSessionWindows(
    gapDuration: Duration,
    options: WindowOptions = WindowOptions()
  ): SCollection[T] =
    this.withWindowFn(Sessions.withGapDuration(gapDuration), options)

   * Group values in to a single global window.
   * @group window
  def withGlobalWindow(options: WindowOptions = WindowOptions()): SCollection[T] =
    this.withWindowFn(new GlobalWindows(), options)

   * Window values into by years.
   * @group window
  def windowByYears(number: Int, options: WindowOptions = WindowOptions()): SCollection[T] =
    this.withWindowFn(CalendarWindows.years(number), options)

   * Window values into by months.
   * @group window
  def windowByMonths(number: Int, options: WindowOptions = WindowOptions()): SCollection[T] =
    this.withWindowFn(CalendarWindows.months(number), options)

   * Window values into by weeks.
   * @group window
  def windowByWeeks(
    number: Int,
    startDayOfWeek: Int,
    options: WindowOptions = WindowOptions()
  ): SCollection[T] =
    this.withWindowFn(CalendarWindows.weeks(number, startDayOfWeek), options)

   * Window values into by days.
   * @group window
  def windowByDays(number: Int, options: WindowOptions = WindowOptions()): SCollection[T] =
    this.withWindowFn(CalendarWindows.days(number), options)

   * Convert values into pairs of (value, window).
   * @group window
  def withPaneInfo: SCollection[(T, PaneInfo)] =
    this.parDo(new DoFn[T, (T, PaneInfo)] {
      private[scio] def processElement(
        @Element element: T,
        out: OutputReceiver[(T, PaneInfo)],
        pane: PaneInfo
      ): Unit =
        out.output((element, pane))

   * Convert values into pairs of (value, timestamp).
   * @group window
  def withTimestamp: SCollection[(T, Instant)] =
    this.parDo(new DoFn[T, (T, Instant)] {
      private[scio] def processElement(
        @Element element: T,
        @Timestamp timestamp: Instant,
        out: OutputReceiver[(T, Instant)]
      ): Unit =
        out.output((element, timestamp))

   * Convert values into pairs of (value, window).
   * @tparam W
   *   window type, must be [[org.apache.beam.sdk.transforms.windowing.BoundedWindow BoundedWindow]]
   *   or one of it's sub-types, e.g.
   *   [[org.apache.beam.sdk.transforms.windowing.GlobalWindow GlobalWindow]] if this SCollection is
   *   not windowed or [[org.apache.beam.sdk.transforms.windowing.IntervalWindow IntervalWindow]] if
   *   it is windowed.
   * @group window
  def withWindow[W <: BoundedWindow: Coder]: SCollection[(T, W)] =
      .parDo(new DoFn[T, (T, W)] {
        private[scio] def processElement(
          @Element element: T,
          out: OutputReceiver[(T, W)],
          window: BoundedWindow
        ): Unit =
          out.output((element, window.asInstanceOf[W]))

   * Assign timestamps to values. With a optional skew
   * @group window
  def timestampBy(f: T => Instant, allowedTimestampSkew: Duration = Duration.ZERO): SCollection[T] =
        .withAllowedTimestampSkew(allowedTimestampSkew): @nowarn("cat=deprecation")

  // =======================================================================
  // Read operations
  // =======================================================================

  /** @deprecated Use readTextFiles */
  @deprecated("Use readTextFiles", "0.14.5")
  def readFiles(implicit ev: T <:< String): SCollection[String] =

   * Reads each file, represented as a pattern, in this [[SCollection]].
   * @return
   *   each line of the input files.
  def readTextFiles(implicit ev: T <:< String): SCollection[String] =
    new FileSCollectionFunctions(this.covary_).readTextFiles()

   * Reads each file, represented as a pattern, in this [[SCollection]].
   * @return
   *   each file fully read as [[Array[Byte]].
  def readFilesAsBytes(implicit ev: T <:< String): SCollection[Array[Byte]] =
    new FileSCollectionFunctions(this.covary_).readFilesAsBytes()

   * Reads each file, represented as a pattern, in this [[SCollection]].
   * @return
   *   each file fully read as [[String]].
  def readFilesAsString(implicit ev: T <:< String): SCollection[String] =
    new FileSCollectionFunctions(this.covary_).readFilesAsString()

   * Reads each file, represented as a pattern, in this [[SCollection]].
   * @see
   *   [[readFilesAsBytes]], [[readFilesAsString]]
  def readFiles[A: Coder](
    f: beam.FileIO.ReadableFile => A
  )(implicit ev: T <:< String): SCollection[A] =
    new FileSCollectionFunctions(this.covary_).readFiles(f)

   * Reads each file, represented as a pattern, in this [[SCollection]].
   * @see
   *   [[readFilesAsBytes]], [[readFilesAsString]]
   * @param directoryTreatment
   *   Controls how to handle directories in the input.
   * @param compression
   *   Reads files using the given [[]].
  def readFiles[A: Coder](directoryTreatment: DirectoryTreatment, compression: Compression)(
    f: beam.FileIO.ReadableFile => A
  )(implicit ev: T <:< String): SCollection[A] =
    new FileSCollectionFunctions(this.covary_).readFiles(directoryTreatment, compression)(f)

   * Reads each file, represented as a pattern, in this [[SCollection]]. Files are split into
   * multiple offset ranges and read with the [[FileBasedSource]].
   * @param desiredBundleSizeBytes
   *   Desired size of bundles read by the sources.
   * @param directoryTreatment
   *   Controls how to handle directories in the input.
   * @param compression
   *   Reads files using the given [[]].
  def readFiles[A: Coder](
    desiredBundleSizeBytes: Long,
    directoryTreatment: DirectoryTreatment,
    compression: Compression
  )(f: String => FileBasedSource[A])(implicit ev: T <:< String): SCollection[A] =
    new FileSCollectionFunctions(this.covary_)
      .readFiles(desiredBundleSizeBytes, directoryTreatment, compression)(f)

   * Reads each file, represented as a pattern, in this [[SCollection]].
   * @see
   *   [[readFilesAsBytes]], [[readFilesAsString]], [[readFiles]]
   * @param directoryTreatment
   *   Controls how to handle directories in the input.
   * @param compression
   *   Reads files using the given [[]].
  def readFiles[A: Coder](
    filesTransform: PTransform[_ >: PCollection[beam.FileIO.ReadableFile], PCollection[A]],
    directoryTreatment: DirectoryTreatment = DirectoryTreatment.SKIP,
    compression: Compression = Compression.AUTO
  )(implicit ev: T <:< String): SCollection[A] =
    new FileSCollectionFunctions(this.covary_)
      .readFiles(filesTransform, directoryTreatment, compression)

   * Reads each file, represented as a pattern, in this [[SCollection]]. Files are split into
   * multiple offset ranges and read with the [[FileBasedSource]].
   * @return
   *   origin file name paired with read line.
   * @param desiredBundleSizeBytes
   *   Desired size of bundles read by the sources.
   * @param directoryTreatment
   *   Controls how to handle directories in the input.
   * @param compression
   *   Reads files using the given [[]].
  def readTextFilesWithPath(
    desiredBundleSizeBytes: Long = FileSCollectionFunctions.DefaultBundleSizeBytes,
    directoryTreatment: DirectoryTreatment = DirectoryTreatment.SKIP,
    compression: Compression = Compression.AUTO
  )(implicit ev: T <:< String): SCollection[(String, String)] =
    new FileSCollectionFunctions(this.covary_)
      .readTextFilesWithPath(desiredBundleSizeBytes, directoryTreatment, compression)

   * Reads each file, represented as a pattern, in this [[SCollection]]. Files are split into
   * multiple offset ranges and read with the [[FileBasedSource]].
   * @return
   *   origin file name paired with read element.
   * @param desiredBundleSizeBytes
   *   Desired size of bundles read by the sources.
   * @param directoryTreatment
   *   Controls how to handle directories in the input.
   * @param compression
   *   Reads files using the given [[]].
  def readFilesWithPath[A: Coder](
    desiredBundleSizeBytes: Long = FileSCollectionFunctions.DefaultBundleSizeBytes,
    directoryTreatment: DirectoryTreatment = DirectoryTreatment.SKIP,
    compression: Compression = Compression.AUTO
    f: String => FileBasedSource[A]
  )(implicit ev: T <:< String): SCollection[(String, A)] =
    new FileSCollectionFunctions(this.covary_)
      .readFilesWithPath(desiredBundleSizeBytes, directoryTreatment, compression)(f)

   * Pairs each element with the value of the provided [[SideInput]] in the element's window.
   * Reify as List:
   * {{{
   *   val other: SCollection[Int] = sc.parallelize(Seq(1))
   *   val coll: SCollection[(Int, Seq[Int])] =
   *     sc.parallelize(Seq(1, 2))
   *       .reifySideInputAsValues(other.asListSideInput)
   * }}}
   * Reify as Iterable:
   * {{{
   *   val other: SCollection[Int] = sc.parallelize(Seq(1))
   *   val coll: SCollection[(Int, Iterable[Int])] =
   *     sc.parallelize(Seq(1, 2))
   *       .reifySideInputAsValues(other.asIterableSideInput)
   * }}}
   * Reify as Map:
   * {{{
   *   val other: SCollection[(Int, Int)] = sc.parallelize(Seq((1, 1)))
   *   val coll: SCollection[(Int, Map[Int, Int])] =
   *     sc.parallelize(Seq(1, 2))
   *       .reifySideInputAsValues(other.asMapSideInput)
   * }}}
   * Reify as Multimap:
   * {{{
   *   val other: SCollection[(Int, Int)]  = sc.parallelize(Seq((1, 1)))
   *   val coll: SCollection[(Int, Map[Int, Iterable[Int]])]  =
   *     sc.parallelize(Seq(1, 2))
   *       .reifySideInputAsValues(other.asMultiMapSideInput)
   * }}}
  // `U: Coder` context bound is required since `PCollectionView` may be of different type
  def reifySideInputAsValues[U: Coder](side: SideInput[U]): SCollection[(T, U)] =
    this.transform(_.withSideInputs(side).map((t, s) => (t, s(side))).toSCollection)

  /** Returns an [[SCollection]] consisting of a single `Seq[T]` element. */
  def reifyAsListInGlobalWindow: SCollection[Seq[T]] =

  /** Returns an [[SCollection]] consisting of a single `Iterable[T]` element. */
  def reifyAsIterableInGlobalWindow: SCollection[Iterable[T]] =

   * Returns an [[SCollection]] consisting of a single element, containing the value of the given
   * side input in the global window.
   * Reify as List:
   * {{{
   *   val coll: SCollection[Seq[Int]] =
   *     sc.parallelize(Seq(1, 2)).reifyInGlobalWindow(_.asListSideInput)
   * }}}
   * Can be used to replace patterns like:
   * {{{
   *   val coll: SCollection[Iterable[Int]] = sc.parallelize(Seq(1, 2)).groupBy(_ => ())
   * }}}
   * where you want to actually get an empty [[Iterable]] even if no data is present.
  // `U: Coder` context bound is required since `PCollectionView` may be of different type
  private[scio] def reifyInGlobalWindow[U: Coder](
    view: SCollection[T] => SideInput[U]
  ): SCollection[U] =
    this.transform(coll =>

  // =======================================================================
  // Write operations
  // =======================================================================

   * Extract data from this SCollection as a closed [[Tap]]. The Tap will be available once the
   * pipeline completes successfully. `.materialize()` must be called before the `ScioContext` is
   * run, as its implementation modifies the current pipeline graph.
   * {{{
   * val closedTap = sc.parallelize(1 to 10).materialize
   * }}}
   * @group output
  def materialize: ClosedTap[T] =
    materialize(ScioUtil.getTempFile(context), isCheckpoint = false)

  private[scio] def materialize(path: String, isCheckpoint: Boolean): ClosedTap[T] =
    if (context.isTest) {
      // Do not run assertions on materialized value but still access test context to trigger
      // the test checking if we're running inside a JobTest
      if (!isCheckpoint) TestDataManager.getOutput(context.testId.get)
    } else {
      val elemCoder = CoderMaterializer.beam(context, coder)
      val arrCoder = ByteArrayCoder.of()
        .map { e =>
            CoderUtils.encodeToByteArray(elemCoder, e),
            // encode record length
            BCoder.Context.NESTED: @nowarn("cat=deprecation")
      ClosedTap(MaterializeTap[T](path, context))

   * Save this SCollection as a text file. Note that elements must be of type `String`.
   * @group output
  def saveAsTextFile(
    path: String,
    numShards: Int = TextIO.WriteParam.DefaultNumShards,
    suffix: String = TextIO.WriteParam.DefaultSuffix,
    compression: Compression = TextIO.WriteParam.DefaultCompression,
    header: Option[String] = TextIO.WriteParam.DefaultHeader,
    footer: Option[String] = TextIO.WriteParam.DefaultFooter,
    shardNameTemplate: String = TextIO.WriteParam.DefaultShardNameTemplate,
    tempDirectory: String = TextIO.WriteParam.DefaultTempDirectory,
    filenamePolicySupplier: FilenamePolicySupplier =
    prefix: String = TextIO.WriteParam.DefaultPrefix
  )(implicit ct: ClassTag[T]): ClosedTap[String] = {
    val s = if (classOf[String] isAssignableFrom ct.runtimeClass) {
    } else {

   * Save this SCollection as raw bytes. Note that elements must be of type `Array[Byte]`.
   * @group output
  def saveAsBinaryFile(
    path: String,
    numShards: Int = BinaryIO.WriteParam.DefaultNumShards,
    prefix: String = BinaryIO.WriteParam.DefaultPrefix,
    suffix: String = BinaryIO.WriteParam.DefaultSuffix,
    compression: Compression = BinaryIO.WriteParam.DefaultCompression,
    header: Array[Byte] = BinaryIO.WriteParam.DefaultHeader,
    footer: Array[Byte] = BinaryIO.WriteParam.DefaultFooter,
    shardNameTemplate: String = BinaryIO.WriteParam.DefaultShardNameTemplate,
    framePrefix: Array[Byte] => Array[Byte] = BinaryIO.WriteParam.DefaultFramePrefix,
    frameSuffix: Array[Byte] => Array[Byte] = BinaryIO.WriteParam.DefaultFrameSuffix,
    tempDirectory: String = BinaryIO.WriteParam.DefaultTempDirectory,
    filenamePolicySupplier: FilenamePolicySupplier =
  )(implicit ev: T <:< Array[Byte]): ClosedTap[Nothing] =

   * Creates a Zstd dictionary based on this SCollection targeting a dictionary of size
   * `zstdDictSizeBytes` to be trained with approximately `trainingBytesTarget` bytes. The exact
   * training size is determined by estimating the average element size with
   * `numElementsForSizeEstimation` encoded elements and sampling this SCollection at an appropriate
   * rate.
   * @param path
   *   The path to which the trained dictionary should be written.
   * @param zstdDictSizeBytes
   *   The size of the dictionary to train in bytes. Recommended dictionary sizes are in hundreds of
   *   KB. Over 10MB is not recommended and you may hit resource limits if the dictionary size is
   *   near 20MB.
   * @param numElementsForSizeEstimation
   *   The number of elements of the SCollection to use to estimate the average element size.
   * @param trainingBytesTarget
   *   The target number of bytes on which to train. Memory usage for training can be 10x this.
   *   `None` to infer from `zstdDictSizeBytes`. Must be able to fit in the memory of a single
   *   worker.
  def saveAsZstdDictionary(
    path: String,
    zstdDictSizeBytes: Int = ZstdDictIO.WriteParam.DefaultZstdDictSizeBytes,
    numElementsForSizeEstimation: Long = ZstdDictIO.WriteParam.DefaultNumElementsForSizeEstimation,
    trainingBytesTarget: Option[Int] = ZstdDictIO.WriteParam.DefaultTrainingBytesTarget
  ): ClosedTap[Nothing] = {

   * Save this SCollection with a custom output transform. The transform should have a unique name.
   * @group output
  def saveAsCustomOutput[O <: POutput](
    name: String,
    transform: PTransform[PCollection[T], O]
  ): ClosedTap[Nothing] = {
    if (context.isTest) {
    } else {
      this.internal.apply(name, transform)


  private[scio] def saveAsInMemoryTap: ClosedTap[T] = {
    val tap = new InMemoryTap[T], this)

   * Generic write method for all `ScioIO[T]` implementations, if it is test pipeline this will
   * evaluate pre-registered output IO implementation which match for the passing `ScioIO[T]`
   * implementation. if not this will invoke [[[T]#write]] method along
   * with write configurations passed by.
   * @param io
   *   an implementation of `ScioIO[T]` trait
   * @param params
   *   configurations need to pass to perform underline write implementation
  def write(io: ScioIO[T])(params: io.WriteP): ClosedTap[io.tapT.T] =
    io.writeWithContext(this, params)

  def write(io: ScioIO[T] { type WriteP = Unit }): ClosedTap[io.tapT.T] =
    io.writeWithContext(this, ())

private[scio] class SCollectionImpl[T](val internal: PCollection[T], val context: ScioContext)
    extends SCollection[T] {}

