
com.spotify.scio.values.SCollection.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of scio-core_2.13 Show documentation
Show all versions of scio-core_2.13 Show documentation
Scio - A Scala API for Apache Beam and Google Cloud Dataflow
/*
* Copyright 2019 Spotify AB.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.spotify.scio.values
import java.io.PrintStream
import java.lang.{Boolean => JBoolean, Double => JDouble, Iterable => JIterable}
import java.util.concurrent.ThreadLocalRandom
import com.spotify.scio.ScioContext
import com.spotify.scio.coders.{Coder, CoderMaterializer}
import com.spotify.scio.estimators.{
ApproxDistinctCounter,
ApproximateUniqueCounter,
ApproximateUniqueCounterByError
}
import com.spotify.scio.io._
import com.spotify.scio.schemas.{Schema, SchemaMaterializer}
import com.spotify.scio.testing.TestDataManager
import com.spotify.scio.transforms.BatchDoFn
import com.spotify.scio.util.FilenamePolicySupplier
import com.spotify.scio.util._
import com.twitter.algebird.{Aggregator, Monoid, MonoidAggregator, Semigroup}
import org.apache.beam.sdk.coders.{ByteArrayCoder, Coder => BCoder}
import org.apache.beam.sdk.schemas.SchemaCoder
import org.apache.beam.sdk.io.{Compression, FileBasedSource}
import org.apache.beam.sdk.io.FileIO.ReadMatches.DirectoryTreatment
import org.apache.beam.sdk.transforms.DoFn.{Element, OutputReceiver, ProcessElement, Timestamp}
import org.apache.beam.sdk.transforms._
import org.apache.beam.sdk.transforms.windowing._
import org.apache.beam.sdk.util.{CoderUtils, SerializableUtils}
import org.apache.beam.sdk.values.WindowingStrategy.AccumulationMode
import org.apache.beam.sdk.values._
import org.apache.beam.sdk.{io => beam}
import org.joda.time.{Duration, Instant, ReadableInstant}
import org.slf4j.LoggerFactory
import scala.jdk.CollectionConverters._
import scala.collection.compat._
import scala.collection.immutable.TreeMap
import scala.reflect.ClassTag
import scala.util.Try
import com.twitter.chill.ClosureCleaner
import org.typelevel.scalaccompat.annotation.{nowarn, unused}
/** Convenience functions for creating SCollections. */
object SCollection {
private[values] val logger = LoggerFactory.getLogger(this.getClass)
/**
* Create a union of multiple [[SCollection]] instances. Will throw an exception if the provided
* iterable is empty. For a version that accepts empty iterables, see [[ScioContext#unionAll]].
*/
// `T: Coder` context bound is required since `scs` might be empty.
def unionAll[T: Coder](scs: Iterable[SCollection[T]]): SCollection[T] =
scs.head.context.unionAll(scs)
/** Implicit conversion from SCollection to DoubleSCollectionFunctions. */
implicit def makeDoubleSCollectionFunctions(s: SCollection[Double]): DoubleSCollectionFunctions =
new DoubleSCollectionFunctions(s)
/** Implicit conversion from SCollection to DoubleSCollectionFunctions. */
implicit def makeDoubleSCollectionFunctions[T](
s: SCollection[T]
)(implicit num: Numeric[T]): DoubleSCollectionFunctions =
new DoubleSCollectionFunctions(s.map(num.toDouble))
/** Implicit conversion from SCollection to PairSCollectionFunctions. */
implicit def makePairSCollectionFunctions[K, V](
s: SCollection[(K, V)]
): PairSCollectionFunctions[K, V] =
new PairSCollectionFunctions(s)
implicit def makePairHashSCollectionFunctions[K, V](
s: SCollection[(K, V)]
): PairHashSCollectionFunctions[K, V] =
new PairHashSCollectionFunctions(s)
implicit def makePairSkewedSCollectionFunctions[K, V](
s: SCollection[(K, V)]
): PairSkewedSCollectionFunctions[K, V] =
new PairSkewedSCollectionFunctions(s)
final private[scio] case class State(postGbkOp: Boolean = false)
}
/**
* A Scala wrapper for [[org.apache.beam.sdk.values.PCollection PCollection]]. Represents an
* immutable, partitioned collection of elements that can be operated on in parallel. This class
* contains the basic operations available on all SCollections, such as `map`, `filter`, and `sum`.
* In addition, [[PairSCollectionFunctions]] contains operations available only on SCollections of
* key-value pairs, such as `groupByKey` and `join`; [[DoubleSCollectionFunctions]] contains
* operations available only on SCollections of `Double`s.
*
* @groupname collection
* Collection Operations
* @groupname hash
* Hash Operations
* @groupname output
* Output Sinks
* @groupname side
* Side Input and Output Operations
* @groupname transform
* Transformations
* @groupname window
* Windowing Operations
*/
sealed trait SCollection[T] extends PCollectionWrapper[T] {
self =>
import TupleFunctions._
// =======================================================================
// States
// =======================================================================
private var _state: SCollection.State = SCollection.State()
private[scio] def withState(f: SCollection.State => SCollection.State): SCollection[T] = {
_state = f(_state)
this
}
private[scio] def state: SCollection.State = _state
// =======================================================================
// Delegations for internal PCollection
// =======================================================================
/** A friendly name for this SCollection. */
def name: String = internal.getName
/** Assign a Coder to this SCollection. */
def setCoder(coder: org.apache.beam.sdk.coders.Coder[T]): SCollection[T] =
context.wrap(internal.setCoder(coder))
def setSchema(schema: Schema[T])(implicit ct: ClassTag[T]): SCollection[T] =
if (!internal.hasSchema) {
val (s, to, from) = SchemaMaterializer.materialize(schema)
val td = TypeDescriptor.of(ScioUtil.classOf[T])
try {
context.wrap(internal.setSchema(s, td, to, from))
} catch {
case _: IllegalStateException =>
// Coder has already been set
map(identity)(Coder.beam(SchemaCoder.of(s, td, to, from)))
}
} else this
private def ensureSerializable[A](coder: BCoder[A]): Either[Throwable, BCoder[A]] =
coder match {
case c if !context.isTest =>
Right(c)
// https://issues.apache.org/jira/browse/BEAM-5645
case c if c.getClass.getPackage.getName.startsWith("org.apache.beam") =>
Right(c)
case _ =>
Try[BCoder[A]](SerializableUtils.ensureSerializable(coder)).toEither
}
/**
* Apply a [[org.apache.beam.sdk.transforms.PTransform PTransform]] and wrap the output in an
* [[SCollection]].
*/
def applyTransform[U: Coder](
transform: PTransform[_ >: PCollection[T], PCollection[U]]
): SCollection[U] = applyTransform(tfName, transform)
/**
* Apply a [[org.apache.beam.sdk.transforms.PTransform PTransform]] and wrap the output in an
* [[SCollection]].
*
* @param name
* default transform name
* @param transform
* [[org.apache.beam.sdk.transforms.PTransform PTransform]] to be applied
*/
def applyTransform[U: Coder](
name: String,
transform: PTransform[_ >: PCollection[T], PCollection[U]]
): SCollection[U] = {
val coder = CoderMaterializer.beam(context, Coder[U])
ensureSerializable(coder).fold(throw _, pApply(name, transform).setCoder)
}
private[scio] def pApply[U](
name: Option[String],
transform: PTransform[_ >: PCollection[T], PCollection[U]]
): SCollection[U] = {
val isCombineGlobally = classOf[Combine.Globally[T, U]].isAssignableFrom(transform.getClass)
val t = if (isCombineGlobally && ScioUtil.isWindowed(this)) {
// In case PCollection is windowed
transform.asInstanceOf[Combine.Globally[T, U]].withoutDefaults()
} else {
transform
}
context.wrap(this.applyInternal(name, t))
}
private[scio] def pApply[U](
transform: PTransform[_ >: PCollection[T], PCollection[U]]
): SCollection[U] =
pApply(None, transform)
private[scio] def pApply[U](
name: String,
transform: PTransform[_ >: PCollection[T], PCollection[U]]
): SCollection[U] =
pApply(Option(name), transform)
private[scio] def parDo[U: Coder](fn: DoFn[T, U]): SCollection[U] =
this
.pApply(ParDo.of(fn))
.setCoder(CoderMaterializer.beam(context, Coder[U]))
/**
* Apply a [[org.apache.beam.sdk.transforms.PTransform PTransform]] and wrap the output in an
* [[SCollection]]. This is a special case of [[applyTransform]] for transforms with [[KV]]
* output.
*/
def applyKvTransform[K: Coder, V: Coder](
transform: PTransform[_ >: PCollection[T], PCollection[KV[K, V]]]
): SCollection[KV[K, V]] =
applyKvTransform(tfName, transform)
/**
* Apply a [[org.apache.beam.sdk.transforms.PTransform PTransform]] and wrap the output in an
* [[SCollection]]. This is a special case of [[applyTransform]] for transforms with [[KV]]
* output.
*
* @param name
* default transform name
* @param transform
* [[org.apache.beam.sdk.transforms.PTransform PTransform]] to be applied
*/
def applyKvTransform[K: Coder, V: Coder](
name: String,
transform: PTransform[_ >: PCollection[T], PCollection[KV[K, V]]]
): SCollection[KV[K, V]] =
applyTransform(name, transform)
/** Apply a transform. */
def transform[U](f: SCollection[T] => SCollection[U]): SCollection[U] = transform(this.tfName)(f)
def transform[U](name: String)(f: SCollection[T] => SCollection[U]): SCollection[U] =
context.wrap(transform_(name)(f(_).internal))
private[scio] def transform_[U <: POutput](f: SCollection[T] => U): U =
transform_(tfName)(f)
private[scio] def transform_[U <: POutput](name: String)(f: SCollection[T] => U): U = {
applyInternal(
name,
new PTransform[PCollection[T], U]() {
override def expand(input: PCollection[T]): U = f(context.wrap(input))
}
)
}
// =======================================================================
// Collection operations
// =======================================================================
/** lifts this [[SCollection]] to the specified type */
def covary[U >: T]: SCollection[U] = this.asInstanceOf[SCollection[U]]
/** lifts this [[SCollection]] to the specified type */
def covary_[U](implicit @unused ev: T <:< U): SCollection[U] = this.asInstanceOf[SCollection[U]]
/** lifts this [[SCollection]] to the specified type */
def contravary[U <: T]: SCollection[U] = this.asInstanceOf[SCollection[U]]
/**
* Convert this SCollection to an [[SCollectionWithFanout]] that uses an intermediate node to
* combine parts of the data to reduce load on the final global combine step.
* @param fanout
* the number of intermediate keys that will be used
*/
def withFanout(fanout: Int): SCollectionWithFanout[T] =
new SCollectionWithFanout[T](this, fanout)
/**
* Return the union of this SCollection and another one. Any identical elements will appear
* multiple times (use [[distinct]] to eliminate them).
* @group collection
*/
def ++(that: SCollection[T]): SCollection[T] = this.union(that)
/**
* Return the union of this SCollection and another one. Any identical elements will appear
* multiple times (use [[distinct]] to eliminate them).
* @group collection
*/
def union(that: SCollection[T]): SCollection[T] = {
val o = PCollectionList
.of(internal)
.and(that.internal)
.apply(this.tfName, Flatten.pCollections())
context.wrap(o)
}
/**
* Return the intersection of this SCollection and another one. The output will not contain any
* duplicate elements, even if the input SCollections did.
*
* Note that this method performs a shuffle internally.
* @group collection
*/
def intersection(that: SCollection[T]): SCollection[T] =
this.transform {
_.map((_, 1)).cogroup(that.map((_, 1))).flatMap { t =>
if (t._2._1.nonEmpty && t._2._2.nonEmpty) Seq(t._1) else Seq.empty
}
}
/**
* Partition this SCollection with the provided function.
*
* @param numPartitions
* number of output partitions
* @param f
* function that assigns an output partition to each element, should be in the range `[0,
* numPartitions - 1]`
* @return
* partitioned SCollections in a `Seq`
* @group collection
*/
def partition(numPartitions: Int, f: T => Int): Seq[SCollection[T]] = {
require(numPartitions > 0, "Number of partitions should be positive")
if (numPartitions == 1) {
Seq(this)
} else {
this
.applyInternal(Partition.of[T](numPartitions, Functions.partitionFn[T](f)))
.getAll
.iterator
.asScala
.map(context.wrap)
.toSeq
}
}
/**
* Partition this SCollection into a pair of SCollections according to a predicate.
*
* @param p
* predicate on which to partition
* @return
* a pair of SCollections: the first SCollection consists of all elements that satisfy the
* predicate p and the second consists of all element that do not.
* @group collection
*/
def partition(p: T => Boolean): (SCollection[T], SCollection[T]) = {
val Seq(left, right) = partition(2, t => if (p(t)) 0 else 1): @nowarn
(left, right)
}
/**
* Partition this SCollection into a map from possible key values to an SCollection of
* corresponding elements based on the provided function .
*
* @param partitionKeys
* The keys for the output partitions
* @param f
* function that assigns an output partition to each element, should be in the range of
* `partitionKeys`
* @return
* partitioned SCollections in a `Map`
* @group collection
*/
def partitionByKey[U](partitionKeys: Set[U])(f: T => U): Map[U, SCollection[T]] = {
val partitionKeysIndexed = partitionKeys.toIndexedSeq
partitionKeysIndexed
.zip(partition(partitionKeys.size, (t: T) => partitionKeysIndexed.indexOf(f(t))))
.toMap
}
/**
* Partition this SCollection using T.## into `n` partitions
*
* @param numPartitions
* number of output partitions
* @return
* partitioned SCollections in a `Seq`
* @group collection
*/
def hashPartition(numPartitions: Int): Seq[SCollection[T]] =
self.partition(
numPartitions,
t => Math.floorMod(ScioUtil.consistentHashCode(t), numPartitions)
)
// =======================================================================
// Transformations
// =======================================================================
/**
* Aggregate the elements using given combine functions and a neutral "zero value". This function
* can return a different result type, `U`, than the type of this SCollection, `T`. Thus, we need
* one operation for merging a `T` into an `U` and one operation for merging two `U`'s. Both of
* these functions are allowed to modify and return their first argument instead of creating a new
* `U` to avoid memory allocation.
* @group transform
*/
def aggregate[U: Coder](
zeroValue: => U
)(seqOp: (U, T) => U, combOp: (U, U) => U): SCollection[U] =
this.pApply(Combine.globally(Functions.aggregateFn(context, zeroValue)(seqOp, combOp)))
/**
* Aggregate with [[com.twitter.algebird.Aggregator Aggregator]]. First each item `T` is mapped to
* `A`, then we reduce with a [[com.twitter.algebird.Semigroup Semigroup]] of `A`, then finally we
* present the results as `U`. This could be more powerful and better optimized in some cases.
* @group transform
*/
def aggregate[A: Coder, U: Coder](aggregator: Aggregator[T, A, U]): SCollection[U] =
this.transform { in =>
val a = aggregator // defeat closure
in.map(a.prepare).sum(a.semigroup).map(a.present)
}
/**
* Aggregate with [[com.twitter.algebird.MonoidAggregator MonoidAggregator]]. First each item `T`
* is mapped to `A`, then we reduce with a [[com.twitter.algebird.Monoid Monoid]] of `A`, then
* finally we present the results as `U`. This could be more powerful and better optimized in some
* cases.
* @group transform
*/
def aggregate[A: Coder, U: Coder](aggregator: MonoidAggregator[T, A, U]): SCollection[U] =
this.transform { in =>
val a = aggregator // defeat closure
in.map(a.prepare).fold(a.monoid).map(a.present)
}
/**
* Batches elements for amortized processing. Elements are batched per-window and batches emitted
* in the window corresponding to its contents.
*
* Batches are emitted even if the maximum size is not reached when bundle finishes or when there
* are too many live windows.
*
* @param batchSize
* desired number of elements in a batch
* @param maxLiveWindows
* maximum number of window buffering
*
* @group collection
*/
def batch(
batchSize: Long,
maxLiveWindows: Int = BatchDoFn.DEFAULT_MAX_LIVE_WINDOWS
): SCollection[Iterable[T]] = {
val weigher = Functions.serializableFn[T, java.lang.Long](_ => 1)
this
.parDo(new BatchDoFn[T](batchSize, weigher, maxLiveWindows))(Coder.aggregate)
.map(_.asScala)
}
/**
* Batches elements for amortized processing. Elements are batched per-window and batches emitted
* in the window corresponding to its contents.
*
* Batches are emitted even if the maximum size is not reached when bundle finishes or when there
* are too many live windows.
*
* @param batchByteSize
* desired batch size in bytes, estimated using the [[Coder]]
* @param maxLiveWindows
* maximum number of window buffering
*
* @group collection
*/
def batchByteSized(
batchByteSize: Long,
maxLiveWindows: Int = BatchDoFn.DEFAULT_MAX_LIVE_WINDOWS
): SCollection[Iterable[T]] =
batchWeighted(batchByteSize, ScioUtil.elementByteSize(context), maxLiveWindows)
/**
* Batches elements for amortized processing. Elements are batched per-window and batches emitted
* in the window corresponding to its contents.
*
* Batches are emitted even if the maximum size is not reached when bundle finishes or when there
* are too many live windows.
*
* @param batchWeight
* desired batch weight
* @param cost
* function that associated a weight to an element
* @param maxLiveWindows
* maximum number of window buffering
* @group collection
*/
def batchWeighted(
batchWeight: Long,
cost: T => Long,
maxLiveWindows: Int = BatchDoFn.DEFAULT_MAX_LIVE_WINDOWS
): SCollection[Iterable[T]] = {
val weigher = Functions.serializableFn(cost.andThen(Long.box))
this
.parDo(new BatchDoFn[T](batchWeight, weigher, maxLiveWindows))(Coder.aggregate)
.map(_.asScala)
}
/**
* Filter the elements for which the given `PartialFunction` is defined, and then map.
* @group transform
*/
def collect[U: Coder](pfn: PartialFunction[T, U]): SCollection[U] =
this.transform {
_.filter(pfn.isDefinedAt).map(pfn)
}
/**
* Generic function to combine the elements using a custom set of aggregation functions. Turns an
* `SCollection[T]` into a result of type `SCollection[C]`, for a "combined type" `C`. Note that
* `T` and `C` can be different -- for example, one might combine an SCollection of type `Int`
* into an SCollection of type `Seq[Int]`. Users provide three functions:
*
* - `createCombiner`, which turns a `T` into a `C` (e.g., creates a one-element list)
*
* - `mergeValue`, to merge a `T` into a `C` (e.g., adds it to the end of a list)
*
* - `mergeCombiners`, to combine two `C`'s into a single one.
*
* Both `mergeValue` and `mergeCombiners` are allowed to modify and return their first argument
* instead of creating a new `U` to avoid memory allocation.
*
* @group transform
*/
def combine[C: Coder](createCombiner: T => C)(
mergeValue: (C, T) => C
)(mergeCombiners: (C, C) => C): SCollection[C] = {
SCollection.logger.warn(
"combine/sum does not support default value and may fail in some streaming scenarios. " +
"Consider aggregate/fold instead."
)
this.pApply(
Combine
.globally(Functions.combineFn(context, createCombiner, mergeValue, mergeCombiners))
.withoutDefaults()
)
}
/**
* Count the number of elements in the SCollection.
* @return
* a new SCollection with the count
* @group transform
*/
def count: SCollection[Long] =
this.pApply(Count.globally[T]()).asInstanceOf[SCollection[Long]]
/**
* Count approximate number of distinct elements in the SCollection.
* @param sampleSize
* the number of entries in the statistical sample; the higher this number, the more accurate
* the estimate will be; should be `>= 16`
* @group transform
*/
def countApproxDistinct(sampleSize: Int): SCollection[Long] =
ApproximateUniqueCounter(sampleSize).estimateDistinctCount(this)
/**
* Count approximate number of distinct elements in the SCollection.
* @param maximumEstimationError
* the maximum estimation error, which should be in the range `[0.01, 0.5]`
* @group transform
*/
def countApproxDistinct(maximumEstimationError: Double = 0.02): SCollection[Long] =
ApproximateUniqueCounterByError(maximumEstimationError)
.estimateDistinctCount(this)
/**
* Returns a single valued SCollection with estimated distinct count. Correctness is depends on
* the [[ApproxDistinctCounter]] estimator.
*
* @Example
* {{{
* val input: SCollection[T] = ...
* val distinctCount: SCollection[Long] = input.countApproxDistinct(ApproximateUniqueCounter(sampleSize))
* }}}
*
* There are two different HLL++ implementations available in the `scio-extra` module.
* - [[com.spotify.scio.extra.hll.sketching.SketchHllPlusPlus]]
* - [[com.spotify.scio.extra.hll.zetasketch.ZetaSketchHllPlusPlus]]
* @param estimator
* @return
*/
def countApproxDistinct(estimator: ApproxDistinctCounter[T]): SCollection[Long] =
estimator.estimateDistinctCount(this)
/**
* Count of each unique value in this SCollection as an SCollection of (value, count) pairs.
* @group transform
*/
def countByValue: SCollection[(T, Long)] =
this.transform {
_.pApply(Count.perElement[T]()).map(TupleFunctions.klToTuple)
}
/**
* Return a new SCollection containing the distinct elements in this SCollection.
* @group transform
*/
def distinct: SCollection[T] = this.pApply(Distinct.create[T]())
/**
* Returns a new SCollection with distinct elements using given function to obtain a
* representative value for each input element.
*
* @param f
* The function to use to get representative values.
* @tparam U
* The type of representative values used to dedup.
* @group transform
*/
// This is simpler than Distinct.withRepresentativeValueFn, and allows us to set Coders
def distinctBy[U: Coder](f: T => U): SCollection[T] =
this.transform { me =>
me
.keyBy(f)
// we use aggregate by key to avoid errors in streaming mode
// when a pane would fire without any element for the key
.aggregateByKey[Option[T]](None)(_ orElse Some(_), _ orElse _)
.values
.flatten
}
/**
* Return a new SCollection containing only the elements that satisfy a predicate.
* @group transform
*/
def filter(f: T => Boolean): SCollection[T] =
this.pApply(Filter.by(Functions.processFn(f.asInstanceOf[T => JBoolean])))
/**
* Return a new SCollection containing only the elements that don't satisfy a predicate.
* @group transform
*/
def filterNot(f: T => Boolean): SCollection[T] = filter(!f(_))
/**
* Return a new SCollection by first applying a function to all elements of this SCollection, and
* then flattening the results.
* @group transform
*/
def flatMap[U: Coder](f: T => TraversableOnce[U]): SCollection[U] =
this.parDo(Functions.flatMapFn(f))
/**
* Return a new `SCollection[U]` by flattening each element of an `SCollection[Traversable[U]]`.
* @group transform
*/
// Cannot use `U: Coder` context bound here because `U` depends on `ev`.
def flatten[U](implicit ev: T => TraversableOnce[U], coder: Coder[U]): SCollection[U] =
flatMap(ev)
/**
* Aggregate the elements using a given associative function and a neutral "zero value". The
* function op(t1, t2) is allowed to modify t1 and return it as its result value to avoid object
* allocation; however, it should not modify t2.
* @group transform
*/
def fold(zeroValue: => T)(op: (T, T) => T): SCollection[T] =
this.pApply(Combine.globally(Functions.aggregateFn(context, zeroValue)(op, op)))
/**
* Fold with [[com.twitter.algebird.Monoid Monoid]], which defines the associative function and
* "zero value" for `T`. This could be more powerful and better optimized in some cases.
* @group transform
*/
def fold(implicit mon: Monoid[T]): SCollection[T] =
this.pApply(Combine.globally(Functions.reduceFn(context, mon)))
/**
* Return an SCollection of grouped items. Each group consists of a key and a sequence of elements
* mapping to that key. The ordering of elements within each group is not guaranteed, and may even
* differ each time the resulting SCollection is evaluated.
*
* Note: This operation may be very expensive. If you are grouping in order to perform an
* aggregation (such as a sum or average) over each key, using
* [[PairSCollectionFunctions.aggregateByKey[U]* PairSCollectionFunctions.aggregateByKey]] or
* [[PairSCollectionFunctions.reduceByKey]] will provide much better performance.
* @group transform
*/
def groupBy[K: Coder](f: T => K): SCollection[(K, Iterable[T])] = {
if (!context.isTest && CallSites.wasCalledExternally) {
SCollection.logger.warn(
"groupBy will materialize all values for a key to a single worker," +
" which is a very common cause of memory issues." +
" Consider using aggregateByKey/reduceByKey on a keyed SCollection instead."
)
}
groupMap(f)(identity)
}
/**
* Return an SCollection of grouped items. Each group consists of a key and a sequence of elements
* transformed into a value of type `U`. The ordering of elements within each group is not
* guaranteed, and may even differ each time the resulting SCollection is evaluated.
*
* It is equivalent to groupBy(key).mapValues(_.map(f)), but more efficient.
*
* @group transform
*/
def groupMap[K: Coder, U: Coder](f: T => K)(
g: T => U
): SCollection[(K, Iterable[U])] =
this.transform {
val cf = ClosureCleaner.clean(f)
val cg = ClosureCleaner.clean(g)
_.map(t => KV.of(cf(t), cg(t)))
.pApply(GroupByKey.create[K, U]())
.map(kvIterableToTuple)
}
/**
* Return an SCollection of grouped items. Each group consists of a key and the result of an
* associative reduce function. The ordering of elements within each group is not guaranteed, and
* may even differ each time the resulting SCollection is evaluated.
*
* The associative function is performed locally on each mapper before sending results to a
* reducer, similarly to a "combiner" in MapReduce
*
* @group transform
*/
def groupMapReduce[K: Coder](f: T => K)(
g: (T, T) => T
): SCollection[(K, T)] =
this.transform {
val cf = ClosureCleaner.clean(f)
_.map(t => KV.of(cf(t), t))
.pApply(Combine.perKey(Functions.reduceFn(context, g)))
.map(kvToTuple)
}
/**
* Return a new SCollection containing only the elements that also exist in the `SideInput`.
*
* @group transform
*/
def hashFilter(sideInput: SideInput[Set[T]]): SCollection[T] =
self.map((_, ())).hashIntersectByKey(sideInput).keys
/**
* Create tuples of the elements in this SCollection by applying `f`.
* @group transform
*/
// Scala lambda is simpler than transforms.WithKeys
def keyBy[K: Coder](f: T => K): SCollection[(K, T)] =
this.map(v => (f(v), v))
/**
* Return a new SCollection by applying a function to all elements of this SCollection.
* @group transform
*/
def map[U: Coder](f: T => U): SCollection[U] = this.parDo(Functions.mapFn(f))
/**
* Return the min of this SCollection as defined by the implicit `Ordering[T]`.
* @return
* a new SCollection with the minimum element
* @group transform
*/
// Scala lambda is simpler and more powerful than transforms.Min
def min(implicit ord: Ordering[T]): SCollection[T] =
this.reduce(ord.min)
/**
* Return the max of this SCollection as defined by the implicit `Ordering[T]`.
* @return
* a new SCollection with the maximum element
* @group transform
*/
// Scala lambda is simpler and more powerful than transforms.Max
def max(implicit ord: Ordering[T]): SCollection[T] =
this.reduce(ord.max)
/**
* Return the latest of this SCollection according to its event time.
* @return
* a new SCollection with the latest element
* @group transform
*/
def latest: SCollection[T] =
// widen to ReadableInstant for scala 2.12 implicit ordering
this.withTimestamp.max(Ordering.by(_._2: ReadableInstant)).keys
/**
* Reduce with [[com.twitter.algebird.Semigroup Semigroup]]. This could be more powerful and
* better optimized than [[reduce]] in some cases.
* @group transform
*/
def sum(implicit sg: Semigroup[T]): SCollection[T] = {
SCollection.logger.warn(
"combine/sum does not support default value and may fail in some streaming scenarios. " +
"Consider aggregate/fold instead."
)
this.pApply(Combine.globally(Functions.reduceFn(context, sg)).withoutDefaults())
}
/**
* Return the mean of this SCollection as defined by the implicit `Numeric[T]`.
* @return
* a new SCollection with the mean of elements
* @group transform
*/
def mean(implicit ev: Numeric[T]): SCollection[Double] = this.transform { in =>
val e = ev // defeat closure
in.map[JDouble](e.toDouble)
.pApply(Mean.globally().withoutDefaults())
.asInstanceOf[SCollection[Double]]
}
/**
* Compute the SCollection's data distribution using approximate `N`-tiles.
* @return
* a new SCollection whose single value is an `Iterable` of the approximate `N`-tiles of the
* elements
* @group transform
*/
def quantilesApprox(numQuantiles: Int)(implicit ord: Ordering[T]): SCollection[Iterable[T]] =
this.transform {
_.pApply(ApproximateQuantiles.globally(numQuantiles, ord))
.map((_: JIterable[T]).asScala)
}
/**
* Randomly splits this SCollection with the provided weights.
*
* @param weights
* weights for splits, will be normalized if they don't sum to 1
* @return
* split SCollections in an array
* @group transform
*/
def randomSplit(weights: Array[Double]): Array[SCollection[T]] = {
val sum = weights.sum
val normalizedCumWeights = weights.map(_ / sum).scanLeft(0.0d)(_ + _)
val m = TreeMap(normalizedCumWeights.toIndexedSeq.zipWithIndex: _*) // Map[lower bound, split]
val sides = (1 until weights.length).map(_ => SideOutput[T]())
val (head, tail) = this
.withSideOutputs(sides: _*)
.flatMap { (x, c) =>
val i = m.rangeTo(ThreadLocalRandom.current().nextDouble()).last._2
if (i == 0) {
Seq(x) // Main output
} else {
c.output(sides(i - 1), x) // Side output
Nil
}
}
(head +: sides.map(tail(_))).toArray
}
/**
* Randomly splits this SCollection into two parts.
*
* @param weight
* weight for left hand side SCollection, should be in the range `(0, 1)`
* @return
* split SCollections in a Tuple2
* @group transform
*/
def randomSplit(weight: Double): (SCollection[T], SCollection[T]) = {
require(weight > 0.0 && weight < 1.0)
val splits = randomSplit(Array(weight, 1d - weight))
(splits(0), splits(1))
}
/**
* Randomly splits this SCollection into three parts. Note: `0 < weightA + weightB < 1`
*
* @param weightA
* weight for first SCollection, should be in the range `(0, 1)`
* @param weightB
* weight for second SCollection, should be in the range `(0, 1)`
* @return
* split SCollections in a Tuple3
* @group transform
*/
def randomSplit(
weightA: Double,
weightB: Double
): (SCollection[T], SCollection[T], SCollection[T]) = {
require(weightA > 0.0 && weightB > 0.0 && (weightA + weightB) < 1.0)
val splits = randomSplit(Array(weightA, weightB, 1d - (weightA + weightB)))
(splits(0), splits(1), splits(2))
}
/**
* Reduce the elements of this SCollection using the specified commutative and associative binary
* operator.
* @group transform
*/
def reduce(op: (T, T) => T): SCollection[T] =
this.pApply(Combine.globally(Functions.reduceFn(context, op)).withoutDefaults())
/**
* Return a sampled subset of this SCollection containing exactly `sampleSize` items. Involves
* combine operation resulting in shuffling. All the elements of the output should fit into main
* memory of a single worker machine.
*
* @return
* a new SCollection whose single value is an `Iterable` of the samples
* @group transform
*/
// TODO move to implicit
def sample(sampleSize: Int): SCollection[Iterable[T]] =
new SampleSCollectionFunctions(this).sample(sampleSize)
// TODO move to implicit
def sampleWeighted(totalWeight: Long, cost: T => Long): SCollection[Iterable[T]] =
new SampleSCollectionFunctions(this).sampleWeighted(totalWeight, cost)
// TODO move to implicit
def sampleByteSized(totalByteSize: Long): SCollection[Iterable[T]] =
new SampleSCollectionFunctions(this).sampleByteSized(totalByteSize)
/**
* Return a sampled subset of this SCollection. Does not trigger shuffling.
*
* @param withReplacement
* if `true` the same element can be produced more than once, otherwise the same element will be
* sampled only once
* @param fraction
* the sampling fraction
* @group transform
*/
// TODO move to implicit
def sample(withReplacement: Boolean, fraction: Double): SCollection[T] =
new SampleSCollectionFunctions(this).sample(withReplacement, fraction)
/**
* Return an SCollection with the elements from `this` that are not in `other`.
* @group transform
*/
def subtract(that: SCollection[T]): SCollection[T] =
this.transform {
_.map((_, ())).subtractByKey(that).keys
}
/**
* Return a sampled subset of any `num` elements of the SCollection.
* @group transform
*/
def take(num: Long): SCollection[T] = this.pApply(Sample.any(num))
/**
* Return the top k (largest) elements from this SCollection as defined by the specified implicit
* `Ordering[T]`.
* @return
* a new SCollection whose single value is an `Iterable` of the top k
* @group transform
*/
def top(num: Int)(implicit ord: Ordering[T]): SCollection[Iterable[T]] =
this.transform {
_.pApply(Top.of[T, Ordering[T]](num, ord)).map((l: JIterable[T]) => l.asScala)
}
// =======================================================================
// Hash operations
// =======================================================================
/**
* Return the cross product with another SCollection by replicating `that` to all workers. The
* right side should be tiny and fit in memory.
* @group hash
*/
def cross[U](that: SCollection[U]): SCollection[(T, U)] = {
implicit val uCoder = that.coder
this.transform { in =>
val side = that.asListSideInput
in.withSideInputs(side)
.flatMap((t, s) => s(side).map((t, _)))
.toSCollection
}
}
/**
* Look up values in an `SCollection[(T, V)]` for each element `T` in this SCollection by
* replicating `that` to all workers. The right side should be tiny and fit in memory.
* @group hash
*/
def hashLookup[V](
that: SCollection[(T, V)]
): SCollection[(T, Iterable[V])] = {
implicit val vCoder = that.valueCoder
this.transform { in =>
val side = that.asMultiMapSingletonSideInput
in.withSideInputs(side)
.map((t, s) => (t, s(side).getOrElse(t, Iterable())))
.toSCollection
}
}
/**
* Print content of an SCollection to `out()`.
* @param out
* where to write the debug information. Default: stdout
* @param prefix
* prefix for each logged entry. Default: empty string
* @param enabled
* if debugging is enabled or not. Default: true. It can be useful to set this to sc.isTest to
* avoid debugging when running in production.
* @group debug
*/
def debug(
out: () => PrintStream = () => Console.out,
prefix: String = "",
enabled: Boolean = true
): SCollection[T] =
if (enabled) {
tap(elem => out().println(prefix + elem))
} else {
this
}
/**
* Applies f to each element of this [[SCollection]], and returns the original value.
*
* @group debug
*/
def tap(f: T => Any): SCollection[T] =
pApply(ParDo.of(Functions.mapFn[T, T] { elem => f(elem); elem })).setCoder(internal.getCoder)
// =======================================================================
// Side input operations
// =======================================================================
/**
* Convert this SCollection of a single value per window to a [[SideInput]], to be used with
* [[withSideInputs]].
* @group side
*/
def asSingletonSideInput: SideInput[T] =
new SingletonSideInput[T](this.applyInternal(View.asSingleton()))
/**
* Convert this SCollection of a single value per window to a [[SideInput]] with a default value,
* to be used with [[withSideInputs]].
* @group side
*/
def asSingletonSideInput(defaultValue: T): SideInput[T] =
new SingletonSideInput[T](this.applyInternal(View.asSingleton().withDefaultValue(defaultValue)))
/**
* Convert this SCollection to a [[SideInput]], mapping each window to a `Seq`, to be used with
* [[withSideInputs]].
*
* The resulting `Seq` is required to fit in memory.
* @group side
*/
// j.u.List#asScala returns s.c.mutable.Buffer which has an O(n) .toList method
// returning Seq[T] here to avoid copying
def asListSideInput: SideInput[Seq[T]] =
new ListSideInput[T](this.applyInternal(View.asList()))
/**
* Convert this SCollection to a [[SideInput]], mapping each window to an `Iterable`, to be used
* with [[withSideInputs]].
*
* The values of the `Iterable` for a window are not required to fit in memory, but they may also
* not be effectively cached. If it is known that every window fits in memory, and stronger
* caching is desired, use [[asListSideInput]].
* @group side
*/
def asIterableSideInput: SideInput[Iterable[T]] =
new IterableSideInput[T](this.applyInternal(View.asIterable()))
/**
* Convert this SCollection to a [[SideInput]], mapping each window to a `Set[T]`, to be used with
* [[withSideInputs]].
*
* The resulting [[SideInput]] is a one element singleton which is a `Set` of all elements in the
* SCollection for the given window. The complete Set must fit in memory of the worker.
*
* @group side
*/
// Find the distinct elements in parallel and then convert to a Set and SingletonSideInput.
// This is preferred over aggregating as we want to map each window to a Set.
def asSetSingletonSideInput: SideInput[Set[T]] =
self
.transform(
_.distinct
.groupBy(_ => ())
.map(_._2.toSet)
)
.asSingletonSideInput(Set.empty[T])
/**
* Convert this SCollection to an [[SCollectionWithSideInput]] with one or more [[SideInput]] s,
* similar to Spark broadcast variables. Call [[SCollectionWithSideInput.toSCollection]] when done
* with side inputs.
*
* {{{
* val s1: SCollection[Int] = // ...
* val s2: SCollection[String] = // ...
* val s3: SCollection[(String, Double)] = // ...
*
* // Prepare side inputs
* val side1 = s1.asSingletonSideInput
* val side2 = s2.asIterableSideInput
* val side3 = s3.asMapSideInput
* val side4 = s4.asMultiMapSideInput
*
* val p: SCollection[MyRecord] = // ...
* p.withSideInputs(side1, side2, side3).map { (x, s) =>
* // Extract side inputs from context
* val s1: Int = s(side1)
* val s2: Iterable[String] = s(side2)
* val s3: Map[String, Double] = s(side3)
* val s4: Map[String, Iterable[Double]] = s(side4)
* // ...
* }
* }}}
* @group side
*/
def withSideInputs(sides: SideInput[_]*): SCollectionWithSideInput[T] =
new SCollectionWithSideInput[T](this, sides)
// =======================================================================
// Side output operations
// =======================================================================
/**
* Convert this SCollection to an [[SCollectionWithSideOutput]] with one or more [[SideOutput]] s,
* so that a single transform can write to multiple destinations.
*
* {{{
* // Prepare side inputs
* val side1 = SideOutput[String]()
* val side2 = SideOutput[Int]()
*
* val p: SCollection[MyRecord] = // ...
* p.withSideOutputs(side1, side2).map { (x, s) =>
* // Write to side outputs via context
* s.output(side1, "word").output(side2, 1)
* // ...
* }
* }}}
* @group side
*/
def withSideOutputs(sides: SideOutput[_]*): SCollectionWithSideOutput[T] =
new SCollectionWithSideOutput[T](this, sides)
// =======================================================================
// Windowing operations
// =======================================================================
/**
* Convert this SCollection to an [[WindowedSCollection]].
* @group window
*/
def toWindowed: WindowedSCollection[T] =
new WindowedSCollection[T](this)
/**
* Window values with the given function.
* @group window
*/
def withWindowFn[W <: BoundedWindow](
fn: WindowFn[_ <: Any, W],
options: WindowOptions = WindowOptions()
): SCollection[T] = {
var transform = Window.into(fn).asInstanceOf[Window[T]]
if (options.trigger != null) {
transform = transform.triggering(options.trigger)
}
if (options.accumulationMode != null) {
if (options.accumulationMode == AccumulationMode.ACCUMULATING_FIRED_PANES) {
transform = transform.accumulatingFiredPanes()
} else if (options.accumulationMode == AccumulationMode.DISCARDING_FIRED_PANES) {
transform = transform.discardingFiredPanes()
} else {
throw new RuntimeException(s"Unsupported accumulation mode ${options.accumulationMode}")
}
}
if (options.allowedLateness != null) {
transform = if (options.closingBehavior == null) {
transform.withAllowedLateness(options.allowedLateness)
} else {
transform.withAllowedLateness(options.allowedLateness, options.closingBehavior)
}
}
if (options.timestampCombiner != null) {
transform = transform.withTimestampCombiner(options.timestampCombiner)
}
if (options.onTimeBehavior != null) {
transform = transform.withOnTimeBehavior(options.onTimeBehavior)
}
this.pApply(transform)
}
/**
* Window values into fixed windows.
* @group window
*/
def withFixedWindows(
duration: Duration,
offset: Duration = Duration.ZERO,
options: WindowOptions = WindowOptions()
): SCollection[T] =
this.withWindowFn(FixedWindows.of(duration).withOffset(offset), options)
/**
* Window values into sliding windows.
* @group window
*/
def withSlidingWindows(
size: Duration,
period: Duration = null,
offset: Duration = Duration.ZERO,
options: WindowOptions = WindowOptions()
): SCollection[T] = {
var transform = SlidingWindows.of(size)
if (period != null) {
transform = transform.every(period)
}
transform = transform.withOffset(offset)
this.withWindowFn(transform, options)
}
/**
* Window values based on sessions.
* @group window
*/
def withSessionWindows(
gapDuration: Duration,
options: WindowOptions = WindowOptions()
): SCollection[T] =
this.withWindowFn(Sessions.withGapDuration(gapDuration), options)
/**
* Group values in to a single global window.
* @group window
*/
def withGlobalWindow(options: WindowOptions = WindowOptions()): SCollection[T] =
this.withWindowFn(new GlobalWindows(), options)
/**
* Window values into by years.
* @group window
*/
def windowByYears(number: Int, options: WindowOptions = WindowOptions()): SCollection[T] =
this.withWindowFn(CalendarWindows.years(number), options)
/**
* Window values into by months.
* @group window
*/
def windowByMonths(number: Int, options: WindowOptions = WindowOptions()): SCollection[T] =
this.withWindowFn(CalendarWindows.months(number), options)
/**
* Window values into by weeks.
* @group window
*/
def windowByWeeks(
number: Int,
startDayOfWeek: Int,
options: WindowOptions = WindowOptions()
): SCollection[T] =
this.withWindowFn(CalendarWindows.weeks(number, startDayOfWeek), options)
/**
* Window values into by days.
* @group window
*/
def windowByDays(number: Int, options: WindowOptions = WindowOptions()): SCollection[T] =
this.withWindowFn(CalendarWindows.days(number), options)
/**
* Convert values into pairs of (value, window).
* @group window
*/
def withPaneInfo: SCollection[(T, PaneInfo)] =
this.parDo(new DoFn[T, (T, PaneInfo)] {
@ProcessElement
private[scio] def processElement(
@Element element: T,
out: OutputReceiver[(T, PaneInfo)],
pane: PaneInfo
): Unit =
out.output((element, pane))
})
/**
* Convert values into pairs of (value, timestamp).
* @group window
*/
def withTimestamp: SCollection[(T, Instant)] =
this.parDo(new DoFn[T, (T, Instant)] {
@ProcessElement
private[scio] def processElement(
@Element element: T,
@Timestamp timestamp: Instant,
out: OutputReceiver[(T, Instant)]
): Unit =
out.output((element, timestamp))
})
/**
* Convert values into pairs of (value, window).
* @tparam W
* window type, must be [[org.apache.beam.sdk.transforms.windowing.BoundedWindow BoundedWindow]]
* or one of it's sub-types, e.g.
* [[org.apache.beam.sdk.transforms.windowing.GlobalWindow GlobalWindow]] if this SCollection is
* not windowed or [[org.apache.beam.sdk.transforms.windowing.IntervalWindow IntervalWindow]] if
* it is windowed.
* @group window
*/
def withWindow[W <: BoundedWindow: Coder]: SCollection[(T, W)] =
this
.parDo(new DoFn[T, (T, W)] {
@ProcessElement
private[scio] def processElement(
@Element element: T,
out: OutputReceiver[(T, W)],
window: BoundedWindow
): Unit =
out.output((element, window.asInstanceOf[W]))
})
/**
* Assign timestamps to values. With a optional skew
* @group window
*/
def timestampBy(f: T => Instant, allowedTimestampSkew: Duration = Duration.ZERO): SCollection[T] =
this.applyTransform(
WithTimestamps
.of(Functions.serializableFn(f))
.withAllowedTimestampSkew(allowedTimestampSkew): @nowarn("cat=deprecation")
)
// =======================================================================
// Read operations
// =======================================================================
/** @deprecated Use readTextFiles */
@deprecated("Use readTextFiles", "0.14.5")
def readFiles(implicit ev: T <:< String): SCollection[String] =
readFiles(beam.TextIO.readFiles())
/**
* Reads each file, represented as a pattern, in this [[SCollection]].
*
* @return
* each line of the input files.
*/
def readTextFiles(implicit ev: T <:< String): SCollection[String] =
new FileSCollectionFunctions(this.covary_).readTextFiles()
/**
* Reads each file, represented as a pattern, in this [[SCollection]].
*
* @return
* each file fully read as [[Array[Byte]].
*/
def readFilesAsBytes(implicit ev: T <:< String): SCollection[Array[Byte]] =
new FileSCollectionFunctions(this.covary_).readFilesAsBytes()
/**
* Reads each file, represented as a pattern, in this [[SCollection]].
*
* @return
* each file fully read as [[String]].
*/
def readFilesAsString(implicit ev: T <:< String): SCollection[String] =
new FileSCollectionFunctions(this.covary_).readFilesAsString()
/**
* Reads each file, represented as a pattern, in this [[SCollection]].
*
* @see
* [[readFilesAsBytes]], [[readFilesAsString]]
*/
def readFiles[A: Coder](
f: beam.FileIO.ReadableFile => A
)(implicit ev: T <:< String): SCollection[A] =
new FileSCollectionFunctions(this.covary_).readFiles(f)
/**
* Reads each file, represented as a pattern, in this [[SCollection]].
*
* @see
* [[readFilesAsBytes]], [[readFilesAsString]]
*
* @param directoryTreatment
* Controls how to handle directories in the input.
* @param compression
* Reads files using the given [[org.apache.beam.sdk.io.Compression]].
*/
def readFiles[A: Coder](directoryTreatment: DirectoryTreatment, compression: Compression)(
f: beam.FileIO.ReadableFile => A
)(implicit ev: T <:< String): SCollection[A] =
new FileSCollectionFunctions(this.covary_).readFiles(directoryTreatment, compression)(f)
/**
* Reads each file, represented as a pattern, in this [[SCollection]]. Files are split into
* multiple offset ranges and read with the [[FileBasedSource]].
*
* @param desiredBundleSizeBytes
* Desired size of bundles read by the sources.
* @param directoryTreatment
* Controls how to handle directories in the input.
* @param compression
* Reads files using the given [[org.apache.beam.sdk.io.Compression]].
*/
def readFiles[A: Coder](
desiredBundleSizeBytes: Long,
directoryTreatment: DirectoryTreatment,
compression: Compression
)(f: String => FileBasedSource[A])(implicit ev: T <:< String): SCollection[A] =
new FileSCollectionFunctions(this.covary_)
.readFiles(desiredBundleSizeBytes, directoryTreatment, compression)(f)
/**
* Reads each file, represented as a pattern, in this [[SCollection]].
*
* @see
* [[readFilesAsBytes]], [[readFilesAsString]], [[readFiles]]
*
* @param directoryTreatment
* Controls how to handle directories in the input.
* @param compression
* Reads files using the given [[org.apache.beam.sdk.io.Compression]].
*/
def readFiles[A: Coder](
filesTransform: PTransform[_ >: PCollection[beam.FileIO.ReadableFile], PCollection[A]],
directoryTreatment: DirectoryTreatment = DirectoryTreatment.SKIP,
compression: Compression = Compression.AUTO
)(implicit ev: T <:< String): SCollection[A] =
new FileSCollectionFunctions(this.covary_)
.readFiles(filesTransform, directoryTreatment, compression)
/**
* Reads each file, represented as a pattern, in this [[SCollection]]. Files are split into
* multiple offset ranges and read with the [[FileBasedSource]].
*
* @return
* origin file name paired with read line.
*
* @param desiredBundleSizeBytes
* Desired size of bundles read by the sources.
* @param directoryTreatment
* Controls how to handle directories in the input.
* @param compression
* Reads files using the given [[org.apache.beam.sdk.io.Compression]].
*/
def readTextFilesWithPath(
desiredBundleSizeBytes: Long = FileSCollectionFunctions.DefaultBundleSizeBytes,
directoryTreatment: DirectoryTreatment = DirectoryTreatment.SKIP,
compression: Compression = Compression.AUTO
)(implicit ev: T <:< String): SCollection[(String, String)] =
new FileSCollectionFunctions(this.covary_)
.readTextFilesWithPath(desiredBundleSizeBytes, directoryTreatment, compression)
/**
* Reads each file, represented as a pattern, in this [[SCollection]]. Files are split into
* multiple offset ranges and read with the [[FileBasedSource]].
*
* @return
* origin file name paired with read element.
*
* @param desiredBundleSizeBytes
* Desired size of bundles read by the sources.
* @param directoryTreatment
* Controls how to handle directories in the input.
* @param compression
* Reads files using the given [[org.apache.beam.sdk.io.Compression]].
*/
def readFilesWithPath[A: Coder](
desiredBundleSizeBytes: Long = FileSCollectionFunctions.DefaultBundleSizeBytes,
directoryTreatment: DirectoryTreatment = DirectoryTreatment.SKIP,
compression: Compression = Compression.AUTO
)(
f: String => FileBasedSource[A]
)(implicit ev: T <:< String): SCollection[(String, A)] =
new FileSCollectionFunctions(this.covary_)
.readFilesWithPath(desiredBundleSizeBytes, directoryTreatment, compression)(f)
/**
* Pairs each element with the value of the provided [[SideInput]] in the element's window.
*
* Reify as List:
* {{{
* val other: SCollection[Int] = sc.parallelize(Seq(1))
* val coll: SCollection[(Int, Seq[Int])] =
* sc.parallelize(Seq(1, 2))
* .reifySideInputAsValues(other.asListSideInput)
* }}}
*
* Reify as Iterable:
* {{{
* val other: SCollection[Int] = sc.parallelize(Seq(1))
* val coll: SCollection[(Int, Iterable[Int])] =
* sc.parallelize(Seq(1, 2))
* .reifySideInputAsValues(other.asIterableSideInput)
* }}}
*
* Reify as Map:
* {{{
* val other: SCollection[(Int, Int)] = sc.parallelize(Seq((1, 1)))
* val coll: SCollection[(Int, Map[Int, Int])] =
* sc.parallelize(Seq(1, 2))
* .reifySideInputAsValues(other.asMapSideInput)
* }}}
*
* Reify as Multimap:
* {{{
* val other: SCollection[(Int, Int)] = sc.parallelize(Seq((1, 1)))
* val coll: SCollection[(Int, Map[Int, Iterable[Int]])] =
* sc.parallelize(Seq(1, 2))
* .reifySideInputAsValues(other.asMultiMapSideInput)
* }}}
*/
// `U: Coder` context bound is required since `PCollectionView` may be of different type
def reifySideInputAsValues[U: Coder](side: SideInput[U]): SCollection[(T, U)] =
this.transform(_.withSideInputs(side).map((t, s) => (t, s(side))).toSCollection)
/** Returns an [[SCollection]] consisting of a single `Seq[T]` element. */
def reifyAsListInGlobalWindow: SCollection[Seq[T]] =
reifyInGlobalWindow(_.asListSideInput)
/** Returns an [[SCollection]] consisting of a single `Iterable[T]` element. */
def reifyAsIterableInGlobalWindow: SCollection[Iterable[T]] =
reifyInGlobalWindow(_.asIterableSideInput)
/**
* Returns an [[SCollection]] consisting of a single element, containing the value of the given
* side input in the global window.
*
* Reify as List:
* {{{
* val coll: SCollection[Seq[Int]] =
* sc.parallelize(Seq(1, 2)).reifyInGlobalWindow(_.asListSideInput)
* }}}
*
* Can be used to replace patterns like:
* {{{
* val coll: SCollection[Iterable[Int]] = sc.parallelize(Seq(1, 2)).groupBy(_ => ())
* }}}
* where you want to actually get an empty [[Iterable]] even if no data is present.
*/
// `U: Coder` context bound is required since `PCollectionView` may be of different type
private[scio] def reifyInGlobalWindow[U: Coder](
view: SCollection[T] => SideInput[U]
): SCollection[U] =
this.transform(coll =>
context.parallelize[Unit](Seq(())).reifySideInputAsValues(view(coll)).values
)
// =======================================================================
// Write operations
// =======================================================================
/**
* Extract data from this SCollection as a closed [[Tap]]. The Tap will be available once the
* pipeline completes successfully. `.materialize()` must be called before the `ScioContext` is
* run, as its implementation modifies the current pipeline graph.
*
* {{{
* val closedTap = sc.parallelize(1 to 10).materialize
* sc.run().waitUntilDone().tap(closedTap)
* }}}
*
* @group output
*/
def materialize: ClosedTap[T] =
materialize(ScioUtil.getTempFile(context), isCheckpoint = false)
private[scio] def materialize(path: String, isCheckpoint: Boolean): ClosedTap[T] =
if (context.isTest) {
// Do not run assertions on materialized value but still access test context to trigger
// the test checking if we're running inside a JobTest
if (!isCheckpoint) TestDataManager.getOutput(context.testId.get)
saveAsInMemoryTap
} else {
val elemCoder = CoderMaterializer.beam(context, coder)
val arrCoder = ByteArrayCoder.of()
this
.map { e =>
CoderUtils.encodeToByteArray(
arrCoder,
CoderUtils.encodeToByteArray(elemCoder, e),
// encode record length
BCoder.Context.NESTED: @nowarn("cat=deprecation")
)
}
.saveAsBinaryFile(path)
ClosedTap(MaterializeTap[T](path, context))
}
/**
* Save this SCollection as a text file. Note that elements must be of type `String`.
* @group output
*/
def saveAsTextFile(
path: String,
numShards: Int = TextIO.WriteParam.DefaultNumShards,
suffix: String = TextIO.WriteParam.DefaultSuffix,
compression: Compression = TextIO.WriteParam.DefaultCompression,
header: Option[String] = TextIO.WriteParam.DefaultHeader,
footer: Option[String] = TextIO.WriteParam.DefaultFooter,
shardNameTemplate: String = TextIO.WriteParam.DefaultShardNameTemplate,
tempDirectory: String = TextIO.WriteParam.DefaultTempDirectory,
filenamePolicySupplier: FilenamePolicySupplier =
TextIO.WriteParam.DefaultFilenamePolicySupplier,
prefix: String = TextIO.WriteParam.DefaultPrefix
)(implicit ct: ClassTag[T]): ClosedTap[String] = {
val s = if (classOf[String] isAssignableFrom ct.runtimeClass) {
this.asInstanceOf[SCollection[String]]
} else {
this.map(_.toString)
}
s.write(TextIO(path))(
TextIO.WriteParam(
suffix,
numShards,
compression,
header,
footer,
filenamePolicySupplier,
prefix,
shardNameTemplate,
tempDirectory
)
)
}
/**
* Save this SCollection as raw bytes. Note that elements must be of type `Array[Byte]`.
* @group output
*/
def saveAsBinaryFile(
path: String,
numShards: Int = BinaryIO.WriteParam.DefaultNumShards,
prefix: String = BinaryIO.WriteParam.DefaultPrefix,
suffix: String = BinaryIO.WriteParam.DefaultSuffix,
compression: Compression = BinaryIO.WriteParam.DefaultCompression,
header: Array[Byte] = BinaryIO.WriteParam.DefaultHeader,
footer: Array[Byte] = BinaryIO.WriteParam.DefaultFooter,
shardNameTemplate: String = BinaryIO.WriteParam.DefaultShardNameTemplate,
framePrefix: Array[Byte] => Array[Byte] = BinaryIO.WriteParam.DefaultFramePrefix,
frameSuffix: Array[Byte] => Array[Byte] = BinaryIO.WriteParam.DefaultFrameSuffix,
tempDirectory: String = BinaryIO.WriteParam.DefaultTempDirectory,
filenamePolicySupplier: FilenamePolicySupplier =
BinaryIO.WriteParam.DefaultFilenamePolicySupplier
)(implicit ev: T <:< Array[Byte]): ClosedTap[Nothing] =
this
.covary_[Array[Byte]]
.write(BinaryIO(path))(
BinaryIO
.WriteParam(
prefix,
suffix,
numShards,
compression,
header,
footer,
shardNameTemplate,
framePrefix,
frameSuffix,
tempDirectory,
filenamePolicySupplier
)
)
/**
* Creates a Zstd dictionary based on this SCollection targeting a dictionary of size
* `zstdDictSizeBytes` to be trained with approximately `trainingBytesTarget` bytes. The exact
* training size is determined by estimating the average element size with
* `numElementsForSizeEstimation` encoded elements and sampling this SCollection at an appropriate
* rate.
*
* @param path
* The path to which the trained dictionary should be written.
* @param zstdDictSizeBytes
* The size of the dictionary to train in bytes. Recommended dictionary sizes are in hundreds of
* KB. Over 10MB is not recommended and you may hit resource limits if the dictionary size is
* near 20MB.
* @param numElementsForSizeEstimation
* The number of elements of the SCollection to use to estimate the average element size.
* @param trainingBytesTarget
* The target number of bytes on which to train. Memory usage for training can be 10x this.
* `None` to infer from `zstdDictSizeBytes`. Must be able to fit in the memory of a single
* worker.
*/
def saveAsZstdDictionary(
path: String,
zstdDictSizeBytes: Int = ZstdDictIO.WriteParam.DefaultZstdDictSizeBytes,
numElementsForSizeEstimation: Long = ZstdDictIO.WriteParam.DefaultNumElementsForSizeEstimation,
trainingBytesTarget: Option[Int] = ZstdDictIO.WriteParam.DefaultTrainingBytesTarget
): ClosedTap[Nothing] = {
this
.write(ZstdDictIO[T](path))(
ZstdDictIO.WriteParam(
zstdDictSizeBytes,
numElementsForSizeEstimation,
trainingBytesTarget
)
)
}
/**
* Save this SCollection with a custom output transform. The transform should have a unique name.
* @group output
*/
def saveAsCustomOutput[O <: POutput](
name: String,
transform: PTransform[PCollection[T], O]
): ClosedTap[Nothing] = {
if (context.isTest) {
TestDataManager.getOutput(context.testId.get)(CustomIO[T](name))(this)
} else {
this.internal.apply(name, transform)
}
ClosedTap[Nothing](EmptyTap)
}
private[scio] def saveAsInMemoryTap: ClosedTap[T] = {
val tap = new InMemoryTap[T]
InMemorySink.save(tap.id, this)
ClosedTap(tap)
}
/**
* Generic write method for all `ScioIO[T]` implementations, if it is test pipeline this will
* evaluate pre-registered output IO implementation which match for the passing `ScioIO[T]`
* implementation. if not this will invoke [[com.spotify.scio.io.ScioIO[T]#write]] method along
* with write configurations passed by.
*
* @param io
* an implementation of `ScioIO[T]` trait
* @param params
* configurations need to pass to perform underline write implementation
*/
def write(io: ScioIO[T])(params: io.WriteP): ClosedTap[io.tapT.T] =
io.writeWithContext(this, params)
def write(io: ScioIO[T] { type WriteP = Unit }): ClosedTap[io.tapT.T] =
io.writeWithContext(this, ())
}
private[scio] class SCollectionImpl[T](val internal: PCollection[T], val context: ScioContext)
extends SCollection[T] {}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy