
com.spotify.scio.values.SCollection.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of scio-core_2.11 Show documentation
Show all versions of scio-core_2.11 Show documentation
Scio - A Scala API for Apache Beam and Google Cloud Dataflow
The newest version!
/*
* Copyright 2016 Spotify AB.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
// scalastyle:off file.size.limit
package com.spotify.scio.values
import java.io.{File, PrintStream}
import java.lang.{Boolean => JBoolean, Double => JDouble, Iterable => JIterable}
import java.net.URI
import java.util.UUID
import com.google.api.services.bigquery.model.{TableReference, TableRow, TableSchema}
import com.google.cloud.dataflow.sdk.coders.{Coder, TableRowJsonCoder}
import com.google.cloud.dataflow.sdk.io.BigQueryIO.Write.{CreateDisposition, WriteDisposition}
import com.google.cloud.dataflow.sdk.io.PatchedAvroIO
import com.google.cloud.dataflow.sdk.transforms._
import com.google.cloud.dataflow.sdk.transforms.windowing._
import com.google.cloud.dataflow.sdk.util.WindowingStrategy.AccumulationMode
import com.google.cloud.dataflow.sdk.values._
import com.google.cloud.dataflow.sdk.{io => gio}
import com.google.datastore.v1.Entity
import com.google.protobuf.Message
import com.spotify.scio.ScioContext
import com.spotify.scio.coders.AvroBytesUtil
import com.spotify.scio.io._
import com.spotify.scio.testing._
import com.spotify.scio.util._
import com.spotify.scio.util.random.{BernoulliSampler, PoissonSampler}
import com.twitter.algebird.{Aggregator, Monoid, Semigroup}
import org.apache.avro.Schema
import org.apache.avro.file.CodecFactory
import org.apache.avro.generic.GenericRecord
import org.apache.avro.specific.SpecificRecordBase
import org.joda.time.{Duration, Instant}
import scala.collection.JavaConverters._
import scala.collection.immutable.TreeMap
import scala.concurrent._
import scala.reflect.ClassTag
/** Convenience functions for creating SCollections. */
object SCollection {
/** Create a union of multiple SCollections */
def unionAll[T: ClassTag](scs: Iterable[SCollection[T]]): SCollection[T] = {
val o = PCollectionList
.of(scs.map(_.internal).asJava)
.apply(CallSites.getCurrent, Flatten.pCollections())
new SCollectionImpl(o, scs.head.context)
}
import scala.language.implicitConversions
/** Implicit conversion from SCollection to DoubleSCollectionFunctions */
implicit def makeDoubleSCollectionFunctions(s: SCollection[Double]): DoubleSCollectionFunctions =
new DoubleSCollectionFunctions(s)
/** Implicit conversion from SCollection to DoubleSCollectionFunctions */
implicit def makeDoubleSCollectionFunctions[T](s: SCollection[T])(implicit num: Numeric[T])
: DoubleSCollectionFunctions =
new DoubleSCollectionFunctions(s.map(num.toDouble))
/** Implicit conversion from SCollection to PairSCollectionFunctions */
implicit def makePairSCollectionFunctions[K: ClassTag, V: ClassTag](s: SCollection[(K, V)])
: PairSCollectionFunctions[K, V] =
new PairSCollectionFunctions(s)
}
// scalastyle:off number.of.methods
/**
* A Scala wrapper for [[com.google.cloud.dataflow.sdk.values.PCollection PCollection]].
* Represents an immutable, partitioned collection of elements that can be operated on in
* parallel. This class contains the basic operations available on all SCollections, such as
* `map`, `filter`, and `persist`. In addition, [[PairSCollectionFunctions]] contains operations
* available only on SCollections of key-value pairs, such as `groupByKey` and `join`;
* [[DoubleSCollectionFunctions]] contains operations available only on SCollections of Doubles.
*
* @groupname collection Collection Operations
* @groupname hash Hash Operations
* @groupname output Output Sinks
* @groupname side Side Input and Output Operations
* @groupname transform Transformations
* @groupname window Windowing Operations
*/
sealed trait SCollection[T] extends PCollectionWrapper[T] {
import TupleFunctions._
// =======================================================================
// Delegations for internal PCollection
// =======================================================================
/** A friendly name for this SCollection. */
def name: String = internal.getName
/** Assign a Coder to this SCollection. */
def setCoder(coder: Coder[T]): SCollection[T] = context.wrap(internal.setCoder(coder))
/** Assign a name to this SCollection. */
def setName(name: String): SCollection[T] = context.wrap(internal.setName(name))
/**
* Apply a [[com.google.cloud.dataflow.sdk.transforms.PTransform PTransform]] and wrap the output
* in an [[SCollection]].
*/
def applyTransform[U: ClassTag](transform: PTransform[_ >: PCollection[T], PCollection[U]])
: SCollection[U] =
this.pApply(transform)
/**
* Apply a [[com.google.cloud.dataflow.sdk.transforms.PTransform PTransform]] with [[PDone]]
* output.
*/
def applyOutputTransform(transform: PTransform[_ >: PCollection[T], PDone]): PDone =
this.applyInternal(transform)
/** Apply a transform. */
private[values] def transform[U: ClassTag](f: SCollection[T] => SCollection[U])
: SCollection[U] = {
val o = internal.apply(CallSites.getCurrent, new PTransform[PCollection[T], PCollection[U]]() {
override def apply(input: PCollection[T]): PCollection[U] = {
f(context.wrap(input)).internal
}
})
context.wrap(o)
}
// =======================================================================
// Collection operations
// =======================================================================
/**
* Convert this SCollection to an [[SCollectionWithFanout]] that uses an intermediate node to
* combine parts of the data to reduce load on the final global combine step.
* @param fanout the number of intermediate keys that will be used
*/
def withFanout(fanout: Int): SCollectionWithFanout[T] =
new SCollectionWithFanout[T](internal, context, fanout)
/**
* Return the union of this SCollection and another one. Any identical elements will appear
* multiple times (use `.distinct()` to eliminate them).
* @group collection
*/
// scalastyle:off method.name
def ++(that: SCollection[T]): SCollection[T] = this.union(that)
// scalastyle:on method.name
/**
* Return the union of this SCollection and another one. Any identical elements will appear
* multiple times (use `.distinct()` to eliminate them).
* @group collection
*/
def union(that: SCollection[T]): SCollection[T] = {
val o = PCollectionList
.of(internal).and(that.internal)
.apply(CallSites.getCurrent, Flatten.pCollections())
context.wrap(o)
}
/**
* Return the intersection of this SCollection and another one. The output will not contain any
* duplicate elements, even if the input SCollections did.
*
* Note that this method performs a shuffle internally.
* @group collection
*/
def intersection(that: SCollection[T]): SCollection[T] = this.transform {
_.map((_, 1)).cogroup(that.map((_, 1))).flatMap { t =>
if (t._2._1.nonEmpty && t._2._2.nonEmpty) Seq(t._1) else Seq.empty
}
}
/**
* Partition this SCollection with the provided function.
*
* @param numPartitions number of output partitions
* @param f function that assigns an output partition to each element, should be in the range
* `[0, numPartitions - 1]`
* @return partitioned SCollections in a Seq
* @group collection
*/
def partition(numPartitions: Int, f: T => Int): Seq[SCollection[T]] =
this.applyInternal(Partition.of[T](numPartitions, Functions.partitionFn[T](numPartitions, f)))
.getAll.asScala.map(p => context.wrap(p))
// =======================================================================
// Transformations
// =======================================================================
/**
* Aggregate the elements using given combine functions and a neutral "zero value". This
* function can return a different result type, U, than the type of this SCollection, T. Thus,
* we need one operation for merging a T into an U and one operation for merging two U's. Both
* of these functions are allowed to modify and return their first argument instead of creating
* a new U to avoid memory allocation.
* @group transform
*/
def aggregate[U: ClassTag](zeroValue: U)(seqOp: (U, T) => U,
combOp: (U, U) => U): SCollection[U] =
this.pApply(Combine.globally(Functions.aggregateFn(zeroValue)(seqOp, combOp)))
/**
* Aggregate with [[com.twitter.algebird.Aggregator Aggregator]]. First each item T is mapped to
* A, then we reduce with a semigroup of A, then finally we present the results as U. This could
* be more powerful and better optimized in some cases.
* @group transform
*/
def aggregate[A: ClassTag, U: ClassTag](aggregator: Aggregator[T, A, U])
: SCollection[U] = this.transform { in =>
val a = aggregator // defeat closure
in.map(a.prepare).sum(a.semigroup).map(a.present)
}
/**
* Filter the elements for which the given PartialFunction is defined, and then map.
* @group transform
*/
def collect[U: ClassTag](pfn: PartialFunction[T, U]): SCollection[U] = this.transform {
_.filter(pfn.isDefinedAt).map(pfn)
}
/**
* Generic function to combine the elements using a custom set of aggregation functions. Turns
* an SCollection[T] into a result of type SCollection[C], for a "combined type" C. Note that V
* and C can be different -- for example, one might combine an SCollection of type Int into an
* SCollection of type Seq[Int]. Users provide three functions:
*
* - `createCombiner`, which turns a V into a C (e.g., creates a one-element list)
*
* - `mergeValue`, to merge a V into a C (e.g., adds it to the end of a list)
*
* - `mergeCombiners`, to combine two C's into a single one.
* @group transform
*/
def combine[C: ClassTag](createCombiner: T => C)
(mergeValue: (C, T) => C)
(mergeCombiners: (C, C) => C): SCollection[C] =
this.pApply(Combine.globally(Functions.combineFn(createCombiner, mergeValue, mergeCombiners)))
/**
* Count the number of elements in the SCollection.
* @return a new SCollection with the count
* @group transform
*/
def count: SCollection[Long] = this.pApply(Count.globally[T]()).asInstanceOf[SCollection[Long]]
/**
* Count approximate number of distinct elements in the SCollection.
* @param sampleSize the number of entries in the statisticalsample; the higher this number, the
* more accurate the estimate will be; should be `>= 16`
* @group transform
*/
def countApproxDistinct(sampleSize: Int): SCollection[Long] =
this.pApply(ApproximateUnique.globally[T](sampleSize)).asInstanceOf[SCollection[Long]]
/**
* Count approximate number of distinct elements in the SCollection.
* @param maximumEstimationError the maximum estimation error, which should be in the range
* `[0.01, 0.5]`
* @group transform
*/
def countApproxDistinct(maximumEstimationError: Double = 0.02): SCollection[Long] =
this
.pApply(ApproximateUnique.globally[T](maximumEstimationError))
.asInstanceOf[SCollection[Long]]
/**
* Count of each unique value in this SCollection as an SCollection of (value, count) pairs.
* @group transform
*/
def countByValue: SCollection[(T, Long)] = this.transform {
_.pApply(Count.perElement[T]()).map(kvToTuple).asInstanceOf[SCollection[(T, Long)]]
}
/**
* Return a new SCollection containing the distinct elements in this SCollection.
* @group transform
*/
def distinct: SCollection[T] = this.pApply(RemoveDuplicates.create[T]())
/**
* Return a new SCollection containing only the elements that satisfy a predicate.
* @group transform
*/
def filter(f: T => Boolean): SCollection[T] =
this.pApply(Filter.byPredicate(Functions.serializableFn(f.asInstanceOf[T => JBoolean])))
/**
* Return a new SCollection by first applying a function to all elements of
* this SCollection, and then flattening the results.
* @group transform
*/
def flatMap[U: ClassTag](f: T => TraversableOnce[U]): SCollection[U] =
this.parDo(Functions.flatMapFn(f))
/**
* Aggregate the elements using a given associative function and a neutral "zero value". The
* function op(t1, t2) is allowed to modify t1 and return it as its result value to avoid object
* allocation; however, it should not modify t2.
* @group transform
*/
def fold(zeroValue: T)(op: (T, T) => T): SCollection[T] =
this.pApply(Combine.globally(Functions.aggregateFn(zeroValue)(op, op)))
/**
* Fold with [[com.twitter.algebird.Monoid Monoid]], which defines the associative function and
* "zero value" for T. This could be more powerful and better optimized in some cases.
* @group transform
*/
def fold(implicit mon: Monoid[T]): SCollection[T] =
this.pApply(Combine.globally(Functions.reduceFn(mon)))
/**
* Return an SCollection of grouped items. Each group consists of a key and a sequence of
* elements mapping to that key. The ordering of elements within each group is not guaranteed,
* and may even differ each time the resulting SCollection is evaluated.
*
* Note: This operation may be very expensive. If you are grouping in order to perform an
* aggregation (such as a sum or average) over each key, using
* [[PairSCollectionFunctions.aggregateByKey[U]* PairSCollectionFunctions.aggregateByKey]] or
* [[PairSCollectionFunctions.reduceByKey]] will provide much better performance.
* @group transform
*/
def groupBy[K: ClassTag](f: T => K): SCollection[(K, Iterable[T])] = this.transform {
_
.pApply(WithKeys.of(Functions.serializableFn(f))).setCoder(this.getKvCoder[K, T])
.pApply(GroupByKey.create[K, T]()).map(kvIterableToTuple)
}
/**
* Create tuples of the elements in this SCollection by applying `f`.
* @group transform
*/
// Scala lambda is simpler than transforms.WithKeys
def keyBy[K: ClassTag](f: T => K): SCollection[(K, T)] = this.map(v => (f(v), v))
/**
* Return a new SCollection by applying a function to all elements of this SCollection.
* @group transform
*/
def map[U: ClassTag](f: T => U): SCollection[U] = this.parDo(Functions.mapFn(f))
/**
* Return the max of this SCollection as defined by the implicit Ordering[T].
* @return a new SCollection with the maximum element
* @group transform
*/
// Scala lambda is simpler and more powerful than transforms.Max
def max(implicit ord: Ordering[T]): SCollection[T] = this.reduce(ord.max)
/**
* Return the mean of this SCollection as defined by the implicit Numeric[T].
* @return a new SCollection with the mean of elements
* @group transform
*/
def mean(implicit ev: Numeric[T]): SCollection[Double] = this.transform { in =>
val e = ev // defeat closure
in
.map(e.toDouble).asInstanceOf[SCollection[JDouble]]
.pApply(Mean.globally()).asInstanceOf[SCollection[Double]]
}
/**
* Return the min of this SCollection as defined by the implicit Ordering[T].
* @return a new SCollection with the minimum element
* @group transform
*/
// Scala lambda is simpler and more powerful than transforms.Min
def min(implicit ord: Ordering[T]): SCollection[T] = this.reduce(ord.min)
/**
* Compute the SCollection's data distribution using approximate `N`-tiles.
* @return a new SCollection whose single value is an Iterable of the approximate `N`-tiles of
* the elements
* @group transform
*/
def quantilesApprox(numQuantiles: Int)
(implicit ord: Ordering[T]): SCollection[Iterable[T]] = this.transform {
_
.pApply(ApproximateQuantiles.globally(numQuantiles, ord))
.map(_.asInstanceOf[JIterable[T]].asScala)
}
/**
* Randomly splits this SCollection with the provided weights.
*
* @param weights weights for splits, will be normalized if they don't sum to 1
* @return split SCollections in an array
* @group transform
*/
def randomSplit(weights: Array[Double]): Array[SCollection[T]] = {
val sum = weights.sum
val normalizedCumWeights = weights.map(_ / sum).scanLeft(0.0d)(_ + _)
val m = TreeMap(normalizedCumWeights.zipWithIndex: _*) // Map[lower bound, split]
val sides = (1 until weights.length).map(_ => SideOutput[T]())
val (head, tail) = this
.withSideOutputs(sides: _*)
.flatMap { (x, c) =>
val i = m.to(scala.util.Random.nextDouble()).last._2
if (i == 0) {
Seq(x) // Main output
} else {
c.output(sides(i - 1), x) // Side output
Nil
}
}
(head +: sides.map(tail(_))).toArray
}
/**
* Reduce the elements of this SCollection using the specified commutative and associative
* binary operator.
* @group transform
*/
def reduce(op: (T, T) => T): SCollection[T] =
this.pApply(Combine.globally(Functions.reduceFn(op)))
/**
* Return a sampled subset of this SCollection.
* @return a new SCollection whose single value is an Iterable of the
* samples
* @group transform
*/
def sample(sampleSize: Int): SCollection[Iterable[T]] = this.transform {
_.pApply(Sample.fixedSizeGlobally(sampleSize)).map(_.asScala)
}
/**
* Return a sampled subset of this SCollection.
* @group transform
*/
def sample(withReplacement: Boolean, fraction: Double): SCollection[T] = {
if (withReplacement) {
this.parDo(new PoissonSampler[T](fraction))
} else {
this.parDo(new BernoulliSampler[T](fraction))
}
}
/**
* Return an SCollection with the elements from `this` that are not in `other`.
* @group transform
*/
def subtract(that: SCollection[T]): SCollection[T] = this.transform {
_.map((_, 1)).cogroup(that.map((_, 1))).flatMap { t =>
if (t._2._1.nonEmpty && t._2._2.isEmpty) Seq(t._1) else Seq.empty
}
}
/**
* Reduce with [[com.twitter.algebird.Semigroup Semigroup]]. This could be more powerful and
* better optimized in some cases.
* @group transform
*/
def sum(implicit sg: Semigroup[T]): SCollection[T] =
this.pApply(Combine.globally(Functions.reduceFn(sg)))
/**
* Return a sampled subset of any `num` elements of the SCollection.
* @group transform
*/
def take(num: Long): SCollection[T] = this.pApply(Sample.any(num))
/**
* Return the top k (largest) elements from this SCollection as defined by the specified
* implicit Ordering[T].
* @return a new SCollection whose single value is an Iterable of the top k
* @group transform
*/
def top(num: Int)(implicit ord: Ordering[T]): SCollection[Iterable[T]] = this.transform {
_.pApply(Top.of(num, ord)).map(_.asInstanceOf[JIterable[T]].asScala)
}
// =======================================================================
// Hash operations
// =======================================================================
/**
* Return the cross product with another SCollection by replicating `that` to all workers. The
* right side should be tiny and fit in memory.
* @group hash
*/
def cross[U: ClassTag](that: SCollection[U]): SCollection[(T, U)] = this.transform { in =>
val side = that.asListSideInput
in
.withSideInputs(side)
.flatMap((t, s) => s(side).map((t, _)))
.toSCollection
}
/**
* Look up values in a SCollection[(T, V)] for each element T in this SCollection by replicating
* `that` to all workers. The right side should be tiny and fit in memory.
* @group hash
*/
def hashLookup[V: ClassTag](that: SCollection[(T, V)])
: SCollection[(T, Iterable[V])] = this.transform { in =>
val side = that.asMultiMapSideInput
in
.withSideInputs(side)
.map((t, s) => (t, s(side).getOrElse(t, Iterable())))
.toSCollection
}
/**
* Print content of a SCollection to `out()`.
* @group debug
*/
def debug(out: () => PrintStream = () => Console.out, prefix: String = ""): SCollection[T] =
this.filter(e => {
// scalastyle:off regex
out().println(s"""$prefix${e.toString}""")
// scalastyle:on regex
// filter that never removes
true
})
// =======================================================================
// Accumulators
// =======================================================================
/**
* Convert this SCollection to an [[SCollectionWithAccumulator]] with one or more
* [[Accumulator]]s, similar to Hadoop counters. Call
* [[SCollectionWithAccumulator.toSCollection]] when done with accumulators.
*
* Note that each accumulator may be used in a single scope only.
*
* Create accumulators with [[ScioContext.maxAccumulator]],
* [[ScioContext.minAccumulator]] or [[ScioContext.sumAccumulator]]. For example:
*
* {{{
* val maxLineLength = sc.maxAccumulator[Int]("maxLineLength")
* val minLineLength = sc.maxAccumulator[Int]("maxLineLength")
* val emptyLines = sc.maxAccumulator[Long]("emptyLines")
*
* val p: SCollection[String] = // ...
* p
* .withAccumulators(maxLineLength, minLineLength, emptyLines)
* .filter { (l, c) =>
* val t = l.strip()
* c.addValue(maxLineLength, t.length).addValue(minLineLength, t.length)
* val b = t.isEmpty
* if (b) c.addValue(emptyLines, 1L)
* !b
* }
* .toSCollection
* }}}
*/
def withAccumulator(acc: Accumulator[_]*): SCollectionWithAccumulator[T] = {
require(acc.forall(this.context.containsAccumulator), "Unregistered accumulator supplied")
new SCollectionWithAccumulator(internal, context, acc)
}
// =======================================================================
// Side input operations
// =======================================================================
/**
* Convert this SCollection of a single value per window to a SideInput, to be used with
* [[SCollection.withSideInputs]].
* @group side
*/
def asSingletonSideInput: SideInput[T] =
new SingletonSideInput[T](this.applyInternal(View.asSingleton()))
/**
* Convert this SCollection to a SideInput, mapping each window to a List, to be used with
* [[SCollection.withSideInputs]].
* @group side
*/
def asListSideInput: SideInput[List[T]] =
new ListSideInput[T](this.applyInternal(View.asList()))
/**
* Convert this SCollection to a SideInput, mapping each window to an Iterable, to be used with
* [[SCollection.withSideInputs]].
*
* The values of the Iterable for a window are not required to fit in memory, but they may also
* not be effectively cached. If it is known that every window fits in memory, and stronger
* caching is desired, use [[asListSideInput]].
* @group side
*/
def asIterableSideInput: SideInput[Iterable[T]] =
new IterableSideInput[T](this.applyInternal(View.asIterable()))
/**
* Convert this SCollection to an [[SCollectionWithSideInput]] with one or more [[SideInput]]s,
* similar to Spark broadcast variables. Call [[SCollectionWithSideInput.toSCollection]] when
* done with side inputs.
*
* Note that the side inputs should be tiny and fit in memory.
*
* {{{
* val s1: SCollection[Int] = // ...
* val s2: SCollection[String] = // ...
* val s3: SCollection[(String, Double)] = // ...
*
* // Prepare side inputs
* val side1 = s1.asSingletonSideInput
* val side2 = s2.asIterableSideInput
* val side3 = s3.asMapSideInput
*
* val p: SCollection[MyRecord] = // ...
* p.withSideInputs(side1, side2, side3).map { (x, s) =>
* // Extract side inputs from context
* val s1: Int = s(side1)
* val s2: Iterable[String] = s(side2)
* val s3: Map[String, Iterable[Double]] = s(side3)
* // ...
* }
* }}}
* @group side
*/
def withSideInputs(sides: SideInput[_]*): SCollectionWithSideInput[T] =
new SCollectionWithSideInput[T](internal, context, sides)
// =======================================================================
// Side output operations
// =======================================================================
/**
* Convert this SCollection to an [[SCollectionWithSideOutput]] with one or more
* [[SideOutput]]s, so that a single transform can write to multiple destinations.
*
* {{{
* // Prepare side inputs
* val side1 = SideOutput[String]()
* val side2 = SideOutput[Int]()
*
* val p: SCollection[MyRecord] = // ...
* p.withSideOutputs(side1, side2).map { (x, s) =>
* // Write to side outputs via context
* s.output(side1, "word").output(side2, 1)
* // ...
* }
* }}}
* @group side
*/
def withSideOutputs(sides: SideOutput[_]*): SCollectionWithSideOutput[T] =
new SCollectionWithSideOutput[T](internal, context, sides)
// =======================================================================
// Windowing operations
// =======================================================================
/**
* Convert this SCollection to an [[WindowedSCollection]].
* @group window
*/
def toWindowed: WindowedSCollection[T] = new WindowedSCollection[T](internal, context)
/**
* Window values with the given function.
* @group window
*/
def withWindowFn[W <: BoundedWindow](fn: WindowFn[AnyRef, W],
options: WindowOptions[W] = WindowOptions())
: SCollection[T] = {
var transform = Window.into(fn).asInstanceOf[Window.Bound[T]]
if (options.trigger != null) {
transform = transform.triggering(options.trigger)
}
if (options.accumulationMode != null) {
if (options.accumulationMode == AccumulationMode.ACCUMULATING_FIRED_PANES) {
transform = transform.accumulatingFiredPanes()
} else if (options.accumulationMode == AccumulationMode.DISCARDING_FIRED_PANES) {
transform = transform.discardingFiredPanes()
} else {
throw new RuntimeException(s"Unsupported accumulation mode ${options.accumulationMode}")
}
}
if (options.allowedLateness != null) {
transform = if (options.closingBehavior == null) {
transform.withAllowedLateness(options.allowedLateness)
} else {
transform.withAllowedLateness(options.allowedLateness, options.closingBehavior)
}
}
if (options.outputTimeFn != null) {
transform = transform.withOutputTimeFn(options.outputTimeFn)
}
this.pApply(transform)
}
/**
* Window values into fixed windows.
* @group window
*/
def withFixedWindows(duration: Duration,
offset: Duration = Duration.ZERO,
options: WindowOptions[IntervalWindow] = WindowOptions()): SCollection[T] =
this.withWindowFn(FixedWindows.of(duration).withOffset(offset), options)
/**
* Window values into sliding windows.
* @group window
*/
def withSlidingWindows(size: Duration,
period: Duration = Duration.millis(1),
offset: Duration = Duration.ZERO,
options: WindowOptions[IntervalWindow] = WindowOptions()): SCollection[T] =
this.withWindowFn(SlidingWindows.of(size).every(period).withOffset(offset), options)
/**
* Window values based on sessions.
* @group window
*/
def withSessionWindows(gapDuration: Duration,
options: WindowOptions[IntervalWindow] = WindowOptions()): SCollection[T] =
this.withWindowFn(Sessions.withGapDuration(gapDuration), options)
/**
* Group values in to a single global window.
* @group window
*/
def withGlobalWindow(options: WindowOptions[GlobalWindow] = WindowOptions()): SCollection[T] =
this.withWindowFn(new GlobalWindows(), options)
/**
* Window values into by years.
* @group window
*/
def windowByYears(number: Int,
options: WindowOptions[IntervalWindow] = WindowOptions()): SCollection[T] =
this.withWindowFn(CalendarWindows.years(number), options)
/**
* Window values into by months.
* @group window
*/
def windowByMonths(number: Int,
options: WindowOptions[IntervalWindow] = WindowOptions()): SCollection[T] =
this.withWindowFn(CalendarWindows.months(number), options)
/**
* Window values into by weeks.
* @group window
*/
def windowByWeeks(number: Int, startDayOfWeek: Int,
options: WindowOptions[IntervalWindow] = WindowOptions()): SCollection[T] =
this.withWindowFn(CalendarWindows.weeks(number, startDayOfWeek), options)
/**
* Window values into by days.
* @group window
*/
def windowByDays(number: Int,
options: WindowOptions[IntervalWindow] = WindowOptions()): SCollection[T] =
this.withWindowFn(CalendarWindows.days(number), options)
/**
* Convert values into pairs of (value, window).
* @group window
*/
def withPaneInfo: SCollection[(T, PaneInfo)] = this.parDo(new DoFn[T, (T, PaneInfo)] {
override def processElement(c: DoFn[T, (T, PaneInfo)]#ProcessContext): Unit =
c.output((c.element(), c.pane()))
})
/**
* Convert values into pairs of (value, timestamp).
* @group window
*/
def withTimestamp: SCollection[(T, Instant)] = this.parDo(new DoFn[T, (T, Instant)] {
override def processElement(c: DoFn[T, (T, Instant)]#ProcessContext): Unit =
c.output((c.element(), c.timestamp()))
})
/**
* Convert values into pairs of (value, window).
* @group window
*/
def withWindow: SCollection[(T, BoundedWindow)] = this.parDo(
new DoFn[T, (T, BoundedWindow)] with DoFn.RequiresWindowAccess {
override def processElement(c: DoFn[T, (T, BoundedWindow)]#ProcessContext): Unit =
c.output((c.element(), c.window()))
})
/**
* Assign timestamps to values.
* With a optional skew
* @group window
*/
def timestampBy(f: T => Instant, allowedTimestampSkew: Duration = Duration.ZERO): SCollection[T] =
this.applyTransform(WithTimestamps.of(Functions.serializableFn(f))
.withAllowedTimestampSkew(allowedTimestampSkew))
// =======================================================================
// Write operations
// =======================================================================
/**
* Extract data from this SCollection as a Future. The Future will be completed once the
* pipeline completes successfully.
* @group output
*/
def materialize: Future[Tap[T]] = {
val filename = "scio-materialize-" + UUID.randomUUID().toString
val tmpDir = if (context.options.getTempLocation == null) {
sys.props("java.io.tmpdir")
} else {
context.options.getTempLocation
}
val path = tmpDir + (if (tmpDir.endsWith("/")) "" else "/") + filename
saveAsObjectFile(path)
}
/**
* Save this SCollection as an object file using default serialization.
* @group output
*/
def saveAsObjectFile(path: String, numShards: Int = 0, suffix: String = ".obj",
metadata: Map[String, AnyRef] = Map.empty)
: Future[Tap[T]] = {
if (context.isTest) {
context.testOut(ObjectFileIO(path))(this)
saveAsInMemoryTap
} else {
val elemCoder = this.getCoder[T]
this
.parDo(new DoFn[T, GenericRecord] {
override def processElement(c: DoFn[T, GenericRecord]#ProcessContext): Unit = {
c.output(AvroBytesUtil.encode(elemCoder, c.element()))
}
})
.saveAsAvroFile(path, numShards, AvroBytesUtil.schema, suffix, metadata = metadata)
context.makeFuture(ObjectFileTap[T](path + "/part-*"))
}
}
private def pathWithShards(path: String) = {
if (ScioUtil.isLocalRunner(this.context.pipeline.getOptions) &&
ScioUtil.isLocalUri(new URI(path))) {
// Create output directory when running locally with local file system
val f = new File(path)
if (f.exists()) {
throw new RuntimeException(s"Output directory $path already exists")
}
f.mkdirs()
}
path.replaceAll("\\/+$", "") + "/part"
}
private def avroOut(path: String, numShards: Int,
codec: CodecFactory,
metadata: Map[String, AnyRef]) =
gio.PatchedAvroIO.Write.to(pathWithShards(path))
.withNumShards(numShards)
.withSuffix(".avro")
.withCodec(codec)
.withMetadata(metadata.asJava)
private def textOut(path: String, suffix: String, numShards: Int) =
gio.TextIO.Write.to(pathWithShards(path)).withNumShards(numShards).withSuffix(suffix)
private def tableRowJsonOut(path: String, numShards: Int) =
textOut(path, ".json", numShards).withCoder(TableRowJsonCoder.of())
/**
* Save this SCollection as an Avro file.
* @param schema must be not null if T is of type GenericRecord.
* @group output
*/
def saveAsAvroFile(path: String,
numShards: Int = 0,
schema: Schema = null,
suffix: String = "",
codec: CodecFactory = CodecFactory.deflateCodec(6),
metadata: Map[String, AnyRef] = Map.empty)
: Future[Tap[T]] =
if (context.isTest) {
context.testOut(AvroIO(path))(this)
saveAsInMemoryTap
} else {
val transform = avroOut(path, numShards, codec, metadata).withSuffix(suffix + ".avro")
val cls = ScioUtil.classOf[T]
if (classOf[SpecificRecordBase] isAssignableFrom cls) {
this.applyInternal(transform.withSchema(cls))
} else {
this.applyInternal(transform.withSchema(schema).asInstanceOf[PatchedAvroIO.Write.Bound[T]])
}
context.makeFuture(AvroTap(path + "/part-*", schema))
}
/**
* Save this SCollection as a Protobuf file.
* @group output
*/
def saveAsProtobufFile(path: String, numShards: Int = 0)
(implicit ev: T <:< Message): Future[Tap[T]] = {
import me.lyh.protobuf.generic
val schema = generic.Schema.of[Message](ct.asInstanceOf[ClassTag[Message]]).toJson
val metadata = Map("protobuf.generic.schema" -> schema)
this.saveAsObjectFile(path, numShards, ".protobuf", metadata)
}
/**
* Save this SCollection as a BigQuery table. Note that elements must be of type TableRow.
* @group output
*/
def saveAsBigQuery(table: TableReference, schema: TableSchema,
writeDisposition: WriteDisposition,
createDisposition: CreateDisposition)
(implicit ev: T <:< TableRow): Future[Tap[TableRow]] = {
val tableSpec = gio.BigQueryIO.toTableSpec(table)
if (context.isTest) {
context.testOut(BigQueryIO(tableSpec))(this.asInstanceOf[SCollection[TableRow]])
if (writeDisposition == WriteDisposition.WRITE_APPEND) {
Future.failed(new NotImplementedError("BigQuery future with append not implemented"))
} else {
saveAsInMemoryTap.asInstanceOf[Future[Tap[TableRow]]]
}
} else {
var transform = gio.BigQueryIO.Write.to(table)
if (schema != null) transform = transform.withSchema(schema)
if (createDisposition != null) transform = transform.withCreateDisposition(createDisposition)
if (writeDisposition != null) transform = transform.withWriteDisposition(writeDisposition)
this.asInstanceOf[SCollection[TableRow]].applyInternal(transform)
if (writeDisposition == WriteDisposition.WRITE_APPEND) {
Future.failed(new NotImplementedError("BigQuery future with append not implemented"))
} else {
context.makeFuture(BigQueryTap(table))
}
}
}
/**
* Save this SCollection as a BigQuery table. Note that elements must be of type TableRow.
* @group output
*/
def saveAsBigQuery(tableSpec: String, schema: TableSchema = null,
writeDisposition: WriteDisposition = null,
createDisposition: CreateDisposition = null)
(implicit ev: T <:< TableRow): Future[Tap[TableRow]] =
saveAsBigQuery(
gio.BigQueryIO.parseTableSpec(tableSpec), schema, writeDisposition, createDisposition)
/**
* Save this SCollection as a Datastore dataset. Note that elements must be of type Entity.
* @group output
*/
def saveAsDatastore(projectId: String)(implicit ev: T <:< Entity): Future[Tap[Entity]] = {
if (context.isTest) {
context.testOut(DatastoreIO(projectId))(this.asInstanceOf[SCollection[Entity]])
} else {
this.asInstanceOf[SCollection[Entity]].applyInternal(
gio.datastore.DatastoreIO.v1.write.withProjectId(projectId))
}
Future.failed(new NotImplementedError("Datastore future not implemented"))
}
/**
* Save this SCollection as a Pub/Sub topic.
* @group output
*/
def saveAsPubsub(topic: String)(implicit ev: T <:< String): Future[Tap[String]] = {
if (context.isTest) {
context.testOut(PubsubIO(topic))(this.asInstanceOf[SCollection[String]])
} else {
this.asInstanceOf[SCollection[String]].applyInternal(gio.PubsubIO.Write.topic(topic))
}
Future.failed(new NotImplementedError("Pubsub future not implemented"))
}
/**
* Save this SCollection as a JSON text file. Note that elements must be of type TableRow.
* @group output
*/
def saveAsTableRowJsonFile(path: String, numShards: Int = 0)
(implicit ev: T <:< TableRow): Future[Tap[TableRow]] =
if (context.isTest) {
context.testOut(TableRowJsonIO(path))(this.asInstanceOf[SCollection[TableRow]])
saveAsInMemoryTap.asInstanceOf[Future[Tap[TableRow]]]
} else {
this.asInstanceOf[SCollection[TableRow]].applyInternal(tableRowJsonOut(path, numShards))
context.makeFuture(TableRowJsonTap(path + "/part-*"))
}
/**
* Save this SCollection as a text file. Note that elements must be of type String.
* @group output
*/
def saveAsTextFile(path: String,
suffix: String = ".txt", numShards: Int = 0): Future[Tap[String]] = {
val s = if (classOf[String] isAssignableFrom this.ct.runtimeClass) {
this.asInstanceOf[SCollection[String]]
} else {
this.map(_.toString)
}
if (context.isTest) {
context.testOut(TextIO(path))(s)
s.saveAsInMemoryTap
} else {
s.applyInternal(textOut(path, suffix, numShards))
context.makeFuture(TextTap(path + "/part-*"))
}
}
private[scio] def saveAsInMemoryTap: Future[Tap[T]] = {
val tap = new InMemoryTap[T]
this.applyInternal(gio.Write.to(new InMemorySink[T](tap.id)))
context.makeFuture(tap)
}
}
// scalastyle:on number.of.methods
private[scio] class SCollectionImpl[T: ClassTag](val internal: PCollection[T],
val context: ScioContext)
extends SCollection[T] {
protected val ct: ClassTag[T] = implicitly[ClassTag[T]]
}
// scalastyle:on file.size.limit
© 2015 - 2025 Weber Informatics LLC | Privacy Policy