com.spotify.scio.values.PairSCollectionFunctions.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of scio-core_2.11 Show documentation
Scio - A Scala API for Apache Beam and Google Cloud Dataflow
The newest version!
/*
 * Copyright 2016 Spotify AB.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package com.spotify.scio.values

import java.lang.{Iterable => JIterable, Long => JLong}
import java.util.{Map => JMap}

import com.google.cloud.dataflow.sdk.transforms._
import com.google.cloud.dataflow.sdk.values.{KV, PCollection, PCollectionView}
import com.spotify.scio.ScioContext
import com.spotify.scio.util._
import com.spotify.scio.util.random.{BernoulliValueSampler, PoissonValueSampler}
import com.twitter.algebird.{Aggregator, _}

import scala.reflect.ClassTag

// scalastyle:off number.of.methods
/**
 * Extra functions available on SCollections of (key, value) pairs through an implicit conversion.
 *
 * @groupname cogroup CoGroup Operations
 * @groupname join Join Operations
 * @groupname per_key Per Key Aggregations
 * @groupname transform Transformations
 * @groupname Ungrouped Other Members
 */
class PairSCollectionFunctions[K, V](val self: SCollection[(K, V)])
                                    (implicit ctKey: ClassTag[K], ctValue: ClassTag[V]) {

  import TupleFunctions._

  private val context: ScioContext = self.context

  private def toKvTransform = ParDo.of(Functions.mapFn[(K, V), KV[K, V]](kv => KV.of(kv._1, kv._2)))

  private[scio] def toKV: SCollection[KV[K, V]] = {
    val o = self.applyInternal(toKvTransform).setCoder(self.getKvCoder[K, V])
    context.wrap(o)
  }

  private[values] def applyPerKey[UI: ClassTag, UO: ClassTag]
  (t: PTransform[PCollection[KV[K, V]], PCollection[KV[K, UI]]], f: KV[K, UI] => (K, UO))
  : SCollection[(K, UO)] = {
    val o = self.applyInternal(new PTransform[PCollection[(K, V)], PCollection[(K, UO)]]() {
      override def apply(input: PCollection[(K, V)]): PCollection[(K, UO)] =
        input
          .apply("TupleToKv", toKvTransform)
          .setCoder(self.getKvCoder[K, V])
          .apply(t)
          .apply("KvToTuple", ParDo.of(Functions.mapFn[KV[K, UI], (K, UO)](f)))
          .setCoder(self.getCoder[(K, UO)])
    })
    context.wrap(o)
  }

  /**
   * Convert this SCollection to an [[SCollectionWithHotKeyFanout]] that uses an intermediate node
   * to combine "hot" keys partially before performing the full combine.
   * @param hotKeyFanout a function from keys to an integer N, where the key will be spread among
   * N intermediate nodes for partial combining. If N is less than or equal to 1, this key will
   * not be sent through an intermediate node.
   */
  def withHotKeyFanout(hotKeyFanout: K => Int): SCollectionWithHotKeyFanout[K, V] =
    new SCollectionWithHotKeyFanout(this, Left(hotKeyFanout))

  /**
   * Convert this SCollection to an [[SCollectionWithHotKeyFanout]] that uses an intermediate node
   * to combine "hot" keys partially before performing the full combine.
   * @param hotKeyFanout constant value for every key
   */
  def withHotKeyFanout(hotKeyFanout: Int): SCollectionWithHotKeyFanout[K, V] =
    new SCollectionWithHotKeyFanout(this, Right(hotKeyFanout))

  // =======================================================================
  // CoGroups
  // =======================================================================

  /**
   * For each key k in `this` or `that`, return a resulting SCollection that contains a tuple with
   * the list of values for that key in `this` as well as `that`.
   * @group cogroup
   */
  def cogroup[W: ClassTag](that: SCollection[(K, W)])
  : SCollection[(K, (Iterable[V], Iterable[W]))] =
    MultiJoin.cogroup(self, that)

  /**
   * For each key k in `this` or `that1` or `that2`, return a resulting SCollection that contains
   * a tuple with the list of values for that key in `this`, `that1` and `that2`.
   * @group cogroup
   */
  def cogroup[W1: ClassTag, W2: ClassTag]
  (that1: SCollection[(K, W1)], that2: SCollection[(K, W2)])
  : SCollection[(K, (Iterable[V], Iterable[W1], Iterable[W2]))] =
    MultiJoin.cogroup(self, that1, that2)

  /**
   * For each key k in `this` or `that1` or `that2` or `that3`, return a resulting SCollection
   * that contains a tuple with the list of values for that key in `this`, `that1`, `that2` and
   * `that3`.
   * @group cogroup
   */
  def cogroup[W1: ClassTag, W2: ClassTag, W3: ClassTag]
  (that1: SCollection[(K, W1)], that2: SCollection[(K, W2)], that3: SCollection[(K, W3)])
  : SCollection[(K, (Iterable[V], Iterable[W1], Iterable[W2], Iterable[W3]))] =
    MultiJoin.cogroup(self, that1, that2, that3)

  /**
   * Alias for cogroup.
   * @group cogroup
   */
  def groupWith[W: ClassTag](that: SCollection[(K, W)])
  : SCollection[(K, (Iterable[V], Iterable[W]))] =
    this.cogroup(that)

  /**
   * Alias for cogroup.
   * @group cogroup
   */
  def groupWith[W1: ClassTag, W2: ClassTag]
  (that1: SCollection[(K, W1)], that2: SCollection[(K, W2)])
  : SCollection[(K, (Iterable[V], Iterable[W1], Iterable[W2]))] =
    this.cogroup(that1, that2)

  /**
   * Alias for cogroup.
   * @group cogroup
   */
  def groupWith[W1: ClassTag, W2: ClassTag, W3: ClassTag]
  (that1: SCollection[(K, W1)], that2: SCollection[(K, W2)], that3: SCollection[(K, W3)])
  : SCollection[(K, (Iterable[V], Iterable[W1], Iterable[W2], Iterable[W3]))] =
    this.cogroup(that1, that2, that3)

  // =======================================================================
  // Joins
  // =======================================================================

  /**
   * Perform a full outer join of `this` and `that`. For each element (k, v) in `this`, the
   * resulting SCollection will either contain all pairs (k, (Some(v), Some(w))) for w in `that`,
   * or the pair (k, (Some(v), None)) if no elements in `that` have key k. Similarly, for each
   * element (k, w) in `that`, the resulting SCollection will either contain all pairs (k,
   * (Some(v), Some(w))) for v in `this`, or the pair (k, (None, Some(w))) if no elements in
   * `this` have key k. Uses the given Partitioner to partition the output SCollection.
   * @group join
   */
  def fullOuterJoin[W: ClassTag](that: SCollection[(K, W)])
  : SCollection[(K, (Option[V], Option[W]))] =
    MultiJoin.outer(self, that)

  /**
   * Return an SCollection containing all pairs of elements with matching keys in `this` and
   * `that`. Each pair of elements will be returned as a (k, (v1, v2)) tuple, where (k, v1) is in
   * `this` and (k, v2) is in `that`. Uses the given Partitioner to partition the output RDD.
   * @group join
   */
  def join[W: ClassTag](that: SCollection[(K, W)]): SCollection[(K, (V, W))] =
    MultiJoin(self, that)

  /**
   * Perform a left outer join of `this` and `that`. For each element (k, v) in `this`, the
   * resulting SCollection will either contain all pairs (k, (v, Some(w))) for w in `that`, or the
   * pair (k, (v, None)) if no elements in `that` have key k. Uses the given Partitioner to
   * partition the output SCollection.
   * @group join
   */
  def leftOuterJoin[W: ClassTag](that: SCollection[(K, W)]): SCollection[(K, (V, Option[W]))] =
    MultiJoin.left(self, that)

  /**
   * Perform a right outer join of `this` and `that`. For each element (k, w) in `that`, the
   * resulting SCollection will either contain all pairs (k, (Some(v), w)) for v in `this`, or the
   * pair (k, (None, w)) if no elements in `this` have key k. Uses the given Partitioner to
   * partition the output SCollection.
   * @group join
   */
  def rightOuterJoin[W: ClassTag](that: SCollection[(K, W)])
  : SCollection[(K, (Option[V], W))] = self.transform {
    MultiJoin.left(that, _).mapValues(kv => (kv._2, kv._1))
  }

  /* Hash operations */

  /**
   * Perform an inner join by replicating `that` to all workers. The right side should be tiny and
   * fit in memory.
   *
   * @group join
   */
  def hashJoin[W: ClassTag](that: SCollection[(K, W)])
  : SCollection[(K, (V, W))] = self.transform { in =>
    val side = that.asMultiMapSideInput
    in.withSideInputs(side).flatMap[(K, (V, W))] { (kv, s) =>
      s(side).getOrElse(kv._1, Iterable()).toSeq.map(w => (kv._1, (kv._2, w)))
    }.toSCollection
  }

  /**
   * Perform a left outer join by replicating `that` to all workers. The right side should be tiny
   * and fit in memory.
   *
   * @group join
   */
  def hashLeftJoin[W: ClassTag](that: SCollection[(K, W)])
  : SCollection[(K, (V, Option[W]))] = self.transform { in =>
    val side = that.asMultiMapSideInput
    in.withSideInputs(side).flatMap[(K, (V, Option[W]))] { (kv, s) =>
      val (k, v) = kv
      val m = s(side)
      if (m.contains(k)) m(k).map(w => (k, (v, Some(w)))) else Seq((k, (v, None)))
    }.toSCollection
  }

  /**
   * N to 1 skewproof flavor of [[PairSCollectionFunctions.join()]].
   *
   * Perform a skewed join where some keys on the left hand may be hot, i.e. appear more than
   * `hotKeyThreshold` times. Frequency of a key is estimated with `1 - delta` probability, and the
   * estimate is within `eps * N` of the true frequency.
   * `true frequency <= estimate <= true frequency + eps * N`, where N is the total size of
   * the left hand side stream so far.
   *
   * @note Make sure to import [[com.twitter.algebird.CMSHasherImplicits]] before using this join
   * @example {{{
   * // Implicits that enabling CMS-hashing
   * import com.twitter.algebird.CMSHasherImplicits._
   *
   * val p = logs.skewedJoin(logMetadata, hotKeyThreshold = 8500, eps=0.0005, seed=1)
   * }}}
   *
   * Read more about CMS -> [[com.twitter.algebird.CMSMonoid]]
   * @group join
   * @param hotKeyThreshold key with `hotKeyThreshold` values will be considered hot. Some runners
   *                        have inefficient GroupByKey implementation for groups with more than 10K
   *                        values. Thus it is recommended to set `hotKeyThreshold` to below 10K,
   *                        keep upper estimation error in mind.
   * @param eps One-sided error bound on the error of each point query, i.e. frequency estimate.
   *            Must lie in (0, 1).
   * @param seed A seed to initialize the random number generator used to create the pairwise
   *             independent hash functions.
   * @param delta A bound on the probability that a query estimate does not lie within some small
   *              interval (an interval that depends on `eps`) around the truth. Must lie in (0, 1).
   * @param sampleFraction left side sample fracation. Default is `1.0` - no sampling.
   * @param withReplacement whether to use sampling with replacement, see [[SCollection.sample()]]
   */
  def skewedJoin[W: ClassTag](that: SCollection[(K, W)],
                              hotKeyThreshold: Long,
                              eps: Double,
                              seed: Int,
                              delta: Double = 1E-10,
                              sampleFraction: Double = 1.0,
                              withReplacement: Boolean = true)(implicit hasher: CMSHasher[K])
  : SCollection[(K, (V, W))] = {
    require(sampleFraction <= 1.0 && sampleFraction > 0.0,
      "Sample fraction has to be between (0.0, 1.0] - default is 1.0")

    import com.twitter.algebird._
    // Key aggregator for `k->#values`
    val keyAggregator = CMS.aggregator[K](eps, delta, seed)

    val leftSideKeys = if (sampleFraction < 1.0) {
      self.sample(withReplacement, sampleFraction).keys
    } else {
      self.keys
    }

    val cms = leftSideKeys.aggregate(keyAggregator)
    self.skewedJoin(that, hotKeyThreshold, cms)
  }

  /**
   * N to 1 skewproof flavor of [[PairSCollectionFunctions.join()]].
   *
   * Perform a skewed join where some keys on the left hand may be hot, i.e. appear more than
   * `hotKeyThreshold` times. Frequency of a key is estimated with `1 - delta` probability, and the
   * estimate is within `eps * N` of the true frequency.
   * `true frequency <= estimate <= true frequency + eps * N`, where N is the total size of
   * the left hand side stream so far.
   *
   * @note Make sure to import [[com.twitter.algebird.CMSHasherImplicits]] before using this join
   * @example {{{
   * // Implicits that enabling CMS-hashing
   * import com.twitter.algebird.CMSHasherImplicits._
   *
   * val keyAggregator = CMS.aggregator[K](eps, delta, seed)
   * val hotKeyCMS = self.keys.aggregate(keyAggregator)
   * val p = logs.skewedJoin(logMetadata, hotKeyThreshold = 8500, cms=hotKeyCMS)
   * }}}
   *
   * Read more about CMS -> [[com.twitter.algebird.CMSMonoid]]
   * @group join
   * @param hotKeyThreshold key with `hotKeyThreshold` values will be considered hot. Some runners
   *                        have inefficient GroupByKey implementation for groups with more than 10K
   *                        values. Thus it is recommended to set `hotKeyThreshold` to below 10K,
   *                        keep upper estimation error in mind.
   * @param cms left hand side key [[com.twitter.algebird.CMSMonoid]]
   */
  def skewedJoin[W: ClassTag](that: SCollection[(K, W)],
                              hotKeyThreshold: Long,
                              cms: SCollection[CMS[K]])
  : SCollection[(K, (V, W))] = {
    val (hotSelf, chillSelf) = (SideOutput[(K, V)](), SideOutput[(K, V)]())
    // scalastyle:off line.size.limit
    // Use asIterableSideInput as workaround for:
    // http://stackoverflow.com/questions/37126729/ismsinkwriter-expects-keys-to-be-written-in-strictly-increasing-order
    // scalastyle:on line.size.limit
    val keyCMS = cms.asIterableSideInput

    val partitionedSelf = self
      .withSideInputs(keyCMS).transformWithSideOutputs(Seq(hotSelf, chillSelf), (e, c) =>
        if (c(keyCMS).nonEmpty &&
            c(keyCMS).head.frequency(e._1).estimate >= hotKeyThreshold) {
          hotSelf
        } else {
          chillSelf
        }
    )

    val (hotThat, chillThat) = (SideOutput[(K, W)](), SideOutput[(K, W)]())
    val partitionedThat = that
      .withSideInputs(keyCMS)
      .transformWithSideOutputs(Seq(hotThat, chillThat), (e, c) =>
        if (c(keyCMS).nonEmpty &&
            c(keyCMS).head.frequency(e._1).estimate >= hotKeyThreshold) {
          hotThat
        } else {
          chillThat
        }
      )

    // Use hash join for hot keys
    val hotJoined = partitionedSelf(hotSelf).hashJoin(partitionedThat(hotThat))

    // Use regular join for the rest of the keys
    val chillJoined = partitionedSelf(chillSelf).join(partitionedThat(chillThat))

    hotJoined ++ chillJoined
  }

  // =======================================================================
  // Transformations
  // =======================================================================

  /**
   * Aggregate the values of each key, using given combine functions and a neutral "zero value".
   * This function can return a different result type, U, than the type of the values in this
   * SCollection, V. Thus, we need one operation for merging a V into a U and one operation for
   * merging two U's. To avoid memory allocation, both of these functions are allowed to modify
   * and return their first argument instead of creating a new U.
   * @group per_key
   */
  def aggregateByKey[U: ClassTag](zeroValue: U)(seqOp: (U, V) => U,
                                                combOp: (U, U) => U): SCollection[(K, U)] =
    this.applyPerKey(
      Combine.perKey(Functions.aggregateFn(zeroValue)(seqOp, combOp)),
      kvToTuple[K, U])

  /**
   * Aggregate the values of each key with [[com.twitter.algebird.Aggregator Aggregator]]. First
   * each value V is mapped to A, then we reduce with a semigroup of A, then finally we present
   * the results as U. This could be more powerful and better optimized in some cases.
   * @group per_key
   */
  def aggregateByKey[A: ClassTag, U: ClassTag](aggregator: Aggregator[V, A, U])
  : SCollection[(K, U)] = self.transform { in =>
    val a = aggregator  // defeat closure
    in.mapValues(a.prepare).sumByKey(a.semigroup).mapValues(a.present)
  }

  /**
   * For each key, compute the values' data distribution using approximate `N`-tiles.
   * @return a new SCollection whose values are Iterables of the approximate `N`-tiles of
   * the elements.
   * @group per_key
   */
  def approxQuantilesByKey(numQuantiles: Int)(implicit ord: Ordering[V])
  : SCollection[(K, Iterable[V])] =
    this.applyPerKey(
      ApproximateQuantiles.perKey(numQuantiles, ord),
      kvListToTuple[K, V])

  /**
   * Generic function to combine the elements for each key using a custom set of aggregation
   * functions. Turns an SCollection[(K, V)] into a result of type SCollection[(K, C)], for a
   * "combined type" C Note that V and C can be different -- for example, one might group an
   * SCollection of type (Int, Int) into an RDD of type (Int, Seq[Int]). Users provide three
   * functions:
   *
   * - `createCombiner`, which turns a V into a C (e.g., creates a one-element list)
   *
   * - `mergeValue`, to merge a V into a C (e.g., adds it to the end of a list)
   *
   * - `mergeCombiners`, to combine two C's into a single one.
   * @group per_key
   */
  def combineByKey[C: ClassTag](createCombiner: V => C)
                               (mergeValue: (C, V) => C)
                               (mergeCombiners: (C, C) => C): SCollection[(K, C)] =
    this.applyPerKey(
      Combine.perKey(Functions.combineFn(createCombiner, mergeValue, mergeCombiners)),
      kvToTuple[K, C])

  /**
   * Count approximate number of distinct values for each key in the SCollection.
   * @param sampleSize the number of entries in the statisticalsample; the higher this number, the
   * more accurate the estimate will be; should be `>= 16`.
   * @group per_key
   */
  def countApproxDistinctByKey(sampleSize: Int): SCollection[(K, Long)] =
    this.applyPerKey(ApproximateUnique.perKey[K, V](sampleSize), kvToTuple[K, JLong])
      .asInstanceOf[SCollection[(K, Long)]]

  /**
   * Count approximate number of distinct values for each key in the SCollection.
   * @param maximumEstimationError the maximum estimation error, which should be in the range
   * `[0.01, 0.5]`.
   * @group per_key
   */
  def countApproxDistinctByKey(maximumEstimationError: Double = 0.02): SCollection[(K, Long)] =
    this.applyPerKey(ApproximateUnique.perKey[K, V](maximumEstimationError), kvToTuple[K, JLong])
      .asInstanceOf[SCollection[(K, Long)]]

  /**
   * Count the number of elements for each key.
   * @return a new SCollection of (key, count) pairs
   * @group per_key
   */
  def countByKey: SCollection[(K, Long)] = self.transform(_.keys.countByValue)

  /**
   * Pass each value in the key-value pair SCollection through a flatMap function without changing
   * the keys.
   * @group transform
   */
  def flatMapValues[U: ClassTag](f: V => TraversableOnce[U]): SCollection[(K, U)] =
    self.flatMap(kv => f(kv._2).map(v => (kv._1, v)))

  /**
   * Merge the values for each key using an associative function and a neutral "zero value" which
   * may be added to the result an arbitrary number of times, and must not change the result
   * (e.g., Nil for list concatenation, 0 for addition, or 1 for multiplication.).
   * @group per_key
   */
  def foldByKey(zeroValue: V)(op: (V, V) => V): SCollection[(K, V)] =
    this.applyPerKey(Combine.perKey(Functions.aggregateFn(zeroValue)(op, op)), kvToTuple[K, V])

  /**
   * Fold by key with [[com.twitter.algebird.Monoid Monoid]], which defines the associative
   * function and "zero value" for V. This could be more powerful and better optimized in some
   * cases.
   * @group per_key
   */
  def foldByKey(implicit mon: Monoid[V]): SCollection[(K, V)] =
    this.applyPerKey(Combine.perKey(Functions.reduceFn(mon)), kvToTuple[K, V])

  /**
   * Group the values for each key in the SCollection into a single sequence. The ordering of
   * elements within each group is not guaranteed, and may even differ each time the resulting
   * SCollection is evaluated.
   *
   * Note: This operation may be very expensive. If you are grouping in order to perform an
   * aggregation (such as a sum or average) over each key, using
   * [[PairSCollectionFunctions.aggregateByKey[U]* PairSCollectionFunctions.aggregateByKey]] or
   * [[PairSCollectionFunctions.reduceByKey]] will provide much better performance.
   *
   * Note: As currently implemented, groupByKey must be able to hold all the key-value pairs for
   * any key in memory. If a key has too many values, it can result in an OutOfMemoryError.
   * @group per_key
   */
  def groupByKey: SCollection[(K, Iterable[V])] =
    this.applyPerKey(GroupByKey.create[K, V](), kvIterableToTuple[K, V])

  /**
   * Return an SCollection with the pairs from `this` whose keys are in `that`.
   * @group per_key
   */
  def intersectByKey(that: SCollection[K]): SCollection[(K, V)] = self.transform {
    _.cogroup(that.map((_, ()))).flatMap { t =>
      if (t._2._1.nonEmpty && t._2._2.nonEmpty) t._2._1.map((t._1, _)) else Seq.empty
    }
  }

  /**
   * Return an SCollection with the keys of each tuple.
   * @group transform
   */
  // Scala lambda is simpler and more powerful than transforms.Keys
  def keys: SCollection[K] = self.map(_._1)

  /**
   * Pass each value in the key-value pair SCollection through a map function without changing the
   * keys.
   * @group transform
   */
  def mapValues[U: ClassTag](f: V => U): SCollection[(K, U)] = self.map(kv => (kv._1, f(kv._2)))

  /**
   * Return the max of values for each key as defined by the implicit Ordering[T].
   * @return a new SCollection of (key, maximum value) pairs
   * @group per_key
   */
  // Scala lambda is simpler and more powerful than transforms.Max
  def maxByKey(implicit ord: Ordering[V]): SCollection[(K, V)] = this.reduceByKey(ord.max)

  /**
   * Return the min of values for each key as defined by the implicit Ordering[T].
   * @return a new SCollection of (key, minimum value) pairs
   * @group per_key
   */
  // Scala lambda is simpler and more powerful than transforms.Min
  def minByKey(implicit ord: Ordering[V]): SCollection[(K, V)] = this.reduceByKey(ord.min)

  /**
   * Merge the values for each key using an associative reduce function. This will also perform
   * the merging locally on each mapper before sending results to a reducer, similarly to a
   * "combiner" in MapReduce.
   * @group per_key
   */
  def reduceByKey(op: (V, V) => V): SCollection[(K, V)] =
    this.applyPerKey(Combine.perKey(Functions.reduceFn(op)), kvToTuple[K, V])

  /**
   * Return a sampled subset of values for each key of this SCollection.
   * @return a new SCollection of (key, sampled values) pairs
   * @group per_key
   */
  def sampleByKey(sampleSize: Int): SCollection[(K, Iterable[V])] =
    this.applyPerKey(Sample.fixedSizePerKey[K, V](sampleSize), kvIterableToTuple[K, V])

  /**
   * Return a subset of this SCollection sampled by key (via stratified sampling).
   *
   * Create a sample of this SCollection using variable sampling rates for different keys as
   * specified by `fractions`, a key to sampling rate map, via simple random sampling with one
   * pass over the SCollection, to produce a sample of size that's approximately equal to the sum
   * of math.ceil(numItems * samplingRate) over all key values.
   *
   * @param withReplacement whether to sample with or without replacement
   * @param fractions map of specific keys to sampling rates
   * @return SCollection containing the sampled subset
   * @group per_key
   */
  def sampleByKey(withReplacement: Boolean, fractions: Map[K, Double]): SCollection[(K, V)] = {
    if (withReplacement) {
      self.parDo(new PoissonValueSampler[K, V](fractions))
    } else {
      self.parDo(new BernoulliValueSampler[K, V](fractions))
    }
  }

  /**
   * Return an SCollection with the pairs from `this` whose keys are not in `that`.
   * @group per_key
   */
  def subtractByKey(that: SCollection[K]): SCollection[(K, V)] = self.transform {
    _.cogroup(that.map((_, ()))).flatMap { t =>
      if (t._2._1.nonEmpty && t._2._2.isEmpty) t._2._1.map((t._1, _)) else Seq.empty
    }
  }

  /**
   * Reduce by key with [[com.twitter.algebird.Semigroup Semigroup]]. This could be more powerful
   * and better optimized in some cases.
   * @group per_key
   */
  def sumByKey(implicit sg: Semigroup[V]): SCollection[(K, V)] =
    this.applyPerKey(Combine.perKey(Functions.reduceFn(sg)), kvToTuple[K, V])

  /**
   * Swap the keys with the values.
   * @group transform
   */
  // Scala lambda is simpler than transforms.KvSwap
  def swap: SCollection[(V, K)] = self.map(kv => (kv._2, kv._1))

  /**
   * Return the top k (largest) values for each key from this SCollection as defined by the
   * specified implicit Ordering[T].
   * @return a new SCollection of (key, top k) pairs
   * @group per_key
   */
  def topByKey(num: Int)(implicit ord: Ordering[V]): SCollection[(K, Iterable[V])] =
    this.applyPerKey(Top.perKey[K, V, Ordering[V]](num, ord), kvListToTuple[K, V])

  /**
   * Return an SCollection with the values of each tuple.
   * @group transform
   */
  // Scala lambda is simpler and more powerful than transforms.Values
  def values: SCollection[V] = self.map(_._2)

  // =======================================================================
  // Side input operations
  // =======================================================================

  /**
   * Convert this SCollection to a SideInput, mapping key-value pairs of each window to a Map[key,
   * value], to be used with [[SCollection.withSideInputs]]. It is required that each key of the
   * input be associated with a single value.
   */
  def asMapSideInput: SideInput[Map[K, V]] = {
    val o = self.applyInternal(
      new PTransform[PCollection[(K, V)], PCollectionView[JMap[K, V]]]() {
        override def apply(input: PCollection[(K, V)]): PCollectionView[JMap[K, V]] = {
          input.apply(toKvTransform).setCoder(self.getKvCoder[K, V]).apply(View.asMap())
        }
      })
    new MapSideInput[K, V](o)
  }

  /**
   * Convert this SCollection to a SideInput, mapping key-value pairs of each window to a Map[key,
   * Iterable[value]], to be used with [[SCollection.withSideInputs]]. It is not required that the
   * keys in the input collection be unique.
   */
  def asMultiMapSideInput: SideInput[Map[K, Iterable[V]]] = {
    val o = self.applyInternal(
      new PTransform[PCollection[(K, V)], PCollectionView[JMap[K, JIterable[V]]]]() {
        override def apply(input: PCollection[(K, V)]): PCollectionView[JMap[K, JIterable[V]]] = {
          input.apply(toKvTransform).setCoder(self.getKvCoder[K, V]).apply(View.asMultimap())
        }
      })
    new MultiMapSideInput[K, V](o)
  }

}
// scalastyle:on number.of.methods