All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.spotify.scio.values.PairSCollectionFunctions.scala Maven / Gradle / Ivy

There is a newer version: 0.14.10
Show newest version
/*
 * Copyright 2019 Spotify AB.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package com.spotify.scio.values

import com.google.common.hash.Funnel
import com.spotify.scio.ScioContext
import com.spotify.scio.coders.{BeamCoders, Coder}
import com.spotify.scio.estimators.{
  ApproxDistinctCounter,
  ApproximateUniqueCounter,
  ApproximateUniqueCounterByError
}
import com.spotify.scio.hash._
import com.spotify.scio.util._
import com.spotify.scio.util.random.{BernoulliValueSampler, PoissonValueSampler}
import com.twitter.algebird.{Aggregator, Monoid, MonoidAggregator, Semigroup}
import org.apache.beam.sdk.transforms.DoFn.{Element, OutputReceiver, ProcessElement, Timestamp}
import org.apache.beam.sdk.transforms._
import org.apache.beam.sdk.values.{KV, PCollection}
import org.joda.time.{Duration, Instant}
import org.slf4j.LoggerFactory

import java.lang.{Double => JDouble}

import scala.collection.compat._

private object PairSCollectionFunctions {
  private val logger = LoggerFactory.getLogger(this.getClass)
}

/**
 * Extra functions available on SCollections of (key, value) pairs through an implicit conversion.
 *
 * @groupname cogroup
 * CoGroup Operations
 * @groupname join
 * Join Operations
 * @groupname per_key
 * Per Key Aggregations
 * @groupname transform
 * Transformations
 */
class PairSCollectionFunctions[K, V](val self: SCollection[(K, V)]) {
  import TupleFunctions._

  private[this] val context: ScioContext = self.context

  implicit lazy val keyCoder: Coder[K] = BeamCoders.getKeyCoder(self)
  implicit lazy val valueCoder: Coder[V] = BeamCoders.getValueCoder(self)

  private[scio] def toKV: SCollection[KV[K, V]] =
    self.map(kv => KV.of(kv._1, kv._2))

  private[values] def applyPerKey[UI: Coder, UO: Coder](
    t: PTransform[_ >: PCollection[KV[K, V]], PCollection[KV[K, UI]]]
  )(f: KV[K, UI] => (K, UO)): SCollection[(K, UO)] = {
    self.transform(
      _.withName("TupleToKv").toKV
        .applyTransform(t.getName, t)
        .withName("KvToTuple")
        .map(f)
    )
  }

  /**
   * Apply a [[org.apache.beam.sdk.transforms.DoFn DoFn]] that processes [[KV]] s and wrap the
   * output in an [[SCollection]].
   */
  def applyPerKeyDoFn[U: Coder](t: DoFn[KV[K, V], KV[K, U]]): SCollection[(K, U)] =
    this.applyPerKey(ParDo.of(t))(kvToTuple)

  /**
   * Convert this SCollection to an [[SCollectionWithHotKeyFanout]] that uses an intermediate node
   * to combine "hot" keys partially before performing the full combine.
   * @param hotKeyFanout
   *   a function from keys to an integer N, where the key will be spread among N intermediate nodes
   *   for partial combining. If N is less than or equal to 1, this key will not be sent through an
   *   intermediate node.
   */
  def withHotKeyFanout(hotKeyFanout: K => Int): SCollectionWithHotKeyFanout[K, V] =
    new SCollectionWithHotKeyFanout(this, Left(hotKeyFanout))

  /**
   * Convert this SCollection to an [[SCollectionWithHotKeyFanout]] that uses an intermediate node
   * to combine "hot" keys partially before performing the full combine.
   * @param hotKeyFanout
   *   constant value for every key
   */
  def withHotKeyFanout(hotKeyFanout: Int): SCollectionWithHotKeyFanout[K, V] =
    new SCollectionWithHotKeyFanout(this, Right(hotKeyFanout))

  // =======================================================================
  // CoGroups
  // =======================================================================

  /**
   * For each key k in `this` or `rhs`, return a resulting SCollection that contains a tuple with
   * the list of values for that key in `this` as well as `rhs`.
   * @group cogroup
   */
  def cogroup[W](rhs: SCollection[(K, W)]): SCollection[(K, (Iterable[V], Iterable[W]))] =
    ArtisanJoin.cogroup(self.tfName, self, rhs)

  /**
   * For each key k in `this` or `rhs1` or `rhs2`, return a resulting SCollection that contains a
   * tuple with the list of values for that key in `this`, `rhs1` and `rhs2`.
   * @group cogroup
   */
  def cogroup[W1, W2](
    rhs1: SCollection[(K, W1)],
    rhs2: SCollection[(K, W2)]
  ): SCollection[(K, (Iterable[V], Iterable[W1], Iterable[W2]))] =
    MultiJoin.withName(self.tfName).cogroup(self, rhs1, rhs2)

  /**
   * For each key k in `this` or `rhs1` or `rhs2` or `rhs3`, return a resulting SCollection that
   * contains a tuple with the list of values for that key in `this`, `rhs1`, `rhs2` and `rhs3`.
   * @group cogroup
   */
  def cogroup[W1, W2, W3](
    rhs1: SCollection[(K, W1)],
    rhs2: SCollection[(K, W2)],
    rhs3: SCollection[(K, W3)]
  ): SCollection[(K, (Iterable[V], Iterable[W1], Iterable[W2], Iterable[W3]))] =
    MultiJoin.withName(self.tfName).cogroup(self, rhs1, rhs2, rhs3)

  /**
   * Alias for `cogroup`.
   * @group cogroup
   */
  def groupWith[W](rhs: SCollection[(K, W)]): SCollection[(K, (Iterable[V], Iterable[W]))] =
    this.cogroup(rhs)

  /**
   * Alias for `cogroup`.
   * @group cogroup
   */
  def groupWith[W1, W2](
    rhs1: SCollection[(K, W1)],
    rhs2: SCollection[(K, W2)]
  ): SCollection[(K, (Iterable[V], Iterable[W1], Iterable[W2]))] =
    this.cogroup(rhs1, rhs2)

  /**
   * Alias for `cogroup`.
   * @group cogroup
   */
  def groupWith[W1, W2, W3](
    rhs1: SCollection[(K, W1)],
    rhs2: SCollection[(K, W2)],
    rhs3: SCollection[(K, W3)]
  ): SCollection[(K, (Iterable[V], Iterable[W1], Iterable[W2], Iterable[W3]))] =
    this.cogroup(rhs1, rhs2, rhs3)

  /**
   * Partition this SCollection using `K.##` into `n` partitions. Note that K should provide
   * consistent hash code accross different JVM.
   *
   * @param numPartitions
   *   number of output partitions
   * @return
   *   partitioned SCollections in a `Seq`
   * @group collection
   */
  def hashPartitionByKey(numPartitions: Int): Seq[SCollection[(K, V)]] =
    self.partition(
      numPartitions,
      elem => Math.floorMod(ScioUtil.consistentHashCode(elem._1), numPartitions)
    )

  // =======================================================================
  // Joins
  // =======================================================================

  /**
   * Perform a full outer join of `this` and `rhs`. For each element (k, v) in `this`, the resulting
   * SCollection will either contain all pairs (k, (Some(v), Some(w))) for w in `rhs`, or the pair
   * (k, (Some(v), None)) if no elements in `rhs` have key k. Similarly, for each element (k, w) in
   * `rhs`, the resulting SCollection will either contain all pairs (k, (Some(v), Some(w))) for v in
   * `this`, or the pair (k, (None, Some(w))) if no elements in `this` have key k.
   * @group join
   */
  def fullOuterJoin[W](rhs: SCollection[(K, W)]): SCollection[(K, (Option[V], Option[W]))] =
    ArtisanJoin.outer(self.tfName, self, rhs)

  /**
   * Return an SCollection containing all pairs of elements with matching keys in `this` and `rhs`.
   * Each pair of elements will be returned as a (k, (v1, v2)) tuple, where (k, v1) is in `this` and
   * (k, v2) is in `rhs`.
   * @group join
   */
  def join[W](rhs: SCollection[(K, W)]): SCollection[(K, (V, W))] =
    ArtisanJoin(self.tfName, self, rhs)

  /**
   * Perform a left outer join of `this` and `rhs`. For each element (k, v) in `this`, the resulting
   * SCollection will either contain all pairs (k, (v, Some(w))) for w in `rhs`, or the pair (k, (v,
   * None)) if no elements in `rhs` have key k.
   * @group join
   */
  def leftOuterJoin[W](rhs: SCollection[(K, W)]): SCollection[(K, (V, Option[W]))] =
    ArtisanJoin.left(self.tfName, self, rhs)

  /**
   * Perform a right outer join of `this` and `rhs`. For each element (k, w) in `rhs`, the resulting
   * SCollection will either contain all pairs (k, (Some(v), w)) for v in `this`, or the pair (k,
   * (None, w)) if no elements in `this` have key k.
   * @group join
   */
  def rightOuterJoin[W](rhs: SCollection[(K, W)]): SCollection[(K, (Option[V], W))] =
    ArtisanJoin.right(self.tfName, self, rhs)

  /**
   * Full outer join for cases when the left collection (`this`) is much larger than the right
   * collection (`rhs`) which cannot fit in memory, but contains a mostly overlapping set of keys as
   * the left collection, i.e. when the intersection of keys is sparse in the left collection. A
   * Bloom Filter of keys from the right collection (`rhs`) is used to split `this` into 2
   * partitions. Only those with keys in the filter go through the join and the rest are
   * concatenated. This is useful for joining historical aggregates with incremental updates.
   *
   * Import `magnolify.guava.auto._` to get common instances of Guava
   * [[com.google.common.hash.Funnel Funnel]] s.
   *
   * Read more about Bloom Filter: [[com.google.common.hash.BloomFilter]].
   *
   * @group join
   * @param rhsNumKeys
   *   An estimate of the number of keys in the right collection `rhs`. This estimate is used to
   *   find the size and number of BloomFilters rhs Scio would use to split the left collection
   *   (`this`) into overlap and intersection in a "map" step before an exact join. Having a value
   *   close to the actual number improves the false positives in intermediate steps which means
   *   less shuffle.
   * @param fpProb
   *   A fraction in range (0, 1) which would be the accepted false positive probability when
   *   computing the overlap. Note: having fpProb = 0 doesn't mean that Scio would calculate an
   *   exact overlap.
   */
  def sparseFullOuterJoin[W](
    rhs: SCollection[(K, W)],
    rhsNumKeys: Long,
    fpProb: Double = 0.01
  )(implicit funnel: Funnel[K]): SCollection[(K, (Option[V], Option[W]))] = self.transform { me =>
    implicit val wCoder = rhs.valueCoder
    SCollection.unionAll(
      split(me, rhs, rhsNumKeys, fpProb).map { case (lhsUnique, lhsOverlap, rhs) =>
        val unique = lhsUnique.map(kv => (kv._1, (Option(kv._2), Option.empty[W])))
        unique ++ lhsOverlap.fullOuterJoin(rhs)
      }
    )
  }

  /**
   * Inner join for cases when the left collection (`this`) is much larger than the right collection
   * (`rhs`) which cannot fit in memory, but contains a mostly overlapping set of keys as the left
   * collection, i.e. when the intersection of keys is sparse in the left collection. A Bloom Filter
   * of keys from the right collection (`rhs`) is used to split `this` into 2 partitions. Only those
   * with keys in the filter go through the join and the rest are filtered out before the join.
   *
   * Import `magnolify.guava.auto._` to get common instances of Guava
   * [[com.google.common.hash.Funnel Funnel]] s.
   *
   * Read more about Bloom Filter: [[com.google.common.hash.BloomFilter]].
   *
   * @group join
   * @param rhsNumKeys
   *   An estimate of the number of keys in the right collection `rhs`. This estimate is used to
   *   find the size and number of BloomFilters that Scio would use to split the left collection
   *   (`this`) into overlap and intersection in a "map" step before an exact join. Having a value
   *   close to the actual number improves the false positives in intermediate steps which means
   *   less shuffle.
   * @param fpProb
   *   A fraction in range (0, 1) which would be the accepted false positive probability when
   *   computing the overlap. Note: having fpProb = 0 doesn't mean that Scio would calculate an
   *   exact overlap.
   */
  def sparseJoin[W](
    rhs: SCollection[(K, W)],
    rhsNumKeys: Long,
    fpProb: Double = 0.01
  )(implicit funnel: Funnel[K]): SCollection[(K, (V, W))] =
    self.transform { me =>
      implicit val wCoder = rhs.valueCoder
      SCollection.unionAll(
        split(me, rhs, rhsNumKeys, fpProb).map { case (_, lhsOverlap, rhs) =>
          lhsOverlap.join(rhs)
        }
      )
    }

  /**
   * Left outer join for cases when the left collection (`this`) is much larger than the right
   * collection (`rhs`) which cannot fit in memory, but contains a mostly overlapping set of keys as
   * the left collection, i.e. when the intersection of keys is sparse in the left collection. A
   * Bloom Filter of keys from the right collection (`rhs`) is used to split `this` into 2
   * partitions. Only those with keys in the filter go through the join and the rest are
   * concatenated. This is useful for joining historical aggregates with incremental updates.
   *
   * Import `magnolify.guava.auto._` to get common instances of Guava
   * [[com.google.common.hash.Funnel Funnel]] s.
   *
   * Read more about Bloom Filter: [[com.google.common.hash.BloomFilter]].
   *
   * @group join
   * @param rhsNumKeys
   *   An estimate of the number of keys in the right collection `rhs`. This estimate is used to
   *   find the size and number of BloomFilters that Scio would use to split the left collection
   *   (`this`) into overlap and intersection in a "map" step before an exact join. Having a value
   *   close to the actual number improves the false positives in intermediate steps which means
   *   less shuffle.
   * @param fpProb
   *   A fraction in range (0, 1) which would be the accepted false positive probability when
   *   computing the overlap. Note: having fpProb = 0 doesn't mean that Scio would calculate an
   *   exact overlap.
   */
  def sparseLeftOuterJoin[W](
    rhs: SCollection[(K, W)],
    rhsNumKeys: Long,
    fpProb: Double = 0.01
  )(implicit funnel: Funnel[K]): SCollection[(K, (V, Option[W]))] =
    self.transform { me =>
      implicit val wCoder = rhs.valueCoder
      SCollection.unionAll(
        split(me, rhs, rhsNumKeys, fpProb).map { case (lhsUnique, lhsOverlap, rhs) =>
          val unique = lhsUnique.map(kv => (kv._1, (kv._2, Option.empty[W])))
          unique ++ lhsOverlap.leftOuterJoin(rhs)
        }
      )
    }

  /**
   * Right outer join for cases when the left collection (`this`) is much larger than the right
   * collection (`rhs`) which cannot fit in memory, but contains a mostly overlapping set of keys as
   * the left collection, i.e. when the intersection of keys is sparse in the left collection. A
   * Bloom Filter of keys from the right collection (`rhs`) is used to split `this` into 2
   * partitions. Only those with keys in the filter go through the join and the rest are
   * concatenated. This is useful for joining historical aggregates with incremental updates.
   *
   * Import `magnolify.guava.auto._` to get common instances of Guava
   * [[com.google.common.hash.Funnel Funnel]] s.
   *
   * Read more about Bloom Filter: [[com.google.common.hash.BloomFilter]].
   *
   * @group join
   * @param rhsNumKeys
   *   An estimate of the number of keys in the right collection `rhs`. This estimate is used to
   *   find the size and number of BloomFilters that Scio would use to split the left collection
   *   (`this`) into overlap and intersection in a "map" step before an exact join. Having a value
   *   close to the actual number improves the false positives in intermediate steps which means
   *   less shuffle.
   * @param fpProb
   *   A fraction in range (0, 1) which would be the accepted false positive probability when
   *   computing the overlap. Note: having fpProb = 0 doesn't mean that Scio would calculate an
   *   exact overlap.
   */
  def sparseRightOuterJoin[W](
    rhs: SCollection[(K, W)],
    rhsNumKeys: Long,
    fpProb: Double = 0.01
  )(implicit funnel: Funnel[K]): SCollection[(K, (Option[V], W))] =
    self.transform { me =>
      implicit val wCoder = rhs.valueCoder
      SCollection.unionAll(
        split(me, rhs, rhsNumKeys, fpProb).map { case (_, lhsOverlap, rhs) =>
          lhsOverlap.rightOuterJoin(rhs)
        }
      )
    }

  /*
   Internal to PairSCollectionFunctions
   Split up parameter `thisSColl` into
   Seq(
     (KeysUniqueInSelf, KeysOverlappingWith`rhsSColl`, PartOfRHSSColl)
   )
   The number of SCollection tuples in the Seq is based on the number of BloomFilters required to
   maintain the given false positive probability for the split of `thisSColl` into Unique and
   Overlap. This function is used by Sparse Join transforms.
   */
  private def split[W](
    thisSColl: SCollection[(K, V)],
    rhsSColl: SCollection[(K, W)],
    rhsNumKeys: Long,
    fpProb: Double
  )(implicit
    funnel: Funnel[K]
  ): Seq[(SCollection[(K, V)], SCollection[(K, V)], SCollection[(K, W)])] = {
    val rhsBfSIs = BloomFilter.createPartitionedSideInputs(rhsSColl.keys, rhsNumKeys, fpProb)
    val n = rhsBfSIs.size

    val thisParts = thisSColl.hashPartitionByKey(n)
    val rhsParts = rhsSColl.hashPartitionByKey(n)

    thisParts.zip(rhsParts).zip(rhsBfSIs).map { case ((lhs, rhs), bfsi) =>
      val (lhsUnique, lhsOverlap) = (SideOutput[(K, V)](), SideOutput[(K, V)]())
      val partitionedLhs = lhs
        .withSideInputs(bfsi)
        .transformWithSideOutputs(Seq(lhsUnique, lhsOverlap)) { (e, c) =>
          if (c(bfsi).mightContain(e._1)) {
            lhsOverlap
          } else {
            lhsUnique
          }
        }
      (partitionedLhs(lhsUnique), partitionedLhs(lhsOverlap), rhs)
    }
  }

  /**
   * Look up values from `rhs` where `rhs` is much larger and keys from `this` wont fit in memory,
   * and is sparse in `rhs`. A Bloom Filter of keys in `this` is used to filter out irrelevant keys
   * in `rhs`. This is useful when searching for a limited number of values from one or more very
   * large tables. Read more about Bloom Filter: [[com.google.common.hash.BloomFilter]].
   * @group join
   * @param thisNumKeys
   *   An estimate of the number of keys in `this`. This estimate is used to find the size and
   *   number of BloomFilters that Scio would use to pre-filter `rhs` before doing a co-group.
   *   Having a value close to the actual number improves the false positives in intermediate steps
   *   which means less shuffle.
   * @param fpProb
   *   A fraction in range (0, 1) which would be the accepted false positive probability when
   *   discarding elements of `rhs` in the pre-filter step.
   */
  def sparseLookup[A](rhs: SCollection[(K, A)], thisNumKeys: Long, fpProb: Double)(implicit
    funnel: Funnel[K]
  ): SCollection[(K, (V, Iterable[A]))] = self.transform { sColl =>
    implicit val aCoder = rhs.valueCoder
    val selfBfSideInputs = BloomFilter.createPartitionedSideInputs(sColl.keys, thisNumKeys, fpProb)
    val n = selfBfSideInputs.size

    val thisParts = sColl.hashPartitionByKey(n)
    val rhsParts = rhs.hashPartitionByKey(n)

    SCollection.unionAll(
      thisParts
        .zip(selfBfSideInputs)
        .zip(rhsParts)
        .map { case ((lhs, lhsBfSi), rhs1) =>
          lhs
            .cogroup(
              rhs1
                .withSideInputs(lhsBfSi)
                .filter((e, c) => c(lhsBfSi).mightContain(e._1))
                .toSCollection
            )
            .flatMap { case (k, (iV, iA)) => iV.map(v => (k, (v, iA))) }
        }
    )
  }

  /**
   * Look up values from `rhs` where `rhs` is much larger and keys from `this` wont fit in memory,
   * and is sparse in `rhs`. A Bloom Filter of keys in `this` is used to filter out irrelevant keys
   * in `rhs`. This is useful when searching for a limited number of values from one or more very
   * large tables. Read more about Bloom Filter: [[com.google.common.hash.BloomFilter]].
   * @group join
   * @param thisNumKeys
   *   An estimate of the number of keys in `this`. This estimate is used to find the size and
   *   number of BloomFilters that Scio would use to pre-filter `rhs` before doing a co-group.
   *   Having a value close to the actual number improves the false positives in intermediate steps
   *   which means less shuffle.
   */
  def sparseLookup[A](rhs: SCollection[(K, A)], thisNumKeys: Long)(implicit
    funnel: Funnel[K]
  ): SCollection[(K, (V, Iterable[A]))] = sparseLookup(rhs, thisNumKeys, 0.01)

  /**
   * Look up values from `rhs` where `rhs` is much larger and keys from `this` wont fit in memory,
   * and is sparse in `rhs`. A Bloom Filter of keys in `this` is used to filter out irrelevant keys
   * in `rhs`. This is useful when searching for a limited number of values from one or more very
   * large tables.
   *
   * Import `magnolify.guava.auto._` to get common instances of Guava
   * [[com.google.common.hash.Funnel Funnel]] s.
   *
   * Read more about Bloom Filter: [[com.google.common.hash.BloomFilter]].
   *
   * @group join
   * @param thisNumKeys
   *   An estimate of the number of keys in `this`. This estimate is used to find the size and
   *   number of BloomFilters that Scio would use to pre-filter `rhs1` and `rhs2` before doing a
   *   co-group. Having a value close to the actual number improves the false positives in
   *   intermediate steps which means less shuffle.
   * @param fpProb
   *   A fraction in range (0, 1) which would be the accepted false positive probability when
   *   discarding elements of `rhs1` and `rhs2` in the pre-filter step.
   */
  def sparseLookup[A, B](
    rhs1: SCollection[(K, A)],
    rhs2: SCollection[(K, B)],
    thisNumKeys: Long,
    fpProb: Double
  )(implicit funnel: Funnel[K]): SCollection[(K, (V, Iterable[A], Iterable[B]))] = self.transform {
    sColl =>
      implicit val aCoder = rhs1.valueCoder
      implicit val bCoder = rhs2.valueCoder
      val selfBfSideInputs =
        BloomFilter.createPartitionedSideInputs(sColl.keys, thisNumKeys, fpProb)
      val n = selfBfSideInputs.size

      val thisParts = sColl.hashPartitionByKey(n)
      val rhs1Parts = rhs1.hashPartitionByKey(n)
      val rhs2Parts = rhs2.hashPartitionByKey(n)

      SCollection.unionAll(
        thisParts.zip(selfBfSideInputs).zip(rhs1Parts).zip(rhs2Parts).map {
          case (((lhs, lhsBfSi), rhs1), rhs2) =>
            lhs
              .cogroup(
                rhs1
                  .withSideInputs(lhsBfSi)
                  .filter((e, c) => c(lhsBfSi).mightContain(e._1))
                  .toSCollection,
                rhs2
                  .withSideInputs(lhsBfSi)
                  .filter((e, c) => c(lhsBfSi).mightContain(e._1))
                  .toSCollection
              )
              .flatMap { case (k, (iV, iA, iB)) => iV.map(v => (k, (v, iA, iB))) }
        }
      )
  }

  /**
   * Look up values from `rhs` where `rhs` is much larger and keys from `this` wont fit in memory,
   * and is sparse in `rhs`. A Bloom Filter of keys in `this` is used to filter out irrelevant keys
   * in `rhs`. This is useful when searching for a limited number of values from one or more very
   * large tables.
   *
   * Import `magnolify.guava.auto._` to get common instances of Guava
   * [[com.google.common.hash.Funnel Funnel]] s.
   *
   * Read more about Bloom Filter: [[com.google.common.hash.BloomFilter]].
   *
   * @group join
   * @param thisNumKeys
   *   An estimate of the number of keys in `this`. This estimate is used to find the size and
   *   number of BloomFilters that Scio would use to pre-filter `rhs` before doing a co-group.
   *   Having a value close to the actual number improves the false positives in intermediate steps
   *   which means less shuffle.
   */
  def sparseLookup[A, B](
    rhs1: SCollection[(K, A)],
    rhs2: SCollection[(K, B)],
    thisNumKeys: Long
  )(implicit funnel: Funnel[K]): SCollection[(K, (V, Iterable[A], Iterable[B]))] =
    sparseLookup(rhs1, rhs2, thisNumKeys, 0.01)

  // =======================================================================
  // Transformations
  // =======================================================================

  /**
   * Aggregate the values of each key, using given combine functions and a neutral "zero value".
   * This function can return a different result type, `U`, than the type of the values in this
   * SCollection, `V`. Thus, we need one operation for merging a `V` into a `U` and one operation
   * for merging two `U``'s. To avoid memory allocation, both of these functions are allowed to
   * modify and return their first argument instead of creating a new `U`.
   * @group per_key
   */
  def aggregateByKey[U: Coder](
    zeroValue: => U
  )(seqOp: (U, V) => U, combOp: (U, U) => U): SCollection[(K, U)] =
    this
      .applyPerKey(
        Combine.perKey(Functions.aggregateFn(context, zeroValue)(seqOp, combOp))
      )(kvToTuple)

  /**
   * Aggregate the values of each key with [[com.twitter.algebird.Aggregator Aggregator]]. First
   * each value `V` is mapped to `A`, then we reduce with a
   * [[com.twitter.algebird.Semigroup Semigroup]] of `A`, then finally we present the results as
   * `U`. This could be more powerful and better optimized in some cases.
   * @group per_key
   */
  def aggregateByKey[A: Coder, U: Coder](aggregator: Aggregator[V, A, U]): SCollection[(K, U)] =
    self.transform { in =>
      val a = aggregator // defeat closure
      in.mapValues(a.prepare)
        .sumByKey(a.semigroup)
        .mapValues(a.present)
    }

  /**
   * Aggregate the values of each key with
   * [[com.twitter.algebird.MonoidAggregator MonoidAggregator]]. First each value `V` is mapped to
   * `A`, then we reduce with a [[com.twitter.algebird.Monoid Monoid]] of `A`, then finally we
   * present the results as `U`. This could be more powerful and better optimized in some cases.
   * @group per_key
   */
  def aggregateByKey[A: Coder, U: Coder](
    aggregator: MonoidAggregator[V, A, U]
  ): SCollection[(K, U)] = self.transform { in =>
    val a = aggregator // defeat closure
    in.mapValues(a.prepare)
      .foldByKey(a.monoid)
      .mapValues(a.present)
  }

  /**
   * For each key, compute the values' data distribution using approximate `N`-tiles.
   * @return
   *   a new SCollection whose values are `Iterable`s of the approximate `N`-tiles of the elements.
   * @group per_key
   */
  def approxQuantilesByKey(
    numQuantiles: Int
  )(implicit ord: Ordering[V]): SCollection[(K, Iterable[V])] =
    this.applyPerKey(ApproximateQuantiles.perKey(numQuantiles, ord))(kvListToTuple)

  /**
   * Generic function to combine the elements for each key using a custom set of aggregation
   * functions. Turns an `SCollection[(K, V)]` into a result of type `SCollection[(K, C)]`, for a
   * "combined type" `C` Note that `V` and `C` can be different -- for example, one might group an
   * SCollection of type `(Int, Int)` into an SCollection of type `(Int, Seq[Int])`. Users provide
   * three functions:
   *
   *   - `createCombiner`, which turns a `V` into a `C` (e.g., creates a one-element list)
   *
   *   - `mergeValue`, to merge a `V` into a `C` (e.g., adds it to the end of a list)
   *
   *   - `mergeCombiners`, to combine two `C`'s into a single one.
   *
   * Both `mergeValue` and `mergeCombiners` are allowed to modify and return their first argument
   * instead of creating a new `U` to avoid memory allocation.
   *
   * @group per_key
   */
  def combineByKey[C: Coder](
    createCombiner: V => C
  )(mergeValue: (C, V) => C)(mergeCombiners: (C, C) => C): SCollection[(K, C)] = {
    PairSCollectionFunctions.logger.warn(
      "combineByKey/sumByKey does not support default value and may fail in some streaming " +
        "scenarios. Consider aggregateByKey/foldByKey instead."
    )
    this.applyPerKey(
      Combine.perKey(
        Functions.combineFn(context, createCombiner, mergeValue, mergeCombiners)
      )
    )(kvToTuple)
  }

  /**
   * Count approximate number of distinct values for each key in the SCollection.
   * @param sampleSize
   *   the number of entries in the statistical sample; the higher this number, the more accurate
   *   the estimate will be; should be `>= 16`.
   * @group per_key
   */
  def countApproxDistinctByKey(sampleSize: Int): SCollection[(K, Long)] =
    ApproximateUniqueCounter(sampleSize)
      .estimateDistinctCountPerKey(this.self)

  /**
   * Count approximate number of distinct values for each key in the SCollection.
   * @param maximumEstimationError
   *   the maximum estimation error, which should be in the range `[0.01, 0.5]`.
   * @group per_key
   */
  def countApproxDistinctByKey(maximumEstimationError: Double = 0.02): SCollection[(K, Long)] =
    ApproximateUniqueCounterByError(maximumEstimationError)
      .estimateDistinctCountPerKey(this.self)

  /**
   * Return a new SCollection of (key, value) pairs where value is estimated distinct count(as Long)
   * per each unique key. Correctness of the estimation is depends on the given
   * [[ApproxDistinctCounter]] estimator.
   *
   * @example
   *   {{{
   *   val input: SCollection[(K, V)] = ...
   *   val distinctCount: SCollection[(K, Long)] =
   *       input.approximateDistinctCountPerKey(ApproximateUniqueCounter(sampleSize))
   *   }}}
   *
   * There are two known subclass of [[ApproxDistinctCounter]] available for HLL++ implementations
   * in the `scio-extra` module.
   *
   *   - [[com.spotify.scio.extra.hll.sketching.SketchingHyperLogLogPlusPlus]]
   *   - [[com.spotify.scio.extra.hll.zetasketch.ZetasketchHll_Counter]]
   *
   * HyperLogLog++: [[https://research.google/pubs/pub40671/ Google HLL++ paper]]
   *
   * @return
   *   a key valued SCollection where value type is Long and hold the approximate distinct count.
   */
  def countApproxDistinctByKey(estimator: ApproxDistinctCounter[V]): SCollection[(K, Long)] =
    estimator.estimateDistinctCountPerKey(this.self)

  /**
   * Count the number of elements for each key.
   * @return
   *   a new SCollection of (key, count) pairs
   * @group per_key
   */
  def countByKey: SCollection[(K, Long)] =
    self.transform(_.keys.countByValue)

  /**
   * Return a new SCollection of (key, value) pairs without duplicates based on the keys. The value
   * is taken randomly for each key.
   *
   * @return
   *   a new SCollection of (key, value) pairs
   * @group per_key
   */
  def distinctByKey: SCollection[(K, V)] =
    self.distinctBy(_._1)

  /**
   * Convert values into pairs of (value, timestamp).
   * @group transform
   */
  def withTimestampedValues: SCollection[(K, (V, Instant))] =
    self.parDo(new DoFn[(K, V), (K, (V, Instant))] {
      @ProcessElement
      private[scio] def processElement(
        @Element element: (K, V),
        @Timestamp timestamp: Instant,
        out: OutputReceiver[(K, (V, Instant))]
      ): Unit = {
        val (k, v) = element
        out.output((k, (v, timestamp)))
      }
    })

  /**
   * Return a new SCollection of (key, value) pairs whose values satisfy the predicate.
   * @group transform
   */
  def filterValues(f: V => Boolean): SCollection[(K, V)] =
    self.filter(kv => f(kv._2))

  /**
   * Pass each value in the key-value pair SCollection through a `flatMap` function without changing
   * the keys.
   * @group transform
   */
  def flatMapValues[U: Coder](f: V => TraversableOnce[U]): SCollection[(K, U)] =
    self.flatMap(kv => f(kv._2).iterator.map(v => (kv._1, v)))

  /**
   * Merge the values for each key using an associative function and a neutral "zero value" which
   * may be added to the result an arbitrary number of times, and must not change the result (e.g.,
   * Nil for list concatenation, 0 for addition, or 1 for multiplication.). The function op(t1, t2)
   * is allowed to modify t1 and return it as its result value to avoid object allocation; however,
   * it should not modify t2.
   *
   * @group per_key
   */
  def foldByKey(zeroValue: => V)(op: (V, V) => V): SCollection[(K, V)] =
    this.applyPerKey(Combine.perKey(Functions.aggregateFn(context, zeroValue)(op, op)))(
      kvToTuple
    )

  /**
   * Fold by key with [[com.twitter.algebird.Monoid Monoid]], which defines the associative function
   * and "zero value" for `V`. This could be more powerful and better optimized in some cases.
   * @group per_key
   */
  def foldByKey(implicit mon: Monoid[V]): SCollection[(K, V)] =
    this.applyPerKey(Combine.perKey(Functions.reduceFn(context, mon)))(kvToTuple)

  /**
   * Group the values for each key in the SCollection into a single sequence. The ordering of
   * elements within each group is not guaranteed, and may even differ each time the resulting
   * SCollection is evaluated.
   *
   * Note: This operation may be very expensive. If you are grouping in order to perform an
   * aggregation (such as a sum or average) over each key, using
   * [[PairSCollectionFunctions.aggregateByKey[U]* PairSCollectionFunctions.aggregateByKey]] or
   * [[PairSCollectionFunctions.reduceByKey]] will provide much better performance.
   *
   * Note: As currently implemented, `groupByKey` must be able to hold all the key-value pairs for
   * any key in memory. If a key has too many values, it can result in an `OutOfMemoryError`.
   * @group per_key
   */
  def groupByKey: SCollection[(K, Iterable[V])] =
    this
      .applyPerKey(GroupByKey.create[K, V]())(kvIterableToTuple)(
        Coder.aggregate,
        Coder.iterableCoder
      )
      .withState(_.copy(postGbkOp = true))

  /**
   * Batches inputs to a desired batch size. Batches will contain only elements of a single key.
   *
   * Elements are buffered until there are batchSize elements buffered, at which point they are
   * emitted to the output [[SCollection]].
   *
   * Windows are preserved (batches contain elements from the same window). Batches may contain
   * elements from more than one bundle.
   *
   * A time limit (in processing time) on how long an incomplete batch of elements is allowed to be
   * buffered can be set. Once a batch is flushed to output, the timer is reset. The provided limit
   * must be a positive duration or zero; a zero buffering duration effectively means no limit.
   *
   * @param batchSize
   * @param maxBufferingDuration
   *
   * @group per_key
   */
  def batchByKey(
    batchSize: Long,
    maxBufferingDuration: Duration = Duration.ZERO
  ): SCollection[(K, Iterable[V])] = {
    val groupIntoBatches = GroupIntoBatches
      .ofSize[K, V](batchSize)
      .withMaxBufferingDuration(maxBufferingDuration)
    this.applyPerKey(groupIntoBatches)(kvIterableToTuple)(Coder.aggregate, Coder.iterableCoder)
  }

  /**
   * Batches inputs to a desired batch of byte size. Batches will contain only elements of a single
   * key.
   *
   * The value coder is used to determine the byte size of each element.
   *
   * Elements are buffered until there are an estimated batchByteSize bytes buffered, at which point
   * they are emitted to the output [[SCollection]].
   *
   * Windows are preserved (batches contain elements from the same window). Batches may contain
   * elements from more than one bundle.
   *
   * A time limit (in processing time) on how long an incomplete batch of elements is allowed to be
   * buffered can be set. Once a batch is flushed to output, the timer is reset. The provided limit
   * must be a positive duration or zero; a zero buffering duration effectively means no limit.
   *
   * @param batchByteSize
   * @param maxBufferingDuration
   *
   * @group per_key
   */
  def batchByteSizedByKey(
    batchByteSize: Long,
    maxBufferingDuration: Duration = Duration.ZERO
  ): SCollection[(K, Iterable[V])] = {
    val groupIntoBatches = GroupIntoBatches
      .ofByteSize[K, V](batchByteSize)
      .withMaxBufferingDuration(maxBufferingDuration)
    this.applyPerKey(groupIntoBatches)(kvIterableToTuple)(Coder.aggregate, Coder.iterableCoder)
  }

  /**
   * Batches inputs to a desired weight. Batches will contain only elements of a single key.
   *
   * The weight of each element is computer from the provided cost function.
   *
   * Elements are buffered until the weight is reached, at which point they are emitted to the
   * output [[SCollection]].
   *
   * Windows are preserved (batches contain elements from the same window). Batches may contain
   * elements from more than one bundle.
   *
   * A time limit (in processing time) on how long an incomplete batch of elements is allowed to be
   * buffered can be set. Once a batch is flushed to output, the timer is reset. The provided limit
   * must be a positive duration or zero; a zero buffering duration effectively means no limit.
   *
   * @param weight
   * @param cost
   * @param maxBufferingDuration
   *
   * @group per_key
   */
  def batchWeightedByKey(
    weight: Long,
    cost: V => Long,
    maxBufferingDuration: Duration = Duration.ZERO
  ): SCollection[(K, Iterable[V])] = {
    val weigher = Functions.serializableFn(cost.andThen(_.asInstanceOf[java.lang.Long]))
    val groupIntoBatches = GroupIntoBatches
      .ofByteSize[K, V](weight, weigher)
      .withMaxBufferingDuration(maxBufferingDuration)
    this.applyPerKey(groupIntoBatches)(kvIterableToTuple)(Coder.aggregate, Coder.iterableCoder)
  }

  /**
   * Return an SCollection with the pairs from `this` whose keys are in `rhs`.
   *
   * Unlike [[SCollection.intersection]] this preserves duplicates in `this`.
   *
   * @group per_key
   */
  def intersectByKey(rhs: SCollection[K]): SCollection[(K, V)] = self.transform {
    _.cogroup(rhs.map((_, ()))).flatMap { t =>
      if (t._2._1.nonEmpty && t._2._2.nonEmpty) t._2._1.map((t._1, _))
      else Seq.empty
    }
  }

  /**
   * Return an SCollection with the pairs from `this` whose keys might be present in the
   * [[SideInput]].
   *
   * The `SideInput[ApproxFilter]` can be used reused for multiple sparse operations across multiple
   * SCollections.
   *
   * @example
   *   {{{
   *   val si = pairSCollRight.asApproxFilterSideInput(BloomFilter, 1000000)
   *   val filtered1 = pairSColl1.sparseIntersectByKey(si)
   *   val filtered2 = pairSColl2.sparseIntersectByKey(si)
   *   }}}
   * @group per_key
   */
  def sparseIntersectByKey[AF <: ApproxFilter[K]](sideInput: SideInput[AF]): SCollection[(K, V)] =
    self.transform {
      _.withSideInputs(sideInput)
        .filter { case ((k, _), c) =>
          c(sideInput).mightContain(k)
        }
        .toSCollection
    }

  /**
   * Return an SCollection with the pairs from `this` whose keys are in `rhs` when the cardinality
   * of `this` >> `rhs`, but neither can fit in memory (see
   * [[PairHashSCollectionFunctions.hashIntersectByKey]]).
   *
   * Unlike [[SCollection.intersection]] this preserves duplicates in `this`.
   *
   * Import `magnolify.guava.auto._` to get common instances of Guava
   * [[com.google.common.hash.Funnel Funnel]] s.
   *
   * @param rhsNumKeys
   *   An estimate of the number of keys in `rhs`. This estimate is used to find the size and number
   *   of BloomFilters that Scio would use to pre-filter `this` in a "map" step before any join.
   *   Having a value close to the actual number improves the false positives in output. When
   *   `computeExact` is set to true, a more accurate estimate of the number of keys in `rhs` would
   *   mean less shuffle when finding the exact value.
   * @param computeExact
   *   Whether or not to directly pass through bloom filter results (with a small false positive
   *   rate) or perform an additional inner join to confirm exact result set. By default this is set
   *   to false.
   * @param fpProb
   *   A fraction in range (0, 1) which would be the accepted false positive probability for this
   *   transform. By default when `computeExact` is set to `false`, this reflects the probability
   *   that an output element is an incorrect intersect (meaning it may not be present in `rhs`)
   *   When `computeExact` is set to `true`, this fraction is used to find the acceptable false
   *   positive in the intermediate step before computing exact. Note: having fpProb = 0 doesn't
   *   mean an exact computation. This value along with `rhsNumKeys` is used for creating a
   *   BloomFilter.
   * @group per_key
   */
  def sparseIntersectByKey(
    rhs: SCollection[K],
    rhsNumKeys: Long,
    computeExact: Boolean = false,
    fpProb: Double = 0.01
  )(implicit funnel: Funnel[K]): SCollection[(K, V)] =
    self.transform { me =>
      val rhsBfs = BloomFilter.createPartitionedSideInputs(rhs, rhsNumKeys, fpProb)
      val n = rhsBfs.size
      val thisParts = me.hashPartitionByKey(n)
      val rhsParts = rhs.hashPartition(n)
      SCollection.unionAll(
        thisParts
          .zip(rhsParts)
          .zip(rhsBfs)
          .map { case ((lhs, rhs), rhsBf) =>
            val approxResults = lhs
              .withSideInputs(rhsBf)
              .filter { case (e, c) => c(rhsBf).mightContain(e._1) }
              .toSCollection

            if (computeExact) {
              approxResults
                .intersectByKey(rhs)
            } else {
              approxResults
            }
          }
      )
    }

  /**
   * Return an SCollection with the keys of each tuple.
   * @group transform
   */
  // Scala lambda is simpler and more powerful than transforms.Keys
  def keys: SCollection[K] =
    self.map(_._1)

  /**
   * Pass each key in the key-value pair SCollection through a `map` function without changing the
   * values.
   * @group transform
   */
  def mapKeys[U: Coder](f: K => U): SCollection[(U, V)] =
    self.map(kv => (f(kv._1), kv._2))

  /**
   * Pass each value in the key-value pair SCollection through a `map` function without changing the
   * keys.
   * @group transform
   */
  def mapValues[U: Coder](f: V => U): SCollection[(K, U)] =
    self.map(kv => (kv._1, f(kv._2)))

  /**
   * Return the min of values for each key as defined by the implicit `Ordering[T]`.
   * @return
   *   a new SCollection of (key, minimum value) pairs
   * @group per_key
   */
  // Scala lambda is simpler and more powerful than transforms.Min
  def minByKey(implicit ord: Ordering[V]): SCollection[(K, V)] =
    this.reduceByKey(ord.min)

  /**
   * Return the max of values for each key as defined by the implicit `Ordering[T]`.
   * @return
   *   a new SCollection of (key, maximum value) pairs
   * @group per_key
   */
  // Scala lambda is simpler and more powerful than transforms.Max
  def maxByKey(implicit ord: Ordering[V]): SCollection[(K, V)] =
    this.reduceByKey(ord.max)

  /**
   * Return latest of values for each key according to its event time, or null if there are no
   * elements.
   * @return
   *   a new SCollection of (key, latest value) pairs
   * @group per_key
   */
  def latestByKey: SCollection[(K, V)] =
    self.applyPerKey(Latest.perKey[K, V]())(kvToTuple)

  /**
   * Reduce by key with [[com.twitter.algebird.Semigroup Semigroup]]. This could be more powerful
   * and better optimized than [[reduceByKey]] in some cases.
   * @group per_key
   */
  def sumByKey(implicit sg: Semigroup[V]): SCollection[(K, V)] = {
    PairSCollectionFunctions.logger.warn(
      "combineByKey/sumByKey does not support default value and may fail in some streaming " +
        "scenarios. Consider aggregateByKey/foldByKey instead."
    )
    this.applyPerKey(Combine.perKey(Functions.reduceFn(context, sg)))(kvToTuple)
  }

  /**
   * Return the mean of values for each key as defined by the implicit `Numeric[T]`.
   * @return
   *   a new SCollection of (key, mean value) pairs
   * @group per_key
   */
  def meanByKey(implicit ev: Numeric[V]): SCollection[(K, Double)] =
    self.transform { in =>
      in.mapValues[JDouble](ev.toDouble).applyPerKey(Mean.perKey[K, JDouble]())(kdToTuple)
    }

  /**
   * Merge the values for each key using an associative reduce function. This will also perform the
   * merging locally on each mapper before sending results to a reducer, similarly to a "combiner"
   * in MapReduce.
   * @group per_key
   */
  def reduceByKey(op: (V, V) => V): SCollection[(K, V)] =
    this.applyPerKey(Combine.perKey(Functions.reduceFn(context, op)))(kvToTuple)

  /**
   * Return a sampled subset of values for each key of this SCollection.
   * @return
   *   a new SCollection of (key, sampled values) pairs
   * @group per_key
   */
  def sampleByKey(sampleSize: Int): SCollection[(K, Iterable[V])] =
    this.applyPerKey(Sample.fixedSizePerKey[K, V](sampleSize))(kvIterableToTuple)(
      Coder.aggregate,
      Coder.iterableCoder
    )

  /**
   * Return a subset of this SCollection sampled by key (via stratified sampling).
   *
   * Create a sample of this SCollection using variable sampling rates for different keys as
   * specified by `fractions`, a key to sampling rate map, via simple random sampling with one pass
   * over the SCollection, to produce a sample of size that's approximately equal to the sum of
   * `math.ceil(numItems * samplingRate)` over all key values.
   *
   * @param withReplacement
   *   whether to sample with or without replacement
   * @param fractions
   *   map of specific keys to sampling rates
   * @return
   *   SCollection containing the sampled subset
   * @group per_key
   */
  def sampleByKey(withReplacement: Boolean, fractions: Map[K, Double]): SCollection[(K, V)] =
    if (withReplacement) {
      self.parDo(new PoissonValueSampler[K, V](fractions))
    } else {
      self.parDo(new BernoulliValueSampler[K, V](fractions))
    }

  /**
   * Return an SCollection with the pairs from `this` whose keys are not in `rhs`.
   * @group per_key
   */
  def subtractByKey(rhs: SCollection[K]): SCollection[(K, V)] = self.transform {
    _.cogroup(rhs.map((_, ()))).flatMap { t =>
      if (t._2._1.nonEmpty && t._2._2.isEmpty) t._2._1.map((t._1, _))
      else Seq.empty
    }
  }

  /**
   * Swap the keys with the values.
   * @group transform
   */
  // Scala lambda is simpler than transforms.KvSwap
  def swap: SCollection[(V, K)] =
    self.map(kv => (kv._2, kv._1))

  /**
   * Return the top `num` (largest) values for each key from this SCollection as defined by the
   * specified implicit `Ordering[T]`.
   *
   * @return
   *   a new SCollection of (key, top `num` values) pairs
   * @group per_key
   */
  def topByKey(num: Int)(implicit ord: Ordering[V]): SCollection[(K, Iterable[V])] =
    this.applyPerKey(Top.perKey[K, V, Ordering[V]](num, ord))(kvListToTuple)

  /**
   * Return an SCollection with the values of each tuple.
   * @group transform
   */
  // Scala lambda is simpler and more powerful than transforms.Values
  def values: SCollection[V] = self.map(_._2)

  /**
   * Return an SCollection having its values flattened.
   * @group transform
   */
  def flattenValues[U: Coder](implicit ev: V <:< TraversableOnce[U]): SCollection[(K, U)] =
    self.flatMapValues(_.asInstanceOf[TraversableOnce[U]])

  // =======================================================================
  // Side input operations
  // =======================================================================

  /**
   * Convert this SCollection to a SideInput, mapping key-value pairs of each window to a `Map[key,
   * value]`, to be used with [[SCollection.withSideInputs]]. It is required that each key of the
   * input be associated with a single value.
   *
   * Note: the underlying map implementation is runner specific and may have performance overhead.
   * Use [[asMapSingletonSideInput]] instead if the resulting map can fit into memory.
   */
  def asMapSideInput: SideInput[Map[K, V]] =
    new MapSideInput[K, V](self.transform_(_.toKV.applyInternal(View.asMap())))

  /**
   * Convert this SCollection to a SideInput, mapping key-value pairs of each window to a `Map[key,
   * Iterable[value]]`, to be used with [[SCollection.withSideInputs]]. In contrast to
   * [[asMapSideInput]], it is not required that the keys in the input collection be unique.
   *
   * Note: the underlying map implementation is runner specific and may have performance overhead.
   * Use [[asMultiMapSingletonSideInput]] instead if the resulting map can fit into memory.
   */
  def asMultiMapSideInput: SideInput[Map[K, Iterable[V]]] =
    new MultiMapSideInput[K, V](self.transform_(_.toKV.applyInternal(View.asMultimap())))

  /**
   * Convert this SCollection to a SideInput, mapping key-value pairs of each window to a `Map[key,
   * value]`, to be used with [[SCollection.withSideInputs]]. It is required that each key of the
   * input be associated with a single value.
   *
   * Currently, the resulting map is required to fit into memory. This is preferable to
   * [[asMapSideInput]] if that's the case.
   */
  def asMapSingletonSideInput: SideInput[Map[K, V]] =
    self
      .transform(
        _.groupByKey
          .map { kv =>
            val iter = kv._2.iterator
            val head = iter.next()
            require(iter.isEmpty, s"Multiple values for key ${kv._1}")
            (kv._1, head)
          }
          .groupBy(_ => ())
          .map(_._2.toMap)
      )
      .asSingletonSideInput(Map.empty[K, V])

  /**
   * Convert this SCollection to a SideInput, mapping key-value pairs of each window to a `Map[key,
   * Iterable[value]]`, to be used with [[SCollection.withSideInputs]]. In contrast to
   * [[asMapSingletonSideInput]], it is not required that the keys in the input collection be
   * unique.
   *
   * Currently, the resulting map is required to fit into memory. This is preferable to
   * [[asMultiMapSideInput]] if that's the case.
   */
  def asMultiMapSingletonSideInput: SideInput[Map[K, Iterable[V]]] =
    self
      .transform(
        _.groupByKey
          .groupBy(_ => ())
          .map(_._2.toMap)
      )
      .asSingletonSideInput(Map.empty[K, Iterable[V]])

  /** Returns an [[SCollection]] consisting of a single `Map[K, V]` element. */
  def reifyAsMapInGlobalWindow: SCollection[Map[K, V]] =
    self.reifyInGlobalWindow(_.asMapSideInput)

  /** Returns an [[SCollection]] consisting of a single `Map[K, Iterable[V]]` element. */
  def reifyAsMultiMapInGlobalWindow: SCollection[Map[K, Iterable[V]]] =
    self.reifyInGlobalWindow(_.asMultiMapSideInput)
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy