All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.spotify.scio.values.SCollectionWithHotKeyFanout.scala Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2019 Spotify AB.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package com.spotify.scio.values

import com.spotify.scio.ScioContext
import com.spotify.scio.coders.Coder
import com.spotify.scio.util.Functions
import com.spotify.scio.util.TupleFunctions._
import com.twitter.algebird.{Aggregator, Monoid, MonoidAggregator, Semigroup}
import org.apache.beam.sdk.transforms.Combine.PerKeyWithHotKeyFanout
import org.apache.beam.sdk.transforms.Top.TopCombineFn
import org.apache.beam.sdk.transforms.{Combine, Mean, SerializableFunction}
import org.joda.time.ReadableInstant

import java.lang.{Double => JDouble}

/**
 * An enhanced SCollection that uses an intermediate node to combine "hot" keys partially before
 * performing the full combine.
 */
class SCollectionWithHotKeyFanout[K, V] private[values] (
  private val self: PairSCollectionFunctions[K, V],
  private val hotKeyFanout: Either[K => Int, Int]
) extends TransformNameable {

  private[this] val context: ScioContext = self.self.context
  implicit private[this] val valueCoder: Coder[V] = self.valueCoder

  private def withFanout[K0, I, O](
    combine: Combine.PerKey[K0, I, O]
  ): PerKeyWithHotKeyFanout[K0, I, O] =
    this.hotKeyFanout match {
      case Left(f) =>
        combine.withHotKeyFanout(
          Functions
            .serializableFn(f)
            .asInstanceOf[SerializableFunction[K0, java.lang.Integer]]
        )
      case Right(f) =>
        combine.withHotKeyFanout(f)
    }

  override def withName(name: String): this.type = {
    self.self.withName(name)
    this
  }

  /**
   * [[PairSCollectionFunctions.aggregateByKey[U]* PairSCollectionFunctions.aggregateByKey]] with
   * hot key fanout.
   */
  def aggregateByKey[U: Coder](
    zeroValue: U
  )(seqOp: (U, V) => U, combOp: (U, U) => U): SCollection[(K, U)] = {
    val cmb = Combine.perKey[K, V, U](Functions.aggregateFn(context, zeroValue)(seqOp, combOp))
    self.applyPerKey(withFanout(cmb))(kvToTuple)
  }

  /**
   * [[PairSCollectionFunctions.aggregateByKey[A,U]* PairSCollectionFunctions.aggregateByKey]] with
   * hot key fanout.
   */
  def aggregateByKey[A: Coder, U: Coder](aggregator: Aggregator[V, A, U]): SCollection[(K, U)] =
    self.self.transform { in =>
      val a = aggregator // defeat closure
      new SCollectionWithHotKeyFanout(in.mapValues(a.prepare), hotKeyFanout)
        .sumByKey(a.semigroup)
        .mapValues(a.present)
    }

  /**
   * [[PairSCollectionFunctions.aggregateByKey[A,U]* PairSCollectionFunctions.aggregateByKey]] with
   * hot key fanout.
   */
  def aggregateByKey[A: Coder, U: Coder](
    aggregator: MonoidAggregator[V, A, U]
  ): SCollection[(K, U)] = {
    self.self.transform { in =>
      val a = aggregator // defeat closure
      new SCollectionWithHotKeyFanout(in.mapValues(a.prepare), hotKeyFanout)
        .foldByKey(a.monoid)
        .mapValues(a.present)
    }
  }

  /** [[PairSCollectionFunctions.combineByKey]] with hot key fanout. */
  def combineByKey[C: Coder](
    createCombiner: V => C
  )(mergeValue: (C, V) => C)(mergeCombiners: (C, C) => C): SCollection[(K, C)] = {
    SCollection.logger.warn(
      "combineByKey/sumByKey does not support default value and may fail in some streaming " +
        "scenarios. Consider aggregateByKey/foldByKey instead."
    )
    self.applyPerKey(
      withFanout(
        Combine
          .perKey(Functions.combineFn(context, createCombiner, mergeValue, mergeCombiners))
      )
    )(kvToTuple)
  }

  /**
   * [[PairSCollectionFunctions.foldByKey(zeroValue:V)* PairSCollectionFunctions.foldByKey]] with
   * hot key fanout.
   */
  def foldByKey(zeroValue: V)(op: (V, V) => V): SCollection[(K, V)] =
    self.applyPerKey(
      withFanout(Combine.perKey(Functions.aggregateFn(context, zeroValue)(op, op)))
    )(kvToTuple)

  /**
   * [[PairSCollectionFunctions.foldByKey(implicit* PairSCollectionFunctions.foldByKey]] with hot
   * key fanout.
   */
  def foldByKey(implicit mon: Monoid[V]): SCollection[(K, V)] =
    self.applyPerKey(withFanout(Combine.perKey(Functions.reduceFn(context, mon))))(
      kvToTuple
    )

  /** [[PairSCollectionFunctions.reduceByKey]] with hot key fanout. */
  def reduceByKey(op: (V, V) => V): SCollection[(K, V)] =
    self.applyPerKey(withFanout(Combine.perKey(Functions.reduceFn(context, op))))(kvToTuple)

  /** [[SCollection.min]] with hot key fan out. */
  def minByKey(implicit ord: Ordering[V]): SCollection[(K, V)] =
    self.reduceByKey(ord.min)

  /** [[SCollection.max]] with hot key fan out. */
  def maxByKey(implicit ord: Ordering[V]): SCollection[(K, V)] =
    self.reduceByKey(ord.max)

  /** [[SCollection.latest]] with hot key fan out. */
  def latestByKey: SCollection[(K, V)] = {
    self.self.transform { in =>
      new SCollectionWithHotKeyFanout(in.withTimestampedValues, this.hotKeyFanout)
        // widen to ReadableInstant for scala 2.12 implicit ordering
        .maxByKey(Ordering.by(_._2: ReadableInstant))
        .mapValues(_._1)
    }
  }

  /** [[SCollection.mean]] with hot key fan out. */
  def meanByKey(implicit ev: Numeric[V]): SCollection[(K, Double)] = {
    val e = ev // defeat closure
    self.self.transform { in =>
      in.mapValues[JDouble](e.toDouble).applyPerKey(Mean.perKey[K, JDouble]())(kdToTuple)
    }
  }

  /** [[PairSCollectionFunctions.sumByKey]] with hot key fanout. */
  def sumByKey(implicit sg: Semigroup[V]): SCollection[(K, V)] = {
    SCollection.logger.warn(
      "combineByKey/sumByKey does not support default value and may fail in some streaming " +
        "scenarios. Consider aggregateByKey/foldByKey instead."
    )
    self.applyPerKey(withFanout(Combine.perKey(Functions.reduceFn(context, sg))))(kvToTuple)
  }

  /** [[PairSCollectionFunctions.topByKey]] with hot key fanout. */
  def topByKey(num: Int)(implicit ord: Ordering[V]): SCollection[(K, Iterable[V])] =
    self.applyPerKey(withFanout(Combine.perKey(new TopCombineFn[V, Ordering[V]](num, ord))))(
      kvListToTuple
    )
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy