All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.spotify.scio.values.SCollectionWithSideInput.scala Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2016 Spotify AB.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package com.spotify.scio.values

import com.google.cloud.dataflow.sdk.transforms.{DoFn, ParDo}
import com.google.cloud.dataflow.sdk.values.{PCollection, TupleTag, TupleTagList}
import com.spotify.scio.ScioContext
import com.spotify.scio.util.FunctionsWithSideInput.SideInputDoFn
import com.spotify.scio.util.{CallSites, ClosureCleaner, FunctionsWithSideInput}

import scala.collection.JavaConverters._
import scala.reflect.ClassTag
import scala.util.Try

/**
 * An enhanced SCollection that provides access to one or more [[SideInput]]s for some transforms.
 * [[SideInput]]s are accessed via the additional [[SideInputContext]] argument.
 */
class SCollectionWithSideInput[T: ClassTag] private[values] (val internal: PCollection[T],
                                                             val context: ScioContext,
                                                             sides: Iterable[SideInput[_]])
  extends PCollectionWrapper[T] {

  protected val ct: ClassTag[T] = implicitly[ClassTag[T]]

  private val parDo = ParDo.withSideInputs(sides.map(_.view).asJava)

  /** [[SCollection.filter]] with an additional SideInputContext argument. */
  def filter(f: (T, SideInputContext[T]) => Boolean): SCollectionWithSideInput[T] = {
    val o = this
      .pApply(parDo.of(FunctionsWithSideInput.filterFn(f)))
      .internal.setCoder(this.getCoder[T])
    new SCollectionWithSideInput[T](o, context, sides)
  }

  /** [[SCollection.flatMap]] with an additional SideInputContext argument. */
  def flatMap[U: ClassTag](f: (T, SideInputContext[T]) => TraversableOnce[U])
  : SCollectionWithSideInput[U] = {
    val o = this
      .pApply(parDo.of(FunctionsWithSideInput.flatMapFn(f)))
      .internal.setCoder(this.getCoder[U])
    new SCollectionWithSideInput[U](o, context, sides)
  }

  /** [[SCollection.keyBy]] with an additional SideInputContext argument. */
  def keyBy[K: ClassTag](f: (T, SideInputContext[T]) => K): SCollectionWithSideInput[(K, T)] =
    this.map((x, s) => (f(x, s), x))

  /** [[SCollection.map]] with an additional SideInputContext argument. */
  def map[U: ClassTag](f: (T, SideInputContext[T]) => U): SCollectionWithSideInput[U] = {
    val o = this
      .pApply(parDo.of(FunctionsWithSideInput.mapFn(f)))
      .internal.setCoder(this.getCoder[U])
    new SCollectionWithSideInput[U](o, context, sides)
  }

  /**
   * Allows multiple outputs from [[SCollectionWithSideInput]]
   *
   * @return map of side output to [[SCollection]]
   */
  private[values] def transformWithSideOutputs(sideOutputs: Seq[SideOutput[T]],
                                             f: (T, SideInputContext[T]) => SideOutput[T],
                                             mainOutput: SideOutput[T] = null)
  : Map[SideOutput[T], SCollection[T]] = {
    val _mainTag = Option(mainOutput).getOrElse(SideOutput[T]())
    val tagToSide = sideOutputs.map(e => e.tupleTag.getId -> e).toMap +
      (_mainTag.tupleTag.getId -> _mainTag)

    val sideTags = TupleTagList.of(sideOutputs.map(e =>
      e.tupleTag.asInstanceOf[TupleTag[_]]).asJava)

    def transformWithSideOutputsFn(partitions: Seq[SideOutput[T]],
                                   f: (T, SideInputContext[T]) => SideOutput[T])
    : DoFn[T, T] = new SideInputDoFn[T, T] {
      val g = ClosureCleaner(f) // defeat closure

      override def processElement(c: DoFn[T, T]#ProcessContext): Unit = {
        val elem = c.element()
        val partition = g(elem, sideInputContext(c))
        if (!partitions.exists(_.tupleTag == partition.tupleTag)) {
          throw new IllegalStateException(s"""${partition.tupleTag.getId} is not part of
            ${partitions.map(_.tupleTag.getId).mkString}""")
        }

        c.sideOutput(partition.tupleTag, elem)
      }
    }

    val transform = parDo
      .withOutputTags(_mainTag.tupleTag, sideTags)
      .of(transformWithSideOutputsFn(sideOutputs, f))

    val pCollectionWrapper = this.internal.apply(CallSites.getCurrent, transform)
    pCollectionWrapper.getAll.asScala
      .mapValues(context.wrap(_).asInstanceOf[SCollection[T]].setCoder(internal.getCoder))
      .flatMap{ case(tt, col) => Try{tagToSide(tt.getId) -> col}.toOption }
      .toMap
  }

  /** Convert back to a basic SCollection. */
  def toSCollection: SCollection[T] = context.wrap(internal)

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy