com.spotify.scio.io.ScioIO.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of scio-core_2.13 Show documentation
Show all versions of scio-core_2.13 Show documentation
Scio - A Scala API for Apache Beam and Google Cloud Dataflow
The newest version!
/*
* Copyright 2019 Spotify AB.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.spotify.scio.io
import com.spotify.scio.ScioContext
import com.spotify.scio.coders.Coder
import com.spotify.scio.testing.TestDataManager
import com.spotify.scio.values.{SCollection, SideOutputCollections}
import scala.annotation.unused
sealed trait TapT[A] extends Serializable {
type T
def saveForTest(data: SCollection[A]): Tap[T]
}
object TapT {
type Aux[A, T0] = TapT[A] { type T = T0 }
}
final class EmptyTapOf[A] private extends TapT[A] {
override type T = Nothing
override def saveForTest(data: SCollection[A]): Tap[T] = EmptyTap
}
object EmptyTapOf { def apply[A]: TapT.Aux[A, Nothing] = new EmptyTapOf[A] }
final class TapOf[A] private extends TapT[A] {
override type T = A
override def saveForTest(data: SCollection[A]): Tap[T] =
data.saveAsInMemoryTap.underlying
}
object TapOf { def apply[A]: TapT.Aux[A, A] = new TapOf[A] }
/**
* Base trait for all Read/Write IO classes. Every IO connector must implement this. This trait has
* two abstract implicit methods #read, #write that need to be implemented in every subtype. Look at
* the [[com.spotify.scio.io.TextIO]] subclass for a reference implementation. IO connectors can
* choose to override #readTest and #writeTest if custom test logic is necessary.
*/
trait ScioIO[T] {
/** Read parameter type */
type ReadP
/** Write parameter type */
type WriteP
/**
* Output tap type.
*
* This _must_ be a stable value (a `val`, not a `def`) in every implementation, otherwise the
* return type of write cannot be inferred.
*/
val tapT: TapT[T]
/** Identifier for JobTest IO matching */
def testId: String = this.toString
/**
* Read data according to the read configuration provided in `params` as an SCollection, or return
* test data if this is in a JobTest
*/
private[scio] def readWithContext(sc: ScioContext, params: ReadP): SCollection[T] =
sc.requireNotClosed(if (sc.isTest) readTest(sc, params) else read(sc, params))
/** Called only in a JobTest. Return test data for this `testId` as an SCollection */
protected def readTest(sc: ScioContext, @unused params: ReadP): SCollection[T] =
TestDataManager.getInput(sc.testId.get)(this).toSCollection(sc)
/** Read data according to the read configuration provided in `params` as an SCollection. */
protected def read(sc: ScioContext, params: ReadP): SCollection[T]
/**
* Write `data` out according to write configuration provided in `params`, or if this is a test
* write to TestDataManager, returning the Tap type
*/
private[scio] def writeWithContext(data: SCollection[T], params: WriteP): ClosedTap[tapT.T] = {
val tap = if (data.context.isTest) writeTest(data, params) else write(data, params)
ClosedTap(tap)
}
/**
* Write `data` out according to write configuration provided in `params`, returning the Tap type.
*/
protected def write(data: SCollection[T], params: WriteP): Tap[tapT.T]
/** Called only in a JobTest. Write `data` to TestDataManager output and return the Tap type */
protected def writeTest(data: SCollection[T], @unused params: WriteP): Tap[tapT.T] = {
TestDataManager.getOutput(data.context.testId.get)(this)(data)
tapT.saveForTest(data)
}
/**
* Write options also return a `ClosedTap`. Once the job completes you can open the `Tap`. Tap
* abstracts away the logic of reading the dataset directly as an Iterator[T] or re-opening it in
* another ScioContext. The Future is complete once the job finishes. This can be used to do light
* weight pipeline orchestration e.g. WordCountOrchestration.scala.
*/
def tap(read: ReadP): Tap[tapT.T]
}
object ScioIO {
type ReadOnly[T, R] =
ScioIO[T] {
type ReadP = R
type WriteP = Nothing
}
type Aux[T, R, W] =
ScioIO[T] {
type ReadP = R
type WriteP = W
}
def ro[T](io: ScioIO[T]): ScioIO.ReadOnly[T, io.ReadP] =
new ScioIO[T] {
override type ReadP = io.ReadP
override type WriteP = Nothing
override val tapT: TapT.Aux[T, io.tapT.T] = io.tapT
override def testId: String = io.testId
override protected def read(sc: ScioContext, params: ReadP): SCollection[T] =
io.read(sc, params)
override protected def write(data: SCollection[T], params: WriteP): Tap[io.tapT.T] =
throw new UnsupportedOperationException("read-only IO. This code should be unreachable")
override def tap(params: ReadP): Tap[io.tapT.T] = io.tap(params)
}
}
/** Base trait for [[ScioIO]] without business logic, for stubbing mock data with `JobTest`. */
trait TestIO[T] extends ScioIO[T] {
override type ReadP = Nothing
override type WriteP = Nothing
override protected def read(sc: ScioContext, params: ReadP): SCollection[T] =
throw new UnsupportedOperationException(s"$this is for testing purpose only")
override protected def write(data: SCollection[T], params: WriteP): Tap[tapT.T] =
throw new UnsupportedOperationException(s"$this is for testing purpose only")
override def tap(params: ReadP): Tap[tapT.T] =
throw new UnsupportedOperationException(s"$this is for testing purpose only")
}
trait KeyedIO[T] { self: ScioIO[T] =>
type KeyT
def keyBy: T => KeyT
def keyCoder: Coder[KeyT]
}
trait WriteResultIO[T] { self: ScioIO[T] =>
override private[scio] def writeWithContext(
data: SCollection[T],
params: WriteP
): ClosedTap[tapT.T] = {
if (data.context.isTest) {
val tap = writeTest(data, params)
// TODO: add possibility to fill this for testing
val output = SideOutputCollections.empty(data.context)
ClosedTap(tap, Some(output))
} else {
val (tap, outputs) = writeWithResult(data, params)
ClosedTap(tap, Some(outputs))
}
}
final override def write(data: SCollection[T], params: self.WriteP): Tap[self.tapT.T] =
throw new UnsupportedOperationException(
"WriteResultIO cannot be used with #write. Use #writeWithResult instead"
)
protected def writeWithResult(
data: SCollection[T],
params: self.WriteP
): (Tap[self.tapT.T], SideOutputCollections)
}
/**
* Special version of [[ScioIO]] for use with [[ScioContext.customInput]] and
* [[SCollection.saveAsCustomOutput]].
*/
final case class CustomIO[T](id: String) extends TestIO[T] {
override val tapT: TapT.Aux[T, T] = TapOf[T]
}
/**
* Special version of [[ScioIO]] for use with [[SCollection.readAll]] and
* [[SCollection.readAllBytes]].
*/
final case class ReadIO[T](id: String) extends TestIO[T] {
override val tapT: TapT.Aux[T, T] = TapOf[T]
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy