Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
com.spotify.scio.bigquery.BigQueryIO.scala Maven / Gradle / Ivy
/*
* Copyright 2019 Spotify AB.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.spotify.scio.bigquery
import com.google.api.services.bigquery.model.TableSchema
import com.spotify.scio.ScioContext
import com.spotify.scio.bigquery.client.BigQuery
import com.spotify.scio.bigquery.types.BigQueryType.HasAnnotation
import com.spotify.scio.coders._
import com.spotify.scio.io._
import com.spotify.scio.util.{FilenamePolicySupplier, Functions, ScioUtil}
import com.spotify.scio.values.{SCollection, SideOutput, SideOutputCollections}
import com.twitter.chill.ClosureCleaner
import org.apache.avro.generic.GenericRecord
import org.apache.beam.sdk.extensions.gcp.options.GcpOptions
import org.apache.beam.sdk.io.Compression
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.TypedRead.{Method => ReadMethod}
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.{
CreateDisposition,
Method => WriteMethod,
WriteDisposition
}
import org.apache.beam.sdk.io.gcp.bigquery._
import org.apache.beam.sdk.io.gcp.{bigquery => beam}
import org.apache.beam.sdk.transforms.SerializableFunction
import org.apache.beam.sdk.values.{PCollection, PCollectionTuple}
import org.joda.time.Duration
import java.util.concurrent.ConcurrentHashMap
import java.util.function
import scala.jdk.CollectionConverters._
import scala.reflect.runtime.universe._
import scala.util.chaining._
private object Reads {
private[this] val cache = new ConcurrentHashMap[ScioContext, BigQuery]()
@inline private def client(sc: ScioContext): BigQuery =
cache.computeIfAbsent(
sc,
new function.Function[ScioContext, BigQuery] {
override def apply(context: ScioContext): BigQuery = {
val opts = context.optionsAs[GcpOptions]
BigQuery(opts.getProject, opts.getGcpCredential)
}
}
)
private[scio] def bqReadQuery[T](sc: ScioContext)(
typedRead: beam.BigQueryIO.TypedRead[T],
sqlQuery: String,
flattenResults: Boolean = false
): SCollection[T] = {
val bigQueryClient = client(sc)
val labels = sc.labels
val read = bigQueryClient.query
.newQueryJob(sqlQuery, flattenResults, labels)
.map { job =>
sc.onClose(_ => bigQueryClient.waitForJobs(job))
typedRead.from(job.table).withoutValidation()
}
sc.applyTransform(read.get)
}
// TODO: support labels Inheritance like in bqReadQuery
private[scio] def bqReadStorage[T](sc: ScioContext)(
typedRead: beam.BigQueryIO.TypedRead[T],
table: Table,
selectedFields: List[String] = BigQueryStorage.ReadParam.DefaultSelectFields,
rowRestriction: Option[String] = BigQueryStorage.ReadParam.DefaultRowRestriction
): SCollection[T] = {
val read = typedRead
.from(table.spec)
.withMethod(ReadMethod.DIRECT_READ)
.withSelectedFields(selectedFields.asJava)
.pipe(r => rowRestriction.fold(r)(r.withRowRestriction))
sc.applyTransform(read)
}
}
private[bigquery] object Writes {
def resolveMethod(
method: WriteMethod,
options: BigQueryOptions,
isBounded: PCollection.IsBounded
): WriteMethod = (method, isBounded) match {
case (WriteMethod.DEFAULT, _)
if options.getUseStorageWriteApi && options.getUseStorageWriteApiAtLeastOnce =>
WriteMethod.STORAGE_API_AT_LEAST_ONCE
case (WriteMethod.DEFAULT, _) if options.getUseStorageWriteApi =>
WriteMethod.STORAGE_WRITE_API
case (WriteMethod.DEFAULT, PCollection.IsBounded.BOUNDED) =>
WriteMethod.FILE_LOADS
case (WriteMethod.DEFAULT, PCollection.IsBounded.UNBOUNDED) =>
WriteMethod.STREAMING_INSERTS
case _ =>
method
}
def withSharding[T](method: WriteMethod, w: beam.BigQueryIO.Write[T])(
sharding: Sharding
): beam.BigQueryIO.Write[T] = {
import WriteMethod._
(sharding, method) match {
case (Sharding.Auto, _) =>
w.withAutoSharding()
case (Sharding.Manual(numShards), FILE_LOADS) =>
w.withNumFileShards(numShards)
case (Sharding.Manual(numShards), STORAGE_WRITE_API | STORAGE_API_AT_LEAST_ONCE) =>
w.withNumStorageWriteApiStreams(numShards)
case _ =>
w
}
}
def withSuccessfulInsertsPropagation[T](method: WriteMethod, w: beam.BigQueryIO.Write[T])(
successfulInsertsPropagation: Boolean
): beam.BigQueryIO.Write[T] = {
import WriteMethod._
method match {
case STREAMING_INSERTS =>
w.withSuccessfulInsertsPropagation(successfulInsertsPropagation)
case STORAGE_WRITE_API | STORAGE_API_AT_LEAST_ONCE =>
w.withPropagateSuccessfulStorageApiWrites(successfulInsertsPropagation)
case _ =>
w
}
}
def sideOutputs(
data: SCollection[_],
method: WriteMethod,
successfulInsertsPropagation: Boolean,
extendedErrorInfo: Boolean,
result: WriteResult
): SideOutputCollections = {
import WriteMethod._
val sc = data.context
var tuple = PCollectionTuple.empty(sc.pipeline)
// success side output
method match {
case FILE_LOADS =>
tuple = tuple.and(BigQueryIO.SuccessfulTableLoads.tupleTag, result.getSuccessfulTableLoads)
case STREAMING_INSERTS if successfulInsertsPropagation =>
tuple = tuple.and(BigQueryIO.SuccessfulInserts.tupleTag, result.getSuccessfulInserts)
case STORAGE_WRITE_API | STORAGE_API_AT_LEAST_ONCE if successfulInsertsPropagation =>
tuple = tuple.and(
BigQueryIO.SuccessfulStorageApiInserts.tupleTag,
result.getSuccessfulStorageApiInserts
)
case _ =>
()
}
// failure side output
method match {
case STREAMING_INSERTS if extendedErrorInfo =>
tuple = tuple.and(BigQueryIO.FailedInsertsWithErr.tupleTag, result.getFailedInsertsWithErr)
case FILE_LOADS | STREAMING_INSERTS =>
tuple = tuple.and(BigQueryIO.FailedInserts.tupleTag, result.getFailedInserts)
case STORAGE_WRITE_API | STORAGE_API_AT_LEAST_ONCE =>
tuple =
tuple.and(BigQueryIO.FailedStorageApiInserts.tupleTag, result.getFailedStorageApiInserts)
case _ =>
()
}
SideOutputCollections(tuple, sc)
}
trait WriteParam[T] {
def configOverride: beam.BigQueryIO.Write[T] => beam.BigQueryIO.Write[T]
}
trait WriteParamDefaults {
type ConfigOverride[T] = beam.BigQueryIO.Write[T] => beam.BigQueryIO.Write[T]
val DefaultMethod: WriteMethod = WriteMethod.DEFAULT
val DefaultSchema: TableSchema = null
val DefaultWriteDisposition: WriteDisposition = null
val DefaultCreateDisposition: CreateDisposition = null
val DefaultTableDescription: String = null
val DefaultTimePartitioning: TimePartitioning = null
val DefaultClustering: Clustering = null
val DefaultTriggeringFrequency: Duration = null
val DefaultSharding: Sharding = null
val DefaultFailedInsertRetryPolicy: InsertRetryPolicy = null
val DefaultSuccessfulInsertsPropagation: Boolean = false
val DefaultExtendedErrorInfo: Boolean = false
val DefaultConfigOverride: Null = null
}
}
sealed trait BigQueryIO[T] extends ScioIO[T] {
final override val tapT: TapT.Aux[T, T] = TapOf[T]
}
object BigQueryIO {
implicit lazy val coderTableDestination: Coder[TableDestination] = Coder.kryo
lazy val SuccessfulTableLoads: SideOutput[TableDestination] = SideOutput()
lazy val SuccessfulInserts: SideOutput[TableRow] = SideOutput()
lazy val SuccessfulStorageApiInserts: SideOutput[TableRow] = SideOutput()
implicit lazy val coderBigQueryInsertError: Coder[BigQueryInsertError] = Coder.kryo
implicit lazy val coderBigQueryStorageApiInsertError: Coder[BigQueryStorageApiInsertError] =
Coder.kryo
lazy val FailedInserts: SideOutput[TableRow] = SideOutput()
lazy val FailedInsertsWithErr: SideOutput[BigQueryInsertError] = SideOutput()
lazy val FailedStorageApiInserts: SideOutput[BigQueryStorageApiInsertError] = SideOutput()
private[bigquery] val storageWriteMethod =
Set(WriteMethod.STORAGE_WRITE_API, WriteMethod.STORAGE_API_AT_LEAST_ONCE)
@inline final def apply[T](id: String): BigQueryIO[T] =
new BigQueryIO[T] with TestIO[T] {
override def testId: String = s"BigQueryIO($id)"
}
@inline final def apply[T](source: Source): BigQueryIO[T] =
new BigQueryIO[T] with TestIO[T] {
override def testId: String = source match {
case t: Table => s"BigQueryIO(${t.spec})"
case q: Query => s"BigQueryIO(${q.underlying})"
}
}
@inline final def apply[T](
id: String,
selectedFields: List[String],
rowRestriction: Option[String]
): BigQueryIO[T] =
new BigQueryIO[T] with TestIO[T] {
override def testId: String =
s"BigQueryIO($id, List(${selectedFields.mkString(",")}), $rowRestriction)"
}
}
object BigQueryTypedSelect {
object ReadParam {
val DefaultFlattenResults: Boolean = false
}
final case class ReadParam private (flattenResults: Boolean = ReadParam.DefaultFlattenResults)
}
final case class BigQueryTypedSelect[T: Coder](
reader: beam.BigQueryIO.TypedRead[T],
sqlQuery: Query,
fromTableRow: TableRow => T
) extends BigQueryIO[T] {
override type ReadP = BigQueryTypedSelect.ReadParam
override type WriteP = Nothing // ReadOnly
override def testId: String = s"BigQueryIO(${sqlQuery.underlying})"
override protected def read(sc: ScioContext, params: ReadP): SCollection[T] = {
val coder = CoderMaterializer.beam(sc, Coder[T])
val rc = reader.withCoder(coder)
Reads.bqReadQuery(sc)(rc, sqlQuery.underlying, params.flattenResults)
}
override protected def write(data: SCollection[T], params: WriteP): Tap[T] =
throw new UnsupportedOperationException("BigQuerySelect is read-only")
override def tap(params: ReadP): Tap[T] = {
val tableReference = BigQuery
.defaultInstance()
.query
.run(sqlQuery.underlying, flattenResults = params.flattenResults)
BigQueryTap(tableReference).map(fromTableRow)
}
}
/**
* Get an SCollection for a BigQuery SELECT query. Both
* [[https://cloud.google.com/bigquery/docs/reference/legacy-sql Legacy SQL]] and
* [[https://cloud.google.com/bigquery/docs/reference/standard-sql/ Standard SQL]] dialects are
* supported. By default the query dialect will be automatically detected. To override this
* behavior, start the query string with `#legacysql` or `#standardsql`.
*/
final case class BigQuerySelect(sqlQuery: Query) extends BigQueryIO[TableRow] {
override type ReadP = BigQuerySelect.ReadParam
override type WriteP = Nothing // ReadOnly
private[this] lazy val underlying =
BigQueryTypedSelect(beam.BigQueryIO.readTableRows(), sqlQuery, identity)(coders.tableRowCoder)
override def testId: String = s"BigQueryIO(${sqlQuery.underlying})"
override protected def read(sc: ScioContext, params: ReadP): SCollection[TableRow] =
sc.read(underlying)(params)
override protected def write(data: SCollection[TableRow], params: WriteP): Tap[TableRow] =
throw new UnsupportedOperationException("BigQuerySelect is read-only")
override def tap(params: ReadP): Tap[TableRow] = underlying.tap(params)
}
object BigQuerySelect {
type ReadParam = BigQueryTypedSelect.ReadParam
val ReadParam = BigQueryTypedSelect.ReadParam
@inline final def apply(sqlQuery: String): BigQuerySelect = new BigQuerySelect(Query(sqlQuery))
}
object BigQueryTypedTable {
/** Defines the format in which BigQuery can be read and written to. */
sealed abstract class Format[F]
object Format {
case object GenericRecord extends Format[GenericRecord]
case object TableRow extends Format[TableRow]
}
final case class WriteParam[T] private (
method: WriteMethod,
schema: TableSchema,
writeDisposition: WriteDisposition,
createDisposition: CreateDisposition,
tableDescription: String,
timePartitioning: TimePartitioning,
clustering: Clustering,
triggeringFrequency: Duration,
sharding: Sharding,
failedInsertRetryPolicy: InsertRetryPolicy,
successfulInsertsPropagation: Boolean,
extendedErrorInfo: Boolean,
configOverride: WriteParam.ConfigOverride[T]
) extends Writes.WriteParam[T]
object WriteParam extends Writes.WriteParamDefaults {
@inline final def apply[T](
method: WriteMethod = DefaultMethod,
schema: TableSchema = DefaultSchema,
writeDisposition: WriteDisposition = DefaultWriteDisposition,
createDisposition: CreateDisposition = DefaultCreateDisposition,
tableDescription: String = DefaultTableDescription,
timePartitioning: TimePartitioning = DefaultTimePartitioning,
clustering: Clustering = DefaultClustering,
triggeringFrequency: Duration = DefaultTriggeringFrequency,
sharding: Sharding = DefaultSharding,
failedInsertRetryPolicy: InsertRetryPolicy = DefaultFailedInsertRetryPolicy,
successfulInsertsPropagation: Boolean = DefaultSuccessfulInsertsPropagation,
extendedErrorInfo: Boolean = DefaultExtendedErrorInfo,
configOverride: ConfigOverride[T] = DefaultConfigOverride
): WriteParam[T] = new WriteParam(
method,
schema,
writeDisposition,
createDisposition,
tableDescription,
timePartitioning,
clustering,
triggeringFrequency,
sharding,
failedInsertRetryPolicy,
successfulInsertsPropagation,
extendedErrorInfo,
configOverride
)
}
private[this] def tableRow(table: Table): BigQueryTypedTable[TableRow] =
BigQueryTypedTable(
beam.BigQueryIO.readTableRows(),
beam.BigQueryIO.writeTableRows(),
table,
BigQueryUtils.convertGenericRecordToTableRow(_, _)
)(coders.tableRowCoder)
private[this] def genericRecord(
table: Table
)(implicit c: Coder[GenericRecord]): BigQueryTypedTable[GenericRecord] =
BigQueryTypedTable(
_.getRecord(),
identity[GenericRecord],
(genericRecord: GenericRecord, _: TableSchema) => genericRecord,
table
)
/**
* Creates a new instance of [[BigQueryTypedTable]] based on the supplied [[Format]].
*
* NOTE: LogicalType support when using `Format.GenericRecord` has some caveats: Reading: Bigquery
* types DATE, TIME, DATIME will be read as STRING. Writing: Supports LogicalTypes only for DATE
* and TIME. DATETIME is not yet supported. https://issuetracker.google.com/issues/140681683
*/
def apply[F: Coder](table: Table, format: Format[F]): BigQueryTypedTable[F] =
format match {
case Format.GenericRecord => genericRecord(table)
case Format.TableRow => tableRow(table)
}
def apply[T: Coder](
readerFn: SchemaAndRecord => T,
writerFn: T => TableRow,
tableRowFn: TableRow => T,
table: Table
): BigQueryTypedTable[T] = {
val rFn = ClosureCleaner.clean(readerFn)
val wFn = ClosureCleaner.clean(writerFn)
val reader = beam.BigQueryIO.read(Functions.serializableFn(rFn))
val writer = beam.BigQueryIO
.write[T]()
.withFormatFunction(Functions.serializableFn(wFn))
val fn: (GenericRecord, TableSchema) => T = (gr, ts) =>
tableRowFn(BigQueryUtils.convertGenericRecordToTableRow(gr, ts))
BigQueryTypedTable(reader, writer, table, fn)
}
def apply[T: Coder](
readerFn: SchemaAndRecord => T,
writerFn: T => GenericRecord,
fn: (GenericRecord, TableSchema) => T,
table: Table
): BigQueryTypedTable[T] = {
val rFn = ClosureCleaner.clean(readerFn)
val wFn = ClosureCleaner.clean(writerFn)
val reader = beam.BigQueryIO.read(rFn(_))
val writer = beam.BigQueryIO
.write[T]()
.useAvroLogicalTypes()
.withAvroFormatFunction(input => wFn(input.getElement()))
.withAvroSchemaFactory { ts =>
BigQueryAvroUtilsWrapper.toGenericAvroSchema("root", ts.getFields())
}
BigQueryTypedTable(reader, writer, table, fn)
}
}
final case class BigQueryTypedTable[T: Coder](
reader: beam.BigQueryIO.TypedRead[T],
writer: beam.BigQueryIO.Write[T],
table: Table,
fn: (GenericRecord, TableSchema) => T
) extends BigQueryIO[T]
with WriteResultIO[T] {
override type ReadP = Unit
override type WriteP = BigQueryTypedTable.WriteParam[T]
override def testId: String = s"BigQueryIO(${table.spec})"
override protected def read(sc: ScioContext, params: ReadP): SCollection[T] = {
val coder = CoderMaterializer.beam(sc, Coder[T])
val io = reader.from(table.ref).withCoder(coder)
sc.applyTransform(s"Read BQ table ${table.spec}", io)
}
override protected def writeWithResult(
data: SCollection[T],
params: WriteP
): (Tap[T], SideOutputCollections) = {
val method = Writes.resolveMethod(
params.method,
data.context.optionsAs[BigQueryOptions],
data.internal.isBounded
)
val transform = writer
.to(table.ref)
.withMethod(params.method)
.pipe(w => Option(params.schema).fold(w)(w.withSchema))
.pipe(w => Option(params.createDisposition).fold(w)(w.withCreateDisposition))
.pipe(w => Option(params.writeDisposition).fold(w)(w.withWriteDisposition))
.pipe(w => Option(params.tableDescription).fold(w)(w.withTableDescription))
.pipe(w => Option(params.timePartitioning).map(_.asJava).fold(w)(w.withTimePartitioning))
.pipe(w => Option(params.clustering).map(_.asJava).fold(w)(w.withClustering))
.pipe(w => Option(params.triggeringFrequency).fold(w)(w.withTriggeringFrequency))
.pipe(w => Option(params.sharding).fold(w)(Writes.withSharding(method, w)))
.pipe(w =>
Writes.withSuccessfulInsertsPropagation(method, w)(params.successfulInsertsPropagation)
)
.pipe(w => if (params.extendedErrorInfo) w.withExtendedErrorInfo() else w)
.pipe(w => Option(params.failedInsertRetryPolicy).fold(w)(w.withFailedInsertRetryPolicy))
.pipe(w => Option(params.configOverride).fold(w)(_(w)))
val wr = data.applyInternal(transform)
val outputs = Writes.sideOutputs(
data,
method,
params.successfulInsertsPropagation,
params.extendedErrorInfo,
wr
)
(tap(()), outputs)
}
override def tap(read: ReadP): Tap[T] = BigQueryTypedTap(table, fn)
}
/** Get an IO for a BigQuery table using the storage API. */
final case class BigQueryStorage(
table: Table,
selectedFields: List[String],
rowRestriction: Option[String]
) extends BigQueryIO[TableRow] {
override type ReadP = Unit
override type WriteP = Nothing // ReadOnly
override def testId: String =
s"BigQueryIO(${table.spec}, List(${selectedFields.mkString(",")}), $rowRestriction)"
override protected def read(sc: ScioContext, params: ReadP): SCollection[TableRow] = {
val coder = CoderMaterializer.beam(sc, coders.tableRowCoder)
val read = beam.BigQueryIO.readTableRows().withCoder(coder)
Reads.bqReadStorage(sc)(
read,
table,
selectedFields,
rowRestriction
)
}
override protected def write(data: SCollection[TableRow], params: WriteP): Tap[TableRow] =
throw new UnsupportedOperationException("BigQueryStorage is read-only")
override def tap(read: ReadP): Tap[TableRow] = {
val readOptions = StorageUtil.tableReadOptions(selectedFields, rowRestriction)
BigQueryStorageTap(table, readOptions)
}
}
object BigQueryStorage {
object ReadParam {
val DefaultSelectFields: List[String] = Nil
val DefaultRowRestriction: Option[String] = None
}
}
final case class BigQueryStorageSelect(sqlQuery: Query) extends BigQueryIO[TableRow] {
override type ReadP = Unit
override type WriteP = Nothing // ReadOnly
private[this] lazy val underlying =
BigQueryTypedSelect(
beam.BigQueryIO.readTableRows().withMethod(ReadMethod.DIRECT_READ),
sqlQuery,
identity
)(coders.tableRowCoder)
override def testId: String = s"BigQueryIO(${sqlQuery.underlying})"
override protected def read(sc: ScioContext, params: ReadP): SCollection[TableRow] =
sc.read(underlying)(BigQueryTypedSelect.ReadParam())
override protected def write(data: SCollection[TableRow], params: WriteP): Tap[TableRow] =
throw new UnsupportedOperationException("BigQuerySelect is read-only")
override def tap(params: ReadP): Tap[TableRow] = underlying.tap(BigQueryTypedSelect.ReadParam())
}
/** Get an IO for a BigQuery TableRow JSON file. */
final case class TableRowJsonIO(path: String) extends ScioIO[TableRow] {
override type ReadP = TableRowJsonIO.ReadParam
override type WriteP = TableRowJsonIO.WriteParam
override val tapT: TapT.Aux[TableRow, TableRow] = TapOf[TableRow]
override protected def read(sc: ScioContext, params: ReadP): SCollection[TableRow] =
sc.read(TextIO(path))(params)
.map(e => ScioUtil.jsonFactory.fromString(e, classOf[TableRow]))
override protected def write(data: SCollection[TableRow], params: WriteP): Tap[TableRow] = {
data
.map(ScioUtil.jsonFactory.toString)
.withName("BigQuery write")
.write(TextIO(path))(params)
tap(TableRowJsonIO.ReadParam(params))
}
override def tap(read: ReadP): Tap[TableRow] =
TableRowJsonTap(path, read)
}
object TableRowJsonIO {
type ReadParam = TextIO.ReadParam
val ReadParam = TextIO.ReadParam
type WriteParam = TextIO.WriteParam
object WriteParam {
val DefaultSuffix: String = ".json"
val DefaultNumShards: Int = TextIO.WriteParam.DefaultNumShards
val DefaultCompression: Compression = TextIO.WriteParam.DefaultCompression
val DefaultFilenamePolicySupplier: FilenamePolicySupplier =
TextIO.WriteParam.DefaultFilenamePolicySupplier
val DefaultPrefix: String = TextIO.WriteParam.DefaultPrefix
val DefaultShardNameTemplate: String = TextIO.WriteParam.DefaultShardNameTemplate
val DefaultTempDirectory: String = TextIO.WriteParam.DefaultTempDirectory
def apply(
suffix: String = DefaultSuffix,
numShards: Int = DefaultNumShards,
compression: Compression = DefaultCompression,
filenamePolicySupplier: FilenamePolicySupplier = DefaultFilenamePolicySupplier,
prefix: String = DefaultPrefix,
shardNameTemplate: String = DefaultShardNameTemplate,
tempDirectory: String = DefaultTempDirectory
): WriteParam = {
TextIO.WriteParam(
suffix = suffix,
numShards = numShards,
compression = compression,
filenamePolicySupplier = filenamePolicySupplier,
prefix = prefix,
shardNameTemplate = shardNameTemplate,
tempDirectory = tempDirectory
)
}
}
}
object BigQueryTyped {
import com.spotify.scio.bigquery.{Table => STable}
@annotation.implicitNotFound(
"""
Can't find annotation for type ${T}.
Make sure this class is annotated with BigQueryType.fromStorage, BigQueryType.fromTable or
BigQueryType.fromQuery.
Alternatively, use BigQueryTyped.Storage(""), BigQueryTyped.Table(""), or
BigQueryTyped.Query("") to get a ScioIO instance.
"""
)
sealed trait IO[T <: HasAnnotation] {
type F[_ <: HasAnnotation] <: ScioIO[_]
def impl: F[T]
}
object IO {
type Aux[T <: HasAnnotation, F0[_ <: HasAnnotation] <: ScioIO[_]] =
IO[T] { type F[A <: HasAnnotation] = F0[A] }
implicit def tableIO[T <: HasAnnotation: TypeTag: Coder](implicit
t: BigQueryType.Table[T]
): Aux[T, Table] =
new IO[T] {
type F[A <: HasAnnotation] = Table[A]
def impl: Table[T] = Table(STable.Spec(t.table))
}
implicit def queryIO[T <: HasAnnotation: TypeTag: Coder](implicit
t: BigQueryType.Query[T]
): Aux[T, Select] =
new IO[T] {
type F[A <: HasAnnotation] = Select[A]
def impl: Select[T] = Select(Query(t.queryRaw))
}
implicit def storageIO[T <: HasAnnotation: TypeTag: Coder](implicit
t: BigQueryType.StorageOptions[T]
): Aux[T, Storage] =
new IO[T] {
type F[A <: HasAnnotation] = Storage[A]
def impl: Storage[T] = Storage(STable.Spec(t.table), Nil, None)
}
}
/**
* Get a typed SCollection for a BigQuery table or a SELECT query.
*
* Note that `T` must be annotated with
* [[com.spotify.scio.bigquery.types.BigQueryType.fromTable BigQueryType.fromStorage]],
* [[com.spotify.scio.bigquery.types.BigQueryType.fromTable BigQueryType.fromTable]], or
* [[com.spotify.scio.bigquery.types.BigQueryType.fromQuery BigQueryType.fromQuery]]
*
* The source (table) specified in the annotation will be used
*/
@inline final def apply[T <: HasAnnotation](implicit t: IO[T]): t.F[T] =
t.impl
/**
* Get a typed SCollection for a BigQuery SELECT query.
*
* Both [[https://cloud.google.com/bigquery/docs/reference/legacy-sql Legacy SQL]] and
* [[https://cloud.google.com/bigquery/docs/reference/standard-sql/ Standard SQL]] dialects are
* supported. By default the query dialect will be automatically detected. To override this
* behavior, start the query string with `#legacysql` or `#standardsql`.
*/
final case class Select[T <: HasAnnotation: TypeTag: Coder](query: Query) extends BigQueryIO[T] {
override type ReadP = Unit
override type WriteP = Nothing // ReadOnly
private[this] lazy val underlying = {
val fromAvro = BigQueryType[T].fromAvro
val fromTableRow = BigQueryType[T].fromTableRow
val reader = beam.BigQueryIO
.read(new SerializableFunction[SchemaAndRecord, T] {
override def apply(input: SchemaAndRecord): T = fromAvro(input.getRecord)
})
BigQueryTypedSelect(reader, query, fromTableRow)
}
override def testId: String = s"BigQueryIO(${query.underlying})"
override protected def read(sc: ScioContext, params: ReadP): SCollection[T] =
sc.read(underlying)(BigQueryTypedSelect.ReadParam())
override protected def write(data: SCollection[T], params: WriteP): Tap[T] =
throw new UnsupportedOperationException("Select queries are read-only")
override def tap(params: ReadP): Tap[T] = underlying.tap(BigQueryTypedSelect.ReadParam())
}
object Select {
@inline final def apply[T <: HasAnnotation: TypeTag: Coder](
query: String
): Select[T] = new Select[T](Query(query))
}
/** Get a typed SCollection for a BigQuery table. */
final case class Table[T <: HasAnnotation: TypeTag: Coder](table: STable)
extends BigQueryIO[T]
with WriteResultIO[T] {
override type ReadP = Unit
override type WriteP = Table.WriteParam[T]
private val underlying = BigQueryTypedTable[T](
(i: SchemaAndRecord) => BigQueryType[T].fromAvro(i.getRecord),
BigQueryType[T].toTableRow,
BigQueryType[T].fromTableRow,
table
)
override def testId: String = s"BigQueryIO(${table.spec})"
override protected def read(sc: ScioContext, params: ReadP): SCollection[T] =
sc.read(underlying)
override protected def writeWithResult(
data: SCollection[T],
params: WriteP
): (Tap[T], SideOutputCollections) = {
val outputs = data
.withName(s"${data.tfName}$$Write")
.write(underlying)(params)
.outputs
.get
(tap(()), outputs)
}
override def tap(read: ReadP): Tap[T] =
BigQueryTypedTap[T](table, underlying.fn)
}
object Table {
final case class WriteParam[T] private (
method: WriteMethod,
writeDisposition: WriteDisposition,
createDisposition: CreateDisposition,
timePartitioning: TimePartitioning,
clustering: Clustering,
triggeringFrequency: Duration,
sharding: Sharding,
failedInsertRetryPolicy: InsertRetryPolicy,
successfulInsertsPropagation: Boolean,
extendedErrorInfo: Boolean,
configOverride: WriteParam.ConfigOverride[T]
) extends Writes.WriteParam[T]
object WriteParam extends Writes.WriteParamDefaults {
@inline final def apply[T](
method: WriteMethod = DefaultMethod,
writeDisposition: WriteDisposition = DefaultWriteDisposition,
createDisposition: CreateDisposition = DefaultCreateDisposition,
timePartitioning: TimePartitioning = DefaultTimePartitioning,
clustering: Clustering = DefaultClustering,
triggeringFrequency: Duration = DefaultTriggeringFrequency,
sharding: Sharding = DefaultSharding,
failedInsertRetryPolicy: InsertRetryPolicy = DefaultFailedInsertRetryPolicy,
successfulInsertsPropagation: Boolean = DefaultSuccessfulInsertsPropagation,
extendedErrorInfo: Boolean = DefaultExtendedErrorInfo,
configOverride: ConfigOverride[T] = DefaultConfigOverride
): WriteParam[T] = new WriteParam(
method,
writeDisposition,
createDisposition,
timePartitioning,
clustering,
triggeringFrequency,
sharding,
failedInsertRetryPolicy,
successfulInsertsPropagation,
extendedErrorInfo,
configOverride
)
implicit private[Table] def typedTableWriteParam[T: TypeTag, Info](
params: Table.WriteParam[T]
): BigQueryTypedTable.WriteParam[T] =
BigQueryTypedTable.WriteParam(
params.method,
BigQueryType[T].schema,
params.writeDisposition,
params.createDisposition,
BigQueryType[T].tableDescription.orNull,
params.timePartitioning,
params.clustering,
params.triggeringFrequency,
params.sharding,
params.failedInsertRetryPolicy,
params.successfulInsertsPropagation,
params.extendedErrorInfo,
params.configOverride
)
}
}
/** Get a typed SCollection for a BigQuery table using the storage API. */
final case class Storage[T <: HasAnnotation: TypeTag: Coder](
table: STable,
selectedFields: List[String],
rowRestriction: Option[String]
) extends BigQueryIO[T] {
override type ReadP = Unit
override type WriteP = Nothing // ReadOnly
override def testId: String =
s"BigQueryIO(${table.spec}, List(${selectedFields.mkString(",")}), $rowRestriction)"
override protected def read(sc: ScioContext, params: ReadP): SCollection[T] = {
val coder = CoderMaterializer.beam(sc, Coder[T])
val fromAvro = BigQueryType[T].fromAvro
val reader = beam.BigQueryIO
.read(new SerializableFunction[SchemaAndRecord, T] {
override def apply(input: SchemaAndRecord): T = fromAvro(input.getRecord)
})
.withCoder(coder)
Reads.bqReadStorage(sc)(reader, table, selectedFields, rowRestriction)
}
override protected def write(data: SCollection[T], params: WriteP): Tap[T] =
throw new UnsupportedOperationException("Storage API is read-only")
override def tap(read: ReadP): Tap[T] = {
val fn = BigQueryType[T].fromTableRow
val readOptions = StorageUtil.tableReadOptions(selectedFields, rowRestriction)
BigQueryStorageTap(table, readOptions).map(fn)
}
}
final case class StorageQuery[T <: HasAnnotation: TypeTag: Coder](sqlQuery: Query)
extends BigQueryIO[T] {
override type ReadP = Unit
override type WriteP = Nothing // ReadOnly
private[this] lazy val underlying = {
val fromAvro = BigQueryType[T].fromAvro
val fromTableRow = BigQueryType[T].fromTableRow
val reader = beam.BigQueryIO
.read(new SerializableFunction[SchemaAndRecord, T] {
override def apply(input: SchemaAndRecord): T = fromAvro(input.getRecord)
})
.withMethod(ReadMethod.DIRECT_READ)
BigQueryTypedSelect(reader, sqlQuery, fromTableRow)
}
override def testId: String = s"BigQueryIO($sqlQuery)"
override protected def read(sc: ScioContext, params: ReadP): SCollection[T] =
sc.read(underlying)(BigQueryTypedSelect.ReadParam())
override protected def write(data: SCollection[T], params: WriteP): Tap[T] =
throw new UnsupportedOperationException("Storage API is read-only")
override def tap(read: ReadP): Tap[T] = underlying.tap(BigQueryTypedSelect.ReadParam())
}
private[scio] def dynamic[T <: HasAnnotation: TypeTag: Coder](
newSource: Option[Source]
): ScioIO.ReadOnly[T, Unit] = {
val bqt = BigQueryType[T]
newSource match {
// newSource is missing, T's companion object must have either table or query
// The case where newSource is null is only there
// for legacy support and should not exists once
// BigQueryScioContext.typedBigQuery is removed
case None if bqt.isTable =>
val table = STable.Spec(bqt.table.get)
ScioIO.ro[T](Table[T](table))
case None if bqt.isQuery =>
val query = Query(bqt.queryRaw.get)
Select[T](query)
case Some(s: STable) =>
ScioIO.ro(Table[T](s))
case Some(s: Query) =>
Select[T](s)
case _ =>
throw new IllegalArgumentException(s"Missing table or query field in companion object")
}
}
}