com.spotify.scio.bigquery.BigQueryIO.scala Maven / Gradle / Ivy

Go to download
/*
 * Copyright 2019 Spotify AB.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package com.spotify.scio.bigquery

import com.google.api.services.bigquery.model.TableSchema
import com.spotify.scio.ScioContext
import com.spotify.scio.bigquery.client.BigQuery
import com.spotify.scio.bigquery.types.BigQueryType.HasAnnotation
import com.spotify.scio.coders._
import com.spotify.scio.io._
import com.spotify.scio.util.{FilenamePolicySupplier, Functions, ScioUtil}
import com.spotify.scio.values.{SCollection, SideOutput, SideOutputCollections}
import com.twitter.chill.ClosureCleaner
import org.apache.avro.generic.GenericRecord
import org.apache.beam.sdk.extensions.gcp.options.GcpOptions
import org.apache.beam.sdk.io.Compression
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.TypedRead.{Method => ReadMethod}
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.{
  CreateDisposition,
  Method => WriteMethod,
  WriteDisposition
}
import org.apache.beam.sdk.io.gcp.bigquery._
import org.apache.beam.sdk.io.gcp.{bigquery => beam}
import org.apache.beam.sdk.transforms.SerializableFunction
import org.apache.beam.sdk.values.{PCollection, PCollectionTuple}
import org.joda.time.Duration

import java.util.concurrent.ConcurrentHashMap
import java.util.function
import scala.jdk.CollectionConverters._
import scala.reflect.runtime.universe._
import scala.util.chaining._

private object Reads {

  private[this] val cache = new ConcurrentHashMap[ScioContext, BigQuery]()

  @inline private def client(sc: ScioContext): BigQuery =
    cache.computeIfAbsent(
      sc,
      new function.Function[ScioContext, BigQuery] {
        override def apply(context: ScioContext): BigQuery = {
          val opts = context.optionsAs[GcpOptions]
          BigQuery(opts.getProject, opts.getGcpCredential)
        }
      }
    )

  private[scio] def bqReadQuery[T](sc: ScioContext)(
    typedRead: beam.BigQueryIO.TypedRead[T],
    sqlQuery: String,
    flattenResults: Boolean = false
  ): SCollection[T] = {
    val bigQueryClient = client(sc)
    val labels = sc.labels
    val read = bigQueryClient.query
      .newQueryJob(sqlQuery, flattenResults, labels)
      .map { job =>
        sc.onClose(_ => bigQueryClient.waitForJobs(job))
        typedRead.from(job.table).withoutValidation()
      }

    sc.applyTransform(read.get)
  }

  // TODO: support labels Inheritance like in bqReadQuery
  private[scio] def bqReadStorage[T](sc: ScioContext)(
    typedRead: beam.BigQueryIO.TypedRead[T],
    table: Table,
    selectedFields: List[String] = BigQueryStorage.ReadParam.DefaultSelectFields,
    rowRestriction: Option[String] = BigQueryStorage.ReadParam.DefaultRowRestriction
  ): SCollection[T] = {
    val read = typedRead
      .from(table.spec)
      .withMethod(ReadMethod.DIRECT_READ)
      .withSelectedFields(selectedFields.asJava)
      .pipe(r => rowRestriction.fold(r)(r.withRowRestriction))

    sc.applyTransform(read)
  }
}

private[bigquery] object Writes {
  def resolveMethod(
    method: WriteMethod,
    options: BigQueryOptions,
    isBounded: PCollection.IsBounded
  ): WriteMethod = (method, isBounded) match {
    case (WriteMethod.DEFAULT, _)
        if options.getUseStorageWriteApi && options.getUseStorageWriteApiAtLeastOnce =>
      WriteMethod.STORAGE_API_AT_LEAST_ONCE
    case (WriteMethod.DEFAULT, _) if options.getUseStorageWriteApi =>
      WriteMethod.STORAGE_WRITE_API
    case (WriteMethod.DEFAULT, PCollection.IsBounded.BOUNDED) =>
      WriteMethod.FILE_LOADS
    case (WriteMethod.DEFAULT, PCollection.IsBounded.UNBOUNDED) =>
      WriteMethod.STREAMING_INSERTS
    case _ =>
      method
  }

  def withSharding[T](method: WriteMethod, w: beam.BigQueryIO.Write[T])(
    sharding: Sharding
  ): beam.BigQueryIO.Write[T] = {
    import WriteMethod._
    (sharding, method) match {
      case (Sharding.Auto, _) =>
        w.withAutoSharding()
      case (Sharding.Manual(numShards), FILE_LOADS) =>
        w.withNumFileShards(numShards)
      case (Sharding.Manual(numShards), STORAGE_WRITE_API | STORAGE_API_AT_LEAST_ONCE) =>
        w.withNumStorageWriteApiStreams(numShards)
      case _ =>
        w
    }
  }

  def withSuccessfulInsertsPropagation[T](method: WriteMethod, w: beam.BigQueryIO.Write[T])(
    successfulInsertsPropagation: Boolean
  ): beam.BigQueryIO.Write[T] = {
    import WriteMethod._
    method match {
      case STREAMING_INSERTS =>
        w.withSuccessfulInsertsPropagation(successfulInsertsPropagation)
      case STORAGE_WRITE_API | STORAGE_API_AT_LEAST_ONCE =>
        w.withPropagateSuccessfulStorageApiWrites(successfulInsertsPropagation)
      case _ =>
        w
    }
  }

  def sideOutputs(
    data: SCollection[_],
    method: WriteMethod,
    successfulInsertsPropagation: Boolean,
    extendedErrorInfo: Boolean,
    result: WriteResult
  ): SideOutputCollections = {
    import WriteMethod._
    val sc = data.context
    var tuple = PCollectionTuple.empty(sc.pipeline)
    // success side output
    method match {
      case FILE_LOADS =>
        tuple = tuple.and(BigQueryIO.SuccessfulTableLoads.tupleTag, result.getSuccessfulTableLoads)
      case STREAMING_INSERTS if successfulInsertsPropagation =>
        tuple = tuple.and(BigQueryIO.SuccessfulInserts.tupleTag, result.getSuccessfulInserts)
      case STORAGE_WRITE_API | STORAGE_API_AT_LEAST_ONCE if successfulInsertsPropagation =>
        tuple = tuple.and(
          BigQueryIO.SuccessfulStorageApiInserts.tupleTag,
          result.getSuccessfulStorageApiInserts
        )
      case _ =>
        ()
    }
    // failure side output
    method match {
      case STREAMING_INSERTS if extendedErrorInfo =>
        tuple = tuple.and(BigQueryIO.FailedInsertsWithErr.tupleTag, result.getFailedInsertsWithErr)
      case FILE_LOADS | STREAMING_INSERTS =>
        tuple = tuple.and(BigQueryIO.FailedInserts.tupleTag, result.getFailedInserts)
      case STORAGE_WRITE_API | STORAGE_API_AT_LEAST_ONCE =>
        tuple =
          tuple.and(BigQueryIO.FailedStorageApiInserts.tupleTag, result.getFailedStorageApiInserts)
      case _ =>
        ()
    }

    SideOutputCollections(tuple, sc)
  }

  trait WriteParam[T] {
    def configOverride: beam.BigQueryIO.Write[T] => beam.BigQueryIO.Write[T]
  }

  trait WriteParamDefaults {

    type ConfigOverride[T] = beam.BigQueryIO.Write[T] => beam.BigQueryIO.Write[T]

    val DefaultMethod: WriteMethod = WriteMethod.DEFAULT
    val DefaultSchema: TableSchema = null
    val DefaultWriteDisposition: WriteDisposition = null
    val DefaultCreateDisposition: CreateDisposition = null
    val DefaultTableDescription: String = null
    val DefaultTimePartitioning: TimePartitioning = null
    val DefaultClustering: Clustering = null
    val DefaultTriggeringFrequency: Duration = null
    val DefaultSharding: Sharding = null
    val DefaultFailedInsertRetryPolicy: InsertRetryPolicy = null
    val DefaultSuccessfulInsertsPropagation: Boolean = false
    val DefaultExtendedErrorInfo: Boolean = false
    val DefaultConfigOverride: Null = null
  }
}

sealed trait BigQueryIO[T] extends ScioIO[T] {
  final override val tapT: TapT.Aux[T, T] = TapOf[T]
}

object BigQueryIO {
  implicit lazy val coderTableDestination: Coder[TableDestination] = Coder.kryo

  lazy val SuccessfulTableLoads: SideOutput[TableDestination] = SideOutput()
  lazy val SuccessfulInserts: SideOutput[TableRow] = SideOutput()
  lazy val SuccessfulStorageApiInserts: SideOutput[TableRow] = SideOutput()

  implicit lazy val coderBigQueryInsertError: Coder[BigQueryInsertError] = Coder.kryo
  implicit lazy val coderBigQueryStorageApiInsertError: Coder[BigQueryStorageApiInsertError] =
    Coder.kryo

  lazy val FailedInserts: SideOutput[TableRow] = SideOutput()
  lazy val FailedInsertsWithErr: SideOutput[BigQueryInsertError] = SideOutput()
  lazy val FailedStorageApiInserts: SideOutput[BigQueryStorageApiInsertError] = SideOutput()

  private[bigquery] val storageWriteMethod =
    Set(WriteMethod.STORAGE_WRITE_API, WriteMethod.STORAGE_API_AT_LEAST_ONCE)

  @inline final def apply[T](id: String): BigQueryIO[T] =
    new BigQueryIO[T] with TestIO[T] {
      override def testId: String = s"BigQueryIO($id)"
    }

  @inline final def apply[T](source: Source): BigQueryIO[T] =
    new BigQueryIO[T] with TestIO[T] {
      override def testId: String = source match {
        case t: Table => s"BigQueryIO(${t.spec})"
        case q: Query => s"BigQueryIO(${q.underlying})"
      }
    }

  @inline final def apply[T](
    id: String,
    selectedFields: List[String],
    rowRestriction: Option[String]
  ): BigQueryIO[T] =
    new BigQueryIO[T] with TestIO[T] {
      override def testId: String =
        s"BigQueryIO($id, List(${selectedFields.mkString(",")}), $rowRestriction)"
    }
}

object BigQueryTypedSelect {
  object ReadParam {
    val DefaultFlattenResults: Boolean = false
  }

  final case class ReadParam private (flattenResults: Boolean = ReadParam.DefaultFlattenResults)
}

final case class BigQueryTypedSelect[T: Coder](
  reader: beam.BigQueryIO.TypedRead[T],
  sqlQuery: Query,
  fromTableRow: TableRow => T
) extends BigQueryIO[T] {
  override type ReadP = BigQueryTypedSelect.ReadParam
  override type WriteP = Nothing // ReadOnly

  override def testId: String = s"BigQueryIO(${sqlQuery.underlying})"

  override protected def read(sc: ScioContext, params: ReadP): SCollection[T] = {
    val coder = CoderMaterializer.beam(sc, Coder[T])
    val rc = reader.withCoder(coder)
    Reads.bqReadQuery(sc)(rc, sqlQuery.underlying, params.flattenResults)
  }

  override protected def write(data: SCollection[T], params: WriteP): Tap[T] =
    throw new UnsupportedOperationException("BigQuerySelect is read-only")

  override def tap(params: ReadP): Tap[T] = {
    val tableReference = BigQuery
      .defaultInstance()
      .query
      .run(sqlQuery.underlying, flattenResults = params.flattenResults)
    BigQueryTap(tableReference).map(fromTableRow)
  }
}

/**
 * Get an SCollection for a BigQuery SELECT query. Both
 * [[https://cloud.google.com/bigquery/docs/reference/legacy-sql Legacy SQL]] and
 * [[https://cloud.google.com/bigquery/docs/reference/standard-sql/ Standard SQL]] dialects are
 * supported. By default the query dialect will be automatically detected. To override this
 * behavior, start the query string with `#legacysql` or `#standardsql`.
 */
final case class BigQuerySelect(sqlQuery: Query) extends BigQueryIO[TableRow] {
  override type ReadP = BigQuerySelect.ReadParam
  override type WriteP = Nothing // ReadOnly

  private[this] lazy val underlying =
    BigQueryTypedSelect(beam.BigQueryIO.readTableRows(), sqlQuery, identity)(coders.tableRowCoder)

  override def testId: String = s"BigQueryIO(${sqlQuery.underlying})"

  override protected def read(sc: ScioContext, params: ReadP): SCollection[TableRow] =
    sc.read(underlying)(params)

  override protected def write(data: SCollection[TableRow], params: WriteP): Tap[TableRow] =
    throw new UnsupportedOperationException("BigQuerySelect is read-only")

  override def tap(params: ReadP): Tap[TableRow] = underlying.tap(params)
}

object BigQuerySelect {
  type ReadParam = BigQueryTypedSelect.ReadParam
  val ReadParam = BigQueryTypedSelect.ReadParam

  @inline final def apply(sqlQuery: String): BigQuerySelect = new BigQuerySelect(Query(sqlQuery))
}

object BigQueryTypedTable {

  /** Defines the format in which BigQuery can be read and written to. */
  sealed abstract class Format[F]
  object Format {
    case object GenericRecord extends Format[GenericRecord]
    case object TableRow extends Format[TableRow]
  }

  final case class WriteParam[T] private (
    method: WriteMethod,
    schema: TableSchema,
    writeDisposition: WriteDisposition,
    createDisposition: CreateDisposition,
    tableDescription: String,
    timePartitioning: TimePartitioning,
    clustering: Clustering,
    triggeringFrequency: Duration,
    sharding: Sharding,
    failedInsertRetryPolicy: InsertRetryPolicy,
    successfulInsertsPropagation: Boolean,
    extendedErrorInfo: Boolean,
    configOverride: WriteParam.ConfigOverride[T]
  ) extends Writes.WriteParam[T]

  object WriteParam extends Writes.WriteParamDefaults {
    @inline final def apply[T](
      method: WriteMethod = DefaultMethod,
      schema: TableSchema = DefaultSchema,
      writeDisposition: WriteDisposition = DefaultWriteDisposition,
      createDisposition: CreateDisposition = DefaultCreateDisposition,
      tableDescription: String = DefaultTableDescription,
      timePartitioning: TimePartitioning = DefaultTimePartitioning,
      clustering: Clustering = DefaultClustering,
      triggeringFrequency: Duration = DefaultTriggeringFrequency,
      sharding: Sharding = DefaultSharding,
      failedInsertRetryPolicy: InsertRetryPolicy = DefaultFailedInsertRetryPolicy,
      successfulInsertsPropagation: Boolean = DefaultSuccessfulInsertsPropagation,
      extendedErrorInfo: Boolean = DefaultExtendedErrorInfo,
      configOverride: ConfigOverride[T] = DefaultConfigOverride
    ): WriteParam[T] = new WriteParam(
      method,
      schema,
      writeDisposition,
      createDisposition,
      tableDescription,
      timePartitioning,
      clustering,
      triggeringFrequency,
      sharding,
      failedInsertRetryPolicy,
      successfulInsertsPropagation,
      extendedErrorInfo,
      configOverride
    )
  }

  private[this] def tableRow(table: Table): BigQueryTypedTable[TableRow] =
    BigQueryTypedTable(
      beam.BigQueryIO.readTableRows(),
      beam.BigQueryIO.writeTableRows(),
      table,
      BigQueryUtils.convertGenericRecordToTableRow(_, _)
    )(coders.tableRowCoder)

  private[this] def genericRecord(
    table: Table
  )(implicit c: Coder[GenericRecord]): BigQueryTypedTable[GenericRecord] =
    BigQueryTypedTable(
      _.getRecord(),
      identity[GenericRecord],
      (genericRecord: GenericRecord, _: TableSchema) => genericRecord,
      table
    )

  /**
   * Creates a new instance of [[BigQueryTypedTable]] based on the supplied [[Format]].
   *
   * NOTE: LogicalType support when using `Format.GenericRecord` has some caveats: Reading: Bigquery
   * types DATE, TIME, DATIME will be read as STRING. Writing: Supports LogicalTypes only for DATE
   * and TIME. DATETIME is not yet supported. https://issuetracker.google.com/issues/140681683
   */
  def apply[F: Coder](table: Table, format: Format[F]): BigQueryTypedTable[F] =
    format match {
      case Format.GenericRecord => genericRecord(table)
      case Format.TableRow      => tableRow(table)
    }

  def apply[T: Coder](
    readerFn: SchemaAndRecord => T,
    writerFn: T => TableRow,
    tableRowFn: TableRow => T,
    table: Table
  ): BigQueryTypedTable[T] = {
    val rFn = ClosureCleaner.clean(readerFn)
    val wFn = ClosureCleaner.clean(writerFn)
    val reader = beam.BigQueryIO.read(Functions.serializableFn(rFn))
    val writer = beam.BigQueryIO
      .write[T]()
      .withFormatFunction(Functions.serializableFn(wFn))
    val fn: (GenericRecord, TableSchema) => T = (gr, ts) =>
      tableRowFn(BigQueryUtils.convertGenericRecordToTableRow(gr, ts))

    BigQueryTypedTable(reader, writer, table, fn)
  }

  def apply[T: Coder](
    readerFn: SchemaAndRecord => T,
    writerFn: T => GenericRecord,
    fn: (GenericRecord, TableSchema) => T,
    table: Table
  ): BigQueryTypedTable[T] = {
    val rFn = ClosureCleaner.clean(readerFn)
    val wFn = ClosureCleaner.clean(writerFn)
    val reader = beam.BigQueryIO.read(rFn(_))
    val writer = beam.BigQueryIO
      .write[T]()
      .useAvroLogicalTypes()
      .withAvroFormatFunction(input => wFn(input.getElement()))
      .withAvroSchemaFactory { ts =>
        BigQueryAvroUtilsWrapper.toGenericAvroSchema("root", ts.getFields())
      }

    BigQueryTypedTable(reader, writer, table, fn)
  }
}

final case class BigQueryTypedTable[T: Coder](
  reader: beam.BigQueryIO.TypedRead[T],
  writer: beam.BigQueryIO.Write[T],
  table: Table,
  fn: (GenericRecord, TableSchema) => T
) extends BigQueryIO[T]
    with WriteResultIO[T] {
  override type ReadP = Unit
  override type WriteP = BigQueryTypedTable.WriteParam[T]

  override def testId: String = s"BigQueryIO(${table.spec})"

  override protected def read(sc: ScioContext, params: ReadP): SCollection[T] = {
    val coder = CoderMaterializer.beam(sc, Coder[T])
    val io = reader.from(table.ref).withCoder(coder)
    sc.applyTransform(s"Read BQ table ${table.spec}", io)
  }

  override protected def writeWithResult(
    data: SCollection[T],
    params: WriteP
  ): (Tap[T], SideOutputCollections) = {
    val method = Writes.resolveMethod(
      params.method,
      data.context.optionsAs[BigQueryOptions],
      data.internal.isBounded
    )

    val transform = writer
      .to(table.ref)
      .withMethod(params.method)
      .pipe(w => Option(params.schema).fold(w)(w.withSchema))
      .pipe(w => Option(params.createDisposition).fold(w)(w.withCreateDisposition))
      .pipe(w => Option(params.writeDisposition).fold(w)(w.withWriteDisposition))
      .pipe(w => Option(params.tableDescription).fold(w)(w.withTableDescription))
      .pipe(w => Option(params.timePartitioning).map(_.asJava).fold(w)(w.withTimePartitioning))
      .pipe(w => Option(params.clustering).map(_.asJava).fold(w)(w.withClustering))
      .pipe(w => Option(params.triggeringFrequency).fold(w)(w.withTriggeringFrequency))
      .pipe(w => Option(params.sharding).fold(w)(Writes.withSharding(method, w)))
      .pipe(w =>
        Writes.withSuccessfulInsertsPropagation(method, w)(params.successfulInsertsPropagation)
      )
      .pipe(w => if (params.extendedErrorInfo) w.withExtendedErrorInfo() else w)
      .pipe(w => Option(params.failedInsertRetryPolicy).fold(w)(w.withFailedInsertRetryPolicy))
      .pipe(w => Option(params.configOverride).fold(w)(_(w)))

    val wr = data.applyInternal(transform)
    val outputs = Writes.sideOutputs(
      data,
      method,
      params.successfulInsertsPropagation,
      params.extendedErrorInfo,
      wr
    )

    (tap(()), outputs)
  }

  override def tap(read: ReadP): Tap[T] = BigQueryTypedTap(table, fn)
}

/** Get an IO for a BigQuery table using the storage API. */
final case class BigQueryStorage(
  table: Table,
  selectedFields: List[String],
  rowRestriction: Option[String]
) extends BigQueryIO[TableRow] {
  override type ReadP = Unit
  override type WriteP = Nothing // ReadOnly

  override def testId: String =
    s"BigQueryIO(${table.spec}, List(${selectedFields.mkString(",")}), $rowRestriction)"

  override protected def read(sc: ScioContext, params: ReadP): SCollection[TableRow] = {
    val coder = CoderMaterializer.beam(sc, coders.tableRowCoder)
    val read = beam.BigQueryIO.readTableRows().withCoder(coder)
    Reads.bqReadStorage(sc)(
      read,
      table,
      selectedFields,
      rowRestriction
    )
  }

  override protected def write(data: SCollection[TableRow], params: WriteP): Tap[TableRow] =
    throw new UnsupportedOperationException("BigQueryStorage is read-only")

  override def tap(read: ReadP): Tap[TableRow] = {
    val readOptions = StorageUtil.tableReadOptions(selectedFields, rowRestriction)
    BigQueryStorageTap(table, readOptions)
  }
}

object BigQueryStorage {
  object ReadParam {
    val DefaultSelectFields: List[String] = Nil
    val DefaultRowRestriction: Option[String] = None
  }
}

final case class BigQueryStorageSelect(sqlQuery: Query) extends BigQueryIO[TableRow] {
  override type ReadP = Unit
  override type WriteP = Nothing // ReadOnly

  private[this] lazy val underlying =
    BigQueryTypedSelect(
      beam.BigQueryIO.readTableRows().withMethod(ReadMethod.DIRECT_READ),
      sqlQuery,
      identity
    )(coders.tableRowCoder)

  override def testId: String = s"BigQueryIO(${sqlQuery.underlying})"

  override protected def read(sc: ScioContext, params: ReadP): SCollection[TableRow] =
    sc.read(underlying)(BigQueryTypedSelect.ReadParam())

  override protected def write(data: SCollection[TableRow], params: WriteP): Tap[TableRow] =
    throw new UnsupportedOperationException("BigQuerySelect is read-only")

  override def tap(params: ReadP): Tap[TableRow] = underlying.tap(BigQueryTypedSelect.ReadParam())
}

/** Get an IO for a BigQuery TableRow JSON file. */
final case class TableRowJsonIO(path: String) extends ScioIO[TableRow] {
  override type ReadP = TableRowJsonIO.ReadParam
  override type WriteP = TableRowJsonIO.WriteParam
  override val tapT: TapT.Aux[TableRow, TableRow] = TapOf[TableRow]

  override protected def read(sc: ScioContext, params: ReadP): SCollection[TableRow] =
    sc.read(TextIO(path))(params)
      .map(e => ScioUtil.jsonFactory.fromString(e, classOf[TableRow]))

  override protected def write(data: SCollection[TableRow], params: WriteP): Tap[TableRow] = {
    data
      .map(ScioUtil.jsonFactory.toString)
      .withName("BigQuery write")
      .write(TextIO(path))(params)
    tap(TableRowJsonIO.ReadParam(params))
  }

  override def tap(read: ReadP): Tap[TableRow] =
    TableRowJsonTap(path, read)
}

object TableRowJsonIO {

  type ReadParam = TextIO.ReadParam
  val ReadParam = TextIO.ReadParam

  type WriteParam = TextIO.WriteParam
  object WriteParam {
    val DefaultSuffix: String = ".json"
    val DefaultNumShards: Int = TextIO.WriteParam.DefaultNumShards
    val DefaultCompression: Compression = TextIO.WriteParam.DefaultCompression
    val DefaultFilenamePolicySupplier: FilenamePolicySupplier =
      TextIO.WriteParam.DefaultFilenamePolicySupplier
    val DefaultPrefix: String = TextIO.WriteParam.DefaultPrefix
    val DefaultShardNameTemplate: String = TextIO.WriteParam.DefaultShardNameTemplate
    val DefaultTempDirectory: String = TextIO.WriteParam.DefaultTempDirectory

    def apply(
      suffix: String = DefaultSuffix,
      numShards: Int = DefaultNumShards,
      compression: Compression = DefaultCompression,
      filenamePolicySupplier: FilenamePolicySupplier = DefaultFilenamePolicySupplier,
      prefix: String = DefaultPrefix,
      shardNameTemplate: String = DefaultShardNameTemplate,
      tempDirectory: String = DefaultTempDirectory
    ): WriteParam = {
      TextIO.WriteParam(
        suffix = suffix,
        numShards = numShards,
        compression = compression,
        filenamePolicySupplier = filenamePolicySupplier,
        prefix = prefix,
        shardNameTemplate = shardNameTemplate,
        tempDirectory = tempDirectory
      )
    }
  }

}

object BigQueryTyped {
  import com.spotify.scio.bigquery.{Table => STable}

  @annotation.implicitNotFound(
    """
    Can't find annotation for type ${T}.
    Make sure this class is annotated with BigQueryType.fromStorage, BigQueryType.fromTable or
    BigQueryType.fromQuery.
    Alternatively, use BigQueryTyped.Storage(""), BigQueryTyped.Table(""), or
    BigQueryTyped.Query("") to get a ScioIO instance.
  """
  )
  sealed trait IO[T <: HasAnnotation] {
    type F[_ <: HasAnnotation] <: ScioIO[_]
    def impl: F[T]
  }

  object IO {
    type Aux[T <: HasAnnotation, F0[_ <: HasAnnotation] <: ScioIO[_]] =
      IO[T] { type F[A <: HasAnnotation] = F0[A] }

    implicit def tableIO[T <: HasAnnotation: TypeTag: Coder](implicit
      t: BigQueryType.Table[T]
    ): Aux[T, Table] =
      new IO[T] {
        type F[A <: HasAnnotation] = Table[A]
        def impl: Table[T] = Table(STable.Spec(t.table))
      }

    implicit def queryIO[T <: HasAnnotation: TypeTag: Coder](implicit
      t: BigQueryType.Query[T]
    ): Aux[T, Select] =
      new IO[T] {
        type F[A <: HasAnnotation] = Select[A]
        def impl: Select[T] = Select(Query(t.queryRaw))
      }

    implicit def storageIO[T <: HasAnnotation: TypeTag: Coder](implicit
      t: BigQueryType.StorageOptions[T]
    ): Aux[T, Storage] =
      new IO[T] {
        type F[A <: HasAnnotation] = Storage[A]
        def impl: Storage[T] = Storage(STable.Spec(t.table), Nil, None)
      }
  }

  /**
   * Get a typed SCollection for a BigQuery table or a SELECT query.
   *
   * Note that `T` must be annotated with
   * [[com.spotify.scio.bigquery.types.BigQueryType.fromTable BigQueryType.fromStorage]],
   * [[com.spotify.scio.bigquery.types.BigQueryType.fromTable BigQueryType.fromTable]], or
   * [[com.spotify.scio.bigquery.types.BigQueryType.fromQuery BigQueryType.fromQuery]]
   *
   * The source (table) specified in the annotation will be used
   */
  @inline final def apply[T <: HasAnnotation](implicit t: IO[T]): t.F[T] =
    t.impl

  /**
   * Get a typed SCollection for a BigQuery SELECT query.
   *
   * Both [[https://cloud.google.com/bigquery/docs/reference/legacy-sql Legacy SQL]] and
   * [[https://cloud.google.com/bigquery/docs/reference/standard-sql/ Standard SQL]] dialects are
   * supported. By default the query dialect will be automatically detected. To override this
   * behavior, start the query string with `#legacysql` or `#standardsql`.
   */
  final case class Select[T <: HasAnnotation: TypeTag: Coder](query: Query) extends BigQueryIO[T] {
    override type ReadP = Unit
    override type WriteP = Nothing // ReadOnly

    private[this] lazy val underlying = {
      val fromAvro = BigQueryType[T].fromAvro
      val fromTableRow = BigQueryType[T].fromTableRow
      val reader = beam.BigQueryIO
        .read(new SerializableFunction[SchemaAndRecord, T] {
          override def apply(input: SchemaAndRecord): T = fromAvro(input.getRecord)
        })
      BigQueryTypedSelect(reader, query, fromTableRow)
    }

    override def testId: String = s"BigQueryIO(${query.underlying})"

    override protected def read(sc: ScioContext, params: ReadP): SCollection[T] =
      sc.read(underlying)(BigQueryTypedSelect.ReadParam())

    override protected def write(data: SCollection[T], params: WriteP): Tap[T] =
      throw new UnsupportedOperationException("Select queries are read-only")

    override def tap(params: ReadP): Tap[T] = underlying.tap(BigQueryTypedSelect.ReadParam())
  }

  object Select {
    @inline final def apply[T <: HasAnnotation: TypeTag: Coder](
      query: String
    ): Select[T] = new Select[T](Query(query))
  }

  /** Get a typed SCollection for a BigQuery table. */
  final case class Table[T <: HasAnnotation: TypeTag: Coder](table: STable)
      extends BigQueryIO[T]
      with WriteResultIO[T] {
    override type ReadP = Unit
    override type WriteP = Table.WriteParam[T]

    private val underlying = BigQueryTypedTable[T](
      (i: SchemaAndRecord) => BigQueryType[T].fromAvro(i.getRecord),
      BigQueryType[T].toTableRow,
      BigQueryType[T].fromTableRow,
      table
    )

    override def testId: String = s"BigQueryIO(${table.spec})"

    override protected def read(sc: ScioContext, params: ReadP): SCollection[T] =
      sc.read(underlying)

    override protected def writeWithResult(
      data: SCollection[T],
      params: WriteP
    ): (Tap[T], SideOutputCollections) = {
      val outputs = data
        .withName(s"${data.tfName}$$Write")
        .write(underlying)(params)
        .outputs
        .get

      (tap(()), outputs)
    }

    override def tap(read: ReadP): Tap[T] =
      BigQueryTypedTap[T](table, underlying.fn)
  }

  object Table {
    final case class WriteParam[T] private (
      method: WriteMethod,
      writeDisposition: WriteDisposition,
      createDisposition: CreateDisposition,
      timePartitioning: TimePartitioning,
      clustering: Clustering,
      triggeringFrequency: Duration,
      sharding: Sharding,
      failedInsertRetryPolicy: InsertRetryPolicy,
      successfulInsertsPropagation: Boolean,
      extendedErrorInfo: Boolean,
      configOverride: WriteParam.ConfigOverride[T]
    ) extends Writes.WriteParam[T]

    object WriteParam extends Writes.WriteParamDefaults {

      @inline final def apply[T](
        method: WriteMethod = DefaultMethod,
        writeDisposition: WriteDisposition = DefaultWriteDisposition,
        createDisposition: CreateDisposition = DefaultCreateDisposition,
        timePartitioning: TimePartitioning = DefaultTimePartitioning,
        clustering: Clustering = DefaultClustering,
        triggeringFrequency: Duration = DefaultTriggeringFrequency,
        sharding: Sharding = DefaultSharding,
        failedInsertRetryPolicy: InsertRetryPolicy = DefaultFailedInsertRetryPolicy,
        successfulInsertsPropagation: Boolean = DefaultSuccessfulInsertsPropagation,
        extendedErrorInfo: Boolean = DefaultExtendedErrorInfo,
        configOverride: ConfigOverride[T] = DefaultConfigOverride
      ): WriteParam[T] = new WriteParam(
        method,
        writeDisposition,
        createDisposition,
        timePartitioning,
        clustering,
        triggeringFrequency,
        sharding,
        failedInsertRetryPolicy,
        successfulInsertsPropagation,
        extendedErrorInfo,
        configOverride
      )

      implicit private[Table] def typedTableWriteParam[T: TypeTag, Info](
        params: Table.WriteParam[T]
      ): BigQueryTypedTable.WriteParam[T] =
        BigQueryTypedTable.WriteParam(
          params.method,
          BigQueryType[T].schema,
          params.writeDisposition,
          params.createDisposition,
          BigQueryType[T].tableDescription.orNull,
          params.timePartitioning,
          params.clustering,
          params.triggeringFrequency,
          params.sharding,
          params.failedInsertRetryPolicy,
          params.successfulInsertsPropagation,
          params.extendedErrorInfo,
          params.configOverride
        )
    }

  }

  /** Get a typed SCollection for a BigQuery table using the storage API. */
  final case class Storage[T <: HasAnnotation: TypeTag: Coder](
    table: STable,
    selectedFields: List[String],
    rowRestriction: Option[String]
  ) extends BigQueryIO[T] {
    override type ReadP = Unit
    override type WriteP = Nothing // ReadOnly

    override def testId: String =
      s"BigQueryIO(${table.spec}, List(${selectedFields.mkString(",")}), $rowRestriction)"

    override protected def read(sc: ScioContext, params: ReadP): SCollection[T] = {
      val coder = CoderMaterializer.beam(sc, Coder[T])
      val fromAvro = BigQueryType[T].fromAvro
      val reader = beam.BigQueryIO
        .read(new SerializableFunction[SchemaAndRecord, T] {
          override def apply(input: SchemaAndRecord): T = fromAvro(input.getRecord)
        })
        .withCoder(coder)
      Reads.bqReadStorage(sc)(reader, table, selectedFields, rowRestriction)
    }

    override protected def write(data: SCollection[T], params: WriteP): Tap[T] =
      throw new UnsupportedOperationException("Storage API is read-only")

    override def tap(read: ReadP): Tap[T] = {
      val fn = BigQueryType[T].fromTableRow
      val readOptions = StorageUtil.tableReadOptions(selectedFields, rowRestriction)
      BigQueryStorageTap(table, readOptions).map(fn)
    }
  }

  final case class StorageQuery[T <: HasAnnotation: TypeTag: Coder](sqlQuery: Query)
      extends BigQueryIO[T] {
    override type ReadP = Unit
    override type WriteP = Nothing // ReadOnly

    private[this] lazy val underlying = {
      val fromAvro = BigQueryType[T].fromAvro
      val fromTableRow = BigQueryType[T].fromTableRow
      val reader = beam.BigQueryIO
        .read(new SerializableFunction[SchemaAndRecord, T] {
          override def apply(input: SchemaAndRecord): T = fromAvro(input.getRecord)
        })
        .withMethod(ReadMethod.DIRECT_READ)
      BigQueryTypedSelect(reader, sqlQuery, fromTableRow)
    }

    override def testId: String = s"BigQueryIO($sqlQuery)"

    override protected def read(sc: ScioContext, params: ReadP): SCollection[T] =
      sc.read(underlying)(BigQueryTypedSelect.ReadParam())

    override protected def write(data: SCollection[T], params: WriteP): Tap[T] =
      throw new UnsupportedOperationException("Storage API is read-only")

    override def tap(read: ReadP): Tap[T] = underlying.tap(BigQueryTypedSelect.ReadParam())
  }

  private[scio] def dynamic[T <: HasAnnotation: TypeTag: Coder](
    newSource: Option[Source]
  ): ScioIO.ReadOnly[T, Unit] = {
    val bqt = BigQueryType[T]
    newSource match {
      // newSource is missing, T's companion object must have either table or query
      // The case where newSource is null is only there
      // for legacy support and should not exists once
      // BigQueryScioContext.typedBigQuery is removed
      case None if bqt.isTable =>
        val table = STable.Spec(bqt.table.get)
        ScioIO.ro[T](Table[T](table))
      case None if bqt.isQuery =>
        val query = Query(bqt.queryRaw.get)
        Select[T](query)
      case Some(s: STable) =>
        ScioIO.ro(Table[T](s))
      case Some(s: Query) =>
        Select[T](s)
      case _ =>
        throw new IllegalArgumentException(s"Missing table or query field in companion object")
    }
  }
}
    

    

    
            
    
            

    
        
            
                Related Artifacts
                
                     mysql-connector-java mysql
 facebook-messenger com.github.codedrinker
 selenium-java org.seleniumhq.selenium
 instagram-java com.github.sola92
 gson com.google.code.gson
 poi org.apache.poi
 httpclient org.apache.httpcomponents
 json org.json
 facebook-java-api com.google.code.facebook-java-api
 poi-ooxml org.apache.poi
 jackson-databind com.fasterxml.jackson.core
 junit junit
 primefaces org.primefaces
 ojdbc7 com.github.noraui
 jfoenix com.jfoenix
 testng org.testng
 json-simple com.googlecode.json-simple
 selenium-server org.seleniumhq.selenium
 itextpdf com.itextpdf
 spring-core org.springframework
                
            
        
        
            
                Related Groups
                
                     org.springframework
 org.apache.poi
 org.hibernate
 org.springframework.boot
 com.fasterxml.jackson.core
 com.itextpdf
 org.seleniumhq.selenium
 mysql
 org.finos.legend.engine
 org.apache.httpcomponents
 org.apache.logging.log4j
 org.openjfx
 org.apache.commons
 org.json
 com.google.guava
 com.google.zxing
 net.sf.jasperreports
 javax.xml.bind
 ojdbc
 com.google.code.facebook-java-api
                
            
        
    
    





    © 2015 - 2024 Weber Informatics LLC | Privacy Policy