gcp4zio.bq.BQImpl.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of gcp4zio-bq_3 Show documentation
Show all versions of gcp4zio-bq_3 Show documentation
Scala interface to Google Cloud API based on ZIO
The newest version!
package gcp4zio
package bq
import com.google.cloud.bigquery._
import gcp4zio.bq.FileType.{CSV, JSON, ORC, PARQUET}
import zio.{Task, ZIO}
import java.util.UUID
import scala.jdk.CollectionConverters._
import zio.stream.{Stream, ZPipeline, ZStream}
@SuppressWarnings(
Array(
"org.wartremover.warts.Var",
"org.wartremover.warts.ToString",
"org.wartremover.warts.Throw",
"org.wartremover.warts.PlatformDefault",
"org.wartremover.warts.AutoUnboxing"
)
)
case class BQImpl(client: BigQuery) extends BQ {
private def getFormatOptions(inputType: FileType): FormatOptions = inputType match {
case PARQUET => FormatOptions.parquet
case ORC => FormatOptions.orc
case CSV(field_delimiter, header_present, _, _) =>
CsvOptions
.newBuilder()
.setSkipLeadingRows(if (header_present) 1 else 0)
.setFieldDelimiter(field_delimiter)
.build()
case _ => FormatOptions.parquet
}
/** Execute SQL query on BigQuery, this API does not returns any data. So it can be used to run any DML/DDL queries
* @param query
* SQL query(INSERT, CREATE) to execute
* @return
*/
def executeQuery(query: String): Task[Job] = ZIO.attempt {
val queryConfig: QueryJobConfiguration = QueryJobConfiguration
.newBuilder(query)
.setUseLegacySql(false)
.build()
val jobId = JobId.of(UUID.randomUUID().toString)
var queryJob = client.create(JobInfo.newBuilder(queryConfig).setJobId(jobId).build())
// Wait for the query to complete.
try queryJob = queryJob.waitFor()
catch {
case e: InterruptedException =>
e.printStackTrace()
}
if (queryJob == null)
throw new RuntimeException("Job no longer exists")
else if (queryJob.getStatus.getError != null) {
logger.error(queryJob.getStatus.getState.toString)
logger.info(s"EmailId: ${queryJob.getUserEmail}")
throw new RuntimeException(s"Error ${queryJob.getStatus.getError.getMessage}")
} else {
logger.info(s"Executed query successfully")
queryJob
}
}
/** This API can be used to run any SQL(SELECT) query on BigQuery to fetch rows
* @param query
* SQL query(SELECT) to execute
* @param fn
* function to convert FieldValueList to Scala Type T
* @tparam T
* Scala Type for output rows
* @return
*/
def fetchResults[T](query: String)(fn: FieldValueList => T): Task[Iterable[T]] = ZIO.attempt {
val queryConfig: QueryJobConfiguration = QueryJobConfiguration
.newBuilder(query)
.setUseLegacySql(false)
.build()
val jobId = JobId.of(UUID.randomUUID().toString)
val queryJob = client.create(JobInfo.newBuilder(queryConfig).setJobId(jobId).build())
// Wait for the query to complete.
val job = queryJob.waitFor()
val result: TableResult = job.getQueryResults()
result.iterateAll().asScala.map(fn)
}
/** This API can be used to run any SQL(SELECT) query on BigQuery to fetch rows
* @param query
* SQL query(SELECT) to execute
* @param fn
* function to convert FieldValueList to Scala Type T
* @tparam T
* Scala Type for output rows
* @return
*/
def fetchStreamingResults[T](query: String)(fn: FieldValueList => T): Task[Stream[Throwable, T]] = ZIO.attempt {
val queryConfig: QueryJobConfiguration = QueryJobConfiguration
.newBuilder(query)
.setUseLegacySql(false)
.build()
val jobId = JobId.of(UUID.randomUUID().toString)
val queryJob = client.create(JobInfo.newBuilder(queryConfig).setJobId(jobId).build())
// Wait for the query to complete.
val job = queryJob.waitFor()
val fnTransformer: ZPipeline[Any, Nothing, FieldValueList, T] = ZPipeline.map(fn)
ZStream
.fromJavaStream(
job.getQueryResults().streamAll()
)
.via(fnTransformer)
}
/** Load data into BigQuery from GCS
* @param sourcePath
* Source GCS path from which we need to load data into BigQuery
* @param sourceFormat
* File format of source data in GCS
* @param targetProject
* Target Google Project ID
* @param targetDataset
* Target Dataset name
* @param targetTable
* Target Table name
* @param writeDisposition
* Write Disposition for table
* @param createDisposition
* Create Disposition for table
* @param schema
* Schema for source files(Useful in case of CSV and JSON)
* @return
*/
override def loadTable(
sourcePath: String,
sourceFormat: FileType,
targetProject: scala.Option[String],
targetDataset: String,
targetTable: String,
writeDisposition: JobInfo.WriteDisposition,
createDisposition: JobInfo.CreateDisposition,
schema: scala.Option[Schema]
): Task[Map[String, Long]] = ZIO.attempt {
val tableId = targetProject match {
case Some(project) => TableId.of(project, targetDataset, targetTable)
case None => TableId.of(targetDataset, targetTable)
}
val jobConfiguration: JobConfiguration = sourceFormat match {
case FileType.BQ =>
QueryJobConfiguration
.newBuilder(sourcePath)
.setUseLegacySql(false)
.setDestinationTable(tableId)
.setWriteDisposition(writeDisposition)
.setCreateDisposition(createDisposition)
.setAllowLargeResults(true)
.build()
case ORC | PARQUET | CSV(_, _, _, _) =>
schema match {
case Some(s) =>
LoadJobConfiguration
.builder(tableId, sourcePath)
.setFormatOptions(getFormatOptions(sourceFormat))
.setSchema(s)
.setWriteDisposition(writeDisposition)
.setCreateDisposition(createDisposition)
.build()
case None =>
LoadJobConfiguration
.builder(tableId, sourcePath)
.setFormatOptions(getFormatOptions(sourceFormat))
.setWriteDisposition(writeDisposition)
.setCreateDisposition(createDisposition)
.build()
}
case _ => throw BQLoadException("Unsupported Input Type")
}
// Create BQ job
val jobId: JobId = JobId.of(UUID.randomUUID().toString)
val job: Job = client.create(JobInfo.newBuilder(jobConfiguration).setJobId(jobId).build())
// Wait for the job to complete
val completedJob = job.waitFor()
val targetTableDef = client.getTable(tableId).getDefinition[StandardTableDefinition]
if (completedJob.getStatus.getError == null) {
logger.info(s"Source path: $sourcePath")
logger.info(s"Destination table: $targetDataset.$targetTable")
logger.info(s"Job State: ${completedJob.getStatus.getState}")
logger.info(s"Loaded rows: ${targetTableDef.getNumRows}")
logger.info(s"Loaded rows size: ${targetTableDef.getNumBytes / 1000000.0} MB")
} else {
throw BQLoadException(
s"""Could not load data in ${sourceFormat.toString} format in table ${targetDataset + "." + targetTable} due to error ${completedJob.getStatus.getError.getMessage}""".stripMargin
)
}
Map(targetTable -> targetTableDef.getNumRows)
}
/** Export data from BigQuery to GCS
* @param sourceDataset
* Source Dataset name
* @param sourceTable
* Source Table name
* @param sourceProject
* Source Google Project ID
* @param targetPath
* Target GCS path
* @param targetFormat
* File format for target GCS location
* @param targetFileName
* Filename in case we want to create single file in target
* @param targetCompressionType
* Compression for destination files
* @return
*/
override def exportTable(
sourceDataset: String,
sourceTable: String,
sourceProject: scala.Option[String],
targetPath: String,
targetFormat: FileType,
targetFileName: scala.Option[String],
targetCompressionType: String = "gzip"
): Task[Unit] = ZIO.attempt {
val tableId = sourceProject match {
case Some(project) => TableId.of(project, sourceDataset, sourceTable)
case None => TableId.of(sourceDataset, sourceTable)
}
val targetFormatStr = targetFormat match {
case CSV(_, _, _, _) => "CSV"
case PARQUET => "PARQUET"
case fmt => throw BQLoadException(s"Unsupported target format $fmt")
}
val targetUri =
targetPath + "/" + targetFileName.getOrElse(s"part-*.${targetFormatStr.toLowerCase}")
val extractJobConfiguration: ExtractJobConfiguration = targetFormat match {
case CSV(delimiter, _, _, _) =>
ExtractJobConfiguration
.newBuilder(tableId, targetUri)
.setFormat(CSV.toString())
.setFieldDelimiter(delimiter)
.build();
case PARQUET =>
ExtractJobConfiguration
.newBuilder(tableId, targetUri)
.setFormat(PARQUET.toString)
.setCompression(targetCompressionType)
.build();
case JSON(_) =>
ExtractJobConfiguration
.newBuilder(tableId, targetUri)
.setFormat(JSON.toString())
.setCompression(targetCompressionType)
.build();
case _ => throw BQLoadException("Unsupported Destination Format")
}
val job = client.create(JobInfo.of(extractJobConfiguration))
val completedJob = job.waitFor()
if (completedJob.getStatus.getError == null) {
logger.info(s"Source table: $sourceDataset.$sourceTable")
logger.info(s"Destination path: $targetPath")
logger.info(s"Job State: ${completedJob.getStatus.getState}")
} else if (completedJob.getStatus.getError != null) {
logger.error(s"BigQuery was unable to extract due to an error:" + job.getStatus.getError)
} else {
throw BQLoadException(
s"""Could not load data from BQ table $sourceDataset.$sourceTable to location $targetFileName due to error ${completedJob.getStatus.getError.getMessage}""".stripMargin
)
}
}
/** Execute function with BigQuery as Input and return Generic o/p T
*
* @param f
* BigQuery => T
* @tparam T
* Output
* @return
*/
override def execute[T](f: BigQuery => T): Task[T] = ZIO.attempt(f(client))
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy