All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.spotify.scio.snowflake.syntax.SCollectionSyntax.scala Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2024 Spotify AB.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package com.spotify.scio.snowflake.syntax

import com.spotify.scio.coders.Coder
import com.spotify.scio.io.ClosedTap
import com.spotify.scio.snowflake.{SnowflakeConnectionOptions, SnowflakeIO, SnowflakeTable}
import com.spotify.scio.values.SCollection
import kantan.csv.RowCodec
import org.apache.beam.sdk.io.snowflake.data.SnowflakeTableSchema
import org.apache.beam.sdk.io.snowflake.enums.{CreateDisposition, WriteDisposition}
import org.joda.time.Duration

/**
 * Enhanced version of [[com.spotify.scio.values.SCollection SCollection]] with Snowflake methods.
 */
final class SnowflakeSCollectionOps[T](private val self: SCollection[T]) extends AnyVal {

  /**
   * Save this SCollection as a Snowflake database table. The [[SCollection]] is written to CSV
   * files in a bucket, using a provided [[kantan.csv.RowEncoder]] to encode each element as a CSV
   * row. The bucket is then COPYied to the Snowflake table.
   *
   * @see
   *   ''Reading from Snowflake'' in the
   *   [[https://beam.apache.org/documentation/io/built-in/snowflake/ Beam `SnowflakeIO` documentation]]
   * @param connectionOptions
   *   options for configuring a Snowflake integration
   * @param table
   *   table name to be written in Snowflake
   * @param tableSchema
   *   table schema to be used during creating table
   * @param createDisposition
   *   disposition to be used during table preparation
   * @param writeDisposition
   *   disposition to be used during writing to table phase
   * @param snowPipe
   *   name of created
   *   [[https://docs.snowflake.com/en/user-guide/data-load-snowpipe-intro SnowPipe]] in Snowflake
   *   dashboard
   * @param shardNumber
   *   number of shards that are created per window
   * @param flushRowLimit
   *   number of row limit that will be saved to the staged file and then loaded to Snowflake
   * @param flushTimeLimit
   *   duration how often staged files will be created and then how often ingested by Snowflake
   *   during streaming
   * @param storageIntegrationName
   *   Storage Integration in Snowflake to be used
   * @param stagingBucketName
   *   cloud bucket (GCS by now) to use as tmp location of CSVs during COPY statement.
   * @param quotationMark
   *   Snowflake-specific quotations around strings
   * @return
   *   [[SCollection]] containing the table elements as parsed from the CSV bucket copied from
   *   Snowflake table
   */
  def saveAsSnowflake(
    connectionOptions: SnowflakeConnectionOptions,
    table: String,
    tableSchema: SnowflakeTableSchema = SnowflakeIO.WriteParam.DefaultTableSchema,
    createDisposition: CreateDisposition = SnowflakeIO.WriteParam.DefaultCreateDisposition,
    writeDisposition: WriteDisposition = SnowflakeIO.WriteParam.DefaultWriteDisposition,
    snowPipe: String = SnowflakeIO.WriteParam.DefaultSnowPipe,
    shardNumber: Integer = SnowflakeIO.WriteParam.DefaultShardNumber,
    flushRowLimit: Integer = SnowflakeIO.WriteParam.DefaultFlushRowLimit,
    flushTimeLimit: Duration = SnowflakeIO.WriteParam.DefaultFlushTimeLimit,
    storageIntegrationName: String = SnowflakeIO.WriteParam.DefaultStorageIntegrationName,
    stagingBucketName: String = SnowflakeIO.WriteParam.DefaultStagingBucketName,
    quotationMark: String = SnowflakeIO.WriteParam.DefaultQuotationMark,
    configOverride: SnowflakeIO.WriteParam.ConfigOverride[T] =
      SnowflakeIO.WriteParam.DefaultConfigOverride
  )(implicit rowCodec: RowCodec[T], coder: Coder[T]): ClosedTap[Nothing] = {
    val param = SnowflakeIO.WriteParam(
      tableSchema = tableSchema,
      createDisposition = createDisposition,
      writeDisposition = writeDisposition,
      snowPipe = snowPipe,
      shardNumber = shardNumber,
      flushRowLimit = flushRowLimit,
      flushTimeLimit = flushTimeLimit,
      storageIntegrationName = storageIntegrationName,
      stagingBucketName = stagingBucketName,
      quotationMark = quotationMark,
      configOverride = configOverride
    )
    self.write(SnowflakeTable[T](connectionOptions, table))(param)
  }
}

trait SCollectionSyntax {
  implicit def snowflakeSCollectionOps[T](sc: SCollection[T]): SnowflakeSCollectionOps[T] =
    new SnowflakeSCollectionOps(sc)
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy