com.spotify.scio.parquet.avro.ParquetAvroSink.scala Maven / Gradle / Ivy

Go to download
/*
 * Copyright 2024 Spotify AB.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package com.spotify.scio.parquet.avro

import com.spotify.scio.parquet.ParquetOutputFile
import org.apache.avro.Schema
import org.apache.beam.sdk.io.FileIO
import org.apache.beam.sdk.io.hadoop.SerializableConfiguration
import org.apache.parquet.avro.AvroParquetWriter
import org.apache.parquet.hadoop.metadata.CompressionCodecName
import org.apache.parquet.hadoop.{ParquetOutputFormat, ParquetWriter}

import java.nio.channels.WritableByteChannel

class ParquetAvroSink[T](
  schema: Schema,
  val compression: CompressionCodecName,
  val conf: SerializableConfiguration
) extends FileIO.Sink[T] {
  private val schemaString = schema.toString
  private var writer: ParquetWriter[T] = _

  override def open(channel: WritableByteChannel): Unit = {
    val schema = new Schema.Parser().parse(schemaString)
    // https://github.com/apache/parquet-mr/tree/master/parquet-hadoop#class-parquetoutputformat
    val rowGroupSize =
      conf.get.getInt(ParquetOutputFormat.BLOCK_SIZE, ParquetWriter.DEFAULT_BLOCK_SIZE)
    writer = AvroParquetWriter
      .builder[T](new ParquetOutputFile(channel))
      .withSchema(schema)
      .withCompressionCodec(compression)
      .withConf(conf.get)
      .withRowGroupSize(rowGroupSize)
      .build
  }

  override def write(element: T): Unit = writer.write(element)

  override def flush(): Unit = writer.close()
}