All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.mjakubowski84.parquet4s.SingleFileParquetSink.scala Maven / Gradle / Ivy

There is a newer version: 2.20.0
Show newest version
package com.github.mjakubowski84.parquet4s

import com.github.mjakubowski84.parquet4s.ScalaCompat.Done
import com.github.mjakubowski84.parquet4s.ScalaCompat.stream.scaladsl.{Flow, Keep, Sink}
import org.apache.parquet.hadoop.ParquetWriter as HadoopParquetWriter
import org.apache.parquet.io.OutputFile
import org.apache.parquet.schema.MessageType
import org.slf4j.{Logger, LoggerFactory}

import scala.concurrent.Future
import scala.util.Try
import scala.util.control.NonFatal

object SingleFileParquetSink {

  trait ToParquet {

    /** Creates a builder of pipe that processes data of given type
      * @tparam T
      *   Schema type
      */
    def of[T: ParquetSchemaResolver: ParquetRecordEncoder]: Builder[T]

    /** Creates a builder of pipe that processes generic records
      */
    def generic(schema: MessageType): Builder[RowParquetRecord]

    /** Creates a builder of a sink that processes data of a given type using a
      * [[org.apache.parquet.hadoop.ParquetWriter]] built from a provided
      * [[org.apache.parquet.hadoop.ParquetWriter.Builder]].
      * @tparam T
      *   Schema type
      * @tparam B
      *   Type of custom [[org.apache.parquet.hadoop.ParquetWriter.Builder]]
      */
    @experimental
    def custom[T, B <: HadoopParquetWriter.Builder[T, B]](builder: B): CustomBuilder[T]
  }

  private[parquet4s] object ToParquetImpl extends ToParquet {
    override def of[T: ParquetSchemaResolver: ParquetRecordEncoder]: Builder[T] =
      BuilderImpl()
    override def generic(schema: MessageType): Builder[RowParquetRecord] =
      BuilderImpl()(
        schemaResolver = RowParquetRecord.genericParquetSchemaResolver(schema),
        encoder        = RowParquetRecord.genericParquetRecordEncoder
      )
    override def custom[T, B <: HadoopParquetWriter.Builder[T, B]](builder: B): CustomBuilder[T] =
      CustomBuilderImpl(builder)
  }

  trait Builder[T] {

    /** @param options
      *   writer options
      */
    def options(options: ParquetWriter.Options): Builder[T]

    /** @param path
      *   at which data is supposed to be written
      * @return
      *   final [[com.github.mjakubowski84.parquet4s.ScalaCompat.stream.scaladsl.Sink]]
      */
    def write(path: Path): Sink[T, Future[Done]]

    /** @param outputFile
      *   to which data is supposed to be written
      * @return
      *   final [[com.github.mjakubowski84.parquet4s.ScalaCompat.stream.scaladsl.Sink]]
      */
    def write(outputFile: OutputFile): Sink[T, Future[Done]]
  }

  private case class BuilderImpl[T](options: ParquetWriter.Options = ParquetWriter.Options())(implicit
      schemaResolver: ParquetSchemaResolver[T],
      encoder: ParquetRecordEncoder[T]
  ) extends Builder[T] {
    override def options(options: ParquetWriter.Options): Builder[T]  = this.copy(options = options)
    override def write(path: Path): Sink[T, Future[Done]]             = write(path.toOutputFile(options))
    override def write(outputFile: OutputFile): Sink[T, Future[Done]] = rowParquetRecordSink(outputFile, options)
  }

  trait CustomBuilder[T] {

    /** @param options
      *   writer options
      */
    def options(options: ParquetWriter.Options): CustomBuilder[T]

    /** @return
      *   final [[com.github.mjakubowski84.parquet4s.ScalaCompat.stream.scaladsl.Sink]]
      */
    def write: Sink[T, Future[Done]]
  }

  private case class CustomBuilderImpl[T, B <: HadoopParquetWriter.Builder[T, B]](
      builder: B,
      options: ParquetWriter.Options = ParquetWriter.Options()
  ) extends CustomBuilder[T] {
    override def options(options: ParquetWriter.Options): CustomBuilder[T] =
      this.copy(options = options)

    override def write: Sink[T, Future[Done]] =
      sink(options.applyTo[T, B](builder).build())
  }

  private val logger: Logger = LoggerFactory.getLogger(this.getClass)

  private def rowParquetRecordSink[T: ParquetSchemaResolver](
      outputFile: OutputFile,
      options: ParquetWriter.Options
  )(implicit encoder: ParquetRecordEncoder[T]): Sink[T, Future[Done]] = {
    val valueCodecConfiguration = ValueCodecConfiguration(options)
    val schema                  = ParquetSchemaResolver.resolveSchema[T]
    val writer                  = ParquetWriter.internalWriter(outputFile, schema, encoder, options)

    def encode(data: T): RowParquetRecord = ParquetRecordEncoder.encode[T](data, valueCodecConfiguration)

    Flow[T]
      .map(encode)
      .toMat(sink(writer))(Keep.right)
  }

  private def sink[T](writer: HadoopParquetWriter[T]): Sink[T, Future[Done]] =
    Flow[T]
      .fold(zero = 0) { case (acc, record) => writer.write(record); acc + 1 }
      .map { count =>
        if (logger.isDebugEnabled) logger.debug(s"$count records were successfully written")
        try writer.close()
        catch {
          case _: NullPointerException => // ignores bug in Parquet
        }
      }
      .recover { case NonFatal(e) =>
        Try(writer.close())
        throw e
      }
      .toMat(Sink.ignore)(Keep.right)

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy