com.github.mjakubowski84.parquet4s.parquet.writer.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of parquet4s-fs2_2.13 Show documentation
parquet4s-fs2
The newest version!
package com.github.mjakubowski84.parquet4s.parquet

import cats.effect.{Resource, Sync}
import cats.implicits.*
import com.github.mjakubowski84.parquet4s.{
  ParquetRecordEncoder,
  ParquetSchemaResolver,
  ParquetWriter,
  Path,
  RowParquetRecord,
  ValueCodecConfiguration,
  experimental
}
import fs2.{Chunk, Pipe, Pull, Stream}
import org.apache.parquet.hadoop.ParquetWriter as HadoopParquetWriter
import org.apache.parquet.io.OutputFile
import org.apache.parquet.schema.MessageType
import scala.util.control.NonFatal

private[parquet4s] object writer {

  trait ToParquet[F[_]] {

    /** Creates a builder of pipe that processes data of given type
      * @tparam T
      *   Schema type
      */
    def of[T: ParquetSchemaResolver: ParquetRecordEncoder]: Builder[F, T]

    /** Creates a builder of pipe that processes generic records
      */
    def generic(schema: MessageType): Builder[F, RowParquetRecord]

    /** Creates a builder of pipe that processes data of a given type using a
      * [[org.apache.parquet.hadoop.ParquetWriter]] built from a provided
      * [[org.apache.parquet.hadoop.ParquetWriter.Builder]].
      * @tparam T
      *   Schema type
      * @tparam B
      *   Type of custom [[org.apache.parquet.hadoop.ParquetWriter.Builder]]
      */
    @experimental
    def custom[T, B <: HadoopParquetWriter.Builder[T, B]](builder: B): CustomBuilder[F, T]
  }

  private[parquet4s] class ToParquetImpl[F[_]: Sync] extends ToParquet[F] {
    override def of[T: ParquetSchemaResolver: ParquetRecordEncoder]: Builder[F, T] =
      BuilderImpl()
    override def generic(schema: MessageType): Builder[F, RowParquetRecord] =
      BuilderImpl()(
        schemaResolver = RowParquetRecord.genericParquetSchemaResolver(schema),
        encoder        = RowParquetRecord.genericParquetRecordEncoder,
        sync           = Sync[F]
      )
    override def custom[T, B <: HadoopParquetWriter.Builder[T, B]](builder: B): CustomBuilder[F, T] =
      CustomBuilderImpl(builder = builder, options = ParquetWriter.Options())
  }

  trait Builder[F[_], T] {

    /** @param options
      *   writer options
      */
    def options(options: ParquetWriter.Options): Builder[F, T]

    /** @param path
      *   at which data is supposed to be written
      * @return
      *   final [[fs2.Pipe]]
      */
    def write(path: Path): Pipe[F, T, Nothing]

    /** @param outputFile
      *   to which data is supposed to be written
      * @return
      *   final [[fs2.Pipe]]
      */
    @experimental
    def write(outputFile: OutputFile): Pipe[F, T, Nothing]
  }

  private case class BuilderImpl[F[_], T](options: ParquetWriter.Options = ParquetWriter.Options())(implicit
      schemaResolver: ParquetSchemaResolver[T],
      encoder: ParquetRecordEncoder[T],
      sync: Sync[F]
  ) extends Builder[F, T] {
    override def options(options: ParquetWriter.Options): Builder[F, T] = this.copy(options = options)
    override def write(path: Path): Pipe[F, T, Nothing]                 = write(path.toOutputFile(options))

    override def write(outputFile: OutputFile): Pipe[F, T, Nothing] = rowParquetRecordPipe[F, T](outputFile, options)
  }

  trait CustomBuilder[F[_], T] {

    /** @param options
      *   writer options
      */
    def options(options: ParquetWriter.Options): CustomBuilder[F, T]

    /** @return
      *   final [[fs2.Pipe]]
      */
    def write: Pipe[F, T, Nothing]
  }

  private case class CustomBuilderImpl[F[_]: Sync, T, B <: HadoopParquetWriter.Builder[T, B]](
      builder: B,
      options: ParquetWriter.Options
  ) extends CustomBuilder[F, T] {
    override def options(options: ParquetWriter.Options): CustomBuilder[F, T] =
      this.copy(options = options)

    override def write: Pipe[F, T, Nothing] =
      pipe(options.applyTo[T, B](builder).build())
  }

  private class Writer[T, F[_]](internalWriter: HadoopParquetWriter[T])(implicit F: Sync[F]) extends AutoCloseable {
    def write(elem: T): F[Unit] =
      F.delay(scala.concurrent.blocking(internalWriter.write(elem)))

    def writePull(chunk: Chunk[T]): Pull[F, Nothing, Unit] =
      Pull.eval(chunk.traverse_(write))

    def writeAll(in: Stream[F, T]): Pull[F, Nothing, Unit] =
      in.pull.uncons.flatMap {
        case Some((chunk, tail)) => writePull(chunk) >> writeAll(tail)
        case None                => Pull.done
      }

    def writeAllStream(in: Stream[F, T]): Stream[F, Nothing] = writeAll(in).stream

    override def close(): Unit =
      try internalWriter.close()
      catch {
        case NonFatal(_) => // ignores bug in Parquet
      }
  }
  private object Writer {
    def apply[T, F[_]](makeParquetWriter: => HadoopParquetWriter[T])(implicit
        F: Sync[F]
    ): Resource[F, Writer[T, F]] =
      Resource.fromAutoCloseable(
        F.blocking(makeParquetWriter).map(new Writer[T, F](_))
      )
  }

  private def rowParquetRecordPipe[F[_], T: ParquetSchemaResolver](
      outputFile: OutputFile,
      options: ParquetWriter.Options
  )(implicit F: Sync[F], encoder: ParquetRecordEncoder[T]): Pipe[F, T, Nothing] = { in =>
    val valueCodecConfiguration = ValueCodecConfiguration(options)
    in
      .evalMapChunk(entity => F.catchNonFatal(ParquetRecordEncoder.encode[T](entity, valueCodecConfiguration)))
      .through(
        pipe[F, RowParquetRecord](
          ParquetWriter
            .internalWriter(
              file           = outputFile,
              schema         = ParquetSchemaResolver.resolveSchema[T],
              metadataWriter = encoder,
              options        = options
            )
        )
      )
  }

  private def pipe[F[_]: Sync, T](makeParquetWriter: => HadoopParquetWriter[T]): Pipe[F, T, Nothing] =
    in =>
      Stream
        .resource(Writer[T, F](makeParquetWriter))
        .flatMap(_.writeAllStream(in))
}