com.github.mjakubowski84.parquet4s.parquet.rotatingWriter.scala Maven / Gradle / Ivy
The newest version!
package com.github.mjakubowski84.parquet4s.parquet
import cats.effect.*
import cats.effect.syntax.all.*
import cats.implicits.*
import com.github.mjakubowski84.parquet4s.*
import com.github.mjakubowski84.parquet4s.compat.MapCompat
import com.github.mjakubowski84.parquet4s.parquet.logger.Logger
import fs2.{Pipe, Pull, Stream}
import org.apache.parquet.schema.MessageType
import java.util.UUID
import java.util.concurrent.TimeUnit
import scala.concurrent.duration.FiniteDuration
import fs2.Chunk
import cats.data.NonEmptyList
import scala.util.control.NonFatal
import java.net.URLEncoder
import java.nio.charset.StandardCharsets
import cats.effect.std.Dequeue
import org.apache.parquet.hadoop.ParquetWriter as HadoopParquetWriter
object rotatingWriter {
val DefaultMaxCount: Long = org.apache.parquet.hadoop.ParquetWriter.DEFAULT_BLOCK_SIZE
val DefaultMaxDuration: FiniteDuration = FiniteDuration(1, TimeUnit.MINUTES)
val DefaultChunkSize = 16
trait ViaParquet[F[_]] {
/** @tparam T
* schema type
* @return
* Builder of pipe that processes data of specified schema
*/
def of[T]: TypedBuilder[F, T, T]
/** @return
* Builder of pipe that processes generic records
*/
def generic: GenericBuilder[F]
/** @return
* Builder of pipe that processes data using custom implementation of ParquetWriter
*/
@experimental
def custom[T, B <: HadoopParquetWriter.Builder[T, B]](writerBuilderFactory: Path => B): CustomBuilder[F, T]
}
private[parquet4s] class ViaParquetImpl[F[_]: Async] extends ViaParquet[F] {
override def of[T]: TypedBuilder[F, T, T] = TypedBuilderImpl[F, T, T](
chunkSize = DefaultChunkSize,
maxCount = DefaultMaxCount,
maxDuration = DefaultMaxDuration,
preWriteTransformation = t => Stream.emit(t),
partitionByOpt = None,
defaultPartition = PartialFunction.empty[ColumnPath, String],
postWriteHandlerOpt = None,
writeOptions = ParquetWriter.Options()
)
override def generic: GenericBuilder[F] = GenericBuilderImpl(
chunkSize = DefaultChunkSize,
maxCount = DefaultMaxCount,
maxDuration = DefaultMaxDuration,
preWriteTransformation = Stream.emit,
partitionByOpt = None,
defaultPartition = PartialFunction.empty[ColumnPath, String],
postWriteHandlerOpt = None,
writeOptions = ParquetWriter.Options()
)
override def custom[T, B <: HadoopParquetWriter.Builder[T, B]](
writerBuilderFactory: Path => B
): CustomBuilder[F, T] =
new CustomBuilderImpl(
chunkSize = DefaultChunkSize,
maxCount = DefaultMaxCount,
maxDuration = DefaultMaxDuration,
partitioning = (path, t) => (path, t),
postWriteHandlerOpt = None,
writeOptions = ParquetWriter.Options(),
writerBuilderFactory = writerBuilderFactory
)
}
trait Builder[F[_], T, W, Self] {
/** @param maxCount
* max number of records to be written before file rotation
*/
def maxCount(maxCount: Long): Self
/** @param maxDuration
* max time after which partition file is rotated
*/
def maxDuration(maxDuration: FiniteDuration): Self
/** @param writeOptions
* writer options used by the flow
*/
def options(writeOptions: ParquetWriter.Options): Self
/** Adds a handler that is invoked after write of each chunk of records. Handler exposes some of the internal state
* of the flow. Intended for lower level monitoring and control.
*
*
If you wish to have postWriteHandler invoked after write of each single element than change the size of
* chunk by changing a value of `chunkSize` property.
*
* @param postWriteHandler
* an effect called after writing a chunk of records, receiving a snapshot of the internal state of the flow as a
* parameter.
*/
def postWriteHandler(postWriteHandler: PostWriteHandler[F, T]): Self
/** For sake of better performance writer processes data in chunks rather than one by one. Default value is `16`.
* @param chunkSize
* default value override
*/
def chunkSize(chunkSize: Int): Self
}
trait GenericBuilder[F[_]]
extends Builder[F, RowParquetRecord, RowParquetRecord, GenericBuilder[F]]
with ParquetRecordPartitioning[F, RowParquetRecord, GenericBuilder[F]] {
/** @param transformation
* function that is called by stream in order to transform data to final write format. Identity by default.
*/
def preWriteTransformation(transformation: RowParquetRecord => Stream[F, RowParquetRecord]): GenericBuilder[F]
/** Builds final writer pipe.
*/
def write(basePath: Path, schema: MessageType): Pipe[F, RowParquetRecord, RowParquetRecord]
}
trait TypedBuilder[F[_], T, W]
extends Builder[F, T, W, TypedBuilder[F, T, W]]
with ParquetRecordPartitioning[F, W, TypedBuilder[F, T, W]] {
/** @param transformation
* function that is called by stream in order to transform data to final write format. Identity by default.
* @tparam X
* Schema type
*/
def preWriteTransformation[X](transformation: T => Stream[F, X]): TypedBuilder[F, T, X]
/** Builds final writer pipe.
*/
def write(basePath: Path)(implicit
schemaResolver: ParquetSchemaResolver[W],
encoder: ParquetRecordEncoder[W]
): Pipe[F, T, T]
}
trait CustomBuilder[F[_], T] extends Builder[F, T, T, CustomBuilder[F, T]] {
/** @param partitioning
* a function which can be used for manipulation of path where the given object T will be written to and the
* content of the object before it is written
* @return
*/
def partitionUsing(partitioning: (Path, T) => (Path, T)): CustomBuilder[F, T]
/** Builds final writer pipe.
*/
def write(basePath: Path): Pipe[F, T, T]
}
private case class GenericBuilderImpl[F[_]: Async](
chunkSize: Int,
maxCount: Long,
maxDuration: FiniteDuration,
preWriteTransformation: RowParquetRecord => Stream[F, RowParquetRecord],
partitionByOpt: Option[NonEmptyList[ColumnPath]],
defaultPartition: PartialFunction[ColumnPath, String],
postWriteHandlerOpt: Option[PostWriteHandler[F, RowParquetRecord]],
writeOptions: ParquetWriter.Options
) extends GenericBuilder[F] {
override def chunkSize(chunkSize: Int): GenericBuilder[F] = this.copy(chunkSize = chunkSize)
override def maxCount(maxCount: Long): GenericBuilder[F] = copy(maxCount = maxCount)
override def maxDuration(maxDuration: FiniteDuration): GenericBuilder[F] = copy(maxDuration = maxDuration)
override def options(writeOptions: ParquetWriter.Options): GenericBuilder[F] = copy(writeOptions = writeOptions)
override def partitionBy(partitionBy: ColumnPath*): GenericBuilder[F] =
copy(partitionByOpt = NonEmptyList.fromList(partitionBy.toList))
override def defaultPartition(defaultPartition: PartialFunction[ColumnPath, String]): GenericBuilder[F] =
copy(defaultPartition = defaultPartition)
override def preWriteTransformation(
transformation: RowParquetRecord => Stream[F, RowParquetRecord]
): GenericBuilder[F] =
copy(preWriteTransformation = transformation)
override def postWriteHandler(postWriteHandler: PostWriteHandler[F, RowParquetRecord]): GenericBuilder[F] =
copy(postWriteHandlerOpt = Option(postWriteHandler))
override def write(basePath: Path, schema: MessageType): Pipe[F, RowParquetRecord, RowParquetRecord] = {
val encode = (record: RowParquetRecord, _: ValueCodecConfiguration) => Sync[F].pure(record)
implicit val resolver: ParquetSchemaResolver[RowParquetRecord] =
RowParquetRecord.genericParquetSchemaResolver(schema)
val finalSchemaF = Sync[F].catchNonFatal {
ParquetSchemaResolver.resolveSchema[RowParquetRecord](toSkip =
partitionByOpt.map(_.toList).getOrElse(Seq.empty)
)
}
rotatingWriter.write[F, RowParquetRecord, RowParquetRecord, RowParquetRecord](
basePath = basePath,
chunkSize = chunkSize,
maxCount = maxCount,
maxDuration = maxDuration,
prewriteTransformation = preWriteTransformation,
encodeAndPartition = (_: Path, record: RowParquetRecord) => encodeAndPartition(record, basePath, encode),
postWriteHandlerOpt = postWriteHandlerOpt,
options = writeOptions,
createWriter = filePath =>
finalSchemaF >>= { finalSchema =>
Sync[F].delay {
scala.concurrent.blocking {
ParquetWriter.internalWriter(
file = filePath.toOutputFile(writeOptions),
schema = finalSchema,
metadataWriter = MetadataWriter.NoOp,
options = writeOptions
)
}
}
}
)
}
}
private case class TypedBuilderImpl[F[_]: Async, T, W](
chunkSize: Int,
maxCount: Long,
maxDuration: FiniteDuration,
preWriteTransformation: T => Stream[F, W],
partitionByOpt: Option[NonEmptyList[ColumnPath]],
defaultPartition: PartialFunction[ColumnPath, String],
postWriteHandlerOpt: Option[PostWriteHandler[F, T]],
writeOptions: ParquetWriter.Options
) extends TypedBuilder[F, T, W] {
override def chunkSize(chunkSize: Int): TypedBuilder[F, T, W] = this.copy(chunkSize = chunkSize)
override def maxCount(maxCount: Long): TypedBuilder[F, T, W] = copy(maxCount = maxCount)
override def maxDuration(maxDuration: FiniteDuration): TypedBuilder[F, T, W] = copy(maxDuration = maxDuration)
override def options(writeOptions: ParquetWriter.Options): TypedBuilder[F, T, W] = copy(writeOptions = writeOptions)
override def partitionBy(partitionBy: ColumnPath*): TypedBuilder[F, T, W] =
copy(partitionByOpt = NonEmptyList.fromList(partitionBy.toList))
override def defaultPartition(defaultPartition: PartialFunction[ColumnPath, String]): TypedBuilder[F, T, W] =
copy(defaultPartition = defaultPartition)
override def preWriteTransformation[X](transformation: T => Stream[F, X]): TypedBuilder[F, T, X] =
TypedBuilderImpl(
chunkSize = chunkSize,
maxCount = maxCount,
maxDuration = maxDuration,
preWriteTransformation = transformation,
partitionByOpt = partitionByOpt,
defaultPartition = defaultPartition,
writeOptions = writeOptions,
postWriteHandlerOpt = postWriteHandlerOpt
)
override def postWriteHandler(postWriteHandler: PostWriteHandler[F, T]): TypedBuilder[F, T, W] =
copy(postWriteHandlerOpt = Option(postWriteHandler))
override def write(
basePath: Path
)(implicit schemaResolver: ParquetSchemaResolver[W], encoder: ParquetRecordEncoder[W]): Pipe[F, T, T] = {
val schemaF =
Sync[F].catchNonFatal(ParquetSchemaResolver.resolveSchema[W](partitionByOpt.map(_.toList).getOrElse(Seq.empty)))
val encode = (obj: W, vcc: ValueCodecConfiguration) => Sync[F].delay(ParquetRecordEncoder.encode[W](obj, vcc))
rotatingWriter.write[F, T, W, RowParquetRecord](
basePath = basePath,
chunkSize = chunkSize,
maxCount = maxCount,
maxDuration = maxDuration,
prewriteTransformation = preWriteTransformation,
encodeAndPartition = (_: Path, obj: W) => encodeAndPartition(obj, basePath, encode),
postWriteHandlerOpt = postWriteHandlerOpt,
options = writeOptions,
createWriter = filePath =>
schemaF >>= { schema =>
Sync[F].delay {
scala.concurrent.blocking {
ParquetWriter.internalWriter(
file = filePath.toOutputFile(writeOptions),
schema = schema,
metadataWriter = MetadataWriter.NoOp,
options = writeOptions
)
}
}
}
)
}
}
private case class CustomBuilderImpl[F[_]: Async, T, B <: HadoopParquetWriter.Builder[T, B]](
chunkSize: Int,
maxCount: Long,
maxDuration: FiniteDuration,
partitioning: (Path, T) => (Path, T),
postWriteHandlerOpt: Option[PostWriteHandler[F, T]],
writeOptions: ParquetWriter.Options,
writerBuilderFactory: Path => B
) extends CustomBuilder[F, T] {
override def chunkSize(chunkSize: Int): CustomBuilder[F, T] = this.copy(chunkSize = chunkSize)
override def maxCount(maxCount: Long): CustomBuilder[F, T] = copy(maxCount = maxCount)
override def maxDuration(maxDuration: FiniteDuration): CustomBuilder[F, T] = copy(maxDuration = maxDuration)
override def options(writeOptions: ParquetWriter.Options): CustomBuilder[F, T] = copy(writeOptions = writeOptions)
override def partitionUsing(partitioning: (Path, T) => (Path, T)): CustomBuilder[F, T] =
copy(partitioning = partitioning)
override def postWriteHandler(postWriteHandler: PostWriteHandler[F, T]): CustomBuilder[F, T] =
copy(postWriteHandlerOpt = Option(postWriteHandler))
override def write(
basePath: Path
): Pipe[F, T, T] =
rotatingWriter.write[F, T, T, T](
basePath = basePath,
chunkSize = chunkSize,
maxCount = maxCount,
maxDuration = maxDuration,
prewriteTransformation = Stream.emit,
encodeAndPartition = (path: Path, obj: T) => Sync[F].catchNonFatal(partitioning(path, obj)),
postWriteHandlerOpt = postWriteHandlerOpt,
options = writeOptions,
createWriter = filePath =>
Sync[F].delay {
scala.concurrent.blocking {
writeOptions.applyTo[T, B](writerBuilderFactory(filePath)).build()
}
}
)
}
trait ParquetRecordPartitioning[F[_], W, Self] {
/** Sets partition paths that stream partitions data by. Can be empty. Partition path can be a simple string column
* (e.g. "color") or a path pointing nested string field (e.g. "user.address.postcode"). Partition path is used to
* extract data from the entity and to create a tree of subdirectories for partitioned files. Using aforementioned
* partitions effects in creation of (example) following tree:
* {{{
* ../color=blue
* /user.address.postcode=XY1234/
* /user.address.postcode=AB4321/
* /color=green
* /user.address.postcode=XY1234/
* /user.address.postcode=CV3344/
* /user.address.postcode=GH6732/
* }}}
* Take note: - PartitionBy must point a string field.
- Partitioning removes partition fields
* from the schema. Data is stored in the name of subdirectory instead of Parquet file.
- Partitioning
* cannot end in having empty schema. If you remove all fields of the message you will get an error.
* - Partitioned directories can be filtered effectively during reading.
*
* @param partitionBy
* [[ColumnPath]]s to partition by
*/
def partitionBy(partitionBy: ColumnPath*): Self
/** Allows to define default partition values for optional or nullable columns.
* @param defaultPartition
* partial function, which Parquet4s will call to attempt to resolve a partition value when it encouters a null
* column value
*/
def defaultPartition(defaultPartition: PartialFunction[ColumnPath, String]): Self
protected def partitionByOpt: Option[NonEmptyList[ColumnPath]]
protected def defaultPartition: PartialFunction[ColumnPath, String]
protected def writeOptions: ParquetWriter.Options
private lazy val vcc: ValueCodecConfiguration = ValueCodecConfiguration(writeOptions)
protected def encodeAndPartition(
valueToWrite: W,
basePath: Path,
encode: (W, ValueCodecConfiguration) => F[RowParquetRecord]
)(implicit F: Sync[F]): F[(Path, RowParquetRecord)] =
encode(valueToWrite, vcc) >>= (record => partition(record = record, basePath = basePath))
private def partition(record: RowParquetRecord, basePath: Path)(implicit F: Sync[F]): F[(Path, RowParquetRecord)] =
partitionByOpt.fold(F.pure(basePath -> record)) { partitionBy =>
modifyPartitionedRecord(record, partitionBy.head).flatMap {
case (firstPartitionPath, firstPartitionValue, modifiedRecord) =>
val builder = new StringBuilder()
builder.append(firstPartitionPath.toString)
builder.append("=")
builder.append(URLEncoder.encode(firstPartitionValue, StandardCharsets.UTF_8.name()))
partitionRec(modifiedRecord, partitionBy.tail, basePath, builder)
}
}
private def partitionRec(
record: RowParquetRecord,
partitionBy: List[ColumnPath],
basePath: Path,
builder: StringBuilder
)(implicit F: Sync[F]): F[(Path, RowParquetRecord)] =
partitionBy match {
case columnPath :: rest =>
modifyPartitionedRecord(record, columnPath).flatMap { case (partitionPath, partitionValue, modifiedRecord) =>
builder.append(Path.Separator)
builder.append(partitionPath.toString)
builder.append("=")
builder.append(URLEncoder.encode(partitionValue, StandardCharsets.UTF_8.name()))
partitionRec(modifiedRecord, rest, basePath, builder)
}
case Nil =>
F.pure(Path(basePath, builder.result()) -> record)
}
private def modifyPartitionedRecord(
record: RowParquetRecord,
partitionColumnPath: ColumnPath
)(implicit F: Sync[F]): F[(ColumnPath, String, RowParquetRecord)] =
record.removed(partitionColumnPath) match {
case (Some(BinaryValue(binary)), modifiedRecord) =>
F.catchNonFatal((partitionColumnPath, binary.toStringUsingUTF8, modifiedRecord))
case (Some(NullValue), modifiedRecord) if defaultPartition.isDefinedAt(partitionColumnPath) =>
F.pure((partitionColumnPath, defaultPartition(partitionColumnPath), modifiedRecord))
case (Some(NullValue), _) =>
F.raiseError(new IllegalArgumentException(s"Field '$partitionColumnPath' is null."))
case (None, _) =>
F.raiseError(new IllegalArgumentException(s"Field '$partitionColumnPath' does not exist."))
case _ =>
F.raiseError(new IllegalArgumentException(s"Non-string field '$partitionColumnPath' used for partitioning."))
}
}
type PostWriteHandler[F[_], T] = PostWriteState[F, T] => F[Unit]
/** Represent the state of writer after processing of `processedData`.
* @param processedData
* Processed input element
* @param modifiedPartitions
* State of partitions that has been written in effect of processing the element T. More than one partition
* can be modified due to preWriteTransformation. The map contains values representing total number of
* writes to a single file (number of writes to the partition after last rotation).
* @param flush
* Flushes all writes to given partition and rotates the file.
* @tparam F
* effect type
* @tparam T
* type of input data
*/
case class PostWriteState[F[_], T](
processedData: Chunk[T],
modifiedPartitions: Map[Path, Long],
flush: Path => F[Unit]
)
sealed private trait WriterEvent[F[_], T, W]
private case class DataEvent[F[_], T, W](data: Stream[F, W], out: T) extends WriterEvent[F, T, W]
private case class RotateEvent[F[_], T, W](partition: Path) extends WriterEvent[F, T, W]
private case class StopEvent[F[_], T, W]() extends WriterEvent[F, T, W]
private object RecordWriter {
private def newFileName(options: ParquetWriter.Options): String = {
val compressionExtension = options.compressionCodecName.getExtension
UUID.randomUUID().toString + compressionExtension + ".parquet"
}
def apply[F[_], T, W, R](
basePath: Path,
createWriter: Path => F[HadoopParquetWriter[R]],
options: ParquetWriter.Options,
eventDequeue: Dequeue[F, WriterEvent[F, T, W]],
maxDuration: FiniteDuration
)(implicit F: Async[F]): F[RecordWriter[F, R]] =
F.uncancelable { _ =>
for {
internalWriter <- createWriter(basePath.append(newFileName(options)))
rotationFiber <- F.delayBy(eventDequeue.offerFront(RotateEvent[F, T, W](basePath)), maxDuration).start
} yield new RecordWriter(internalWriter, rotationFiber)
}
}
private class RecordWriter[F[_], R](
internalWriter: HadoopParquetWriter[R],
rotationFiber: Fiber[F, Throwable, Unit]
)(implicit F: Async[F]) {
var count: Long = 0
var disposed: Boolean = false
def write(record: R): F[Long] = F.delay(scala.concurrent.blocking {
internalWriter.write(record)
count = count + 1
count
})
def dispose: F[Unit] =
F.uncancelable { _ =>
disposed = true
rotationFiber.cancel >> F.delay(scala.concurrent.blocking(internalWriter.close())).recover { case NonFatal(_) =>
() // ignores bug in Parquet
}
}
}
private class RotatingWriter[F[_], T, W, R](
basePath: Path,
options: ParquetWriter.Options,
chunkSize: Int,
maxCount: Long,
maxDuration: FiniteDuration,
encodeAndPartition: (Path, W) => F[(Path, R)],
eventDequeue: Dequeue[F, WriterEvent[F, T, W]],
logger: Logger[F],
postWriteHandlerOpt: Option[PostWriteHandler[F, T]],
createWriter: Path => F[HadoopParquetWriter[R]],
writersRef: Ref[F, Map[Path, RecordWriter[F, R]]]
)(implicit F: Async[F]) {
private def write(chunk: Chunk[W]): F[Map[Path, Long]] =
chunk.foldM(Map.empty[Path, Long]) { case (map, entity) =>
write(entity).map { case (path, count) =>
map.updated(path, count)
}
}
private def write(entity: W): F[(Path, Long)] =
for {
partitioning <- encodeAndPartition(basePath, entity)
(path, partitionedRecord) = partitioning
count <- writersRef.access.flatMap { case (writers, setter) =>
writers.get(path) match {
// it should never happened that disposed writer is left in the map but let's be safe
case Some(writer) if !writer.disposed =>
for {
count <- writer.write(partitionedRecord)
_ <-
if (count >= maxCount) {
writer.dispose >> setter(MapCompat.remove(writers, path)).void
} else {
F.unit
}
} yield count
case _ =>
for {
writer <- RecordWriter[F, T, W, R](path, createWriter, options, eventDequeue, maxDuration)
count <- writer.write(partitionedRecord)
isUpdated <-
if (count >= maxCount) {
// writer is not supposed to be added to the map, but disposed immediately
F.pure(false)
} else {
setter(writers.updated(path, writer))
}
_ <-
if (isUpdated) {
F.unit
} else {
// updating ref didn't succeed or writer is intended to be disposed immediately
writer.dispose
}
} yield count
}
}
} yield path -> count
private def disposeAll: F[Unit] =
F.uncancelable { _ =>
for {
writers <- writersRef.getAndSet(Map.empty)
_ <- writers.values.toList.traverse_(_.dispose)
} yield ()
}
private def dispose(partition: Path): F[Unit] =
F.uncancelable { _ =>
for {
removedWriterOpt <- writersRef.modify { writers =>
writers.get(partition) match {
case Some(writer) =>
MapCompat.remove(writers, partition) -> Some(writer)
case None =>
writers -> None
}
}
_ <- removedWriterOpt.traverse_(_.dispose)
} yield ()
}
private def rotatePull(partitions: Iterable[Path]): Pull[F, T, Unit] =
partitions
.map(partition => Pull.eval(logger.debug(s"Rotating $partition")) >> Pull.eval(dispose(partition)))
.reduceOption(_ >> _)
.getOrElse(Pull.done)
private def postWriteHandlerPull(
out: Chunk[T],
partitionsState: Map[Path, Long]
): Pull[F, T, List[Path]] =
postWriteHandlerOpt.fold(Pull.eval[F, List[Path]](F.pure(List.empty)))(handler =>
Pull.eval {
for {
partitionsToFlushRef <- Ref.of[F, List[Path]](List.empty)
state = PostWriteState[F, T](
processedData = out,
modifiedPartitions = partitionsState,
flush = partition => partitionsToFlushRef.update(partitions => partition +: partitions)
)
_ <- handler(state)
partitionsToFlush <- partitionsToFlushRef.get
} yield partitionsToFlush
}
)
private def writeEntitiesAndOutputPull(
entityStream: Stream[F, W],
outChunk: Chunk[T]
): Pull[F, T, Map[Path, Long]] =
writeEntityChunksAndOutputPull(
entityChunksStream = entityStream.chunkN(chunkSize),
outChunk = outChunk,
modifiedPartitions = Map.empty
)
private def writeEntityChunksAndOutputPull(
entityChunksStream: Stream[F, Chunk[W]],
outChunk: Chunk[T],
modifiedPartitions: Map[Path, Long]
): Pull[F, T, Map[Path, Long]] =
entityChunksStream.pull.uncons1.flatMap {
case Some((chunk, tail)) =>
Pull.eval(write(chunk)).flatMap { chunkModifiedPartitions =>
writeEntityChunksAndOutputPull(tail, outChunk, modifiedPartitions ++ chunkModifiedPartitions)
}
case None if postWriteHandlerOpt.isEmpty =>
Pull.output(outChunk) >> Pull.pure(modifiedPartitions)
case None =>
postWriteHandlerPull(outChunk, modifiedPartitions).flatMap {
case Nil =>
Pull.output(outChunk) >> Pull.pure(modifiedPartitions)
case partitionsToRotate =>
rotatePull(partitionsToRotate) >> Pull.output(outChunk) >> Pull.pure(modifiedPartitions)
}
}
sealed private trait Acc
private case class DataAcc(data: Stream[F, W], chunk: Chunk[T], pull: Pull[F, T, Unit]) extends Acc
private case class StopAcc(data: Stream[F, W], chunk: Chunk[T], pull: Pull[F, T, Unit]) extends Acc
private def writeAllEventsPull(in: Stream[F, Chunk[WriterEvent[F, T, W]]]): Pull[F, T, Unit] =
in.pull.uncons1.flatMap {
case Some((eventChunk, tail)) =>
eventChunk.foldLeft[Acc](DataAcc(Stream.empty, Chunk.empty, Pull.done)) {
case (DataAcc(dataStream, outChunk, pull), DataEvent(data, out)) =>
DataAcc(dataStream ++ data, outChunk.appendK(out), pull)
case (DataAcc(dataStream, outChunk, pull), RotateEvent(partition)) =>
DataAcc(
Stream.empty,
Chunk.empty[T],
pull >> writeEntitiesAndOutputPull(dataStream, outChunk) >> rotatePull(Iterable(partition))
)
case (DataAcc(dataStream, outChunk, pull), StopEvent()) =>
StopAcc(dataStream, outChunk, pull)
case (stop: StopAcc, _) =>
stop
} match {
case StopAcc(_, outChunk, pull) if outChunk.isEmpty =>
pull
case StopAcc(dataStream, outChunk, pull) =>
pull >> writeEntitiesAndOutputPull(dataStream, outChunk) >> Pull.done
case DataAcc(_, outChunk, pull) if outChunk.isEmpty =>
pull >> writeAllEventsPull(tail)
case DataAcc(dataStream, outChunk, pull) =>
pull >> writeEntitiesAndOutputPull(dataStream, outChunk) >> writeAllEventsPull(tail)
}
case None =>
Pull.done
}
def writeAllEvents(in: Stream[F, WriterEvent[F, T, W]]): Stream[F, T] =
writeAllEventsPull(in.chunkLimit(chunkSize)).stream.onFinalize(disposeAll)
}
private def write[F[_], T, W, R](
basePath: Path,
chunkSize: Int,
maxCount: Long,
maxDuration: FiniteDuration,
prewriteTransformation: T => Stream[F, W],
encodeAndPartition: (Path, W) => F[(Path, R)],
postWriteHandlerOpt: Option[PostWriteHandler[F, T]],
options: ParquetWriter.Options,
createWriter: Path => F[HadoopParquetWriter[R]]
)(implicit F: Async[F]): Pipe[F, T, T] =
in =>
for {
logger <- Stream.eval(logger[F](this.getClass))
_ <- Stream.eval(io.validateWritePath[F](basePath, options, logger))
eventDequeue <- Stream.eval(Dequeue.unbounded[F, WriterEvent[F, T, W]])
writersRef <- Stream.eval(Ref.of(Map.empty[Path, RecordWriter[F, R]]))
rotatingWriter <- Stream.emit(
new RotatingWriter[F, T, W, R](
basePath = basePath,
options = options,
chunkSize = chunkSize,
maxCount = maxCount,
maxDuration = maxDuration,
encodeAndPartition = encodeAndPartition,
eventDequeue = eventDequeue,
logger = logger,
postWriteHandlerOpt = postWriteHandlerOpt,
createWriter = createWriter,
writersRef = writersRef
)
)
eventStream = Stream(
Stream.fromQueueUnterminated(eventDequeue, limit = chunkSize),
in
.map { inputElement =>
DataEvent[F, T, W](prewriteTransformation(inputElement), inputElement)
}
.append(Stream.emit(StopEvent[F, T, W]()))
).parJoin(maxOpen = 2)
out <- rotatingWriter.writeAllEvents(eventStream)
} yield out
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy