All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.mjakubowski84.parquet4s.ParquetPartitioningFlow.scala Maven / Gradle / Ivy

There is a newer version: 2.20.0
Show newest version
package com.github.mjakubowski84.parquet4s

import com.github.mjakubowski84.parquet4s.ScalaCompat.stream.stage.*
import com.github.mjakubowski84.parquet4s.ScalaCompat.stream.{Attributes, FlowShape, Inlet, Outlet}
import com.github.mjakubowski84.parquet4s.ParquetPartitioningFlow.PostWriteState
import org.apache.parquet.schema.MessageType
import org.slf4j.{Logger, LoggerFactory}

import java.util.UUID
import java.util.concurrent.TimeUnit
import scala.concurrent.duration.FiniteDuration
import scala.collection.concurrent.TrieMap

object ParquetPartitioningFlow {

  val DefaultMaxCount: Long              = org.apache.parquet.hadoop.ParquetWriter.DEFAULT_BLOCK_SIZE
  val DefaultMaxDuration: FiniteDuration = FiniteDuration(1, TimeUnit.MINUTES)

  trait ViaParquet {

    /** @tparam T
      *   schema type
      * @return
      *   builder of flow that processes data of given schema
      */
    def of[T]: TypedBuilder[T, T]

    /** @return
      *   builder of flow that processes generic records
      */
    def generic: GenericBuilder
  }

  private[parquet4s] object ViaParquetImpl extends ViaParquet {
    override def of[T]: TypedBuilder[T, T] = TypedBuilderImpl(
      maxCount               = DefaultMaxCount,
      maxDuration            = DefaultMaxDuration,
      preWriteTransformation = t => Iterable(t),
      partitionBy            = Seq.empty,
      writeOptions           = ParquetWriter.Options(),
      postWriteHandler       = None
    )
    override def generic: GenericBuilder = GenericBuilderImpl(
      maxCount               = DefaultMaxCount,
      maxDuration            = DefaultMaxDuration,
      preWriteTransformation = record => Iterable(record),
      partitionBy            = Seq.empty,
      options                = ParquetWriter.Options(),
      postWriteHandler       = None
    )
  }

  /** Builds an instance of [[ParquetPartitioningFlow]]
    * @tparam T
    *   Type of message that flow accepts
    * @tparam W
    *   Schema of Parquet file that flow writes
    */
  trait Builder[T, W, Self] {

    /** @param maxCount
      *   max number of records to be written before file rotation
      */
    def maxCount(maxCount: Long): Self

    /** @param maxDuration
      *   max time after which partition file is rotated
      */
    def maxDuration(maxDuration: FiniteDuration): Self

    /** @param options
      *   writer options used by the flow
      */
    def options(options: ParquetWriter.Options): Self

    /** Sets partition paths that flow partitions data by. Can be empty. Partition path can be a simple string column
      * (e.g. "color") or a path pointing nested string field (e.g. "user.address.postcode"). Partition path is used to
      * extract data from the entity and to create a tree of subdirectories for partitioned files. Using aforementioned
      * partitions effects in creation of (example) following tree:
      * {{{
      * ../color=blue
      *       /user.address.postcode=XY1234/
      *       /user.address.postcode=AB4321/
      *   /color=green
      *       /user.address.postcode=XY1234/
      *       /user.address.postcode=CV3344/
      *       /user.address.postcode=GH6732/
      * }}}
      * Take note: 
  1. PartitionBy must point a string field.
  2. Partitioning removes partition fields * from the schema. Data is stored in name of subdirectory instead of Parquet file.
  3. Partitioning cannot * end in having empty schema. If you remove all fields of the message you will get an error.
  4. Partitioned * directories can be filtered effectively during reading.
* * @param partitionBy * [[ColumnPath]]s to partition by */ def partitionBy(partitionBy: ColumnPath*): Self /** Adds a handler after record writes, exposing some of the internal state of the flow. Intended for lower level * monitoring and control. * * Please note that the handler is invoked after each input element is processed and not after each write. It is so * because postWriteHandler may produce multiple records for a single input element. * * @param handler * a function called after writing a record, receiving a snapshot of the internal state of the flow as a * parameter. */ def postWriteHandler(handler: PostWriteState[T] => Unit): Self } trait GenericBuilder extends Builder[RowParquetRecord, RowParquetRecord, GenericBuilder] { /** @param transformation * function that is called by flow in order to transform record to final write format. Identity by default. */ def preWriteTransformation(transformation: RowParquetRecord => Iterable[RowParquetRecord]): GenericBuilder /** Builds a final flow */ def write(basePath: Path, schema: MessageType): GraphStage[FlowShape[RowParquetRecord, RowParquetRecord]] } trait TypedBuilder[T, W] extends Builder[T, W, TypedBuilder[T, W]] { /** @param transformation * function that is called by flow in order to transform data to final write format. Identity by default. * @tparam X * Schema type */ def preWriteTransformation[X](transformation: T => Iterable[X]): TypedBuilder[T, X] /** Builds a final flow */ def write( basePath: Path )(implicit schemaResolver: ParquetSchemaResolver[W], encoder: ParquetRecordEncoder[W]): GraphStage[FlowShape[T, T]] } private case class GenericBuilderImpl( maxCount: Long, maxDuration: FiniteDuration, preWriteTransformation: RowParquetRecord => Iterable[RowParquetRecord], partitionBy: Seq[ColumnPath], options: ParquetWriter.Options, postWriteHandler: Option[PostWriteState[RowParquetRecord] => Unit] ) extends GenericBuilder { override def maxCount(maxCount: Long): GenericBuilder = copy(maxCount = maxCount) override def maxDuration(maxDuration: FiniteDuration): GenericBuilder = copy(maxDuration = maxDuration) override def options(options: ParquetWriter.Options): GenericBuilder = copy(options = options) override def partitionBy(partitionBy: ColumnPath*): GenericBuilder = copy(partitionBy = partitionBy) override def preWriteTransformation( transformation: RowParquetRecord => Iterable[RowParquetRecord] ): GenericBuilder = copy(preWriteTransformation = transformation) override def postWriteHandler(handler: PostWriteState[RowParquetRecord] => Unit): GenericBuilder = copy(postWriteHandler = Some(handler)) override def write( basePath: Path, schema: MessageType ): GraphStage[FlowShape[RowParquetRecord, RowParquetRecord]] = { implicit val resolver: ParquetSchemaResolver[RowParquetRecord] = RowParquetRecord.genericParquetSchemaResolver(schema) val finalSchema = ParquetSchemaResolver.resolveSchema[RowParquetRecord](toSkip = partitionBy) val encode = (record: RowParquetRecord, _: ValueCodecConfiguration) => record new ParquetPartitioningFlow[RowParquetRecord, RowParquetRecord]( basePath, maxCount, maxDuration, preWriteTransformation, partitionBy, encode, finalSchema, options, postWriteHandler ) } } private case class TypedBuilderImpl[T, W]( maxCount: Long, maxDuration: FiniteDuration, preWriteTransformation: T => Iterable[W], partitionBy: Seq[ColumnPath], writeOptions: ParquetWriter.Options, postWriteHandler: Option[PostWriteState[T] => Unit] ) extends TypedBuilder[T, W] { override def maxCount(maxCount: Long): TypedBuilder[T, W] = copy(maxCount = maxCount) override def maxDuration(maxDuration: FiniteDuration): TypedBuilder[T, W] = copy(maxDuration = maxDuration) override def options(options: ParquetWriter.Options): TypedBuilder[T, W] = copy(writeOptions = options) override def partitionBy(partitionBy: ColumnPath*): TypedBuilder[T, W] = copy(partitionBy = partitionBy) override def preWriteTransformation[X](transformation: T => Iterable[X]): TypedBuilder[T, X] = TypedBuilderImpl( maxCount, maxDuration, transformation, partitionBy, writeOptions, postWriteHandler ) override def postWriteHandler(handler: PostWriteState[T] => Unit): TypedBuilder[T, W] = copy(postWriteHandler = Some(handler)) override def write(basePath: Path)(implicit schemaResolver: ParquetSchemaResolver[W], encoder: ParquetRecordEncoder[W] ): GraphStage[FlowShape[T, T]] = { val schema = ParquetSchemaResolver.resolveSchema[W](toSkip = partitionBy) val encode = (obj: W, vcc: ValueCodecConfiguration) => ParquetRecordEncoder.encode(obj, vcc) new ParquetPartitioningFlow[T, W]( basePath, maxCount, maxDuration, preWriteTransformation, partitionBy, encode, schema, writeOptions, postWriteHandler ) } } /** Represent the state of writer after processing of `processedData`. * @param processedData * Processed input element * @param modifiedPartitions * State of partitions that has been written in effect of processing the element T. More than one partition * can be modified due to preWriteTransformation. The map contains values representing total number of * writes to a single file (number of writes to the partition after last rotation). * @param flush * Flushes all writes to given partition and rotates the file. * @tparam T * type of input data */ case class PostWriteState[T](processedData: T, modifiedPartitions: Map[Path, Long], flush: Path => Unit) } private class ParquetPartitioningFlow[T, W]( basePath: Path, maxCount: Long, maxDuration: FiniteDuration, preWriteTransformation: T => Iterable[W], partitionBy: Seq[ColumnPath], encode: (W, ValueCodecConfiguration) => RowParquetRecord, schema: MessageType, writeOptions: ParquetWriter.Options, postWriteHandler: Option[PostWriteState[T] => Unit] ) extends GraphStage[FlowShape[T, T]] with IOOps { val in: Inlet[T] = Inlet[T]("ParquetPartitioningFlow.in") val out: Outlet[T] = Outlet[T]("ParquetPartitioningFlow.out") val shape: FlowShape[T, T] = FlowShape.of(in, out) override val logger: Logger = LoggerFactory.getLogger("ParquetPartitioningFlow") private val vcc = ValueCodecConfiguration(writeOptions) private class Logic extends TimerGraphStageLogic(shape) with InHandler with OutHandler { case class WriterState( writer: ParquetWriter.InternalWriter, var written: Long ) private val writers = TrieMap.empty[Path, WriterState] setHandlers(in, out, this) private val compressionExtension: String = writeOptions.compressionCodecName.getExtension private def newFileName: String = UUID.randomUUID().toString + compressionExtension + ".parquet" private def partition(record: RowParquetRecord): (Path, RowParquetRecord) = partitionBy.foldLeft(basePath -> record) { case ((currentPath, currentRecord), partitionPath) => currentRecord.removed(partitionPath) match { case (Some(BinaryValue(binary)), modifiedRecord) => Path(currentPath, s"$partitionPath=${binary.toStringUsingUTF8}") -> modifiedRecord case (None, _) => throw new IllegalArgumentException(s"Field '$partitionPath' does not exist.") case (Some(NullValue), _) => throw new IllegalArgumentException(s"Field '$partitionPath' is null.") case _ => throw new IllegalArgumentException(s"Non-string field '$partitionPath' used for partitioning.") } } private def scheduleNextRotation(path: Path, delay: FiniteDuration): Unit = scheduleOnce(timerKey = path, delay = delay) private def write(msg: T): Map[Path, Long] = { val valuesToWrite = preWriteTransformation(msg) val records = valuesToWrite.map(value => encode(value, vcc)) val pathsAndPartitionedRecords = records.map(partition) pathsAndPartitionedRecords.foldLeft(Map.empty[Path, Long]) { case (modifiedPartitions, (writerPath, partitionedRecord)) => val state = writers.getOrElseUpdate( writerPath, { logger.debug("Creating writer to write to [{}]", writerPath) val writer = ParquetWriter.internalWriter( file = Path(writerPath, newFileName).toOutputFile(writeOptions), schema = schema, metadataWriter = MetadataWriter.NoOp, options = writeOptions ) val state = WriterState( writer = writer, written = 0L ) scheduleNextRotation(writerPath, maxDuration) state } ) state.writer.write(partitionedRecord) state.written += 1 modifiedPartitions.updated(writerPath, state.written) } } private def close(path: Path): Unit = writers.remove(path) match { case Some(writerState) => cancelTimer(path) try writerState.writer.close() catch { case _: NullPointerException => // ignores bug in Parquet } case None => logger.debug("Trying to close a writer for a path [{}], no state was found", path) } override def onTimer(timerKey: Any): Unit = timerKey match { case path: Path => close(path) case _ => // ignore } override def onPush(): Unit = { val msg = grab(in) val modifiedPartitions = write(msg) postWriteHandler.foreach( _.apply( PostWriteState( processedData = msg, modifiedPartitions = modifiedPartitions, flush = partition => close(partition) ) ) ) modifiedPartitions.foreach { case (path, count) if count >= maxCount => close(path) case _ => // ignore } push(out, msg) } override def onPull(): Unit = if (!isClosed(in) && !hasBeenPulled(in)) { pull(in) } override def preStart(): Unit = validateWritePath(path = basePath, writeOptions = writeOptions) override def postStop(): Unit = { writers.keySet.foreach { path => close(path) } super.postStop() } } override def createLogic(inheritedAttributes: Attributes): GraphStageLogic = new Logic() }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy