com.github.mjakubowski84.parquet4s.ParquetPartitioningFlow.scala Maven / Gradle / Ivy
package com.github.mjakubowski84.parquet4s
import com.github.mjakubowski84.parquet4s.ScalaCompat.stream.stage.*
import com.github.mjakubowski84.parquet4s.ScalaCompat.stream.{Attributes, FlowShape, Inlet, Outlet}
import com.github.mjakubowski84.parquet4s.ParquetPartitioningFlow.PostWriteState
import org.apache.parquet.schema.MessageType
import org.slf4j.{Logger, LoggerFactory}
import java.util.UUID
import java.util.concurrent.TimeUnit
import scala.concurrent.duration.FiniteDuration
import scala.collection.concurrent.TrieMap
object ParquetPartitioningFlow {
val DefaultMaxCount: Long = org.apache.parquet.hadoop.ParquetWriter.DEFAULT_BLOCK_SIZE
val DefaultMaxDuration: FiniteDuration = FiniteDuration(1, TimeUnit.MINUTES)
trait ViaParquet {
/** @tparam T
* schema type
* @return
* builder of flow that processes data of given schema
*/
def of[T]: TypedBuilder[T, T]
/** @return
* builder of flow that processes generic records
*/
def generic: GenericBuilder
}
private[parquet4s] object ViaParquetImpl extends ViaParquet {
override def of[T]: TypedBuilder[T, T] = TypedBuilderImpl(
maxCount = DefaultMaxCount,
maxDuration = DefaultMaxDuration,
preWriteTransformation = t => Iterable(t),
partitionBy = Seq.empty,
writeOptions = ParquetWriter.Options(),
postWriteHandler = None
)
override def generic: GenericBuilder = GenericBuilderImpl(
maxCount = DefaultMaxCount,
maxDuration = DefaultMaxDuration,
preWriteTransformation = record => Iterable(record),
partitionBy = Seq.empty,
options = ParquetWriter.Options(),
postWriteHandler = None
)
}
/** Builds an instance of [[ParquetPartitioningFlow]]
* @tparam T
* Type of message that flow accepts
* @tparam W
* Schema of Parquet file that flow writes
*/
trait Builder[T, W, Self] {
/** @param maxCount
* max number of records to be written before file rotation
*/
def maxCount(maxCount: Long): Self
/** @param maxDuration
* max time after which partition file is rotated
*/
def maxDuration(maxDuration: FiniteDuration): Self
/** @param options
* writer options used by the flow
*/
def options(options: ParquetWriter.Options): Self
/** Sets partition paths that flow partitions data by. Can be empty. Partition path can be a simple string column
* (e.g. "color") or a path pointing nested string field (e.g. "user.address.postcode"). Partition path is used to
* extract data from the entity and to create a tree of subdirectories for partitioned files. Using aforementioned
* partitions effects in creation of (example) following tree:
* {{{
* ../color=blue
* /user.address.postcode=XY1234/
* /user.address.postcode=AB4321/
* /color=green
* /user.address.postcode=XY1234/
* /user.address.postcode=CV3344/
* /user.address.postcode=GH6732/
* }}}
* Take note: - PartitionBy must point a string field.
- Partitioning removes partition fields
* from the schema. Data is stored in name of subdirectory instead of Parquet file.
- Partitioning cannot
* end in having empty schema. If you remove all fields of the message you will get an error.
- Partitioned
* directories can be filtered effectively during reading.
*
* @param partitionBy
* [[ColumnPath]]s to partition by
*/
def partitionBy(partitionBy: ColumnPath*): Self
/** Adds a handler after record writes, exposing some of the internal state of the flow. Intended for lower level
* monitoring and control.
*
* Please note that the handler is invoked after each input element is processed and not after each write. It is so
* because postWriteHandler may produce multiple records for a single input element.
*
* @param handler
* a function called after writing a record, receiving a snapshot of the internal state of the flow as a
* parameter.
*/
def postWriteHandler(handler: PostWriteState[T] => Unit): Self
}
trait GenericBuilder extends Builder[RowParquetRecord, RowParquetRecord, GenericBuilder] {
/** @param transformation
* function that is called by flow in order to transform record to final write format. Identity by default.
*/
def preWriteTransformation(transformation: RowParquetRecord => Iterable[RowParquetRecord]): GenericBuilder
/** Builds a final flow
*/
def write(basePath: Path, schema: MessageType): GraphStage[FlowShape[RowParquetRecord, RowParquetRecord]]
}
trait TypedBuilder[T, W] extends Builder[T, W, TypedBuilder[T, W]] {
/** @param transformation
* function that is called by flow in order to transform data to final write format. Identity by default.
* @tparam X
* Schema type
*/
def preWriteTransformation[X](transformation: T => Iterable[X]): TypedBuilder[T, X]
/** Builds a final flow
*/
def write(
basePath: Path
)(implicit schemaResolver: ParquetSchemaResolver[W], encoder: ParquetRecordEncoder[W]): GraphStage[FlowShape[T, T]]
}
private case class GenericBuilderImpl(
maxCount: Long,
maxDuration: FiniteDuration,
preWriteTransformation: RowParquetRecord => Iterable[RowParquetRecord],
partitionBy: Seq[ColumnPath],
options: ParquetWriter.Options,
postWriteHandler: Option[PostWriteState[RowParquetRecord] => Unit]
) extends GenericBuilder {
override def maxCount(maxCount: Long): GenericBuilder = copy(maxCount = maxCount)
override def maxDuration(maxDuration: FiniteDuration): GenericBuilder = copy(maxDuration = maxDuration)
override def options(options: ParquetWriter.Options): GenericBuilder = copy(options = options)
override def partitionBy(partitionBy: ColumnPath*): GenericBuilder = copy(partitionBy = partitionBy)
override def preWriteTransformation(
transformation: RowParquetRecord => Iterable[RowParquetRecord]
): GenericBuilder =
copy(preWriteTransformation = transformation)
override def postWriteHandler(handler: PostWriteState[RowParquetRecord] => Unit): GenericBuilder =
copy(postWriteHandler = Some(handler))
override def write(
basePath: Path,
schema: MessageType
): GraphStage[FlowShape[RowParquetRecord, RowParquetRecord]] = {
implicit val resolver: ParquetSchemaResolver[RowParquetRecord] =
RowParquetRecord.genericParquetSchemaResolver(schema)
val finalSchema = ParquetSchemaResolver.resolveSchema[RowParquetRecord](toSkip = partitionBy)
val encode = (record: RowParquetRecord, _: ValueCodecConfiguration) => record
new ParquetPartitioningFlow[RowParquetRecord, RowParquetRecord](
basePath,
maxCount,
maxDuration,
preWriteTransformation,
partitionBy,
encode,
finalSchema,
options,
postWriteHandler
)
}
}
private case class TypedBuilderImpl[T, W](
maxCount: Long,
maxDuration: FiniteDuration,
preWriteTransformation: T => Iterable[W],
partitionBy: Seq[ColumnPath],
writeOptions: ParquetWriter.Options,
postWriteHandler: Option[PostWriteState[T] => Unit]
) extends TypedBuilder[T, W] {
override def maxCount(maxCount: Long): TypedBuilder[T, W] = copy(maxCount = maxCount)
override def maxDuration(maxDuration: FiniteDuration): TypedBuilder[T, W] = copy(maxDuration = maxDuration)
override def options(options: ParquetWriter.Options): TypedBuilder[T, W] = copy(writeOptions = options)
override def partitionBy(partitionBy: ColumnPath*): TypedBuilder[T, W] = copy(partitionBy = partitionBy)
override def preWriteTransformation[X](transformation: T => Iterable[X]): TypedBuilder[T, X] =
TypedBuilderImpl(
maxCount,
maxDuration,
transformation,
partitionBy,
writeOptions,
postWriteHandler
)
override def postWriteHandler(handler: PostWriteState[T] => Unit): TypedBuilder[T, W] =
copy(postWriteHandler = Some(handler))
override def write(basePath: Path)(implicit
schemaResolver: ParquetSchemaResolver[W],
encoder: ParquetRecordEncoder[W]
): GraphStage[FlowShape[T, T]] = {
val schema = ParquetSchemaResolver.resolveSchema[W](toSkip = partitionBy)
val encode = (obj: W, vcc: ValueCodecConfiguration) => ParquetRecordEncoder.encode(obj, vcc)
new ParquetPartitioningFlow[T, W](
basePath,
maxCount,
maxDuration,
preWriteTransformation,
partitionBy,
encode,
schema,
writeOptions,
postWriteHandler
)
}
}
/** Represent the state of writer after processing of `processedData`.
* @param processedData
* Processed input element
* @param modifiedPartitions
* State of partitions that has been written in effect of processing the element T. More than one partition
* can be modified due to preWriteTransformation. The map contains values representing total number of
* writes to a single file (number of writes to the partition after last rotation).
* @param flush
* Flushes all writes to given partition and rotates the file.
* @tparam T
* type of input data
*/
case class PostWriteState[T](processedData: T, modifiedPartitions: Map[Path, Long], flush: Path => Unit)
}
private class ParquetPartitioningFlow[T, W](
basePath: Path,
maxCount: Long,
maxDuration: FiniteDuration,
preWriteTransformation: T => Iterable[W],
partitionBy: Seq[ColumnPath],
encode: (W, ValueCodecConfiguration) => RowParquetRecord,
schema: MessageType,
writeOptions: ParquetWriter.Options,
postWriteHandler: Option[PostWriteState[T] => Unit]
) extends GraphStage[FlowShape[T, T]]
with IOOps {
val in: Inlet[T] = Inlet[T]("ParquetPartitioningFlow.in")
val out: Outlet[T] = Outlet[T]("ParquetPartitioningFlow.out")
val shape: FlowShape[T, T] = FlowShape.of(in, out)
override val logger: Logger = LoggerFactory.getLogger("ParquetPartitioningFlow")
private val vcc = ValueCodecConfiguration(writeOptions)
private class Logic extends TimerGraphStageLogic(shape) with InHandler with OutHandler {
case class WriterState(
writer: ParquetWriter.InternalWriter,
var written: Long
)
private val writers = TrieMap.empty[Path, WriterState]
setHandlers(in, out, this)
private val compressionExtension: String = writeOptions.compressionCodecName.getExtension
private def newFileName: String = UUID.randomUUID().toString + compressionExtension + ".parquet"
private def partition(record: RowParquetRecord): (Path, RowParquetRecord) =
partitionBy.foldLeft(basePath -> record) { case ((currentPath, currentRecord), partitionPath) =>
currentRecord.removed(partitionPath) match {
case (Some(BinaryValue(binary)), modifiedRecord) =>
Path(currentPath, s"$partitionPath=${binary.toStringUsingUTF8}") -> modifiedRecord
case (None, _) =>
throw new IllegalArgumentException(s"Field '$partitionPath' does not exist.")
case (Some(NullValue), _) =>
throw new IllegalArgumentException(s"Field '$partitionPath' is null.")
case _ =>
throw new IllegalArgumentException(s"Non-string field '$partitionPath' used for partitioning.")
}
}
private def scheduleNextRotation(path: Path, delay: FiniteDuration): Unit =
scheduleOnce(timerKey = path, delay = delay)
private def write(msg: T): Map[Path, Long] = {
val valuesToWrite = preWriteTransformation(msg)
val records = valuesToWrite.map(value => encode(value, vcc))
val pathsAndPartitionedRecords = records.map(partition)
pathsAndPartitionedRecords.foldLeft(Map.empty[Path, Long]) {
case (modifiedPartitions, (writerPath, partitionedRecord)) =>
val state = writers.getOrElseUpdate(
writerPath, {
logger.debug("Creating writer to write to [{}]", writerPath)
val writer = ParquetWriter.internalWriter(
file = Path(writerPath, newFileName).toOutputFile(writeOptions),
schema = schema,
metadataWriter = MetadataWriter.NoOp,
options = writeOptions
)
val state = WriterState(
writer = writer,
written = 0L
)
scheduleNextRotation(writerPath, maxDuration)
state
}
)
state.writer.write(partitionedRecord)
state.written += 1
modifiedPartitions.updated(writerPath, state.written)
}
}
private def close(path: Path): Unit =
writers.remove(path) match {
case Some(writerState) =>
cancelTimer(path)
try writerState.writer.close()
catch {
case _: NullPointerException => // ignores bug in Parquet
}
case None =>
logger.debug("Trying to close a writer for a path [{}], no state was found", path)
}
override def onTimer(timerKey: Any): Unit =
timerKey match {
case path: Path => close(path)
case _ => // ignore
}
override def onPush(): Unit = {
val msg = grab(in)
val modifiedPartitions = write(msg)
postWriteHandler.foreach(
_.apply(
PostWriteState(
processedData = msg,
modifiedPartitions = modifiedPartitions,
flush = partition => close(partition)
)
)
)
modifiedPartitions.foreach {
case (path, count) if count >= maxCount =>
close(path)
case _ => // ignore
}
push(out, msg)
}
override def onPull(): Unit =
if (!isClosed(in) && !hasBeenPulled(in)) {
pull(in)
}
override def preStart(): Unit =
validateWritePath(path = basePath, writeOptions = writeOptions)
override def postStop(): Unit = {
writers.keySet.foreach { path =>
close(path)
}
super.postStop()
}
}
override def createLogic(inheritedAttributes: Attributes): GraphStageLogic = new Logic()
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy