All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.mjakubowski84.parquet4s.ParquetWriter.scala Maven / Gradle / Ivy

The newest version!
package com.github.mjakubowski84.parquet4s

import org.apache.hadoop.conf.Configuration
import org.apache.parquet.hadoop.api.WriteSupport
import org.apache.parquet.hadoop.api.WriteSupport.{FinalizedWriteContext, WriteContext}
import org.apache.parquet.hadoop.metadata.CompressionCodecName
import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetWriter as HadoopParquetWriter}
import org.apache.parquet.io.OutputFile
import org.apache.parquet.io.api.RecordConsumer
import org.apache.parquet.schema.MessageType
import org.slf4j.LoggerFactory

import java.io.Closeable
import java.util.TimeZone
import scala.annotation.implicitNotFound
import scala.jdk.CollectionConverters.*
import scala.util.Using
import scala.util.control.NonFatal

/** Type class that allows to write data which schema is represented by type T. Path and options are meant to be
  * set by implementation of the trait.
  * @tparam T
  *   schema of data to write
  */
trait ParquetWriter[T] extends Closeable {

  /** Appends data chunk to file contents.
    * @param data
    *   data to write
    */
  def write(data: Iterable[T]): Unit

  /** Appends data chunk to file contents.
    * @param data
    *   data to write
    */
  def write(data: T*): Unit

}

object ParquetWriter {

  private[parquet4s] type InternalWriter = HadoopParquetWriter[RowParquetRecord]

  @implicitNotFound(
    "Cannot write data of type ${T}. " +
      "Please check if there are implicit ValueEncoder and TypedSchemaDef available for each field and subfield of ${T}."
  )
  @deprecated("2.0.0", "Use builder api by calling 'of[T]' or 'generic'")
  type ParquetWriterFactory[T] = (Path, Options) => ParquetWriter[T]

  private val SignatureMetadata = Map("MadeBy" -> "https://github.com/mjakubowski84/parquet4s")

  private class InternalBuilder(file: OutputFile, schema: MessageType, extraMetadata: MetadataWriter)
      extends HadoopParquetWriter.Builder[RowParquetRecord, InternalBuilder](file) {
    private val logger = LoggerFactory.getLogger(ParquetWriter.this.getClass)

    if (logger.isDebugEnabled) {
      logger.debug(s"""Resolved following schema to write Parquet to "$file":\n$schema""")
    }

    override def self(): InternalBuilder = this

    override def getWriteSupport(conf: Configuration): WriteSupport[RowParquetRecord] =
      new ParquetWriteSupport(schema, SignatureMetadata, extraMetadata)
  }

  /** Configuration of parquet writer. Please have a look at documentation of
    * Parquet to understand what every configuration entry is responsible for. 
NOTE! Please be careful * when using OVERWRITE mode. All data at given path (either file or directory) are deleted before writing in the * OVERWRITE mode.
Apart from options specific for Parquet file format there are some other: * @param hadoopConf * can be used to programmatically set Hadoop's [[org.apache.hadoop.conf.Configuration]] * @param timeZone * used when encoding time-based data, local machine's time zone is used by default */ case class Options( writeMode: ParquetFileWriter.Mode = ParquetFileWriter.Mode.CREATE, compressionCodecName: CompressionCodecName = HadoopParquetWriter.DEFAULT_COMPRESSION_CODEC_NAME, dictionaryEncodingEnabled: Boolean = HadoopParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, dictionaryPageSize: Int = HadoopParquetWriter.DEFAULT_PAGE_SIZE, maxPaddingSize: Int = HadoopParquetWriter.MAX_PADDING_SIZE_DEFAULT, pageSize: Int = HadoopParquetWriter.DEFAULT_PAGE_SIZE, rowGroupSize: Long = HadoopParquetWriter.DEFAULT_BLOCK_SIZE, validationEnabled: Boolean = HadoopParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, hadoopConf: Configuration = new Configuration(), timeZone: TimeZone = TimeZone.getDefault ) { private[parquet4s] def applyTo[T, B <: HadoopParquetWriter.Builder[T, B]](builder: B): B = builder .withWriteMode(writeMode) .withCompressionCodec(compressionCodecName) .withDictionaryEncoding(dictionaryEncodingEnabled) .withDictionaryPageSize(dictionaryPageSize) .withMaxPaddingSize(maxPaddingSize) .withPageSize(pageSize) .withRowGroupSize(rowGroupSize) .withValidation(validationEnabled) .withConf(hadoopConf) } /** Builder of [[ParquetWriter]]. * @tparam T * type of documents to write */ trait Builder[T] { /** Configuration of writer, see [[ParquetWriter.Options]] */ def options(options: Options): Builder[T] /** Builds a writer for writing the output file */ @experimental def build(file: OutputFile): ParquetWriter[T] def build(path: Path): ParquetWriter[T] /** Writes iterable collection of data as a Parquet output file. */ @experimental def writeAndClose(file: OutputFile, data: Iterable[T]): Unit /** Writes iterable collection of data as a Parquet files at given path. */ def writeAndClose(path: Path, data: Iterable[T]): Unit } trait CustomBuilder[T] { /** @param options * writer options */ def options(options: ParquetWriter.Options): CustomBuilder[T] def writeAndClose(data: Iterable[T]): Unit def build(): ParquetWriter[T] } private case class CustomBuilderImpl[T, B <: HadoopParquetWriter.Builder[T, B]]( builder: B, options: Options = Options() ) extends CustomBuilder[T] { override def options(options: Options): CustomBuilder[T] = this.copy(options = options) override def writeAndClose(data: Iterable[T]): Unit = Using.resource(build())(_.write(data)) override def build(): ParquetWriter[T] = new ParquetWriterImpl[T, T]( internalWriter = options.applyTo[T, B](builder).build(), encode = identity, fileDescription = "custom" ) } private case class BuilderImpl[T](options: Options = Options())(implicit encoder: ParquetRecordEncoder[T], schemaResolver: ParquetSchemaResolver[T] ) extends Builder[T] { override def options(options: Options): Builder[T] = this.copy(options = options) override def build(file: OutputFile): ParquetWriter[T] = new ParquetWriterImpl[T, RowParquetRecord]( internalWriter = internalWriter( file = file, schema = ParquetSchemaResolver.resolveSchema[T], metadataWriter = encoder, options = options ), encode = ParquetRecordEncoder.encode(_, ValueCodecConfiguration(options)), fileDescription = file.toString ) override def build(path: Path): ParquetWriter[T] = build(path.toOutputFile(options)) override def writeAndClose(file: OutputFile, data: Iterable[T]): Unit = Using.resource(build(file))(_.write(data)) override def writeAndClose(path: Path, data: Iterable[T]): Unit = Using.resource(build(path))(_.write(data)) } private[parquet4s] def internalWriter( file: OutputFile, schema: MessageType, metadataWriter: MetadataWriter, options: Options ): InternalWriter = options .applyTo[RowParquetRecord, InternalBuilder](new InternalBuilder(file, schema, metadataWriter)) .build() /** Writes iterable collection of data as a Parquet files at given path. Path can represent local file or directory, * HDFS, AWS S3, Google Storage, Azure, etc. Please refer to Hadoop client documentation or your data provider in * order to know how to configure the connection. * * @param path * [[Path]] where the data will be written to * @param data * Collection of T that will be written in Parquet file format * @param options * configuration of writer, see [[ParquetWriter.Options]] * @param writerFactory * [[ParquetWriterFactory]] that will be used to create an instance of writer * @tparam T * type of data, will be used also to resolve the schema of Parquet files */ @deprecated("2.0.0", "Use builder api by calling 'of[T]' or 'generic'") def writeAndClose[T](path: Path, data: Iterable[T], options: ParquetWriter.Options = ParquetWriter.Options())(implicit writerFactory: ParquetWriterFactory[T] ): Unit = Using.resource(writerFactory(path, options))(_.write(data)) @deprecated("2.0.0", "Use builder api by calling 'of[T]' or 'generic''") def writer[T](path: Path, options: ParquetWriter.Options = ParquetWriter.Options())(implicit writerFactory: ParquetWriterFactory[T] ): ParquetWriter[T] = writerFactory(path, options) /** Default instance of [[ParquetWriterFactory]] */ @deprecated("2.0.0", "Use builder api by calling 'of[T]' or 'generic'") implicit def writerFactory[T: ParquetRecordEncoder: ParquetSchemaResolver]: ParquetWriterFactory[T] = (path, options) => { val vcc = ValueCodecConfiguration(options) new ParquetWriterImpl[T, RowParquetRecord]( internalWriter = internalWriter( file = path.toOutputFile(options), schema = ParquetSchemaResolver.resolveSchema[T], metadataWriter = implicitly[ParquetRecordEncoder[T]], options = options ), encode = entity => ParquetRecordEncoder.encode(entity, vcc), fileDescription = path.toString ) } /** Creates [[Builder]] of [[ParquetWriter]] for documents of type T. */ def of[T: ParquetRecordEncoder: ParquetSchemaResolver]: Builder[T] = BuilderImpl() /** Creates [[Builder]] of [[ParquetWriter]] for generic records. */ def generic(message: MessageType): Builder[RowParquetRecord] = BuilderImpl()(RowParquetRecord.genericParquetRecordEncoder, RowParquetRecord.genericParquetSchemaResolver(message)) @experimental def custom[T, B <: HadoopParquetWriter.Builder[T, B]](builder: B): CustomBuilder[T] = CustomBuilderImpl(builder) } private class ParquetWriterImpl[T, W]( internalWriter: HadoopParquetWriter[W], encode: T => W, fileDescription: String ) extends ParquetWriter[T] { private val logger = LoggerFactory.getLogger(this.getClass) private var closed = false override def write(data: Iterable[T]): Unit = if (closed) { throw new IllegalStateException("Attempted to write with a writer which was already closed") } else { data.foreach { elem => internalWriter.write(encode(elem)) } } override def write(data: T*): Unit = this.write(data) override def close(): Unit = synchronized { if (closed) { logger.warn("Attempted to close a writer which was already closed") } else { if (logger.isDebugEnabled) { logger.debug(s"Finished writing to $fileDescription and closing writer.") } closed = true try internalWriter.close() catch { case NonFatal(_) => // ignores bug in Parquet } } } } private class ParquetWriteSupport(schema: MessageType, metadata: Map[String, String], extraMetadata: MetadataWriter) extends WriteSupport[RowParquetRecord] { private var consumer: RecordConsumer = _ override def init(configuration: Configuration): WriteContext = new WriteContext(schema, metadata.asJava) override def write(record: RowParquetRecord): Unit = { consumer.startMessage() record.iterator.foreach { case (_, NullValue) => // ignoring nulls case (name, value) => val fieldIndex = schema.getFieldIndex(name) consumer.startField(name, fieldIndex) value.write(schema.getType(fieldIndex), consumer) consumer.endField(name, fieldIndex) } consumer.endMessage() } override def prepareForWrite(recordConsumer: RecordConsumer): Unit = consumer = recordConsumer override def finalizeWrite(): FinalizedWriteContext = new FinalizedWriteContext(extraMetadata.getMetadata().asJava) }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy