All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.mjakubowski84.parquet4s.ParquetIterable.scala Maven / Gradle / Ivy

The newest version!
package com.github.mjakubowski84.parquet4s

import com.github.mjakubowski84.parquet4s.etl.{CompoundParquetIterable, InMemoryParquetIterable, Join}

import java.io.Closeable

object ParquetIterable {

  private[parquet4s] def apply[T: ParquetRecordDecoder](
      iteratorFactory: () => Iterator[RowParquetRecord] & Closeable,
      valueCodecConfiguration: ValueCodecConfiguration,
      stats: Stats
  ): ParquetIterable[T] =
    new ParquetIterableImpl(
      iteratorFactory         = iteratorFactory,
      valueCodecConfiguration = valueCodecConfiguration,
      stats                   = stats,
      transformations         = Seq.empty,
      decode                  = record => ParquetRecordDecoder.decode(record, valueCodecConfiguration)
    )

  def inMemory[T: ParquetRecordDecoder](
      data: => Iterable[RowParquetRecord],
      valueCodecConfiguration: ValueCodecConfiguration = ValueCodecConfiguration.Default
  ): ParquetIterable[T] =
    new InMemoryParquetIterable[T](
      data                    = data,
      valueCodecConfiguration = valueCodecConfiguration,
      decode                  = record => ParquetRecordDecoder.decode(record, valueCodecConfiguration)
    )

  implicit class RowParquetRecordIterableOps(iterable: ParquetIterable[RowParquetRecord]) {

    /** Creates a new instance of [[ParquetIterable]] with a new [[ParquetRecordDecoder]] applied. Especially useful for
      * decoding whole dataset of [[RowParquetRecord]] to the desired class.
      * @tparam U
      *   a type that the dataset shall be decoded to
      */
    def as[U: ParquetRecordDecoder]: ParquetIterable[U] = iterable.changeDecoder[U]

    /** Joins this dataset with `right` dataset using left join. Joins records from this dataset and `right` one that
      * have `onLeft` column and `onRight` column equal respectively. 

Take note that, while this * dataset is going to be iterated over, the whole `right` dataset is going to be loaded into memory before. * Therefore, it is recommended to run this operation only if the `right` dataset is small enough to fit into * memory. You may consider to use `rightJoin` and swap the datasets - if this dataset is actually smaller than * `right` one. * @param right * the dataset to join with * @param onLeft * join column in this dataset * @param onRight * join column in `right` dataset * @return * [[ParquetIterable]] with joined records */ @experimental def leftJoin( right: ParquetIterable[RowParquetRecord], onLeft: ColumnPath, onRight: ColumnPath ): ParquetIterable[RowParquetRecord] = Join.left(iterable, right, onLeft, onRight) /** Joins this dataset with `right` dataset using right join. Joins records from this dataset and `right` one that * have `onLeft` column and `onRight` column equal respectively.

Take note that, while this * dataset is going to be iterated over, the whole `right` dataset is going to be loaded into memory before. * Therefore, it is recommended to run this operation only if the `right` dataset is small enough to fit into * memory. You may consider to use `leftJoin` and swap the datasets - if this dataset is actually smaller than * `right` one. * @param right * the dataset to join with * @param onLeft * join column in this dataset * @param onRight * join column in `right` dataset * @return * [[ParquetIterable]] with joined records */ @experimental def rightJoin( right: ParquetIterable[RowParquetRecord], onLeft: ColumnPath, onRight: ColumnPath ): ParquetIterable[RowParquetRecord] = Join.right(iterable, right, onLeft, onRight) /** Joins this dataset with `right` dataset using inner join. Joins records from this dataset and `right` one that * have `onLeft` column and `onRight` column equal respectively.

Take note that, while this * dataset is going to be iterated over, the whole `right` dataset is going to be loaded into memory before. * Therefore, it is recommended to run this operation only if the `right` dataset is small enough to fit into * memory. You may consider to swap the datasets - if this dataset is actually smaller than `right` one. * @param right * the dataset to join with * @param onLeft * join column in this dataset * @param onRight * join column in `right` dataset * @return * [[ParquetIterable]] with joined records */ @experimental def innerJoin( right: ParquetIterable[RowParquetRecord], onLeft: ColumnPath, onRight: ColumnPath ): ParquetIterable[RowParquetRecord] = Join.inner(iterable, right, onLeft, onRight) /** Joins this dataset with `right` dataset using full join. Joins records from this dataset and `right` one that * have `onLeft` column and `onRight` column equal respectively.

Take note that, while this * dataset is going to be iterated over, the whole `right` dataset is going to be loaded into memory before. * Therefore, it is recommended to run this operation only if the `right` dataset is small enough to fit into * memory. You may consider to swap the datasets - if this dataset is actually smaller than `right` one. * @param right * the dataset to join with * @param onLeft * join column in this dataset * @param onRight * join column in `right` dataset * @return * [[ParquetIterable]] with joined records */ @experimental def fullJoin( right: ParquetIterable[RowParquetRecord], onLeft: ColumnPath, onRight: ColumnPath ): ParquetIterable[RowParquetRecord] = Join.full(iterable, right, onLeft, onRight) } } /** Allows to iterate over Parquet file(s). Remember to call `close()` when you are done. * * @tparam T * type that represents schema of Parquet file */ trait ParquetIterable[T] extends Iterable[T] with Closeable { /** Returns min value of underlying dataset at given path * * @param columnPath * [[ColumnPath]] * @tparam V * type of data at given path * @return * min value or [[scala.None]] if there is no matching data or path is invalid */ def min[V: Ordering: ValueDecoder](columnPath: ColumnPath): Option[V] = stats.min[V](columnPath) /** Returns max value of underlying dataset at given path * * @param columnPath * [[ColumnPath]] * @tparam V * type of data at given path * @return * max value or [[scala.None]] if there is no matching data or path is invalid */ def max[V: Ordering: ValueDecoder](columnPath: ColumnPath): Option[V] = stats.max[V](columnPath) /** Returns the size of the underlying dataset. Filter is considered during calculation. Tries to leverage Parquet * metadata and statistics in order to avoid full traverse over the dataset. */ override def size: Int = stats.recordCount.toInt /** Appends `other` dataset to this one. `Other` is not read until this dataset is not fully iterated over. * @param other * dataset to be concatenated with this */ @experimental def concat(other: ParquetIterable[T]): ParquetIterable[T] = new CompoundParquetIterable(Iterable(this, other)) /** Writes this dataset to given path and releases all opened resources. * @param path * path to write the dataset to * @param options * [[ParquetWriter.Options]] */ @experimental def writeAndClose(path: Path, options: ParquetWriter.Options = ParquetWriter.Options())(implicit schemaResolver: ParquetSchemaResolver[T], encoder: ParquetRecordEncoder[T] ): Unit = try ParquetWriter.of[T].options(options).writeAndClose(path, this) finally this.close() private[parquet4s] def appendTransformation( transformation: RowParquetRecord => Iterable[RowParquetRecord] ): ParquetIterable[T] private[parquet4s] def changeDecoder[U: ParquetRecordDecoder]: ParquetIterable[U] private[parquet4s] def stats: Stats private[parquet4s] def valueCodecConfiguration: ValueCodecConfiguration } private[parquet4s] class ParquetIterableImpl[T]( iteratorFactory: () => Iterator[RowParquetRecord] & Closeable, override val valueCodecConfiguration: ValueCodecConfiguration, override val stats: Stats, transformations: Seq[RowParquetRecord => Iterable[RowParquetRecord]], decode: RowParquetRecord => T ) extends ParquetIterable[T] { private var openCloseables: Set[Closeable] = Set.empty override def iterator: Iterator[T] = { val iterator = iteratorFactory() this.synchronized { openCloseables = openCloseables + iterator } if (transformations.isEmpty) iterator.map(decode) else iterator.flatMap(record => transformations .foldLeft(Seq(record)) { case (iterables, transformation) => iterables.flatMap(transformation) } .map(decode) ) } override def close(): Unit = openCloseables.synchronized { openCloseables.foreach(_.close()) openCloseables = Set.empty } override private[parquet4s] def appendTransformation( transformation: RowParquetRecord => Iterable[RowParquetRecord] ): ParquetIterable[T] = new ParquetIterableImpl[T]( iteratorFactory = iteratorFactory, valueCodecConfiguration = valueCodecConfiguration, stats = stats, transformations = transformations :+ transformation, decode = decode ) override private[parquet4s] def changeDecoder[U: ParquetRecordDecoder]: ParquetIterable[U] = new ParquetIterableImpl[U]( iteratorFactory = iteratorFactory, valueCodecConfiguration = valueCodecConfiguration, stats = stats, transformations = transformations, decode = (record: RowParquetRecord) => ParquetRecordDecoder.decode(record, valueCodecConfiguration) ) }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy