com.github.mjakubowski84.parquet4s.ParquetReader.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of parquet4s-core_3 Show documentation
parquet4s-core
The newest version!
package com.github.mjakubowski84.parquet4s

import com.github.mjakubowski84.parquet4s.etl.CompoundParquetIterable
import org.apache.hadoop.conf.Configuration
import org.apache.parquet.filter2.compat.FilterCompat
import org.apache.parquet.hadoop.util.HadoopInputFile
import org.apache.parquet.io.InputFile
import org.apache.parquet.schema.{MessageType, Type}
import org.slf4j.{Logger, LoggerFactory}

import java.io.Closeable
import java.util.TimeZone

object ParquetReader extends IOOps {

  /** Configuration settings that are used during decoding or reading Parquet files
    *
    * @param timeZone
    *   set it to [[java.util.TimeZone]] which was used to encode time-based data that you want to read; machine's time
    *   zone is used by default
    * @param useHadoopVectoredIo
    *   should use Hadoop's feature to vectorise (and parallelise) reading file chunks
    * @param hadoopConf
    *   use it to programmatically override Hadoop's [[org.apache.hadoop.conf.Configuration]]
    */
  case class Options(
      timeZone: TimeZone           = TimeZone.getDefault,
      useHadoopVectoredIo: Boolean = org.apache.parquet.hadoop.ParquetInputFormat.HADOOP_VECTORED_IO_DEFAULT,
      hadoopConf: Configuration    = new Configuration()
  ) {
    private[parquet4s] def applyTo[T](builder: ParquetIterator.HadoopBuilder[T]): ParquetIterator.HadoopBuilder[T] =
      builder.withConf(hadoopConf)
  }

  /** Builds an instance of [[ParquetIterable]]
    * @tparam T
    *   type of data generated by the source.
    */
  trait Builder[T] {

    /** @param options
      *   configuration of how Parquet files should be read
      */
    def options(options: ParquetReader.Options): Builder[T]

    /** @param filter
      *   optional before-read filter; no filtering is applied by default; check [[Filter]] for more details
      */
    def filter(filter: Filter): Builder[T]

    /** Attempt to read data as partitioned. Partition names must follow Hive format. Partition values will be set in
      * read records to corresponding fields.
      */
    @deprecated(
      message = "Reading always tries to resolve partitions if the provided path is a directory.",
      since   = "2.12.0"
    )
    def partitioned: Builder[T]

    /** @param path
      *   [[Path]] to Parquet files, e.g.: {{{Path("file:///data/users")}}}
      * @param decoder
      *   decodes [[RowParquetRecord]] to your data type
      * @return
      *   final [[ParquetIterable]]
      * @throws scala.IllegalArgumentException
      *   when reading inconsistent partition directory
      */
    def read(path: Path)(implicit decoder: ParquetRecordDecoder[T]): ParquetIterable[T]

    /** @param inputFile
      *   the InputFile to read from
      * @param decoder
      *   decodes [[RowParquetRecord]] to your data type
      * @return
      *   final [[ParquetIterable]]
      */
    @experimental
    def read(inputFile: InputFile)(implicit decoder: ParquetRecordDecoder[T]): ParquetIterable[T]
  }

  private case class BuilderImpl[T](
      options: ParquetReader.Options                               = ParquetReader.Options(),
      filter: Filter                                               = Filter.noopFilter,
      projectedSchemaResolverOpt: Option[ParquetSchemaResolver[T]] = None,
      columnProjections: Seq[ColumnProjection]                     = Seq.empty
  ) extends Builder[T] {
    override def options(options: ParquetReader.Options): Builder[T] =
      this.copy(options = options)

    override def filter(filter: Filter): Builder[T] =
      this.copy(filter = filter)

    override def partitioned: Builder[T] = this

    override def read(path: Path)(implicit decoder: ParquetRecordDecoder[T]): ParquetIterable[T] =
      read(path.toInputFile(options))

    override def read(inputFile: InputFile)(implicit decoder: ParquetRecordDecoder[T]): ParquetIterable[T] = {
      val valueCodecConfiguration = ValueCodecConfiguration(options)

      inputFile match {
        case hadoopInputFile: HadoopInputFile =>
          partitionedIterable(Path(hadoopInputFile.getPath), valueCodecConfiguration, filter, options)
        case _ =>
          singleIterable(
            inputFile               = inputFile,
            valueCodecConfiguration = valueCodecConfiguration,
            projectedSchemaOpt =
              projectedSchemaResolverOpt.map(implicit resolver => ParquetSchemaResolver.resolveSchema[T]),
            filterCompat     = filter.toFilterCompat(valueCodecConfiguration),
            partitionViewOpt = None,
            readerOptions    = options
          )
      }
    }

    private def partitionedIterable(
        path: Path,
        valueCodecConfiguration: ValueCodecConfiguration,
        filter: Filter,
        readerOptions: ParquetReader.Options
    )(implicit decoder: ParquetRecordDecoder[T]): ParquetIterable[T] =
      listPartitionedDirectory(path, readerOptions.hadoopConf, filter, valueCodecConfiguration) match {
        case Left(exception) =>
          throw exception
        case Right(partitionedDirectory) =>
          val projectedSchemaOpt = projectedSchemaResolverOpt.map(implicit resolver =>
            ParquetSchemaResolver.resolveSchema(partitionedDirectory.schema)
          )
          lazy val fallbackFilterCompat = filter.toNonPredicateFilterCompat
          val iterables = partitionedDirectory.paths.map { partitionedPath =>
            singleIterable(
              inputFile               = partitionedPath.inputFile,
              valueCodecConfiguration = valueCodecConfiguration,
              projectedSchemaOpt      = projectedSchemaOpt,
              filterCompat            = partitionedPath.filterPredicateOpt.fold(fallbackFilterCompat)(FilterCompat.get),
              partitionViewOpt        = Option(partitionedPath.view),
              readerOptions           = readerOptions
            ).appendTransformation(setPartitionValues(partitionedPath))
          }
          new CompoundParquetIterable[T](iterables)
      }

    private def singleIterable(
        inputFile: InputFile,
        valueCodecConfiguration: ValueCodecConfiguration,
        projectedSchemaOpt: Option[MessageType],
        filterCompat: FilterCompat.Filter,
        partitionViewOpt: Option[PartitionView],
        readerOptions: ParquetReader.Options
    )(implicit decoder: ParquetRecordDecoder[T]): ParquetIterable[T] = {
      if (logger.isDebugEnabled) {
        logger.debug(s"Creating ParquetIterable for file $inputFile")
      }
      ParquetIterable[T](
        iteratorFactory = ParquetIterator
          .factory(inputFile, projectedSchemaOpt, columnProjections, filterCompat, decoder, readerOptions),
        valueCodecConfiguration = valueCodecConfiguration,
        stats = Stats(inputFile, valueCodecConfiguration, projectedSchemaOpt, filterCompat, partitionViewOpt)
      )
    }

    private def setPartitionValues(partitionedPath: PartitionedPath)(
        record: RowParquetRecord
    ): Iterable[RowParquetRecord] =
      Option(
        partitionedPath.partitions.foldLeft(record) { case (currentRecord, (columnPath, value)) =>
          currentRecord.updated(columnPath, BinaryValue(value))
        }
      )
  }

  @experimental
  trait CustomBuilder[T] {

    /** @param options
      *   configuration of how Parquet files should be read
      */
    def options(options: Options): CustomBuilder[T]

    /** @param filter
      *   optional before-read filter; no filtering is applied by default; check [[Filter]] for more details
      */
    def filter(filter: Filter): CustomBuilder[T]

    /** @return
      *   final closeable [[scala.collection.Iterable]]
      */
    def read: Iterable[T] & Closeable
  }

  private case class CustomBuilderImpl[T](
      builder: ParquetIterator.HadoopBuilder[T],
      options: Options,
      filter: Filter
  ) extends CustomBuilder[T] {
    override def options(options: Options): CustomBuilder[T] = this.copy(options = options)

    override def filter(filter: Filter): CustomBuilder[T] = this.copy(filter = filter)

    override def read: Iterable[T] & Closeable = {
      val vcc = ValueCodecConfiguration(options)
      closeableIterable(
        iteratorFactory = () => new ParquetIterator[T](options.applyTo(builder).withFilter(filter.toFilterCompat(vcc)))
      )
    }

    private def closeableIterable(iteratorFactory: () => Iterator[T] & Closeable): Iterable[T] & Closeable =
      new Iterable[T] with Closeable {
        private var openCloseables: Set[Closeable] = Set.empty

        override def iterator: Iterator[T] = {
          val iterator = iteratorFactory()
          this.synchronized {
            openCloseables = openCloseables + iterator
          }
          iterator
        }

        override def close(): Unit =
          openCloseables.synchronized {
            openCloseables.foreach(_.close())
            openCloseables = Set.empty
          }
      }
  }

  override val logger: Logger = LoggerFactory.getLogger(this.getClass)

  /** Creates new [[ParquetIterable]] over data from given path. 
 Path can represent local file or directory, HDFS,
    * AWS S3, Google Storage, Azure, etc. Please refer to Hadoop client documentation or your data provider in order to
    * know how to configure the connection.
    *
    * @note
    *   Remember to call `close()` on iterable in order to free resources!
    *
    * @param path
    *   [[Path]] to Parquet files, e.g.: {{{Path("file:///data/users")}}}
    * @param options
    *   configuration of how Parquet files should be read
    * @param filter
    *   optional before-read filtering; no filtering is applied by default; check [[Filter]] for more details
    * @tparam T
    *   type of data that represents the schema of the Parquet file, e.g.:
    *   {{{case class MyData(id: Long, name: String, created: java.sql.Timestamp)}}}
    */
  @deprecated("2.0.0", "use builder API by calling 'as[T]', 'projectedAs[T]', 'generic' or 'projectedGeneric'")
  def read[T: ParquetRecordDecoder: ParquetSchemaResolver](
      path: Path,
      options: Options = Options(),
      filter: Filter   = Filter.noopFilter
  ): ParquetIterable[T] =
    projectedAs[T].options(options).filter(filter).read(path)

  /** Creates [[Builder]] of Parquet reader for documents of type T.
    */
  def as[T]: Builder[T] = BuilderImpl()

  /** Creates [[Builder]] of Parquet reader for projected documents of type T. Due to projection reader
    * does not attempt to read all existing columns of the file but applies enforced projection schema.
    */
  def projectedAs[T: ParquetSchemaResolver]: Builder[T] = BuilderImpl(
    projectedSchemaResolverOpt = Option(implicitly[ParquetSchemaResolver[T]])
  )

  /** Creates [[Builder]] of Parquet reader returning generic records.
    */
  def generic: Builder[RowParquetRecord] = BuilderImpl()

  /** Creates [[Builder]] of Parquet reader returning projected generic records. Due to projection reader does
    * not attempt to read all existing columns of the file but applies enforced projection schema.
    */
  def projectedGeneric(projectedSchema: MessageType): Builder[RowParquetRecord] = BuilderImpl(
    projectedSchemaResolverOpt = Option(RowParquetRecord.genericParquetSchemaResolver(projectedSchema))
  )

  // format: off
  /** Creates [[Builder]] of Parquet reader returning projected generic records. Due to projection, reader does
    * not attempt to read all existing columns of the file but applies enforced projection schema. Besides simple
    * projection one can use aliases and extract values from nested fields - in a way similar to SQL.
    * 
 

    * @example
    *    
    *projectedGeneric(
    *  Col("foo").as[Int], // selects Int column "foo"
    *  Col("bar.baz".as[String]), // selects String field "bar.baz", creates column "baz" wih a value of "baz"
    *  Col("bar.baz".as[String].alias("bar_baz")) // selects String field "bar.baz", creates column "bar_baz" wih a value of "baz"
    *)
    *     
    * @param col
    *   first column projection
    * @param cols
    *   next column projections
    */
  // format: on  
  def projectedGeneric(col: TypedColumnPath[?], cols: TypedColumnPath[?]*): Builder[RowParquetRecord] = {
    val (fields, columnProjections) =
      (col +: cols.toVector).zipWithIndex
        .foldLeft((Vector.empty[Type], Vector.empty[ColumnProjection])) {
          case ((fields, projections), (columnPath, ordinal)) =>
            val updatedFields      = fields :+ columnPath.toType
            val updatedProjections = projections :+ ColumnProjection(columnPath, ordinal)
            updatedFields -> updatedProjections
        }
    BuilderImpl(
      projectedSchemaResolverOpt = Option(RowParquetRecord.genericParquetSchemaResolver(Message.merge(fields))),
      columnProjections          = columnProjections
    )
  }

  /** Creates [[CustomBuilder]] for reading Parquet data using custom internal implementation.
    * @param builder
    *   custom internal implementation
    * @tparam T
    *   type of read data
    */
  @experimental
  def custom[T](builder: org.apache.parquet.hadoop.ParquetReader.Builder[T]): CustomBuilder[T] = CustomBuilderImpl(
    builder = builder,
    options = Options(),
    filter  = Filter.noopFilter
  )

}