com.github.mjakubowski84.parquet4s.ParquetReader.scala Maven / Gradle / Ivy
The newest version!
package com.github.mjakubowski84.parquet4s
import com.github.mjakubowski84.parquet4s.etl.CompoundParquetIterable
import org.apache.hadoop.conf.Configuration
import org.apache.parquet.filter2.compat.FilterCompat
import org.apache.parquet.hadoop.util.HadoopInputFile
import org.apache.parquet.io.InputFile
import org.apache.parquet.schema.{MessageType, Type}
import org.slf4j.{Logger, LoggerFactory}
import java.io.Closeable
import java.util.TimeZone
object ParquetReader extends IOOps {
/** Configuration settings that are used during decoding or reading Parquet files
*
* @param timeZone
* set it to [[java.util.TimeZone]] which was used to encode time-based data that you want to read; machine's time
* zone is used by default
* @param useHadoopVectoredIo
* should use Hadoop's feature to vectorise (and parallelise) reading file chunks
* @param hadoopConf
* use it to programmatically override Hadoop's [[org.apache.hadoop.conf.Configuration]]
*/
case class Options(
timeZone: TimeZone = TimeZone.getDefault,
useHadoopVectoredIo: Boolean = org.apache.parquet.hadoop.ParquetInputFormat.HADOOP_VECTORED_IO_DEFAULT,
hadoopConf: Configuration = new Configuration()
) {
private[parquet4s] def applyTo[T](builder: ParquetIterator.HadoopBuilder[T]): ParquetIterator.HadoopBuilder[T] =
builder.withConf(hadoopConf)
}
/** Builds an instance of [[ParquetIterable]]
* @tparam T
* type of data generated by the source.
*/
trait Builder[T] {
/** @param options
* configuration of how Parquet files should be read
*/
def options(options: ParquetReader.Options): Builder[T]
/** @param filter
* optional before-read filter; no filtering is applied by default; check [[Filter]] for more details
*/
def filter(filter: Filter): Builder[T]
/** Attempt to read data as partitioned. Partition names must follow Hive format. Partition values will be set in
* read records to corresponding fields.
*/
@deprecated(
message = "Reading always tries to resolve partitions if the provided path is a directory.",
since = "2.12.0"
)
def partitioned: Builder[T]
/** @param path
* [[Path]] to Parquet files, e.g.: {{{Path("file:///data/users")}}}
* @param decoder
* decodes [[RowParquetRecord]] to your data type
* @return
* final [[ParquetIterable]]
* @throws scala.IllegalArgumentException
* when reading inconsistent partition directory
*/
def read(path: Path)(implicit decoder: ParquetRecordDecoder[T]): ParquetIterable[T]
/** @param inputFile
* the InputFile to read from
* @param decoder
* decodes [[RowParquetRecord]] to your data type
* @return
* final [[ParquetIterable]]
*/
@experimental
def read(inputFile: InputFile)(implicit decoder: ParquetRecordDecoder[T]): ParquetIterable[T]
}
private case class BuilderImpl[T](
options: ParquetReader.Options = ParquetReader.Options(),
filter: Filter = Filter.noopFilter,
projectedSchemaResolverOpt: Option[ParquetSchemaResolver[T]] = None,
columnProjections: Seq[ColumnProjection] = Seq.empty
) extends Builder[T] {
override def options(options: ParquetReader.Options): Builder[T] =
this.copy(options = options)
override def filter(filter: Filter): Builder[T] =
this.copy(filter = filter)
override def partitioned: Builder[T] = this
override def read(path: Path)(implicit decoder: ParquetRecordDecoder[T]): ParquetIterable[T] =
read(path.toInputFile(options))
override def read(inputFile: InputFile)(implicit decoder: ParquetRecordDecoder[T]): ParquetIterable[T] = {
val valueCodecConfiguration = ValueCodecConfiguration(options)
inputFile match {
case hadoopInputFile: HadoopInputFile =>
partitionedIterable(Path(hadoopInputFile.getPath), valueCodecConfiguration, filter, options)
case _ =>
singleIterable(
inputFile = inputFile,
valueCodecConfiguration = valueCodecConfiguration,
projectedSchemaOpt =
projectedSchemaResolverOpt.map(implicit resolver => ParquetSchemaResolver.resolveSchema[T]),
filterCompat = filter.toFilterCompat(valueCodecConfiguration),
partitionViewOpt = None,
readerOptions = options
)
}
}
private def partitionedIterable(
path: Path,
valueCodecConfiguration: ValueCodecConfiguration,
filter: Filter,
readerOptions: ParquetReader.Options
)(implicit decoder: ParquetRecordDecoder[T]): ParquetIterable[T] =
listPartitionedDirectory(path, readerOptions.hadoopConf, filter, valueCodecConfiguration) match {
case Left(exception) =>
throw exception
case Right(partitionedDirectory) =>
val projectedSchemaOpt = projectedSchemaResolverOpt.map(implicit resolver =>
ParquetSchemaResolver.resolveSchema(partitionedDirectory.schema)
)
lazy val fallbackFilterCompat = filter.toNonPredicateFilterCompat
val iterables = partitionedDirectory.paths.map { partitionedPath =>
singleIterable(
inputFile = partitionedPath.inputFile,
valueCodecConfiguration = valueCodecConfiguration,
projectedSchemaOpt = projectedSchemaOpt,
filterCompat = partitionedPath.filterPredicateOpt.fold(fallbackFilterCompat)(FilterCompat.get),
partitionViewOpt = Option(partitionedPath.view),
readerOptions = readerOptions
).appendTransformation(setPartitionValues(partitionedPath))
}
new CompoundParquetIterable[T](iterables)
}
private def singleIterable(
inputFile: InputFile,
valueCodecConfiguration: ValueCodecConfiguration,
projectedSchemaOpt: Option[MessageType],
filterCompat: FilterCompat.Filter,
partitionViewOpt: Option[PartitionView],
readerOptions: ParquetReader.Options
)(implicit decoder: ParquetRecordDecoder[T]): ParquetIterable[T] = {
if (logger.isDebugEnabled) {
logger.debug(s"Creating ParquetIterable for file $inputFile")
}
ParquetIterable[T](
iteratorFactory = ParquetIterator
.factory(inputFile, projectedSchemaOpt, columnProjections, filterCompat, decoder, readerOptions),
valueCodecConfiguration = valueCodecConfiguration,
stats = Stats(inputFile, valueCodecConfiguration, projectedSchemaOpt, filterCompat, partitionViewOpt)
)
}
private def setPartitionValues(partitionedPath: PartitionedPath)(
record: RowParquetRecord
): Iterable[RowParquetRecord] =
Option(
partitionedPath.partitions.foldLeft(record) { case (currentRecord, (columnPath, value)) =>
currentRecord.updated(columnPath, BinaryValue(value))
}
)
}
@experimental
trait CustomBuilder[T] {
/** @param options
* configuration of how Parquet files should be read
*/
def options(options: Options): CustomBuilder[T]
/** @param filter
* optional before-read filter; no filtering is applied by default; check [[Filter]] for more details
*/
def filter(filter: Filter): CustomBuilder[T]
/** @return
* final closeable [[scala.collection.Iterable]]
*/
def read: Iterable[T] & Closeable
}
private case class CustomBuilderImpl[T](
builder: ParquetIterator.HadoopBuilder[T],
options: Options,
filter: Filter
) extends CustomBuilder[T] {
override def options(options: Options): CustomBuilder[T] = this.copy(options = options)
override def filter(filter: Filter): CustomBuilder[T] = this.copy(filter = filter)
override def read: Iterable[T] & Closeable = {
val vcc = ValueCodecConfiguration(options)
closeableIterable(
iteratorFactory = () => new ParquetIterator[T](options.applyTo(builder).withFilter(filter.toFilterCompat(vcc)))
)
}
private def closeableIterable(iteratorFactory: () => Iterator[T] & Closeable): Iterable[T] & Closeable =
new Iterable[T] with Closeable {
private var openCloseables: Set[Closeable] = Set.empty
override def iterator: Iterator[T] = {
val iterator = iteratorFactory()
this.synchronized {
openCloseables = openCloseables + iterator
}
iterator
}
override def close(): Unit =
openCloseables.synchronized {
openCloseables.foreach(_.close())
openCloseables = Set.empty
}
}
}
override val logger: Logger = LoggerFactory.getLogger(this.getClass)
/** Creates new [[ParquetIterable]] over data from given path.
Path can represent local file or directory, HDFS,
* AWS S3, Google Storage, Azure, etc. Please refer to Hadoop client documentation or your data provider in order to
* know how to configure the connection.
*
* @note
* Remember to call `close()` on iterable in order to free resources!
*
* @param path
* [[Path]] to Parquet files, e.g.: {{{Path("file:///data/users")}}}
* @param options
* configuration of how Parquet files should be read
* @param filter
* optional before-read filtering; no filtering is applied by default; check [[Filter]] for more details
* @tparam T
* type of data that represents the schema of the Parquet file, e.g.:
* {{{case class MyData(id: Long, name: String, created: java.sql.Timestamp)}}}
*/
@deprecated("2.0.0", "use builder API by calling 'as[T]', 'projectedAs[T]', 'generic' or 'projectedGeneric'")
def read[T: ParquetRecordDecoder: ParquetSchemaResolver](
path: Path,
options: Options = Options(),
filter: Filter = Filter.noopFilter
): ParquetIterable[T] =
projectedAs[T].options(options).filter(filter).read(path)
/** Creates [[Builder]] of Parquet reader for documents of type T.
*/
def as[T]: Builder[T] = BuilderImpl()
/** Creates [[Builder]] of Parquet reader for projected documents of type T. Due to projection reader
* does not attempt to read all existing columns of the file but applies enforced projection schema.
*/
def projectedAs[T: ParquetSchemaResolver]: Builder[T] = BuilderImpl(
projectedSchemaResolverOpt = Option(implicitly[ParquetSchemaResolver[T]])
)
/** Creates [[Builder]] of Parquet reader returning generic records.
*/
def generic: Builder[RowParquetRecord] = BuilderImpl()
/** Creates [[Builder]] of Parquet reader returning projected generic records. Due to projection reader does
* not attempt to read all existing columns of the file but applies enforced projection schema.
*/
def projectedGeneric(projectedSchema: MessageType): Builder[RowParquetRecord] = BuilderImpl(
projectedSchemaResolverOpt = Option(RowParquetRecord.genericParquetSchemaResolver(projectedSchema))
)
// format: off
/** Creates [[Builder]] of Parquet reader returning projected generic records. Due to projection, reader does
* not attempt to read all existing columns of the file but applies enforced projection schema. Besides simple
* projection one can use aliases and extract values from nested fields - in a way similar to SQL.
*
* @example
*
*projectedGeneric(
* Col("foo").as[Int], // selects Int column "foo"
* Col("bar.baz".as[String]), // selects String field "bar.baz", creates column "baz" wih a value of "baz"
* Col("bar.baz".as[String].alias("bar_baz")) // selects String field "bar.baz", creates column "bar_baz" wih a value of "baz"
*)
*
* @param col
* first column projection
* @param cols
* next column projections
*/
// format: on
def projectedGeneric(col: TypedColumnPath[?], cols: TypedColumnPath[?]*): Builder[RowParquetRecord] = {
val (fields, columnProjections) =
(col +: cols.toVector).zipWithIndex
.foldLeft((Vector.empty[Type], Vector.empty[ColumnProjection])) {
case ((fields, projections), (columnPath, ordinal)) =>
val updatedFields = fields :+ columnPath.toType
val updatedProjections = projections :+ ColumnProjection(columnPath, ordinal)
updatedFields -> updatedProjections
}
BuilderImpl(
projectedSchemaResolverOpt = Option(RowParquetRecord.genericParquetSchemaResolver(Message.merge(fields))),
columnProjections = columnProjections
)
}
/** Creates [[CustomBuilder]] for reading Parquet data using custom internal implementation.
* @param builder
* custom internal implementation
* @tparam T
* type of read data
*/
@experimental
def custom[T](builder: org.apache.parquet.hadoop.ParquetReader.Builder[T]): CustomBuilder[T] = CustomBuilderImpl(
builder = builder,
options = Options(),
filter = Filter.noopFilter
)
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy