org.bdgenomics.utils.parquet.ParquetPartition.scala Maven / Gradle / Ivy
The newest version!
/**
* Licensed to Big Data Genomics (BDG) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The BDG licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.bdgenomics.utils.parquet
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.io.compress.{ CompressionCodec => HadoopCompressionCodec }
import org.apache.spark.Partition
import org.bdgenomics.utils.parquet.io.{ ByteAccess, FileLocator }
import org.bdgenomics.utils.parquet.rdd.{ CompressionCodecEnum, ParquetColumnChunk, ParquetColumnDescriptor, ParquetRowGroup }
import parquet.column.ColumnDescriptor
import parquet.column.page.{ PageReadStore, PageReader }
import parquet.filter.UnboundRecordFilter
import parquet.io.ColumnIOFactory
import parquet.io.api.RecordMaterializer
/**
*
* ParquetPartition is a Partition implementation, used in our RDD implementations, which corresponds to a
* single row group in a single Parquet file.
*
* @param locator The locator of the Parquet file
* @param index An integer (assigned by the RDD) for uniqueness
* @param rowGroup The metadata of the row group in the parquet file which this partition names
* @param requestedSchema The schema of the object corresponding to the user's projection
* @param actualSchema The actual schema of the objects in this Parquet file
*/
class ParquetPartition(val locator: FileLocator,
val index: Int,
val rowGroup: ParquetRowGroup,
val requestedSchema: ParquetSchemaType,
val actualSchema: ParquetSchemaType)
extends Partition {
def materializeRecords[T](io: ByteAccess,
recordMaterializer: RecordMaterializer[T],
filter: UnboundRecordFilter): Iterator[T] =
ParquetPartition.materializeRecords(io, recordMaterializer, filter, rowGroup, requestedSchema, actualSchema)
}
/**
* PartitionPageReadStore is a holder for mapping between the ParquetColumnDescriptor values and the
* PageReader values.
*
* @param chunkMap For a particular row group in a Parquet file, this is a map relating the descriptors of the columns
* to the corresponding readers for their values.
* @param rowGroup The metadata for the row group itself.
*/
class PartitionPageReadStore(chunkMap: Map[ParquetColumnDescriptor, PageReader], rowGroup: ParquetRowGroup)
extends PageReadStore {
override def getPageReader(cd: ColumnDescriptor): PageReader =
chunkMap.getOrElse(new ParquetColumnDescriptor(cd),
throw new NoSuchElementException("Could not find %s in the map %s".format(
cd.getPath.mkString("."),
chunkMap.keys.map(_.path.mkString(".")).mkString(","))))
override def getRowCount: Long = rowGroup.rowCount
}
object ParquetPartition {
/**
* This is our core implementation that materializes records out of a row group,
* applying the corresponding filters and projections.
*
* @param io The bytes (reified as a ByteAccess value) for the Parquet file
* @param recordMaterializer a record materializer for the records we are requesting back
* @param filter The filter to dictate which records should be materialized.
* @param rowGroup The metadata for the row group we are materializing records from
* @param requestedSchema The projection requested by the user
* @param actualSchema The schema of the records in the file
* @tparam T The type of the records desired by the user
* @return An iterator which materializes only those records passing the filter from the row group
*/
def materializeRecords[T](io: ByteAccess,
recordMaterializer: RecordMaterializer[T],
filter: UnboundRecordFilter,
rowGroup: ParquetRowGroup,
requestedSchema: ParquetSchemaType,
actualSchema: ParquetSchemaType): Iterator[T] = {
val requestedPaths = requestedSchema.paths()
val requestedColumnChunks: Seq[ParquetColumnChunk] = rowGroup.columnChunks.filter {
cc => requestedPaths.contains(TypePath(cc.columnDescriptor.path))
}
val config: Configuration = new Configuration()
val decompressor: Option[HadoopCompressionCodec] =
CompressionCodecEnum.getHadoopCodec(rowGroup.columnChunks.head.compressionCodec, config)
val chunkMap = requestedColumnChunks
.map(cc => (cc.columnDescriptor, cc.readAllPages(decompressor, io)))
.toMap
val pageReadStore = new PartitionPageReadStore(chunkMap, rowGroup)
val columnIOFactory: ColumnIOFactory = new ColumnIOFactory
val columnIO = columnIOFactory.getColumnIO(requestedSchema.convertToParquet(), actualSchema.convertToParquet())
// TODO: Use Scala Option instead of null
val reader = if (filter == null.asInstanceOf[UnboundRecordFilter]) {
columnIO.getRecordReader[T](pageReadStore, recordMaterializer)
} else {
columnIO.getRecordReader[T](pageReadStore, recordMaterializer, filter)
}
new Iterator[T] {
var recordsRead = 0
val totalRecords = rowGroup.rowCount
var nextT: Option[T] = Option(reader.read())
override def next(): T = {
val ret = nextT
recordsRead += 1
if (recordsRead >= totalRecords) {
nextT = None
} else {
nextT = Option(reader.read())
}
ret.getOrElse(null.asInstanceOf[T])
}
override def hasNext: Boolean = nextT.isDefined
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy