org.bdgenomics.utils.parquet.ParquetCommon.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of bdg-utils-parquet Show documentation
The newest version!
/**
 * Licensed to Big Data Genomics (BDG) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The BDG licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.{ ByteArrayInputStream, InputStream }
import java.nio.charset.Charset
import org.bdgenomics.utils.parquet.rdd.CompressionCodecEnum.CompressionCodec
import parquet.column.page.PageReader
import parquet.format.{ ColumnChunk, RowGroup, FileMetaData }
import parquet.format.converter.ParquetMetadataConverter
import parquet.org.apache.thrift.protocol.TCompactProtocol
import parquet.org.apache.thrift.transport.TIOStreamTransport
import parquet.schema.MessageType
import parquet.schema.PrimitiveType.PrimitiveTypeName
import scala.collection.JavaConversions._
import scala.math._

package parquet.format.converter {

  /**
   * Parquet has made their particular methods for doing this conversion
   * private, protected, and package-protected, so we have to wrap it
   * _within the parquet.format.converter package_.
   */
  class MyConverter extends ParquetMetadataConverter with Serializable {
    def convert(md: FileMetaData): MessageType = super.fromParquetSchema(md.getSchema)
  }
}

package org.bdgenomics.utils.parquet.rdd {

  import parquet.format.converter.MyConverter
  import java.nio.ByteBuffer
  import parquet.column.page.{ DictionaryPage, Page }
  import parquet.format.{ PageType, PageHeader }
  import parquet.bytes.BytesInput
  import parquet.column.ColumnDescriptor
  import org.apache.hadoop.io.compress.{ CompressionCodec => HadoopCompressionCodec, CodecPool }
  import org.apache.hadoop.conf.Configuration
  import org.bdgenomics.utils.parquet.io.ByteAccess

  object ParquetCommon {

    val MAGIC: Array[Byte] = "PAR1".getBytes(Charset.forName("ASCII"))

    val converter = new MyConverter()

    def parseMessageType(metadata: FileMetaData): MessageType = converter.convert(metadata)

    def readInt(buffer: Array[Byte], offset: Int): Int =
      ByteBuffer.wrap(buffer.slice(offset, offset + 4).reverse).getInt

    def readFileMetadata(io: ByteAccess): FileMetaData = {
      val magicAndLength: Array[Byte] = io.readFully(io.length() - MAGIC.length - 4, MAGIC.length + 4)

      require(magicAndLength.slice(4, magicAndLength.length).sameElements(MAGIC),
        "File does not appear to be a Parquet file")

      val footerLength = readInt(magicAndLength, 0)

      assert(io.length() >= 0)
      assert(footerLength >= 0, "footerLength %d should be non-negative".format(footerLength))
      assert(io.length() - MAGIC.length - 4 - footerLength >= 0)

      val footerBytes = io.readFully(io.length() - MAGIC.length - 4 - footerLength, footerLength)
      val footerInputStream = new ByteArrayInputStream(footerBytes)

      val protocol = new TCompactProtocol(new TIOStreamTransport(footerInputStream))
      val fileMetadata = new FileMetaData()
      fileMetadata.read(protocol)

      fileMetadata
    }

    def readFooter(io: ByteAccess): Footer = {
      new Footer(readFileMetadata(io))
    }
  }

  /**
   *
   * @param byteSize Total (compressed) size of the row group, in bytes.
   * @param rowCount Total number of rows in the row group
   * @param columnChunks Metadata for each column chunk in the row group
   */
  case class ParquetRowGroup(byteSize: Long, rowCount: Long, columnChunks: Seq[ParquetColumnChunk]) {
    def this(rg: RowGroup, schema: MessageType) = this(rg.getTotal_byte_size, rg.getNum_rows, rg.getColumns.map(cc => new ParquetColumnChunk(cc, schema)))

  }

  object CompressionCodecEnum extends Enumeration {
    val GZIP, SNAPPY, LZO, UNCOMPRESSED = Value

    type CompressionCodec = CompressionCodecEnum.Value

    def apply(codec: parquet.format.CompressionCodec): CompressionCodec =
      codec match {
        case parquet.format.CompressionCodec.GZIP         => GZIP
        case parquet.format.CompressionCodec.SNAPPY       => SNAPPY
        case parquet.format.CompressionCodec.LZO          => LZO
        case parquet.format.CompressionCodec.UNCOMPRESSED => UNCOMPRESSED
      }

    def getHadoopCodec(parquetCodec: CompressionCodec, conf: Configuration): Option[HadoopCompressionCodec] = {
      val codec = parquetCodec match {
        case GZIP   => Some(new org.apache.hadoop.io.compress.GzipCodec())
        case SNAPPY => Some(new parquet.hadoop.codec.SnappyCodec())
        case LZO =>
          throw new UnsupportedOperationException("Cannot initialize an LZO Codec.")
        case UNCOMPRESSED => None
      }

      codec.foreach {
        cd =>
          cd.setConf(conf)
      }

      codec
    }

  }

  case class ParquetColumnDescriptor(path: Seq[String],
                                     typeName: PrimitiveTypeName,
                                     typeLength: Int,
                                     maxRep: Int,
                                     maxDef: Int) {

    def this(descriptor: ColumnDescriptor) =
      this(descriptor.getPath, descriptor.getType, descriptor.getTypeLength, descriptor.getMaxRepetitionLevel, descriptor.getMaxRepetitionLevel)

    def this(cc: ColumnChunk, schema: MessageType) =
      this(schema
        .getColumns
        .find(_.getPath.sameElements(cc.getMeta_data.getPath_in_schema))
        .get)

    def columnDescriptor(): ColumnDescriptor =
      new ColumnDescriptor(path.toArray, typeName, typeLength, maxRep, maxDef)

    override def toString: String = "{path=%s|typeName=%s|typeLength=%d|maxRep=%d|maxDef=%d}".format(
      path.mkString("."), typeName, typeLength, maxRep, maxDef)
  }

  /**
   *
   * @param numValues The number of values in this column
   * @param dataPageOffset The offset relative to the entire file, in bytes, of the first data page within this column chunk
   * @param indexPageOffset The offset, relative to the entire file, in bytes, of the (optional) index page
   * @param dictionaryPageOffset The offset, relative to the entire file, in bytes, of the (optional) dictionary page
   * @param uncompressedSize The total size, in uncompressed bytes, of the column chunk
   * @param compressedSize The total compressed size, in bytes, of the column chunk
   * @param columnDescriptor The column descriptor for this column chunk, describes path and type information
   * @param compressionCodec The compression method used on this column chunk
   */
  case class ParquetColumnChunk(numValues: Long,
                                dataPageOffset: Long,
                                indexPageOffset: Option[Long],
                                dictionaryPageOffset: Option[Long],
                                uncompressedSize: Long,
                                compressedSize: Long,
                                columnDescriptor: ParquetColumnDescriptor,
                                compressionCodec: CompressionCodec) {

    def this(cc: ColumnChunk, schema: MessageType) =
      this(
        cc.getMeta_data.getNum_values,
        cc.getMeta_data.getData_page_offset,
        if (cc.getMeta_data.isSetIndex_page_offset) Some(cc.getMeta_data.getIndex_page_offset) else None,
        if (cc.getMeta_data.isSetDictionary_page_offset) Some(cc.getMeta_data.getDictionary_page_offset) else None,
        cc.getMeta_data.getTotal_uncompressed_size,
        cc.getMeta_data.getTotal_compressed_size,
        new ParquetColumnDescriptor(cc, schema),
        CompressionCodecEnum(cc.getMeta_data.getCodec))

    def readPageHeader(is: InputStream): PageHeader = parquet.format.Util.readPageHeader(is)

    def readDataPage(bytesInput: BytesInput, pageHeader: PageHeader, parquetMetadataConverter: ParquetMetadataConverter): Page =
      new Page(
        bytesInput,
        pageHeader.data_page_header.num_values,
        pageHeader.uncompressed_page_size,
        parquetMetadataConverter.getEncoding(pageHeader.data_page_header.repetition_level_encoding),
        parquetMetadataConverter.getEncoding(pageHeader.data_page_header.definition_level_encoding),
        parquetMetadataConverter.getEncoding(pageHeader.data_page_header.encoding))

    def readDictionaryPage(bytesInput: BytesInput, pageHeader: PageHeader, parquetMetadataConverter: ParquetMetadataConverter): DictionaryPage =
      new DictionaryPage(
        bytesInput,
        pageHeader.uncompressed_page_size,
        pageHeader.dictionary_page_header.num_values,
        parquetMetadataConverter.getEncoding(pageHeader.dictionary_page_header.encoding))

    private def decompress(codec: HadoopCompressionCodec, input: InputStream, compressed: Int, uncompressed: Int): BytesInput = {
      val decompressor = CodecPool.getDecompressor(codec)
      decompressor.reset()
      val bytesArray = new Array[Byte](compressed)
      var toRead = compressed
      var read = 1
      while (read > 0 && toRead > 0) {

        assert(compressed - toRead >= 0, "offset %d-%d=%d was negative".format(compressed, toRead, compressed - toRead))
        assert(toRead >= 0, "len %d was negative".format(toRead))
        assert(toRead <= bytesArray.length - (compressed - toRead), "length was > the bounds %d".format(toRead))
        read = input.read(bytesArray, compressed - toRead, toRead)
        toRead = toRead - read
      }
      val bytes = new ByteArrayInputStream(bytesArray)

      assert(codec != null, "codec was null")
      assert(decompressor != null, "decompressor was null")
      assert(bytes != null, "bytes was null")
      BytesInput.from(codec.createInputStream(bytes, decompressor), uncompressed)
    }

    def readAllPages(codec: Option[HadoopCompressionCodec], io: ByteAccess): PageReader = {

      val parquetMetadataConverter = ParquetCommon.converter

      val chunkBytes: Array[Byte] =
        io.readFully(min(min(dictionaryPageOffset.getOrElse(Long.MaxValue), indexPageOffset.getOrElse(Long.MaxValue)),
          dataPageOffset),
          compressedSize.toInt)
      val input = new ByteArrayInputStream(chunkBytes)

      var dictPage: Option[DictionaryPage] = None
      var dataPages: Seq[Page] = Seq()

      var valuesRead: Long = 0

      while (valuesRead < numValues) {
        val header = readPageHeader(input)
        val bytesInput = codec match {
          case None => BytesInput.from(input, header.compressed_page_size)
          case Some(c) =>
            decompress(c, input, header.compressed_page_size, header.uncompressed_page_size)
        }

        header.`type` match {
          case PageType.DICTIONARY_PAGE =>
            dictPage = Some(readDictionaryPage(bytesInput, header, parquetMetadataConverter))
          case PageType.DATA_PAGE =>
            val dp = readDataPage(bytesInput, header, parquetMetadataConverter)
            dataPages = dataPages :+ dp
            valuesRead += dp.getValueCount
          case _ =>
        }
      }

      if (valuesRead != numValues) {
        // corrupt chunk
      }

      new PageReader {
        override def readPage: Page = {
          if (dataPages.isEmpty) null
          else {
            val ret = dataPages.head
            dataPages = dataPages.tail
            ret
          }
        }

        override def readDictionaryPage: DictionaryPage = {
          dictPage match {
            case None     => null
            case Some(dp) => dp
          }
        }

        def getTotalValueCount: Long = numValues
      }
    }

  }

  /**
   *
   * @param rowGroups The metadata for each row group in the file
   */
  case class Footer(rowGroups: Seq[ParquetRowGroup]) {

    def this(metadata: FileMetaData) = this(Footer.parseRowGroups(metadata))

  }

  object Footer {
    def parseRowGroups(metadata: FileMetaData): Seq[ParquetRowGroup] = {
      val schema = ParquetCommon.parseMessageType(metadata)
      metadata.getRow_groups.map(rg => new ParquetRowGroup(rg, schema))
    }
  }

}