org.apache.comet.parquet.FileReader Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.comet.parquet;
import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
import java.lang.reflect.Method;
import java.net.URI;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.zip.CRC32;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.parquet.ParquetReadOptions;
import org.apache.parquet.Preconditions;
import org.apache.parquet.bytes.ByteBufferInputStream;
import org.apache.parquet.bytes.BytesInput;
import org.apache.parquet.bytes.BytesUtils;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.page.DataPage;
import org.apache.parquet.column.page.DataPageV1;
import org.apache.parquet.column.page.DataPageV2;
import org.apache.parquet.column.page.DictionaryPage;
import org.apache.parquet.column.page.PageReadStore;
import org.apache.parquet.compression.CompressionCodecFactory;
import org.apache.parquet.crypto.AesCipher;
import org.apache.parquet.crypto.FileDecryptionProperties;
import org.apache.parquet.crypto.InternalColumnDecryptionSetup;
import org.apache.parquet.crypto.InternalFileDecryptor;
import org.apache.parquet.crypto.ModuleCipherFactory;
import org.apache.parquet.crypto.ParquetCryptoRuntimeException;
import org.apache.parquet.filter2.compat.FilterCompat;
import org.apache.parquet.format.BlockCipher;
import org.apache.parquet.format.DataPageHeader;
import org.apache.parquet.format.DataPageHeaderV2;
import org.apache.parquet.format.DictionaryPageHeader;
import org.apache.parquet.format.FileCryptoMetaData;
import org.apache.parquet.format.PageHeader;
import org.apache.parquet.format.Util;
import org.apache.parquet.format.converter.ParquetMetadataConverter;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
import org.apache.parquet.hadoop.metadata.ColumnPath;
import org.apache.parquet.hadoop.metadata.FileMetaData;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.hadoop.util.counters.BenchmarkCounter;
import org.apache.parquet.internal.column.columnindex.OffsetIndex;
import org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter;
import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore;
import org.apache.parquet.internal.filter2.columnindex.RowRanges;
import org.apache.parquet.io.InputFile;
import org.apache.parquet.io.ParquetDecodingException;
import org.apache.parquet.io.SeekableInputStream;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.spark.sql.execution.metric.SQLMetric;
import static org.apache.parquet.hadoop.ParquetFileWriter.EFMAGIC;
import static org.apache.parquet.hadoop.ParquetFileWriter.MAGIC;
import static org.apache.comet.parquet.RowGroupFilter.FilterLevel.BLOOMFILTER;
import static org.apache.comet.parquet.RowGroupFilter.FilterLevel.DICTIONARY;
import static org.apache.comet.parquet.RowGroupFilter.FilterLevel.STATISTICS;
/**
* A Parquet file reader. Mostly followed {@code ParquetFileReader} in {@code parquet-mr}, but with
* customizations & optimizations for Comet.
*/
public class FileReader implements Closeable {
private static final Logger LOG = LoggerFactory.getLogger(FileReader.class);
private final ParquetMetadataConverter converter;
private final SeekableInputStream f;
private final InputFile file;
private final Map metrics;
private final Map paths = new HashMap<>();
private final FileMetaData fileMetaData; // may be null
private final List blocks;
private final List blockIndexStores;
private final List blockRowRanges;
private final CRC32 crc;
private final ParquetMetadata footer;
/**
* Read configurations come from two options: - options: these are options defined & specified
* from 'parquet-mr' library - cometOptions: these are Comet-specific options, for the features
* introduced in Comet's Parquet implementation
*/
private final ParquetReadOptions options;
private final ReadOptions cometOptions;
private int currentBlock = 0;
private RowGroupReader currentRowGroup = null;
private InternalFileDecryptor fileDecryptor;
public FileReader(InputFile file, ParquetReadOptions options, ReadOptions cometOptions)
throws IOException {
this(file, null, options, cometOptions, null);
}
public FileReader(
InputFile file,
ParquetReadOptions options,
ReadOptions cometOptions,
Map metrics)
throws IOException {
this(file, null, options, cometOptions, metrics);
}
public FileReader(
InputFile file,
ParquetMetadata footer,
ParquetReadOptions options,
ReadOptions cometOptions,
Map metrics)
throws IOException {
this.converter = new ParquetMetadataConverter(options);
this.file = file;
this.f = file.newStream();
this.options = options;
this.cometOptions = cometOptions;
this.metrics = metrics;
if (footer == null) {
try {
footer = readFooter(file, options, f, converter);
} catch (Exception e) {
// In case that reading footer throws an exception in the constructor, the new stream
// should be closed. Otherwise, there's no way to close this outside.
f.close();
throw e;
}
}
this.footer = footer;
this.fileMetaData = footer.getFileMetaData();
this.fileDecryptor = fileMetaData.getFileDecryptor(); // must be called before filterRowGroups!
if (null != fileDecryptor && fileDecryptor.plaintextFile()) {
this.fileDecryptor = null; // Plaintext file. No need in decryptor
}
this.blocks = filterRowGroups(footer.getBlocks());
this.blockIndexStores = listWithNulls(this.blocks.size());
this.blockRowRanges = listWithNulls(this.blocks.size());
for (ColumnDescriptor col : footer.getFileMetaData().getSchema().getColumns()) {
paths.put(ColumnPath.get(col.getPath()), col);
}
this.crc = options.usePageChecksumVerification() ? new CRC32() : null;
}
/** Returns the footer of the Parquet file being read. */
public ParquetMetadata getFooter() {
return this.footer;
}
/** Returns the metadata of the Parquet file being read. */
public FileMetaData getFileMetaData() {
return this.fileMetaData;
}
/** Returns the input stream of the Parquet file being read. */
public SeekableInputStream getInputStream() {
return this.f;
}
/** Returns the Parquet options for reading the file. */
public ParquetReadOptions getOptions() {
return this.options;
}
/** Returns all the row groups of this reader (after applying row group filtering). */
public List getRowGroups() {
return blocks;
}
/** Sets the projected columns to be read later via {@link #readNextRowGroup()} */
public void setRequestedSchema(List projection) {
paths.clear();
for (ColumnDescriptor col : projection) {
paths.put(ColumnPath.get(col.getPath()), col);
}
}
/**
* Gets the total number of records across all row groups (after applying row group filtering).
*/
public long getRecordCount() {
long total = 0;
for (BlockMetaData block : blocks) {
total += block.getRowCount();
}
return total;
}
/**
* Gets the total number of records across all row groups (after applying both row group filtering
* and page-level column index filtering).
*/
public long getFilteredRecordCount() {
if (!options.useColumnIndexFilter()
|| !FilterCompat.isFilteringRequired(options.getRecordFilter())) {
return getRecordCount();
}
long total = 0;
for (int i = 0, n = blocks.size(); i < n; ++i) {
total += getRowRanges(i).rowCount();
}
return total;
}
/** Skips the next row group. Returns false if there's no row group to skip. Otherwise, true. */
public boolean skipNextRowGroup() {
return advanceToNextBlock();
}
/**
* Returns the next row group to read (after applying row group filtering), or null if there's no
* more row group.
*/
public PageReadStore readNextRowGroup() throws IOException {
if (currentBlock == blocks.size()) {
return null;
}
BlockMetaData block = blocks.get(currentBlock);
if (block.getRowCount() == 0) {
throw new RuntimeException("Illegal row group of 0 rows");
}
this.currentRowGroup = new RowGroupReader(block.getRowCount());
// prepare the list of consecutive parts to read them in one scan
List allParts = new ArrayList<>();
ConsecutivePartList currentParts = null;
for (ColumnChunkMetaData mc : block.getColumns()) {
ColumnPath pathKey = mc.getPath();
ColumnDescriptor columnDescriptor = paths.get(pathKey);
if (columnDescriptor != null) {
BenchmarkCounter.incrementTotalBytes(mc.getTotalSize());
long startingPos = mc.getStartingPos();
boolean mergeRanges = cometOptions.isIOMergeRangesEnabled();
int mergeRangeDelta = cometOptions.getIOMergeRangesDelta();
// start a new list if -
// it is the first part or
// the part is consecutive or
// the part is not consecutive but within the merge range
if (currentParts == null
|| (!mergeRanges && currentParts.endPos() != startingPos)
|| (mergeRanges && startingPos - currentParts.endPos() > mergeRangeDelta)) {
currentParts = new ConsecutivePartList(startingPos);
allParts.add(currentParts);
}
// if we are in a consecutive part list and there is a gap in between the parts,
// we treat the gap as a skippable chunk
long delta = startingPos - currentParts.endPos();
if (mergeRanges && delta > 0 && delta <= mergeRangeDelta) {
// add a chunk that will be skipped because it has no column descriptor
currentParts.addChunk(new ChunkDescriptor(null, null, startingPos, delta));
}
currentParts.addChunk(
new ChunkDescriptor(columnDescriptor, mc, startingPos, mc.getTotalSize()));
}
}
// actually read all the chunks
return readChunks(block, allParts, new ChunkListBuilder());
}
/**
* Returns the next row group to read (after applying both row group filtering and page level
* column index filtering), or null if there's no more row group.
*/
public PageReadStore readNextFilteredRowGroup() throws IOException {
if (currentBlock == blocks.size()) {
return null;
}
if (!options.useColumnIndexFilter()
|| !FilterCompat.isFilteringRequired(options.getRecordFilter())) {
return readNextRowGroup();
}
BlockMetaData block = blocks.get(currentBlock);
if (block.getRowCount() == 0) {
throw new RuntimeException("Illegal row group of 0 rows");
}
ColumnIndexStore ciStore = getColumnIndexReader(currentBlock);
RowRanges rowRanges = getRowRanges(currentBlock);
long rowCount = rowRanges.rowCount();
if (rowCount == 0) {
// There are no matching rows -> skipping this row-group
advanceToNextBlock();
return readNextFilteredRowGroup();
}
if (rowCount == block.getRowCount()) {
// All rows are matching -> fall back to the non-filtering path
return readNextRowGroup();
}
this.currentRowGroup = new RowGroupReader(rowRanges);
// prepare the list of consecutive parts to read them in one scan
ChunkListBuilder builder = new ChunkListBuilder();
List allParts = new ArrayList<>();
ConsecutivePartList currentParts = null;
for (ColumnChunkMetaData mc : block.getColumns()) {
ColumnPath pathKey = mc.getPath();
ColumnDescriptor columnDescriptor = paths.get(pathKey);
if (columnDescriptor != null) {
OffsetIndex offsetIndex = ciStore.getOffsetIndex(mc.getPath());
IndexFilter indexFilter = new IndexFilter(rowRanges, offsetIndex, block.getRowCount());
OffsetIndex filteredOffsetIndex = indexFilter.filterOffsetIndex();
for (IndexFilter.OffsetRange range :
indexFilter.calculateOffsetRanges(filteredOffsetIndex, mc)) {
BenchmarkCounter.incrementTotalBytes(range.length);
long startingPos = range.offset;
// first part or not consecutive => new list
if (currentParts == null || currentParts.endPos() != startingPos) {
currentParts = new ConsecutivePartList(startingPos);
allParts.add(currentParts);
}
ChunkDescriptor chunkDescriptor =
new ChunkDescriptor(columnDescriptor, mc, startingPos, range.length);
currentParts.addChunk(chunkDescriptor);
builder.setOffsetIndex(chunkDescriptor, filteredOffsetIndex);
}
}
}
// actually read all the chunks
return readChunks(block, allParts, builder);
}
// Visible for testing
ColumnIndexReader getColumnIndexReader(int blockIndex) {
ColumnIndexReader ciStore = blockIndexStores.get(blockIndex);
if (ciStore == null) {
ciStore = ColumnIndexReader.create(blocks.get(blockIndex), paths.keySet(), fileDecryptor, f);
blockIndexStores.set(blockIndex, ciStore);
}
return ciStore;
}
private PageReadStore readChunks(
BlockMetaData block, List allParts, ChunkListBuilder builder)
throws IOException {
if (shouldReadParallel()) {
readAllPartsParallel(allParts, builder);
} else {
for (ConsecutivePartList consecutiveChunks : allParts) {
consecutiveChunks.readAll(f, builder);
}
}
for (Chunk chunk : builder.build()) {
readChunkPages(chunk, block);
}
advanceToNextBlock();
return currentRowGroup;
}
private boolean shouldReadParallel() {
if (file instanceof CometInputFile) {
URI uri = ((CometInputFile) file).getPath().toUri();
return shouldReadParallel(cometOptions, uri.getScheme());
}
return false;
}
static boolean shouldReadParallel(ReadOptions options, String scheme) {
return options.isParallelIOEnabled() && shouldReadParallelForScheme(scheme);
}
private static boolean shouldReadParallelForScheme(String scheme) {
if (scheme == null) {
return false;
}
switch (scheme) {
case "s3a":
// Only enable parallel read for S3, so far.
return true;
default:
return false;
}
}
static class ReadRange {
long offset = 0;
long length = 0;
List buffers = new ArrayList<>();
@Override
public String toString() {
return "ReadRange{"
+ "offset="
+ offset
+ ", length="
+ length
+ ", numBuffers="
+ buffers.size()
+ '}';
}
}
List getReadRanges(List allParts, int nBuffers) {
int nThreads = cometOptions.parallelIOThreadPoolSize();
long buffersPerThread = nBuffers / nThreads + 1;
boolean adjustSkew = cometOptions.adjustReadRangesSkew();
List allRanges = new ArrayList<>();
for (ConsecutivePartList consecutiveChunk : allParts) {
ReadRange readRange = null;
long offset = consecutiveChunk.offset;
for (int i = 0; i < consecutiveChunk.buffers.size(); i++) {
if ((adjustSkew && (i % buffersPerThread == 0)) || i == 0) {
readRange = new ReadRange();
allRanges.add(readRange);
readRange.offset = offset;
}
ByteBuffer b = consecutiveChunk.buffers.get(i);
readRange.length += b.capacity();
readRange.buffers.add(b);
offset += b.capacity();
}
}
if (LOG.isDebugEnabled()) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < allRanges.size(); i++) {
sb.append(allRanges.get(i).toString());
if (i < allRanges.size() - 1) {
sb.append(",");
}
}
LOG.debug("Read Ranges: {}", sb);
}
return allRanges;
}
private void readAllRangesParallel(List allRanges) {
int nThreads = cometOptions.parallelIOThreadPoolSize();
ExecutorService threadPool = CometFileReaderThreadPool.getOrCreateThreadPool(nThreads);
List> futures = new ArrayList<>();
for (ReadRange readRange : allRanges) {
futures.add(
threadPool.submit(
() -> {
SeekableInputStream inputStream = null;
try {
if (file instanceof CometInputFile) {
// limit the max read ahead to length of the range
inputStream =
(((CometInputFile) file).newStream(readRange.offset, readRange.length));
LOG.debug(
"Opened new input file: {}, at offset: {}",
((CometInputFile) file).getPath().getName(),
readRange.offset);
} else {
inputStream = file.newStream();
}
long curPos = readRange.offset;
for (ByteBuffer buffer : readRange.buffers) {
inputStream.seek(curPos);
LOG.debug(
"Thread: {} Offset: {} Size: {}",
Thread.currentThread().getId(),
curPos,
buffer.capacity());
inputStream.readFully(buffer);
buffer.flip();
curPos += buffer.capacity();
} // for
} finally {
if (inputStream != null) {
inputStream.close();
}
}
return null;
}));
}
for (Future future : futures) {
try {
future.get();
} catch (InterruptedException | ExecutionException e) {
throw new RuntimeException(e);
}
}
}
/**
* Read all the consecutive part list objects in parallel.
*
* @param allParts all consecutive parts
* @param builder chunk list builder
*/
public void readAllPartsParallel(List allParts, ChunkListBuilder builder)
throws IOException {
int nBuffers = 0;
for (ConsecutivePartList consecutiveChunks : allParts) {
consecutiveChunks.allocateReadBuffers();
nBuffers += consecutiveChunks.buffers.size();
}
List allRanges = getReadRanges(allParts, nBuffers);
long startNs = System.nanoTime();
readAllRangesParallel(allRanges);
for (ConsecutivePartList consecutiveChunks : allParts) {
consecutiveChunks.setReadMetrics(startNs);
ByteBufferInputStream stream;
stream = ByteBufferInputStream.wrap(consecutiveChunks.buffers);
// report in a counter the data we just scanned
BenchmarkCounter.incrementBytesRead(consecutiveChunks.length);
for (int i = 0; i < consecutiveChunks.chunks.size(); i++) {
ChunkDescriptor descriptor = consecutiveChunks.chunks.get(i);
if (descriptor.col != null) {
builder.add(descriptor, stream.sliceBuffers(descriptor.size));
} else {
stream.skipFully(descriptor.size);
}
}
}
}
private void readChunkPages(Chunk chunk, BlockMetaData block) throws IOException {
if (fileDecryptor == null || fileDecryptor.plaintextFile()) {
currentRowGroup.addColumn(chunk.descriptor.col, chunk.readAllPages());
return;
}
// Encrypted file
ColumnPath columnPath = ColumnPath.get(chunk.descriptor.col.getPath());
InternalColumnDecryptionSetup columnDecryptionSetup = fileDecryptor.getColumnSetup(columnPath);
if (!columnDecryptionSetup.isEncrypted()) { // plaintext column
currentRowGroup.addColumn(chunk.descriptor.col, chunk.readAllPages());
} else { // encrypted column
currentRowGroup.addColumn(
chunk.descriptor.col,
chunk.readAllPages(
columnDecryptionSetup.getMetaDataDecryptor(),
columnDecryptionSetup.getDataDecryptor(),
fileDecryptor.getFileAAD(),
block.getOrdinal(),
columnDecryptionSetup.getOrdinal()));
}
}
private boolean advanceToNextBlock() {
if (currentBlock == blocks.size()) {
return false;
}
// update the current block and instantiate a dictionary reader for it
++currentBlock;
return true;
}
public long[] getRowIndices() {
long[] rowIndices = new long[blocks.size() * 2];
for (int i = 0, n = blocks.size(); i < n; i++) {
BlockMetaData block = blocks.get(i);
rowIndices[i * 2] = getRowIndexOffset(block);
rowIndices[i * 2 + 1] = block.getRowCount();
}
return rowIndices;
}
// Uses reflection to get row index offset from a Parquet block metadata.
//
// The reason reflection is used here is that some Spark versions still depend on a
// Parquet version where the method `getRowIndexOffset` is not public.
private long getRowIndexOffset(BlockMetaData metaData) {
try {
Method method = BlockMetaData.class.getMethod("getRowIndexOffset");
method.setAccessible(true);
return (long) method.invoke(metaData);
} catch (Exception e) {
throw new RuntimeException("Error when calling getRowIndexOffset", e);
}
}
private RowRanges getRowRanges(int blockIndex) {
Preconditions.checkState(
FilterCompat.isFilteringRequired(options.getRecordFilter()),
"Should not be invoked if filter is null or NOOP");
RowRanges rowRanges = blockRowRanges.get(blockIndex);
if (rowRanges == null) {
rowRanges =
ColumnIndexFilter.calculateRowRanges(
options.getRecordFilter(),
getColumnIndexReader(blockIndex),
paths.keySet(),
blocks.get(blockIndex).getRowCount());
blockRowRanges.set(blockIndex, rowRanges);
}
return rowRanges;
}
private static ParquetMetadata readFooter(
InputFile file,
ParquetReadOptions options,
SeekableInputStream f,
ParquetMetadataConverter converter)
throws IOException {
long fileLen = file.getLength();
String filePath = file.toString();
LOG.debug("File length {}", fileLen);
int FOOTER_LENGTH_SIZE = 4;
// MAGIC + data + footer + footerIndex + MAGIC
if (fileLen < MAGIC.length + FOOTER_LENGTH_SIZE + MAGIC.length) {
throw new RuntimeException(
filePath + " is not a Parquet file (length is too low: " + fileLen + ")");
}
// Read footer length and magic string - with a single seek
byte[] magic = new byte[MAGIC.length];
long fileMetadataLengthIndex = fileLen - magic.length - FOOTER_LENGTH_SIZE;
LOG.debug("reading footer index at {}", fileMetadataLengthIndex);
f.seek(fileMetadataLengthIndex);
int fileMetadataLength = BytesUtils.readIntLittleEndian(f);
f.readFully(magic);
boolean encryptedFooterMode;
if (Arrays.equals(MAGIC, magic)) {
encryptedFooterMode = false;
} else if (Arrays.equals(EFMAGIC, magic)) {
encryptedFooterMode = true;
} else {
throw new RuntimeException(
filePath
+ " is not a Parquet file. Expected magic number "
+ "at tail, but found "
+ Arrays.toString(magic));
}
long fileMetadataIndex = fileMetadataLengthIndex - fileMetadataLength;
LOG.debug("read footer length: {}, footer index: {}", fileMetadataLength, fileMetadataIndex);
if (fileMetadataIndex < magic.length || fileMetadataIndex >= fileMetadataLengthIndex) {
throw new RuntimeException(
"corrupted file: the footer index is not within the file: " + fileMetadataIndex);
}
f.seek(fileMetadataIndex);
FileDecryptionProperties fileDecryptionProperties = options.getDecryptionProperties();
InternalFileDecryptor fileDecryptor = null;
if (null != fileDecryptionProperties) {
fileDecryptor = new InternalFileDecryptor(fileDecryptionProperties);
}
// Read all the footer bytes in one time to avoid multiple read operations,
// since it can be pretty time consuming for a single read operation in HDFS.
byte[] footerBytes = new byte[fileMetadataLength];
f.readFully(footerBytes);
ByteBuffer footerBytesBuffer = ByteBuffer.wrap(footerBytes);
LOG.debug("Finished to read all footer bytes.");
InputStream footerBytesStream = ByteBufferInputStream.wrap(footerBytesBuffer);
// Regular file, or encrypted file with plaintext footer
if (!encryptedFooterMode) {
return converter.readParquetMetadata(
footerBytesStream, options.getMetadataFilter(), fileDecryptor, false, fileMetadataLength);
}
// Encrypted file with encrypted footer
if (fileDecryptor == null) {
throw new ParquetCryptoRuntimeException(
"Trying to read file with encrypted footer. " + "No keys available");
}
FileCryptoMetaData fileCryptoMetaData = Util.readFileCryptoMetaData(footerBytesStream);
fileDecryptor.setFileCryptoMetaData(
fileCryptoMetaData.getEncryption_algorithm(), true, fileCryptoMetaData.getKey_metadata());
// footer length is required only for signed plaintext footers
return converter.readParquetMetadata(
footerBytesStream, options.getMetadataFilter(), fileDecryptor, true, 0);
}
private List filterRowGroups(List blocks) {
FilterCompat.Filter recordFilter = options.getRecordFilter();
if (FilterCompat.isFilteringRequired(recordFilter)) {
// set up data filters based on configured levels
List levels = new ArrayList<>();
if (options.useStatsFilter()) {
levels.add(STATISTICS);
}
if (options.useDictionaryFilter()) {
levels.add(DICTIONARY);
}
if (options.useBloomFilter()) {
levels.add(BLOOMFILTER);
}
return RowGroupFilter.filterRowGroups(levels, recordFilter, blocks, this);
}
return blocks;
}
private static List listWithNulls(int size) {
return Stream.generate(() -> (T) null).limit(size).collect(Collectors.toList());
}
public void closeStream() throws IOException {
if (f != null) {
f.close();
}
}
@Override
public void close() throws IOException {
try {
if (f != null) {
f.close();
}
} finally {
options.getCodecFactory().release();
}
}
/**
* Builder to concatenate the buffers of the discontinuous parts for the same column. These parts
* are generated as a result of the column-index based filtering when some pages might be skipped
* at reading.
*/
private class ChunkListBuilder {
private class ChunkData {
final List buffers = new ArrayList<>();
OffsetIndex offsetIndex;
}
private final Map map = new HashMap<>();
void add(ChunkDescriptor descriptor, List buffers) {
ChunkListBuilder.ChunkData data = map.get(descriptor);
if (data == null) {
data = new ChunkData();
map.put(descriptor, data);
}
data.buffers.addAll(buffers);
}
void setOffsetIndex(ChunkDescriptor descriptor, OffsetIndex offsetIndex) {
ChunkData data = map.get(descriptor);
if (data == null) {
data = new ChunkData();
map.put(descriptor, data);
}
data.offsetIndex = offsetIndex;
}
List build() {
List chunks = new ArrayList<>();
for (Map.Entry entry : map.entrySet()) {
ChunkDescriptor descriptor = entry.getKey();
ChunkData data = entry.getValue();
chunks.add(new Chunk(descriptor, data.buffers, data.offsetIndex));
}
return chunks;
}
}
/** The data for a column chunk */
private class Chunk {
private final ChunkDescriptor descriptor;
private final ByteBufferInputStream stream;
final OffsetIndex offsetIndex;
/**
* @param descriptor descriptor for the chunk
* @param buffers ByteBuffers that contain the chunk
* @param offsetIndex the offset index for this column; might be null
*/
Chunk(ChunkDescriptor descriptor, List buffers, OffsetIndex offsetIndex) {
this.descriptor = descriptor;
this.stream = ByteBufferInputStream.wrap(buffers);
this.offsetIndex = offsetIndex;
}
protected PageHeader readPageHeader(BlockCipher.Decryptor blockDecryptor, byte[] pageHeaderAAD)
throws IOException {
return Util.readPageHeader(stream, blockDecryptor, pageHeaderAAD);
}
/**
* Calculate checksum of input bytes, throw decoding exception if it does not match the provided
* reference crc
*/
private void verifyCrc(int referenceCrc, byte[] bytes, String exceptionMsg) {
crc.reset();
crc.update(bytes);
if (crc.getValue() != ((long) referenceCrc & 0xffffffffL)) {
throw new ParquetDecodingException(exceptionMsg);
}
}
private ColumnPageReader readAllPages() throws IOException {
return readAllPages(null, null, null, -1, -1);
}
private ColumnPageReader readAllPages(
BlockCipher.Decryptor headerBlockDecryptor,
BlockCipher.Decryptor pageBlockDecryptor,
byte[] aadPrefix,
int rowGroupOrdinal,
int columnOrdinal)
throws IOException {
List pagesInChunk = new ArrayList<>();
DictionaryPage dictionaryPage = null;
PrimitiveType type =
fileMetaData.getSchema().getType(descriptor.col.getPath()).asPrimitiveType();
long valuesCountReadSoFar = 0;
int dataPageCountReadSoFar = 0;
byte[] dataPageHeaderAAD = null;
if (null != headerBlockDecryptor) {
dataPageHeaderAAD =
AesCipher.createModuleAAD(
aadPrefix,
ModuleCipherFactory.ModuleType.DataPageHeader,
rowGroupOrdinal,
columnOrdinal,
getPageOrdinal(dataPageCountReadSoFar));
}
while (hasMorePages(valuesCountReadSoFar, dataPageCountReadSoFar)) {
byte[] pageHeaderAAD = dataPageHeaderAAD;
if (null != headerBlockDecryptor) {
// Important: this verifies file integrity (makes sure dictionary page had not been
// removed)
if (null == dictionaryPage && descriptor.metadata.hasDictionaryPage()) {
pageHeaderAAD =
AesCipher.createModuleAAD(
aadPrefix,
ModuleCipherFactory.ModuleType.DictionaryPageHeader,
rowGroupOrdinal,
columnOrdinal,
-1);
} else {
int pageOrdinal = getPageOrdinal(dataPageCountReadSoFar);
AesCipher.quickUpdatePageAAD(dataPageHeaderAAD, pageOrdinal);
}
}
PageHeader pageHeader = readPageHeader(headerBlockDecryptor, pageHeaderAAD);
int uncompressedPageSize = pageHeader.getUncompressed_page_size();
int compressedPageSize = pageHeader.getCompressed_page_size();
final BytesInput pageBytes;
switch (pageHeader.type) {
case DICTIONARY_PAGE:
// there is only one dictionary page per column chunk
if (dictionaryPage != null) {
throw new ParquetDecodingException(
"more than one dictionary page in column " + descriptor.col);
}
pageBytes = this.readAsBytesInput(compressedPageSize);
if (options.usePageChecksumVerification() && pageHeader.isSetCrc()) {
verifyCrc(
pageHeader.getCrc(),
pageBytes.toByteArray(),
"could not verify dictionary page integrity, CRC checksum verification failed");
}
DictionaryPageHeader dicHeader = pageHeader.getDictionary_page_header();
dictionaryPage =
new DictionaryPage(
pageBytes,
uncompressedPageSize,
dicHeader.getNum_values(),
converter.getEncoding(dicHeader.getEncoding()));
// Copy crc to new page, used for testing
if (pageHeader.isSetCrc()) {
dictionaryPage.setCrc(pageHeader.getCrc());
}
break;
case DATA_PAGE:
DataPageHeader dataHeaderV1 = pageHeader.getData_page_header();
pageBytes = this.readAsBytesInput(compressedPageSize);
if (options.usePageChecksumVerification() && pageHeader.isSetCrc()) {
verifyCrc(
pageHeader.getCrc(),
pageBytes.toByteArray(),
"could not verify page integrity, CRC checksum verification failed");
}
DataPageV1 dataPageV1 =
new DataPageV1(
pageBytes,
dataHeaderV1.getNum_values(),
uncompressedPageSize,
converter.fromParquetStatistics(
getFileMetaData().getCreatedBy(), dataHeaderV1.getStatistics(), type),
converter.getEncoding(dataHeaderV1.getRepetition_level_encoding()),
converter.getEncoding(dataHeaderV1.getDefinition_level_encoding()),
converter.getEncoding(dataHeaderV1.getEncoding()));
// Copy crc to new page, used for testing
if (pageHeader.isSetCrc()) {
dataPageV1.setCrc(pageHeader.getCrc());
}
pagesInChunk.add(dataPageV1);
valuesCountReadSoFar += dataHeaderV1.getNum_values();
++dataPageCountReadSoFar;
break;
case DATA_PAGE_V2:
DataPageHeaderV2 dataHeaderV2 = pageHeader.getData_page_header_v2();
int dataSize =
compressedPageSize
- dataHeaderV2.getRepetition_levels_byte_length()
- dataHeaderV2.getDefinition_levels_byte_length();
pagesInChunk.add(
new DataPageV2(
dataHeaderV2.getNum_rows(),
dataHeaderV2.getNum_nulls(),
dataHeaderV2.getNum_values(),
this.readAsBytesInput(dataHeaderV2.getRepetition_levels_byte_length()),
this.readAsBytesInput(dataHeaderV2.getDefinition_levels_byte_length()),
converter.getEncoding(dataHeaderV2.getEncoding()),
this.readAsBytesInput(dataSize),
uncompressedPageSize,
converter.fromParquetStatistics(
getFileMetaData().getCreatedBy(), dataHeaderV2.getStatistics(), type),
dataHeaderV2.isIs_compressed()));
valuesCountReadSoFar += dataHeaderV2.getNum_values();
++dataPageCountReadSoFar;
break;
default:
LOG.debug(
"skipping page of type {} of size {}", pageHeader.getType(), compressedPageSize);
stream.skipFully(compressedPageSize);
break;
}
}
if (offsetIndex == null && valuesCountReadSoFar != descriptor.metadata.getValueCount()) {
// Would be nice to have a CorruptParquetFileException or something as a subclass?
throw new IOException(
"Expected "
+ descriptor.metadata.getValueCount()
+ " values in column chunk at "
+ file
+ " offset "
+ descriptor.metadata.getFirstDataPageOffset()
+ " but got "
+ valuesCountReadSoFar
+ " values instead over "
+ pagesInChunk.size()
+ " pages ending at file offset "
+ (descriptor.fileOffset + stream.position()));
}
CompressionCodecFactory.BytesInputDecompressor decompressor =
options.getCodecFactory().getDecompressor(descriptor.metadata.getCodec());
return new ColumnPageReader(
decompressor,
pagesInChunk,
dictionaryPage,
offsetIndex,
blocks.get(currentBlock).getRowCount(),
pageBlockDecryptor,
aadPrefix,
rowGroupOrdinal,
columnOrdinal);
}
private boolean hasMorePages(long valuesCountReadSoFar, int dataPageCountReadSoFar) {
return offsetIndex == null
? valuesCountReadSoFar < descriptor.metadata.getValueCount()
: dataPageCountReadSoFar < offsetIndex.getPageCount();
}
private int getPageOrdinal(int dataPageCountReadSoFar) {
if (null == offsetIndex) {
return dataPageCountReadSoFar;
}
return offsetIndex.getPageOrdinal(dataPageCountReadSoFar);
}
/**
* @param size the size of the page
* @return the page
* @throws IOException if there is an error while reading from the file stream
*/
public BytesInput readAsBytesInput(int size) throws IOException {
return BytesInput.from(stream.sliceBuffers(size));
}
}
/**
* Describes a list of consecutive parts to be read at once. A consecutive part may contain whole
* column chunks or only parts of them (some pages).
*/
private class ConsecutivePartList {
private final long offset;
private final List chunks = new ArrayList<>();
private long length;
private final SQLMetric fileReadTimeMetric;
private final SQLMetric fileReadSizeMetric;
private final SQLMetric readThroughput;
List buffers;
/**
* Constructor
*
* @param offset where the first chunk starts
*/
ConsecutivePartList(long offset) {
if (metrics != null) {
this.fileReadTimeMetric = metrics.get("ParquetInputFileReadTime");
this.fileReadSizeMetric = metrics.get("ParquetInputFileReadSize");
this.readThroughput = metrics.get("ParquetInputFileReadThroughput");
} else {
this.fileReadTimeMetric = null;
this.fileReadSizeMetric = null;
this.readThroughput = null;
}
this.offset = offset;
}
/**
* Adds a chunk to the list. It must be consecutive to the previous chunk.
*
* @param descriptor a chunk descriptor
*/
public void addChunk(ChunkDescriptor descriptor) {
chunks.add(descriptor);
length += descriptor.size;
}
private void allocateReadBuffers() {
int fullAllocations = Math.toIntExact(length / options.getMaxAllocationSize());
int lastAllocationSize = Math.toIntExact(length % options.getMaxAllocationSize());
int numAllocations = fullAllocations + (lastAllocationSize > 0 ? 1 : 0);
this.buffers = new ArrayList<>(numAllocations);
for (int i = 0; i < fullAllocations; i += 1) {
this.buffers.add(options.getAllocator().allocate(options.getMaxAllocationSize()));
}
if (lastAllocationSize > 0) {
this.buffers.add(options.getAllocator().allocate(lastAllocationSize));
}
}
/**
* @param f file to read the chunks from
* @param builder used to build chunk list to read the pages for the different columns
* @throws IOException if there is an error while reading from the stream
*/
public void readAll(SeekableInputStream f, ChunkListBuilder builder) throws IOException {
f.seek(offset);
allocateReadBuffers();
long startNs = System.nanoTime();
for (ByteBuffer buffer : buffers) {
f.readFully(buffer);
buffer.flip();
}
setReadMetrics(startNs);
// report in a counter the data we just scanned
BenchmarkCounter.incrementBytesRead(length);
ByteBufferInputStream stream = ByteBufferInputStream.wrap(buffers);
for (int i = 0; i < chunks.size(); i++) {
ChunkDescriptor descriptor = chunks.get(i);
if (descriptor.col != null) {
builder.add(descriptor, stream.sliceBuffers(descriptor.size));
} else {
stream.skipFully(descriptor.size);
}
}
}
private void setReadMetrics(long startNs) {
long totalFileReadTimeNs = System.nanoTime() - startNs;
double sizeInMb = ((double) length) / (1024 * 1024);
double timeInSec = ((double) totalFileReadTimeNs) / 1000_0000_0000L;
double throughput = sizeInMb / timeInSec;
LOG.debug(
"Comet: File Read stats: Length: {} MB, Time: {} secs, throughput: {} MB/sec ",
sizeInMb,
timeInSec,
throughput);
if (fileReadTimeMetric != null) {
fileReadTimeMetric.add(totalFileReadTimeNs);
}
if (fileReadSizeMetric != null) {
fileReadSizeMetric.add(length);
}
if (readThroughput != null) {
readThroughput.set(throughput);
}
}
/**
* End position of the last byte of these chunks
*
* @return the position following the last byte of these chunks
*/
public long endPos() {
return offset + length;
}
}
/** Information needed to read a column chunk or a part of it. */
private static class ChunkDescriptor {
private final ColumnDescriptor col;
private final ColumnChunkMetaData metadata;
private final long fileOffset;
private final long size;
/**
* @param col column this chunk is part of
* @param metadata metadata for the column
* @param fileOffset offset in the file where this chunk starts
* @param size size of the chunk
*/
ChunkDescriptor(
ColumnDescriptor col, ColumnChunkMetaData metadata, long fileOffset, long size) {
this.col = col;
this.metadata = metadata;
this.fileOffset = fileOffset;
this.size = size;
}
@Override
public int hashCode() {
return col.hashCode();
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
} else if (obj instanceof ChunkDescriptor) {
return col.equals(((ChunkDescriptor) obj).col);
} else {
return false;
}
}
}
}