Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.apache.parquet.hadoop.ParquetFileReader Maven / Gradle / Ivy
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
package org.apache.parquet.hadoop;
import static org.apache.parquet.bytes.BytesUtils.readIntLittleEndian;
import static org.apache.parquet.filter2.compat.RowGroupFilter.FilterLevel.DICTIONARY;
import static org.apache.parquet.filter2.compat.RowGroupFilter.FilterLevel.STATISTICS;
import static org.apache.parquet.format.converter.ParquetMetadataConverter.NO_FILTER;
import static org.apache.parquet.format.converter.ParquetMetadataConverter.SKIP_ROW_GROUPS;
import static org.apache.parquet.hadoop.ColumnIndexFilterUtils.calculateOffsetRanges;
import static org.apache.parquet.hadoop.ColumnIndexFilterUtils.filterOffsetIndex;
import static org.apache.parquet.hadoop.ParquetFileWriter.MAGIC;
import static org.apache.parquet.hadoop.ParquetFileWriter.PARQUET_COMMON_METADATA_FILE;
import static org.apache.parquet.hadoop.ParquetFileWriter.PARQUET_METADATA_FILE;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.HadoopReadOptions;
import org.apache.parquet.ParquetReadOptions;
import org.apache.parquet.bytes.ByteBufferInputStream;
import org.apache.parquet.bytes.BytesInput;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.Encoding;
import org.apache.parquet.compression.CompressionCodecFactory.BytesInputDecompressor;
import org.apache.parquet.filter2.compat.FilterCompat;
import org.apache.parquet.filter2.compat.RowGroupFilter;
import org.apache.parquet.format.DataPageHeader;
import org.apache.parquet.format.DataPageHeaderV2;
import org.apache.parquet.format.DictionaryPageHeader;
import org.apache.parquet.format.PageHeader;
import org.apache.parquet.format.Util;
import org.apache.parquet.format.converter.ParquetMetadataConverter;
import org.apache.parquet.format.converter.ParquetMetadataConverter.MetadataFilter;
import org.apache.parquet.hadoop.ColumnChunkPageReadStore.ColumnChunkPageReader;
import org.apache.parquet.hadoop.ColumnIndexFilterUtils.OffsetRange;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
import org.apache.parquet.hadoop.metadata.ColumnPath;
import org.apache.parquet.hadoop.metadata.FileMetaData;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.hadoop.util.HadoopInputFile;
import org.apache.parquet.hadoop.util.HiddenFileFilter;
import org.apache.parquet.hadoop.util.counters.BenchmarkCounter;
import org.apache.parquet.internal.column.columnindex.ColumnIndex;
import org.apache.parquet.internal.column.columnindex.OffsetIndex;
import org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter;
import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore;
import org.apache.parquet.internal.filter2.columnindex.RowRanges;
import org.apache.parquet.internal.hadoop.metadata.IndexReference;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.yetus.audience.InterfaceAudience.Private;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
* Internal implementation of the Parquet file reader as a block container
public class ParquetFileReader implements Closeable {
private static final Logger LOG = LoggerFactory.getLogger(ParquetFileReader.class);
public static String PARQUET_READ_PARALLELISM = "";
private final ParquetMetadataConverter converter;
* for files provided, check if there's a summary file.
* If a summary file is found it is used otherwise the file footer is used.
* @param configuration the hadoop conf to connect to the file system;
* @param partFiles the part files to read
* @return the footers for those files using the summary file if possible.
* @throws IOException if there is an exception while reading footers
* @deprecated metadata files are not recommended and will be removed in 2.0.0
public static List readAllFootersInParallelUsingSummaryFiles(Configuration configuration, List partFiles) throws IOException {
return readAllFootersInParallelUsingSummaryFiles(configuration, partFiles, false);
private static MetadataFilter filter(boolean skipRowGroups) {
return skipRowGroups ? SKIP_ROW_GROUPS : NO_FILTER;
* for files provided, check if there's a summary file.
* If a summary file is found it is used otherwise the file footer is used.
* @param configuration the hadoop conf to connect to the file system;
* @param partFiles the part files to read
* @param skipRowGroups to skipRowGroups in the footers
* @return the footers for those files using the summary file if possible.
* @throws IOException if there is an exception while reading footers
* @deprecated metadata files are not recommended and will be removed in 2.0.0
public static List readAllFootersInParallelUsingSummaryFiles(
final Configuration configuration,
final Collection partFiles,
final boolean skipRowGroups) throws IOException {
// figure out list of all parents to part files
Set parents = new HashSet();
for (FileStatus part : partFiles) {
// read corresponding summary files if they exist
List>> summaries = new ArrayList>>();
for (final Path path : parents) {
summaries.add(new Callable>() {
public Map call() throws Exception {
ParquetMetadata mergedMetadata = readSummaryMetadata(configuration, path, skipRowGroups);
if (mergedMetadata != null) {
final List footers;
if (skipRowGroups) {
footers = new ArrayList();
for (FileStatus f : partFiles) {
footers.add(new Footer(f.getPath(), mergedMetadata));
} else {
footers = footersFromSummaryFile(path, mergedMetadata);
Map map = new HashMap();
for (Footer footer : footers) {
// the folder may have been moved
footer = new Footer(new Path(path, footer.getFile().getName()), footer.getParquetMetadata());
map.put(footer.getFile(), footer);
return map;
} else {
return Collections.emptyMap();
Map cache = new HashMap();
try {
List> footersFromSummaries = runAllInParallel(configuration.getInt(PARQUET_READ_PARALLELISM, 5), summaries);
for (Map footers : footersFromSummaries) {
} catch (ExecutionException e) {
throw new IOException("Error reading summaries", e);
// keep only footers for files actually requested and read file footer if not found in summaries
List result = new ArrayList(partFiles.size());
List toRead = new ArrayList();
for (FileStatus part : partFiles) {
Footer f = cache.get(part.getPath());
if (f != null) {
} else {
if (toRead.size() > 0) {
// read the footers of the files that did not have a summary file"reading another {} footers", toRead.size());
result.addAll(readAllFootersInParallel(configuration, toRead, skipRowGroups));
return result;
private static List runAllInParallel(int parallelism, List> toRun) throws ExecutionException {"Initiating action with parallelism: {}", parallelism);
ExecutorService threadPool = Executors.newFixedThreadPool(parallelism);
try {
List> futures = new ArrayList>();
for (Callable callable : toRun) {
List result = new ArrayList(toRun.size());
for (Future future : futures) {
try {
} catch (InterruptedException e) {
throw new RuntimeException("The thread was interrupted", e);
return result;
} finally {
* @param configuration the conf to access the File System
* @param partFiles the files to read
* @return the footers
* @throws IOException if an exception was raised while reading footers
* @deprecated metadata files are not recommended and will be removed in 2.0.0
public static List readAllFootersInParallel(final Configuration configuration, List partFiles) throws IOException {
return readAllFootersInParallel(configuration, partFiles, false);
* read all the footers of the files provided
* (not using summary files)
* @param configuration the conf to access the File System
* @param partFiles the files to read
* @param skipRowGroups to skip the rowGroup info
* @return the footers
* @throws IOException if there is an exception while reading footers
* @deprecated will be removed in 2.0.0;
* use {@link ParquetFileReader#open(InputFile, ParquetReadOptions)}
public static List readAllFootersInParallel(final Configuration configuration, List partFiles, final boolean skipRowGroups) throws IOException {
List> footers = new ArrayList>();
for (final FileStatus currentFile : partFiles) {
footers.add(new Callable() {
public Footer call() throws Exception {
try {
return new Footer(currentFile.getPath(), readFooter(configuration, currentFile, filter(skipRowGroups)));
} catch (IOException e) {
throw new IOException("Could not read footer for file " + currentFile, e);
try {
return runAllInParallel(configuration.getInt(PARQUET_READ_PARALLELISM, 5), footers);
} catch (ExecutionException e) {
throw new IOException("Could not read footer: " + e.getMessage(), e.getCause());
* Read the footers of all the files under that path (recursively)
* not using summary files.
* @param configuration a configuration
* @param fileStatus a file status to recursively list
* @param skipRowGroups whether to skip reading row group metadata
* @return a list of footers
* @throws IOException if an exception is thrown while reading the footers
* @deprecated will be removed in 2.0.0;
* use {@link ParquetFileReader#open(InputFile, ParquetReadOptions)}
public static List readAllFootersInParallel(Configuration configuration, FileStatus fileStatus, boolean skipRowGroups) throws IOException {
List statuses = listFiles(configuration, fileStatus);
return readAllFootersInParallel(configuration, statuses, skipRowGroups);
* Read the footers of all the files under that path (recursively)
* not using summary files.
* rowGroups are not skipped
* @param configuration the configuration to access the FS
* @param fileStatus the root dir
* @return all the footers
* @throws IOException if an exception is thrown while reading the footers
* @deprecated will be removed in 2.0.0;
* use {@link ParquetFileReader#open(InputFile, ParquetReadOptions)}
public static List readAllFootersInParallel(Configuration configuration, FileStatus fileStatus) throws IOException {
return readAllFootersInParallel(configuration, fileStatus, false);
* @param configuration a configuration
* @param path a file path
* @return a list of footers
* @throws IOException if an exception is thrown while reading the footers
* @deprecated will be removed in 2.0.0;
* use {@link ParquetFileReader#open(InputFile, ParquetReadOptions)}
public static List readFooters(Configuration configuration, Path path) throws IOException {
return readFooters(configuration, status(configuration, path));
private static FileStatus status(Configuration configuration, Path path) throws IOException {
return path.getFileSystem(configuration).getFileStatus(path);
* this always returns the row groups
* @param configuration a configuration
* @param pathStatus a file status to read footers from
* @return a list of footers
* @throws IOException if an exception is thrown while reading the footers
* @deprecated will be removed in 2.0.0;
* use {@link ParquetFileReader#open(InputFile, ParquetReadOptions)}
public static List readFooters(Configuration configuration, FileStatus pathStatus) throws IOException {
return readFooters(configuration, pathStatus, false);
* Read the footers of all the files under that path (recursively)
* using summary files if possible
* @param configuration the configuration to access the FS
* @param pathStatus the root dir
* @param skipRowGroups whether to skip reading row group metadata
* @return all the footers
* @throws IOException if an exception is thrown while reading the footers
* @deprecated will be removed in 2.0.0;
* use {@link ParquetFileReader#open(InputFile, ParquetReadOptions)}
public static List readFooters(Configuration configuration, FileStatus pathStatus, boolean skipRowGroups) throws IOException {
List files = listFiles(configuration, pathStatus);
return readAllFootersInParallelUsingSummaryFiles(configuration, files, skipRowGroups);
private static List listFiles(Configuration conf, FileStatus fileStatus) throws IOException {
if (fileStatus.isDir()) {
FileSystem fs = fileStatus.getPath().getFileSystem(conf);
FileStatus[] list = fs.listStatus(fileStatus.getPath(), HiddenFileFilter.INSTANCE);
List result = new ArrayList();
for (FileStatus sub : list) {
result.addAll(listFiles(conf, sub));
return result;
} else {
return Arrays.asList(fileStatus);
* Specifically reads a given summary file
* @param configuration a configuration
* @param summaryStatus file status for a summary file
* @return the metadata translated for each file
* @throws IOException if an exception is thrown while reading the summary file
* @deprecated metadata files are not recommended and will be removed in 2.0.0
public static List readSummaryFile(Configuration configuration, FileStatus summaryStatus) throws IOException {
final Path parent = summaryStatus.getPath().getParent();
ParquetMetadata mergedFooters = readFooter(configuration, summaryStatus, filter(false));
return footersFromSummaryFile(parent, mergedFooters);
static ParquetMetadata readSummaryMetadata(Configuration configuration, Path basePath, boolean skipRowGroups) throws IOException {
Path metadataFile = new Path(basePath, PARQUET_METADATA_FILE);
Path commonMetaDataFile = new Path(basePath, PARQUET_COMMON_METADATA_FILE);
FileSystem fileSystem = basePath.getFileSystem(configuration);
if (skipRowGroups && fileSystem.exists(commonMetaDataFile)) {
// reading the summary file that does not contain the row groups"reading summary file: {}", commonMetaDataFile);
return readFooter(configuration, commonMetaDataFile, filter(skipRowGroups));
} else if (fileSystem.exists(metadataFile)) {"reading summary file: {}", metadataFile);
return readFooter(configuration, metadataFile, filter(skipRowGroups));
} else {
return null;
static List footersFromSummaryFile(final Path parent, ParquetMetadata mergedFooters) {
Map footers = new HashMap();
List blocks = mergedFooters.getBlocks();
for (BlockMetaData block : blocks) {
String path = block.getPath();
Path fullPath = new Path(parent, path);
ParquetMetadata current = footers.get(fullPath);
if (current == null) {
current = new ParquetMetadata(mergedFooters.getFileMetaData(), new ArrayList());
footers.put(fullPath, current);
List result = new ArrayList();
for (Entry entry : footers.entrySet()) {
result.add(new Footer(entry.getKey(), entry.getValue()));
return result;
* Reads the meta data block in the footer of the file
* @param configuration a configuration
* @param file the parquet File
* @return the metadata blocks in the footer
* @throws IOException if an error occurs while reading the file
* @deprecated will be removed in 2.0.0;
* use {@link ParquetFileReader#open(InputFile, ParquetReadOptions)}
public static final ParquetMetadata readFooter(Configuration configuration, Path file) throws IOException {
return readFooter(configuration, file, NO_FILTER);
* Reads the meta data in the footer of the file.
* Skipping row groups (or not) based on the provided filter
* @param configuration a configuration
* @param file the Parquet File
* @param filter the filter to apply to row groups
* @return the metadata with row groups filtered.
* @throws IOException if an error occurs while reading the file
* @deprecated will be removed in 2.0.0;
* use {@link ParquetFileReader#open(InputFile, ParquetReadOptions)}
public static ParquetMetadata readFooter(Configuration configuration, Path file, MetadataFilter filter) throws IOException {
return readFooter(HadoopInputFile.fromPath(file, configuration), filter);
* @param configuration a configuration
* @param file the Parquet File
* @return the metadata with row groups.
* @throws IOException if an error occurs while reading the file
* @deprecated will be removed in 2.0.0;
* use {@link ParquetFileReader#open(InputFile, ParquetReadOptions)}
public static final ParquetMetadata readFooter(Configuration configuration, FileStatus file) throws IOException {
return readFooter(configuration, file, NO_FILTER);
* Reads the meta data block in the footer of the file
* @param configuration a configuration
* @param file the parquet File
* @param filter the filter to apply to row groups
* @return the metadata blocks in the footer
* @throws IOException if an error occurs while reading the file
* @deprecated will be removed in 2.0.0;
* use {@link ParquetFileReader#open(InputFile, ParquetReadOptions)}
public static final ParquetMetadata readFooter(Configuration configuration, FileStatus file, MetadataFilter filter) throws IOException {
return readFooter(HadoopInputFile.fromStatus(file, configuration), filter);
* Reads the meta data block in the footer of the file using provided input stream
* @param file a {@link InputFile} to read
* @param filter the filter to apply to row groups
* @return the metadata blocks in the footer
* @throws IOException if an error occurs while reading the file
* @deprecated will be removed in 2.0.0;
* use {@link ParquetFileReader#open(InputFile, ParquetReadOptions)}
public static final ParquetMetadata readFooter(InputFile file, MetadataFilter filter) throws IOException {
ParquetReadOptions options;
if (file instanceof HadoopInputFile) {
options = HadoopReadOptions.builder(((HadoopInputFile) file).getConfiguration())
} else {
options = ParquetReadOptions.builder().withMetadataFilter(filter).build();
try (SeekableInputStream in = file.newStream()) {
return readFooter(file, options, in);
private static final ParquetMetadata readFooter(InputFile file, ParquetReadOptions options, SeekableInputStream f) throws IOException {
ParquetMetadataConverter converter = new ParquetMetadataConverter(options);
return readFooter(file, options, f, converter);
private static final ParquetMetadata readFooter(InputFile file, ParquetReadOptions options, SeekableInputStream f, ParquetMetadataConverter converter) throws IOException {
long fileLen = file.getLength();
String filePath = file.toString();
LOG.debug("File length {}", fileLen);
if (fileLen < MAGIC.length + FOOTER_LENGTH_SIZE + MAGIC.length) { // MAGIC + data + footer + footerIndex + MAGIC
throw new RuntimeException(filePath + " is not a Parquet file (too small length: " + fileLen + ")");
long footerLengthIndex = fileLen - FOOTER_LENGTH_SIZE - MAGIC.length;
LOG.debug("reading footer index at {}", footerLengthIndex);;
int footerLength = readIntLittleEndian(f);
byte[] magic = new byte[MAGIC.length];
if (!Arrays.equals(MAGIC, magic)) {
throw new RuntimeException(filePath + " is not a Parquet file. expected magic number at tail " + Arrays.toString(MAGIC) + " but found " + Arrays.toString(magic));
long footerIndex = footerLengthIndex - footerLength;
LOG.debug("read footer length: {}, footer index: {}", footerLength, footerIndex);
if (footerIndex < MAGIC.length || footerIndex >= footerLengthIndex) {
throw new RuntimeException("corrupted file: the footer index is not within the file: " + footerIndex);
return converter.readParquetMetadata(f, options.getMetadataFilter());
* @param conf a configuration
* @param file a file path to open
* @return a parquet file reader
* @throws IOException if there is an error while opening the file
* @deprecated will be removed in 2.0.0; use {@link #open(InputFile)}
public static ParquetFileReader open(Configuration conf, Path file) throws IOException {
return new ParquetFileReader(HadoopInputFile.fromPath(file, conf),
* @param conf a configuration
* @param file a file path to open
* @param filter a metadata filter
* @return a parquet file reader
* @throws IOException if there is an error while opening the file
* @deprecated will be removed in 2.0.0; use {@link #open(InputFile,ParquetReadOptions)}
public static ParquetFileReader open(Configuration conf, Path file, MetadataFilter filter) throws IOException {
return open(HadoopInputFile.fromPath(file, conf),
* @param conf a configuration
* @param file a file path to open
* @param footer a footer for the file if already loaded
* @return a parquet file reader
* @throws IOException if there is an error while opening the file
* @deprecated will be removed in 2.0.0
public static ParquetFileReader open(Configuration conf, Path file, ParquetMetadata footer) throws IOException {
return new ParquetFileReader(conf, file, footer);
* Open a {@link InputFile file}.
* @param file an input file
* @return an open ParquetFileReader
* @throws IOException if there is an error while opening the file
public static ParquetFileReader open(InputFile file) throws IOException {
return new ParquetFileReader(file, ParquetReadOptions.builder().build());
* Open a {@link InputFile file} with {@link ParquetReadOptions options}.
* @param file an input file
* @param options parquet read options
* @return an open ParquetFileReader
* @throws IOException if there is an error while opening the file
public static ParquetFileReader open(InputFile file, ParquetReadOptions options) throws IOException {
return new ParquetFileReader(file, options);
private final InputFile file;
private final SeekableInputStream f;
private final ParquetReadOptions options;
private final Map paths = new HashMap<>();
private final FileMetaData fileMetaData; // may be null
private final List blocks;
private final List blockIndexStores;
private final List blockRowRanges;
// not final. in some cases, this may be lazily loaded for backward-compat.
private ParquetMetadata footer;
private int currentBlock = 0;
private ColumnChunkPageReadStore currentRowGroup = null;
private DictionaryPageReader nextDictionaryReader = null;
* @param configuration the Hadoop conf
* @param filePath Path for the parquet file
* @param blocks the blocks to read
* @param columns the columns to read (their path)
* @throws IOException if the file can not be opened
* @deprecated will be removed in 2.0.0.
public ParquetFileReader(Configuration configuration, Path filePath, List blocks,
List columns) throws IOException {
this(configuration, null, filePath, blocks, columns);
* @param configuration the Hadoop conf
* @param fileMetaData fileMetaData for parquet file
* @param filePath Path for the parquet file
* @param blocks the blocks to read
* @param columns the columns to read (their path)
* @throws IOException if the file can not be opened
* @deprecated will be removed in 2.0.0.
public ParquetFileReader(
Configuration configuration, FileMetaData fileMetaData,
Path filePath, List blocks, List columns) throws IOException {
this.converter = new ParquetMetadataConverter(configuration);
this.file = HadoopInputFile.fromPath(filePath, configuration);
this.fileMetaData = fileMetaData;
this.f = file.newStream();
this.options = HadoopReadOptions.builder(configuration).build();
this.blocks = filterRowGroups(blocks);
this.blockIndexStores = listWithNulls(this.blocks.size());
this.blockRowRanges = listWithNulls(this.blocks.size());
for (ColumnDescriptor col : columns) {
paths.put(ColumnPath.get(col.getPath()), col);
* @param conf the Hadoop Configuration
* @param file Path to a parquet file
* @param filter a {@link MetadataFilter} for selecting row groups
* @throws IOException if the file can not be opened
* @deprecated will be removed in 2.0.0.
public ParquetFileReader(Configuration conf, Path file, MetadataFilter filter) throws IOException {
this(HadoopInputFile.fromPath(file, conf),
* @param conf the Hadoop Configuration
* @param file Path to a parquet file
* @param footer a {@link ParquetMetadata} footer already read from the file
* @throws IOException if the file can not be opened
* @deprecated will be removed in 2.0.0.
public ParquetFileReader(Configuration conf, Path file, ParquetMetadata footer) throws IOException {
this.converter = new ParquetMetadataConverter(conf);
this.file = HadoopInputFile.fromPath(file, conf);
this.f = this.file.newStream();
this.options = HadoopReadOptions.builder(conf).build();
this.footer = footer;
this.fileMetaData = footer.getFileMetaData();
this.blocks = filterRowGroups(footer.getBlocks());
this.blockIndexStores = listWithNulls(this.blocks.size());
this.blockRowRanges = listWithNulls(this.blocks.size());
for (ColumnDescriptor col : footer.getFileMetaData().getSchema().getColumns()) {
paths.put(ColumnPath.get(col.getPath()), col);
public ParquetFileReader(InputFile file, ParquetReadOptions options) throws IOException {
this.converter = new ParquetMetadataConverter(options);
this.file = file;
this.f = file.newStream();
this.options = options;
try {
this.footer = readFooter(file, options, f, converter);
} catch (Exception e) {
// In case that reading footer throws an exception in the constructor, the new stream
// should be closed. Otherwise, there's no way to close this outside.
throw e;
this.fileMetaData = footer.getFileMetaData();
this.blocks = filterRowGroups(footer.getBlocks());
this.blockIndexStores = listWithNulls(this.blocks.size());
this.blockRowRanges = listWithNulls(this.blocks.size());
for (ColumnDescriptor col : footer.getFileMetaData().getSchema().getColumns()) {
paths.put(ColumnPath.get(col.getPath()), col);
private static List listWithNulls(int size) {
return Stream.generate(() -> (T) null).limit(size).collect(Collectors.toCollection(ArrayList::new));
public ParquetMetadata getFooter() {
if (footer == null) {
try {
// don't read the row groups because this.blocks is always set
this.footer = readFooter(file, options, f, converter);
} catch (IOException e) {
throw new ParquetDecodingException("Unable to read file footer", e);
return footer;
public FileMetaData getFileMetaData() {
if (fileMetaData != null) {
return fileMetaData;
return getFooter().getFileMetaData();
public long getRecordCount() {
long total = 0;
for (BlockMetaData block : blocks) {
total += block.getRowCount();
return total;
long getFilteredRecordCount() {
if (!options.useColumnIndexFilter()) {
return getRecordCount();
long total = 0;
for (int i = 0, n = blocks.size(); i < n; ++i) {
total += getRowRanges(i).rowCount();
return total;
* @return the path for this file
* @deprecated will be removed in 2.0.0; use {@link #getFile()} instead
public Path getPath() {
return new Path(file.toString());
public String getFile() {
return file.toString();
private List filterRowGroups(List blocks) throws IOException {
// set up data filters based on configured levels
List levels = new ArrayList<>();
if (options.useStatsFilter()) {
if (options.useDictionaryFilter()) {
FilterCompat.Filter recordFilter = options.getRecordFilter();
if (recordFilter != null) {
return RowGroupFilter.filterRowGroups(levels, recordFilter, blocks, this);
return blocks;
public List getRowGroups() {
return blocks;
public void setRequestedSchema(MessageType projection) {
for (ColumnDescriptor col : projection.getColumns()) {
paths.put(ColumnPath.get(col.getPath()), col);
public void appendTo(ParquetFileWriter writer) throws IOException {
writer.appendRowGroups(f, blocks, true);
* Reads all the columns requested from the row group at the current file position.
* @throws IOException if an error occurs while reading
* @return the PageReadStore which can provide PageReaders for each column.
public PageReadStore readNextRowGroup() throws IOException {
if (currentBlock == blocks.size()) {
return null;
BlockMetaData block = blocks.get(currentBlock);
if (block.getRowCount() == 0) {
throw new RuntimeException("Illegal row group of 0 rows");
this.currentRowGroup = new ColumnChunkPageReadStore(block.getRowCount());
// prepare the list of consecutive parts to read them in one scan
List allParts = new ArrayList();
ConsecutivePartList currentParts = null;
for (ColumnChunkMetaData mc : block.getColumns()) {
ColumnPath pathKey = mc.getPath();
ColumnDescriptor columnDescriptor = paths.get(pathKey);
if (columnDescriptor != null) {
long startingPos = mc.getStartingPos();
// first part or not consecutive => new list
if (currentParts == null || currentParts.endPos() != startingPos) {
currentParts = new ConsecutivePartList(startingPos);
currentParts.addChunk(new ChunkDescriptor(columnDescriptor, mc, startingPos, (int)mc.getTotalSize()));
// actually read all the chunks
ChunkListBuilder builder = new ChunkListBuilder();
for (ConsecutivePartList consecutiveChunks : allParts) {
consecutiveChunks.readAll(f, builder);
for (Chunk chunk : {
currentRowGroup.addColumn(chunk.descriptor.col, chunk.readAllPages());
// avoid re-reading bytes the dictionary reader is used after this call
if (nextDictionaryReader != null) {
return currentRowGroup;
* Reads all the columns requested from the row group at the current file position. It may skip specific pages based
* on the column indexes according to the actual filter. As the rows are not aligned among the pages of the different
* columns row synchronization might be required. See the documentation of the class SynchronizingColumnReader for
* details.
* @return the PageReadStore which can provide PageReaders for each column
* @throws IOException
* if any I/O error occurs while reading
public PageReadStore readNextFilteredRowGroup() throws IOException {
if (currentBlock == blocks.size()) {
return null;
if (!options.useColumnIndexFilter()) {
return readNextRowGroup();
BlockMetaData block = blocks.get(currentBlock);
if (block.getRowCount() == 0) {
throw new RuntimeException("Illegal row group of 0 rows");
ColumnIndexStore ciStore = getColumnIndexStore(currentBlock);
RowRanges rowRanges = getRowRanges(currentBlock);
long rowCount = rowRanges.rowCount();
if (rowCount == 0) {
// There are no matching rows -> skipping this row-group
return readNextFilteredRowGroup();
if (rowCount == block.getRowCount()) {
// All rows are matching -> fall back to the non-filtering path
return readNextRowGroup();
this.currentRowGroup = new ColumnChunkPageReadStore(rowRanges);
// prepare the list of consecutive parts to read them in one scan
ChunkListBuilder builder = new ChunkListBuilder();
List allParts = new ArrayList();
ConsecutivePartList currentParts = null;
for (ColumnChunkMetaData mc : block.getColumns()) {
ColumnPath pathKey = mc.getPath();
ColumnDescriptor columnDescriptor = paths.get(pathKey);
if (columnDescriptor != null) {
OffsetIndex offsetIndex = ciStore.getOffsetIndex(mc.getPath());
OffsetIndex filteredOffsetIndex = filterOffsetIndex(offsetIndex, rowRanges,
for (OffsetRange range : calculateOffsetRanges(filteredOffsetIndex, mc, offsetIndex.getOffset(0))) {
long startingPos = range.getOffset();
// first part or not consecutive => new list
if (currentParts == null || currentParts.endPos() != startingPos) {
currentParts = new ConsecutivePartList(startingPos);
ChunkDescriptor chunkDescriptor = new ChunkDescriptor(columnDescriptor, mc, startingPos,
(int) range.getLength());
builder.setOffsetIndex(chunkDescriptor, filteredOffsetIndex);
// actually read all the chunks
for (ConsecutivePartList consecutiveChunks : allParts) {
consecutiveChunks.readAll(f, builder);
for (Chunk chunk : {
currentRowGroup.addColumn(chunk.descriptor.col, chunk.readAllPages());
// avoid re-reading bytes the dictionary reader is used after this call
if (nextDictionaryReader != null) {
return currentRowGroup;
private ColumnIndexStore getColumnIndexStore(int blockIndex) {
ColumnIndexStore ciStore = blockIndexStores.get(blockIndex);
if (ciStore == null) {
ciStore = ColumnIndexStoreImpl.create(this, blocks.get(blockIndex), paths.keySet());
blockIndexStores.set(blockIndex, ciStore);
return ciStore;
private RowRanges getRowRanges(int blockIndex) {
RowRanges rowRanges = blockRowRanges.get(blockIndex);
if (rowRanges == null) {
rowRanges = ColumnIndexFilter.calculateRowRanges(options.getRecordFilter(), getColumnIndexStore(blockIndex),
paths.keySet(), blocks.get(blockIndex).getRowCount());
blockRowRanges.set(blockIndex, rowRanges);
return rowRanges;
public boolean skipNextRowGroup() {
return advanceToNextBlock();
private boolean advanceToNextBlock() {
if (currentBlock == blocks.size()) {
return false;
// update the current block and instantiate a dictionary reader for it
this.nextDictionaryReader = null;
return true;
* Returns a {@link DictionaryPageReadStore} for the row group that would be
* returned by calling {@link #readNextRowGroup()} or skipped by calling
* {@link #skipNextRowGroup()}.
* @return a DictionaryPageReadStore for the next row group
public DictionaryPageReadStore getNextDictionaryReader() {
if (nextDictionaryReader == null && currentBlock < blocks.size()) {
this.nextDictionaryReader = getDictionaryReader(blocks.get(currentBlock));
return nextDictionaryReader;
public DictionaryPageReader getDictionaryReader(BlockMetaData block) {
return new DictionaryPageReader(this, block);
* Reads and decompresses a dictionary page for the given column chunk.
* Returns null if the given column chunk has no dictionary page.
* @param meta a column's ColumnChunkMetaData to read the dictionary from
* @return an uncompressed DictionaryPage or null
* @throws IOException if there is an error while reading the dictionary
DictionaryPage readDictionary(ColumnChunkMetaData meta) throws IOException {
if (!meta.getEncodings().contains(Encoding.PLAIN_DICTIONARY) &&
!meta.getEncodings().contains(Encoding.RLE_DICTIONARY)) {
return null;
// TODO: this should use getDictionaryPageOffset() but it isn't reliable.
if (f.getPos() != meta.getStartingPos()) {;
PageHeader pageHeader = Util.readPageHeader(f);
if (!pageHeader.isSetDictionary_page_header()) {
return null; // TODO: should this complain?
DictionaryPage compressedPage = readCompressedDictionary(pageHeader, f);
BytesInputDecompressor decompressor = options.getCodecFactory().getDecompressor(meta.getCodec());
return new DictionaryPage(
decompressor.decompress(compressedPage.getBytes(), compressedPage.getUncompressedSize()),
private DictionaryPage readCompressedDictionary(
PageHeader pageHeader, SeekableInputStream fin) throws IOException {
DictionaryPageHeader dictHeader = pageHeader.getDictionary_page_header();
int uncompressedPageSize = pageHeader.getUncompressed_page_size();
int compressedPageSize = pageHeader.getCompressed_page_size();
byte [] dictPageBytes = new byte[compressedPageSize];
BytesInput bin = BytesInput.from(dictPageBytes);
return new DictionaryPage(
bin, uncompressedPageSize, dictHeader.getNum_values(),
* @param column
* the column chunk which the column index is to be returned for
* @return the column index for the specified column chunk or {@code null} if there is no index
* @throws IOException
* if any I/O error occurs during reading the file
public ColumnIndex readColumnIndex(ColumnChunkMetaData column) throws IOException {
IndexReference ref = column.getColumnIndexReference();
if (ref == null) {
return null;
return ParquetMetadataConverter.fromParquetColumnIndex(column.getPrimitiveType(), Util.readColumnIndex(f));
* @param column
* the column chunk which the offset index is to be returned for
* @return the offset index for the specified column chunk or {@code null} if there is no index
* @throws IOException
* if any I/O error occurs during reading the file
public OffsetIndex readOffsetIndex(ColumnChunkMetaData column) throws IOException {
IndexReference ref = column.getOffsetIndexReference();
if (ref == null) {
return null;
return ParquetMetadataConverter.fromParquetOffsetIndex(Util.readOffsetIndex(f));
public void close() throws IOException {
try {
if (f != null) {
} finally {
* Builder to concatenate the buffers of the discontinuous parts for the same column. These parts are generated as a
* result of the column-index based filtering when some pages might be skipped at reading.
private class ChunkListBuilder {
private class ChunkData {
final List buffers = new ArrayList<>();
OffsetIndex offsetIndex;
private final Map map = new HashMap<>();
private ChunkDescriptor lastDescriptor;
private SeekableInputStream f;
void add(ChunkDescriptor descriptor, List buffers, SeekableInputStream f) {
ChunkData data = map.get(descriptor);
if (data == null) {
data = new ChunkData();
map.put(descriptor, data);
lastDescriptor = descriptor;
this.f = f;
void setOffsetIndex(ChunkDescriptor descriptor, OffsetIndex offsetIndex) {
ChunkData data = map.get(descriptor);
if (data == null) {
data = new ChunkData();
map.put(descriptor, data);
data.offsetIndex = offsetIndex;
List build() {
List chunks = new ArrayList<>();
for (Entry entry : map.entrySet()) {
ChunkDescriptor descriptor = entry.getKey();
ChunkData data = entry.getValue();
if (descriptor.equals(lastDescriptor)) {
// because of a bug, the last chunk might be larger than descriptor.size
chunks.add(new WorkaroundChunk(lastDescriptor, data.buffers, f, data.offsetIndex));
} else {
chunks.add(new Chunk(descriptor, data.buffers, data.offsetIndex));
return chunks;
* The data for a column chunk
private class Chunk {
protected final ChunkDescriptor descriptor;
protected final ByteBufferInputStream stream;
final OffsetIndex offsetIndex;
* @param descriptor descriptor for the chunk
* @param buffers ByteBuffers that contain the chunk
* @param offsetIndex the offset index for this column; might be null
public Chunk(ChunkDescriptor descriptor, List buffers, OffsetIndex offsetIndex) {
this.descriptor = descriptor; = ByteBufferInputStream.wrap(buffers);
this.offsetIndex = offsetIndex;
protected PageHeader readPageHeader() throws IOException {
return Util.readPageHeader(stream);
* Read all of the pages in a given column chunk.
* @return the list of pages
public ColumnChunkPageReader readAllPages() throws IOException {
List pagesInChunk = new ArrayList();
DictionaryPage dictionaryPage = null;
PrimitiveType type = getFileMetaData().getSchema()
long valuesCountReadSoFar = 0;
int dataPageCountReadSoFar = 0;
while (hasMorePages(valuesCountReadSoFar, dataPageCountReadSoFar)) {
PageHeader pageHeader = readPageHeader();
int uncompressedPageSize = pageHeader.getUncompressed_page_size();
int compressedPageSize = pageHeader.getCompressed_page_size();
switch (pageHeader.type) {
// there is only one dictionary page per column chunk
if (dictionaryPage != null) {
throw new ParquetDecodingException("more than one dictionary page in column " + descriptor.col);
DictionaryPageHeader dicHeader = pageHeader.getDictionary_page_header();
dictionaryPage =
new DictionaryPage(
DataPageHeader dataHeaderV1 = pageHeader.getData_page_header();
new DataPageV1(
valuesCountReadSoFar += dataHeaderV1.getNum_values();
case DATA_PAGE_V2:
DataPageHeaderV2 dataHeaderV2 = pageHeader.getData_page_header_v2();
int dataSize = compressedPageSize - dataHeaderV2.getRepetition_levels_byte_length() - dataHeaderV2.getDefinition_levels_byte_length();
new DataPageV2(
valuesCountReadSoFar += dataHeaderV2.getNum_values();
LOG.debug("skipping page of type {} of size {}", pageHeader.getType(), compressedPageSize);
if (offsetIndex == null && valuesCountReadSoFar != descriptor.metadata.getValueCount()) {
// Would be nice to have a CorruptParquetFileException or something as a subclass?
throw new IOException(
"Expected " + descriptor.metadata.getValueCount() + " values in column chunk at " +
getPath() + " offset " + descriptor.metadata.getFirstDataPageOffset() +
" but got " + valuesCountReadSoFar + " values instead over " + pagesInChunk.size()
+ " pages ending at file offset " + (descriptor.fileOffset + stream.position()));
BytesInputDecompressor decompressor = options.getCodecFactory().getDecompressor(descriptor.metadata.getCodec());
return new ColumnChunkPageReader(decompressor, pagesInChunk, dictionaryPage, offsetIndex,
private boolean hasMorePages(long valuesCountReadSoFar, int dataPageCountReadSoFar) {
return offsetIndex == null ? valuesCountReadSoFar < descriptor.metadata.getValueCount()
: dataPageCountReadSoFar < offsetIndex.getPageCount();
* @param size the size of the page
* @return the page
* @throws IOException if there is an error while reading from the file stream
public BytesInput readAsBytesInput(int size) throws IOException {
return BytesInput.from(stream.sliceBuffers(size));
* deals with a now fixed bug where compressedLength was missing a few bytes.
private class WorkaroundChunk extends Chunk {
private final SeekableInputStream f;
* @param descriptor the descriptor of the chunk
* @param f the file stream positioned at the end of this chunk
private WorkaroundChunk(ChunkDescriptor descriptor, List buffers, SeekableInputStream f, OffsetIndex offsetIndex) {
super(descriptor, buffers, offsetIndex);
this.f = f;
protected PageHeader readPageHeader() throws IOException {
PageHeader pageHeader;
stream.mark(8192); // headers should not be larger than 8k
try {
pageHeader = Util.readPageHeader(stream);
} catch (IOException e) {
// this is to workaround a bug where the compressedLength
// of the chunk is missing the size of the header of the dictionary
// to allow reading older files (using dictionary) we need this.
// usually 13 to 19 bytes are missing
// if the last page is smaller than this, the page header itself is truncated in the buffer.
stream.reset(); // resetting the buffer to the position before we got the error"completing the column chunk to read the page header");
pageHeader = Util.readPageHeader(new SequenceInputStream(stream, f)); // trying again from the buffer + remainder of the stream.
return pageHeader;
public BytesInput readAsBytesInput(int size) throws IOException {
int available = stream.available();
if (size > available) {
// this is to workaround a bug where the compressedLength
// of the chunk is missing the size of the header of the dictionary
// to allow reading older files (using dictionary) we need this.
// usually 13 to 19 bytes are missing
int missingBytes = size - available;"completed the column chunk with {} bytes", missingBytes);
List buffers = new ArrayList<>();
ByteBuffer lastBuffer = ByteBuffer.allocate(missingBytes);
return BytesInput.from(buffers);
return super.readAsBytesInput(size);
* Information needed to read a column chunk or a part of it.
private static class ChunkDescriptor {
private final ColumnDescriptor col;
private final ColumnChunkMetaData metadata;
private final long fileOffset;
private final int size;
* @param col column this chunk is part of
* @param metadata metadata for the column
* @param fileOffset offset in the file where this chunk starts
* @param size size of the chunk
private ChunkDescriptor(
ColumnDescriptor col,
ColumnChunkMetaData metadata,
long fileOffset,
int size) {
this.col = col;
this.metadata = metadata;
this.fileOffset = fileOffset;
this.size = size;
public int hashCode() {
return col.hashCode();
public boolean equals(Object obj) {
if (this == obj) {
return true;
} else if (obj instanceof ChunkDescriptor) {
return col.equals(((ChunkDescriptor) obj).col);
} else {
return false;
* Describes a list of consecutive parts to be read at once. A consecutive part may contain whole column chunks or
* only parts of them (some pages).
private class ConsecutivePartList {
private final long offset;
private int length;
private final List chunks = new ArrayList();
* @param offset where the first chunk starts
ConsecutivePartList(long offset) {
this.offset = offset;
* adds a chunk to the list.
* It must be consecutive to the previous chunk
* @param descriptor a chunk descriptor
public void addChunk(ChunkDescriptor descriptor) {
length += descriptor.size;
* @param f file to read the chunks from
* @param builder used to build chunk list to read the pages for the different columns
* @throws IOException if there is an error while reading from the stream
public void readAll(SeekableInputStream f, ChunkListBuilder builder) throws IOException {
List buffers = readBlocks(f, offset, length);
// report in a counter the data we just scanned
ByteBufferInputStream stream = ByteBufferInputStream.wrap(buffers);
for (int i = 0; i < chunks.size(); i++) {
ChunkDescriptor descriptor = chunks.get(i);
builder.add(descriptor, stream.sliceBuffers(descriptor.size), f);
* @return the position following the last byte of these chunks
public long endPos() {
return offset + length;
* @param f file to read the blocks from
* @return the ByteBuffer blocks
* @throws IOException if there is an error while reading from the stream
List readBlocks(SeekableInputStream f, long offset, int length) throws IOException {;
int fullAllocations = length / options.getMaxAllocationSize();
int lastAllocationSize = length % options.getMaxAllocationSize();
int numAllocations = fullAllocations + (lastAllocationSize > 0 ? 1 : 0);
List buffers = new ArrayList<>(numAllocations);
for (int i = 0; i < fullAllocations; i++) {
if (lastAllocationSize > 0) {
for (ByteBuffer buffer : buffers) {
return buffers;
Optional readColumnInBlock(int blockIndex, ColumnDescriptor columnDescriptor) {
BlockMetaData block = blocks.get(blockIndex);
if (block.getRowCount() == 0) {
throw new RuntimeException("Illegal row group of 0 rows");
Optional mc = findColumnByPath(block, columnDescriptor.getPath());
return -> new ChunkDescriptor(columnDescriptor, column, column.getStartingPos(), (int) column.getTotalSize()))
.map(chunk -> readChunk(f, chunk));
private ColumnChunkPageReader readChunk(SeekableInputStream f, ChunkDescriptor descriptor) {
try {
List buffers = readBlocks(f, descriptor.fileOffset, descriptor.size);
ByteBufferInputStream stream = ByteBufferInputStream.wrap(buffers);
Chunk chunk = new WorkaroundChunk(descriptor, stream.sliceBuffers(descriptor.size), f, null);
return chunk.readAllPages();
} catch (IOException e) {
throw new RuntimeException(e);
private Optional findColumnByPath(BlockMetaData block, String[] path) {
for (ColumnChunkMetaData column : block.getColumns()) {
if (Arrays.equals(column.getPath().toArray(), path)) {
return Optional.of(column);
return Optional.empty();
public int blocksCount() {
return blocks.size();
public BlockMetaData getBlockMetaData(int blockIndex) {
return blocks.get(blockIndex);