io.trino.parquet.reader.PageReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of trino-parquet Show documentation
Trino - Parquet file format support
There is a newer version: 464
/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.trino.parquet.reader;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Iterators;
import com.google.common.collect.PeekingIterator;
import io.trino.parquet.DataPage;
import io.trino.parquet.DataPageV1;
import io.trino.parquet.DataPageV2;
import io.trino.parquet.DictionaryPage;
import io.trino.parquet.Page;
import io.trino.parquet.ParquetDataSourceId;
import io.trino.parquet.metadata.ColumnChunkMetadata;
import jakarta.annotation.Nullable;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.statistics.Statistics;
import org.apache.parquet.format.CompressionCodec;
import org.apache.parquet.internal.column.columnindex.OffsetIndex;

import java.io.IOException;
import java.util.Iterator;
import java.util.Optional;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import static io.trino.parquet.ParquetCompressionUtils.decompress;
import static io.trino.parquet.ParquetReaderUtils.isOnlyDictionaryEncodingPages;
import static java.util.Objects.requireNonNull;

public final class PageReader
{
    private final ParquetDataSourceId dataSourceId;
    private final CompressionCodec codec;
    private final boolean hasOnlyDictionaryEncodedPages;
    private final boolean hasNoNulls;
    private final PeekingIterator compressedPages;

    private boolean dictionaryAlreadyRead;
    private int dataPageReadCount;

    public static PageReader createPageReader(
            ParquetDataSourceId dataSourceId,
            ChunkedInputStream columnChunk,
            ColumnChunkMetadata metadata,
            ColumnDescriptor columnDescriptor,
            @Nullable OffsetIndex offsetIndex,
            Optional fileCreatedBy)
    {
        // Parquet schema may specify a column definition as OPTIONAL even though there are no nulls in the actual data.
        // Row-group column statistics can be used to identify such cases and switch to faster non-nullable read
        // paths in FlatColumnReader.
        Statistics columnStatistics = metadata.getStatistics();
        boolean hasNoNulls = columnStatistics != null && columnStatistics.getNumNulls() == 0;
        boolean hasOnlyDictionaryEncodedPages = isOnlyDictionaryEncodingPages(metadata);
        ParquetColumnChunkIterator compressedPages = new ParquetColumnChunkIterator(
                dataSourceId,
                fileCreatedBy,
                columnDescriptor,
                metadata,
                columnChunk,
                offsetIndex);

        return new PageReader(
                dataSourceId,
                metadata.getCodec().getParquetCompressionCodec(),
                compressedPages,
                hasOnlyDictionaryEncodedPages,
                hasNoNulls);
    }

    @VisibleForTesting
    public PageReader(
            ParquetDataSourceId dataSourceId,
            CompressionCodec codec,
            Iterator compressedPages,
            boolean hasOnlyDictionaryEncodedPages,
            boolean hasNoNulls)
    {
        this.dataSourceId = requireNonNull(dataSourceId, "dataSourceId is null");
        this.codec = codec;
        this.compressedPages = Iterators.peekingIterator(compressedPages);
        this.hasOnlyDictionaryEncodedPages = hasOnlyDictionaryEncodedPages;
        this.hasNoNulls = hasNoNulls;
    }

    public boolean hasNoNulls()
    {
        return hasNoNulls;
    }

    public boolean hasOnlyDictionaryEncodedPages()
    {
        return hasOnlyDictionaryEncodedPages;
    }

    public DataPage readPage()
    {
        if (!compressedPages.hasNext()) {
            return null;
        }
        Page compressedPage = compressedPages.next();
        checkState(compressedPage instanceof DataPage, "Found page %s instead of a DataPage", compressedPage);
        dataPageReadCount++;
        try {
            if (compressedPage instanceof DataPageV1 dataPageV1) {
                if (!arePagesCompressed()) {
                    return dataPageV1;
                }
                return new DataPageV1(
                        decompress(dataSourceId, codec, dataPageV1.getSlice(), dataPageV1.getUncompressedSize()),
                        dataPageV1.getValueCount(),
                        dataPageV1.getUncompressedSize(),
                        dataPageV1.getFirstRowIndex(),
                        dataPageV1.getRepetitionLevelEncoding(),
                        dataPageV1.getDefinitionLevelEncoding(),
                        dataPageV1.getValueEncoding());
            }
            DataPageV2 dataPageV2 = (DataPageV2) compressedPage;
            if (!dataPageV2.isCompressed()) {
                return dataPageV2;
            }
            int uncompressedSize = dataPageV2.getUncompressedSize()
                    - dataPageV2.getDefinitionLevels().length()
                    - dataPageV2.getRepetitionLevels().length();
            return new DataPageV2(
                    dataPageV2.getRowCount(),
                    dataPageV2.getNullCount(),
                    dataPageV2.getValueCount(),
                    dataPageV2.getRepetitionLevels(),
                    dataPageV2.getDefinitionLevels(),
                    dataPageV2.getDataEncoding(),
                    decompress(dataSourceId, codec, dataPageV2.getSlice(), uncompressedSize),
                    dataPageV2.getUncompressedSize(),
                    dataPageV2.getFirstRowIndex(),
                    dataPageV2.getStatistics(),
                    false);
        }
        catch (IOException e) {
            throw new RuntimeException("Could not decompress page", e);
        }
    }

    public DictionaryPage readDictionaryPage()
    {
        checkState(!dictionaryAlreadyRead, "Dictionary was already read");
        checkState(dataPageReadCount == 0, "Dictionary has to be read first but " + dataPageReadCount + " was read already");
        dictionaryAlreadyRead = true;
        if (!(compressedPages.peek() instanceof DictionaryPage)) {
            return null;
        }
        try {
            DictionaryPage compressedDictionaryPage = (DictionaryPage) compressedPages.next();
            return new DictionaryPage(
                    decompress(dataSourceId, codec, compressedDictionaryPage.getSlice(), compressedDictionaryPage.getUncompressedSize()),
                    compressedDictionaryPage.getDictionarySize(),
                    compressedDictionaryPage.getEncoding());
        }
        catch (IOException e) {
            throw new RuntimeException("Error reading dictionary page", e);
        }
    }

    public boolean hasNext()
    {
        return compressedPages.hasNext();
    }

    public DataPage getNextPage()
    {
        verifyDictionaryPageRead();

        return (DataPage) compressedPages.peek();
    }

    public void skipNextPage()
    {
        verifyDictionaryPageRead();
        compressedPages.next();
    }

    public boolean arePagesCompressed()
    {
        return codec != CompressionCodec.UNCOMPRESSED;
    }

    private void verifyDictionaryPageRead()
    {
        checkArgument(dictionaryAlreadyRead, "Dictionary has to be read first");
    }
}