io.trino.parquet.reader.TrinoColumnIndexStore Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of trino-parquet Show documentation
Trino - Parquet file format support
The newest version!
/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.trino.parquet.reader;

import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ListMultimap;
import io.trino.parquet.DiskRange;
import io.trino.parquet.ParquetDataSource;
import io.trino.parquet.ParquetReaderOptions;
import io.trino.parquet.metadata.BlockMetadata;
import io.trino.parquet.metadata.ColumnChunkMetadata;
import io.trino.parquet.metadata.IndexReference;
import io.trino.spi.predicate.Domain;
import io.trino.spi.predicate.TupleDomain;
import jakarta.annotation.Nullable;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.format.Util;
import org.apache.parquet.hadoop.metadata.ColumnPath;
import org.apache.parquet.internal.column.columnindex.ColumnIndex;
import org.apache.parquet.internal.column.columnindex.OffsetIndex;
import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore;
import org.apache.parquet.schema.PrimitiveType;

import java.io.IOException;
import java.io.InputStream;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.function.BiFunction;

import static com.google.common.collect.ImmutableMap.toImmutableMap;
import static com.google.common.collect.ImmutableSet.toImmutableSet;
import static io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext;
import static io.trino.parquet.ParquetMetadataConverter.fromParquetColumnIndex;
import static io.trino.parquet.ParquetMetadataConverter.fromParquetOffsetIndex;
import static java.util.Objects.requireNonNull;

/**
 * Internal implementation of {@link ColumnIndexStore}.
 * Similar to org.apache.parquet.hadoop.ColumnIndexStoreImpl which is not accessible
 */
public class TrinoColumnIndexStore
        implements ColumnIndexStore
{
    private final ParquetDataSource dataSource;
    private final List columnIndexReferences;
    private final List offsetIndexReferences;

    @Nullable
    private Map columnIndexStore;
    @Nullable
    private Map offsetIndexStore;

    /**
     * Creates a column index store which lazily reads column/offset indexes for the columns in paths.
     *
     * @param columnsRead is the set of columns used for projection
     * @param columnsFiltered is the set of columns used for filtering
     */
    public TrinoColumnIndexStore(
            ParquetDataSource dataSource,
            BlockMetadata block,
            Set columnsRead,
            Set columnsFiltered)
    {
        this.dataSource = requireNonNull(dataSource, "dataSource is null");
        requireNonNull(block, "block is null");
        requireNonNull(columnsRead, "columnsRead is null");
        requireNonNull(columnsFiltered, "columnsFiltered is null");

        ImmutableList.Builder columnIndexBuilder = ImmutableList.builderWithExpectedSize(columnsFiltered.size());
        ImmutableList.Builder offsetIndexBuilder = ImmutableList.builderWithExpectedSize(columnsRead.size());
        for (ColumnChunkMetadata column : block.columns()) {
            ColumnPath path = column.getPath();
            if (column.getColumnIndexReference() != null && columnsFiltered.contains(path)) {
                columnIndexBuilder.add(new ColumnIndexMetadata(
                        column.getColumnIndexReference(),
                        path,
                        column.getPrimitiveType()));
            }
            if (column.getOffsetIndexReference() != null && columnsRead.contains(path)) {
                offsetIndexBuilder.add(new ColumnIndexMetadata(
                        column.getOffsetIndexReference(),
                        path,
                        column.getPrimitiveType()));
            }
        }
        this.columnIndexReferences = columnIndexBuilder.build();
        this.offsetIndexReferences = offsetIndexBuilder.build();
    }

    @Override
    public ColumnIndex getColumnIndex(ColumnPath column)
    {
        if (columnIndexStore == null) {
            columnIndexStore = loadIndexes(dataSource, columnIndexReferences, (inputStream, columnMetadata) -> {
                try {
                    return fromParquetColumnIndex(columnMetadata.getPrimitiveType(), Util.readColumnIndex(inputStream));
                }
                catch (IOException e) {
                    throw new RuntimeException(e);
                }
            });
        }

        return columnIndexStore.get(column);
    }

    @Override
    public OffsetIndex getOffsetIndex(ColumnPath column)
    {
        if (offsetIndexStore == null) {
            offsetIndexStore = loadIndexes(dataSource, offsetIndexReferences, (inputStream, columnMetadata) -> {
                try {
                    return fromParquetOffsetIndex(Util.readOffsetIndex(inputStream));
                }
                catch (IOException e) {
                    throw new RuntimeException(e);
                }
            });
        }

        return offsetIndexStore.get(column);
    }

    public static Optional getColumnIndexStore(
            ParquetDataSource dataSource,
            BlockMetadata blockMetadata,
            Map, ColumnDescriptor> descriptorsByPath,
            TupleDomain parquetTupleDomain,
            ParquetReaderOptions options)
    {
        if (!options.isUseColumnIndex() || parquetTupleDomain.isAll() || parquetTupleDomain.isNone()) {
            return Optional.empty();
        }

        boolean hasColumnIndex = false;
        for (ColumnChunkMetadata column : blockMetadata.columns()) {
            if (column.getColumnIndexReference() != null && column.getOffsetIndexReference() != null) {
                hasColumnIndex = true;
                break;
            }
        }

        if (!hasColumnIndex) {
            return Optional.empty();
        }

        Set columnsReadPaths = new HashSet<>(descriptorsByPath.size());
        for (List path : descriptorsByPath.keySet()) {
            columnsReadPaths.add(ColumnPath.get(path.toArray(new String[0])));
        }

        Map parquetDomains = parquetTupleDomain.getDomains()
                .orElseThrow(() -> new IllegalStateException("Predicate other than none should have domains"));
        Set columnsFilteredPaths = parquetDomains.keySet().stream()
                .map(column -> ColumnPath.get(column.getPath()))
                .collect(toImmutableSet());

        return Optional.of(new TrinoColumnIndexStore(dataSource, blockMetadata, columnsReadPaths, columnsFilteredPaths));
    }

    private static  Map loadIndexes(
            ParquetDataSource dataSource,
            List indexMetadata,
            BiFunction deserializer)
    {
        // Merge multiple small reads of the file for indexes stored close to each other
        ListMultimap ranges = ArrayListMultimap.create(indexMetadata.size(), 1);
        for (ColumnIndexMetadata column : indexMetadata) {
            ranges.put(column.getPath(), column.getDiskRange());
        }

        Map columnInputStreams = dataSource.planRead(ranges, newSimpleAggregatedMemoryContext());
        try {
            return indexMetadata.stream()
                    .collect(toImmutableMap(
                            ColumnIndexMetadata::getPath,
                            column -> deserializer.apply(columnInputStreams.get(column.getPath()), column)));
        }
        finally {
            columnInputStreams.values().forEach(ChunkedInputStream::close);
        }
    }

    private static class ColumnIndexMetadata
    {
        private final DiskRange diskRange;
        private final ColumnPath path;
        private final PrimitiveType primitiveType;

        private ColumnIndexMetadata(IndexReference indexReference, ColumnPath path, PrimitiveType primitiveType)
        {
            requireNonNull(indexReference, "indexReference is null");
            this.diskRange = new DiskRange(indexReference.getOffset(), indexReference.getLength());
            this.path = requireNonNull(path, "path is null");
            this.primitiveType = requireNonNull(primitiveType, "primitiveType is null");
        }

        private DiskRange getDiskRange()
        {
            return diskRange;
        }

        private ColumnPath getPath()
        {
            return path;
        }

        private PrimitiveType getPrimitiveType()
        {
            return primitiveType;
        }
    }
}