io.trino.parquet.reader.AbstractColumnReader Maven / Gradle / Ivy
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.trino.parquet.reader;
import io.airlift.log.Logger;
import io.airlift.slice.Slice;
import io.trino.parquet.DictionaryPage;
import io.trino.parquet.ParquetEncoding;
import io.trino.parquet.PrimitiveField;
import io.trino.parquet.reader.decoders.ValueDecoder;
import io.trino.parquet.reader.flat.ColumnAdapter;
import io.trino.parquet.reader.flat.DictionaryDecoder;
import io.trino.parquet.reader.flat.RowRangesIterator;
import io.trino.spi.block.Block;
import io.trino.spi.block.DictionaryBlock;
import io.trino.spi.type.AbstractVariableWidthType;
import io.trino.spi.type.DateType;
import io.trino.spi.type.Type;
import jakarta.annotation.Nullable;
import org.apache.parquet.io.ParquetDecodingException;
import java.util.Optional;
import java.util.OptionalLong;
import static io.trino.parquet.ParquetEncoding.PLAIN_DICTIONARY;
import static io.trino.parquet.ParquetEncoding.RLE_DICTIONARY;
import static io.trino.parquet.reader.decoders.ValueDecoder.ValueDecodersProvider;
import static io.trino.parquet.reader.flat.DictionaryDecoder.DictionaryDecoderProvider;
import static io.trino.parquet.reader.flat.RowRangesIterator.createRowRangesIterator;
import static java.lang.String.format;
import static java.util.Objects.requireNonNull;
public abstract class AbstractColumnReader
implements ColumnReader
{
private static final Logger log = Logger.get(AbstractColumnReader.class);
protected final PrimitiveField field;
protected final ValueDecodersProvider decodersProvider;
protected final ColumnAdapter columnAdapter;
private final DictionaryDecoderProvider dictionaryDecoderProvider;
protected PageReader pageReader;
protected RowRangesIterator rowRanges;
@Nullable
protected DictionaryDecoder dictionaryDecoder;
private boolean produceDictionaryBlock;
public AbstractColumnReader(
PrimitiveField field,
ValueDecodersProvider decodersProvider,
DictionaryDecoderProvider dictionaryDecoderProvider,
ColumnAdapter columnAdapter)
{
this.field = requireNonNull(field, "field is null");
this.decodersProvider = requireNonNull(decodersProvider, "decoders is null");
this.dictionaryDecoderProvider = requireNonNull(dictionaryDecoderProvider, "dictionaryDecoderProvider is null");
this.columnAdapter = requireNonNull(columnAdapter, "columnAdapter is null");
}
@Override
public void setPageReader(PageReader pageReader, Optional rowRanges)
{
this.pageReader = requireNonNull(pageReader, "pageReader");
// The dictionary page must be placed at the first position of the column chunk
// if it is partly or completely dictionary encoded. At most one dictionary page
// can be placed in a column chunk.
DictionaryPage dictionaryPage = pageReader.readDictionaryPage();
// For dictionary based encodings - https://github.com/apache/parquet-format/blob/master/Encodings.md
if (dictionaryPage != null) {
log.debug("field %s, readDictionaryPage %s", field, dictionaryPage);
dictionaryDecoder = dictionaryDecoderProvider.create(dictionaryPage, isNonNull());
produceDictionaryBlock = shouldProduceDictionaryBlock(rowRanges);
}
this.rowRanges = createRowRangesIterator(rowRanges);
}
protected abstract boolean isNonNull();
protected boolean produceDictionaryBlock()
{
return produceDictionaryBlock;
}
protected ValueDecoder createValueDecoder(ValueDecodersProvider decodersProvider, ParquetEncoding encoding, Slice data)
{
ValueDecoder valueDecoder;
if (encoding == PLAIN_DICTIONARY || encoding == RLE_DICTIONARY) {
if (dictionaryDecoder == null) {
throw new ParquetDecodingException(format("Dictionary is missing for %s", field));
}
valueDecoder = dictionaryDecoder;
}
else {
valueDecoder = decodersProvider.create(encoding);
}
valueDecoder.init(new SimpleSliceInputStream(data));
return valueDecoder;
}
protected static void throwEndOfBatchException(int remainingInBatch)
{
throw new ParquetDecodingException(format("Corrupted Parquet file: extra %d values to be consumed when scanning current batch", remainingInBatch));
}
protected static void unpackDictionaryNullId(
int[] source,
int[] destination,
boolean[] isNull,
int destOffset,
int chunkSize,
int nullId)
{
int srcOffset = 0;
for (int i = destOffset; i < destOffset + chunkSize; i++) {
if (isNull[i]) {
destination[i] = nullId;
}
else {
destination[i] = source[srcOffset++];
}
}
}
protected static ColumnChunk createDictionaryBlock(int[] dictionaryIds, Block dictionary, int[] definitions, int[] repetitions)
{
int positionsCount = dictionaryIds.length;
return new ColumnChunk(
DictionaryBlock.create(positionsCount, dictionary, dictionaryIds),
definitions,
repetitions,
OptionalLong.of(getMaxDictionaryBlockSize(dictionary, positionsCount)));
}
private boolean shouldProduceDictionaryBlock(Optional filteredRowRanges)
{
// Parquet writer may choose to fall back to a non-dictionary encoding after starting with dictionary encoding if
// 1. If the size of the dictionary exceeds a threshold (1MB for parquet-mr by default).
// 2. Number of dictionary entries exceeds a threshold (Integer.MAX_VALUE for parquet-mr by default).
// Trino dictionary blocks are produced only when the entire column chunk is dictionary encoded
if (pageReader.hasOnlyDictionaryEncodedPages()) {
if (!shouldProduceDictionaryForType(field.getType())) {
return false;
}
requireNonNull(dictionaryDecoder, "dictionaryDecoder is null");
// Filtering of parquet pages using column indexes may result in the total number of values read from the
// column chunk being lower than the size of the dictionary
return filteredRowRanges.map(rowRanges -> rowRanges.getRowCount() > dictionaryDecoder.getDictionarySize())
.orElse(true);
}
return false;
}
static boolean shouldProduceDictionaryForType(Type type)
{
// TODO: DictionaryBlocks are currently restricted to variable width and date types where dictionary processing is most beneficial.
// Dictionary processing for other data types can be enabled after validating improvements on benchmarks.
return type instanceof AbstractVariableWidthType || type instanceof DateType;
}
private static long getMaxDictionaryBlockSize(Block dictionary, long batchSize)
{
// An approximate upper bound on size of DictionaryBlock is derived here instead of using
// DictionaryBlock#getSizeInBytes directly because that method is expensive
double maxDictionaryFractionUsed = Math.min((double) batchSize / dictionary.getPositionCount(), 1.0);
return (long) (batchSize * Integer.BYTES + dictionary.getSizeInBytes() * maxDictionaryFractionUsed);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy