
com.facebook.presto.orc.StripeReader Maven / Gradle / Ivy
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.presto.orc;
import com.facebook.presto.common.RuntimeStats;
import com.facebook.presto.orc.checkpoint.InvalidCheckpointException;
import com.facebook.presto.orc.checkpoint.StreamCheckpoint;
import com.facebook.presto.orc.metadata.ColumnEncoding;
import com.facebook.presto.orc.metadata.ColumnEncoding.ColumnEncodingKind;
import com.facebook.presto.orc.metadata.DwrfSequenceEncoding;
import com.facebook.presto.orc.metadata.MetadataReader;
import com.facebook.presto.orc.metadata.OrcType;
import com.facebook.presto.orc.metadata.OrcType.OrcTypeKind;
import com.facebook.presto.orc.metadata.PostScript.HiveWriterVersion;
import com.facebook.presto.orc.metadata.RowGroupIndex;
import com.facebook.presto.orc.metadata.Stream;
import com.facebook.presto.orc.metadata.Stream.StreamKind;
import com.facebook.presto.orc.metadata.StripeEncryptionGroup;
import com.facebook.presto.orc.metadata.StripeFooter;
import com.facebook.presto.orc.metadata.StripeInformation;
import com.facebook.presto.orc.metadata.statistics.ColumnStatistics;
import com.facebook.presto.orc.metadata.statistics.HiveBloomFilter;
import com.facebook.presto.orc.stream.InputStreamSource;
import com.facebook.presto.orc.stream.InputStreamSources;
import com.facebook.presto.orc.stream.OrcInputStream;
import com.facebook.presto.orc.stream.SharedBuffer;
import com.facebook.presto.orc.stream.ValueInputStream;
import com.facebook.presto.orc.stream.ValueInputStreamSource;
import com.facebook.presto.orc.stream.ValueStreams;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Predicates;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableMultimap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import io.airlift.slice.Slice;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.SortedMap;
import static com.facebook.presto.orc.NoopOrcLocalMemoryContext.NOOP_ORC_LOCAL_MEMORY_CONTEXT;
import static com.facebook.presto.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint;
import static com.facebook.presto.orc.checkpoint.Checkpoints.getStreamCheckpoints;
import static com.facebook.presto.orc.metadata.ColumnEncoding.ColumnEncodingKind.DICTIONARY;
import static com.facebook.presto.orc.metadata.ColumnEncoding.ColumnEncodingKind.DICTIONARY_V2;
import static com.facebook.presto.orc.metadata.DwrfMetadataReader.toStripeEncryptionGroup;
import static com.facebook.presto.orc.metadata.OrcType.OrcTypeKind.STRUCT;
import static com.facebook.presto.orc.metadata.Stream.StreamArea.INDEX;
import static com.facebook.presto.orc.metadata.Stream.StreamKind.BLOOM_FILTER;
import static com.facebook.presto.orc.metadata.Stream.StreamKind.DICTIONARY_DATA;
import static com.facebook.presto.orc.metadata.Stream.StreamKind.LENGTH;
import static com.facebook.presto.orc.metadata.Stream.StreamKind.ROW_INDEX;
import static com.facebook.presto.orc.metadata.statistics.ColumnStatistics.mergeColumnStatistics;
import static com.facebook.presto.orc.stream.CheckpointInputStreamSource.createCheckpointStreamSource;
import static com.google.common.base.MoreObjects.toStringHelper;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import static com.google.common.collect.Iterables.getOnlyElement;
import static java.lang.Math.multiplyExact;
import static java.lang.Math.toIntExact;
import static java.util.Objects.requireNonNull;
public class StripeReader
{
private final OrcDataSource orcDataSource;
private final Optional decompressor;
private final List types;
private final HiveWriterVersion hiveWriterVersion;
private final Set includedOrcColumns;
private final int rowsInRowGroup;
private final OrcPredicate predicate;
private final MetadataReader metadataReader;
private final Optional writeValidation;
private final StripeMetadataSource stripeMetadataSource;
private final boolean cacheable;
private final Multimap dwrfEncryptionGroupColumns;
private final RuntimeStats runtimeStats;
private final Optional fileIntrospector;
public StripeReader(
OrcDataSource orcDataSource,
Optional decompressor,
List types,
Set includedOrcColumns,
int rowsInRowGroup,
OrcPredicate predicate,
HiveWriterVersion hiveWriterVersion,
MetadataReader metadataReader,
Optional writeValidation,
StripeMetadataSource stripeMetadataSource,
boolean cacheable,
Map dwrfEncryptionGroupMap,
RuntimeStats runtimeStats,
Optional fileIntrospector)
{
this.orcDataSource = requireNonNull(orcDataSource, "orcDataSource is null");
this.decompressor = requireNonNull(decompressor, "decompressor is null");
this.types = ImmutableList.copyOf(requireNonNull(types, "types is null"));
this.includedOrcColumns = requireNonNull(includedOrcColumns, "includedColumns is null");
this.rowsInRowGroup = rowsInRowGroup;
this.predicate = requireNonNull(predicate, "predicate is null");
this.hiveWriterVersion = requireNonNull(hiveWriterVersion, "hiveWriterVersion is null");
this.metadataReader = requireNonNull(metadataReader, "metadataReader is null");
this.writeValidation = requireNonNull(writeValidation, "writeValidation is null");
this.stripeMetadataSource = requireNonNull(stripeMetadataSource, "stripeMetadataSource is null");
this.cacheable = cacheable;
this.dwrfEncryptionGroupColumns = invertEncryptionGroupMap(requireNonNull(dwrfEncryptionGroupMap, "dwrfEncryptionGroupMap is null"));
this.runtimeStats = requireNonNull(runtimeStats, "runtimeStats is null");
this.fileIntrospector = requireNonNull(fileIntrospector, "fileIntrospector is null");
}
private Multimap invertEncryptionGroupMap(Map dwrfEncryptionGroupMap)
{
ImmutableMultimap.Builder invertedMapBuilder = ImmutableMultimap.builder();
for (Entry entry : dwrfEncryptionGroupMap.entrySet()) {
invertedMapBuilder.put(entry.getValue(), entry.getKey());
}
for (int i = 0; i < types.size(); i++) {
if (!dwrfEncryptionGroupMap.containsKey(i)) {
invertedMapBuilder.put(-1, i);
}
}
return invertedMapBuilder.build();
}
public Stripe readStripe(
StripeInformation stripe,
OrcAggregatedMemoryContext systemMemoryUsage,
Optional decryptors,
SharedBuffer sharedDecompressionBuffer)
throws IOException
{
StripeId stripeId = new StripeId(orcDataSource.getId(), stripe.getOffset());
// read the stripe footer
StripeFooter stripeFooter = readStripeFooter(stripeId, stripe, systemMemoryUsage);
fileIntrospector.ifPresent(introspector -> introspector.onStripeFooter(stripe, stripeFooter));
// get streams for selected columns
List> allStreams = new ArrayList<>();
allStreams.add(stripeFooter.getStreams());
Map includedStreams = new HashMap<>();
boolean hasRowGroupDictionary = addIncludedStreams(stripeFooter.getColumnEncodings(), stripeFooter.getStreams(), includedStreams);
Map columnEncodings = new HashMap<>();
Map stripeFooterEncodings = stripeFooter.getColumnEncodings();
columnEncodings.putAll(stripeFooterEncodings);
// included columns may be encrypted
if (decryptors.isPresent()) {
List encryptedEncryptionGroups = stripeFooter.getStripeEncryptionGroups();
for (Integer groupId : decryptors.get().getEncryptorGroupIds()) {
StripeEncryptionGroup stripeEncryptionGroup = getStripeEncryptionGroup(decryptors.get().getEncryptorByGroupId(groupId), encryptedEncryptionGroups.get(groupId), dwrfEncryptionGroupColumns.get(groupId), systemMemoryUsage);
allStreams.add(stripeEncryptionGroup.getStreams());
columnEncodings.putAll(stripeEncryptionGroup.getColumnEncodings());
boolean encryptedHasRowGroupDictionary = addIncludedStreams(stripeEncryptionGroup.getColumnEncodings(), stripeEncryptionGroup.getStreams(), includedStreams);
hasRowGroupDictionary = encryptedHasRowGroupDictionary || hasRowGroupDictionary;
}
}
// handle stripes with more than one row group or a dictionary
boolean invalidCheckPoint = false;
if ((stripe.getNumberOfRows() > rowsInRowGroup) || hasRowGroupDictionary) {
// determine ranges of the stripe to read
Map diskRanges = getDiskRanges(allStreams);
diskRanges = Maps.filterKeys(diskRanges, Predicates.in(includedStreams.keySet()));
// read the file regions
Map streamsData = readDiskRanges(stripeId, diskRanges, systemMemoryUsage, decryptors, sharedDecompressionBuffer);
// read the row index for each column
Map> columnIndexes = readColumnIndexes(includedStreams, streamsData, stripeId);
fileIntrospector.ifPresent(introspector -> introspector.onRowGroupIndexes(stripe, columnIndexes));
if (writeValidation.isPresent()) {
writeValidation.get().validateRowGroupStatistics(orcDataSource.getId(), stripe.getOffset(), columnIndexes);
}
// select the row groups matching the tuple domain
Set selectedRowGroups = selectRowGroups(stripe, columnIndexes);
// if all row groups are skipped, return null
if (selectedRowGroups.isEmpty()) {
// set accounted memory usage to zero
systemMemoryUsage.close();
return null;
}
// value streams
Map> valueStreams = createValueStreams(includedStreams, streamsData, columnEncodings);
// build the dictionary streams
InputStreamSources dictionaryStreamSources = createDictionaryStreamSources(includedStreams, valueStreams, columnEncodings);
// build the row groups
try {
List rowGroups = createRowGroups(
stripe.getNumberOfRows(),
includedStreams,
valueStreams,
columnIndexes,
selectedRowGroups,
columnEncodings);
return new Stripe(stripe.getNumberOfRows(), columnEncodings, rowGroups, dictionaryStreamSources);
}
catch (InvalidCheckpointException e) {
// The ORC file contains a corrupt checkpoint stream
// If the file does not have a row group dictionary, treat the stripe as a single row group. Otherwise,
// we must fail because the length of the row group dictionary is contained in the checkpoint stream.
if (hasRowGroupDictionary) {
throw new OrcCorruptionException(e, orcDataSource.getId(), "Checkpoints are corrupt");
}
invalidCheckPoint = true;
}
}
// stripe only has one row group and no dictionary
ImmutableMap.Builder diskRangesBuilder = ImmutableMap.builder();
for (Entry entry : getDiskRanges(allStreams).entrySet()) {
StreamId streamId = entry.getKey();
if (includedStreams.containsKey(streamId)) {
diskRangesBuilder.put(entry);
}
}
ImmutableMap diskRanges = diskRangesBuilder.build();
// read the file regions
Map streamsData = readDiskRanges(stripeId, diskRanges, systemMemoryUsage, decryptors, sharedDecompressionBuffer);
long totalBytes = 0;
ImmutableMap.Builder> columnIndexes = ImmutableMap.builder();
for (Entry entry : includedStreams.entrySet()) {
if (entry.getKey().getStreamKind() == ROW_INDEX) {
List rowGroupIndexes = metadataReader.readRowIndexes(hiveWriterVersion, streamsData.get(entry.getKey()), null);
checkState(rowGroupIndexes.size() == 1 || invalidCheckPoint, "expect a single row group or an invalid check point");
for (RowGroupIndex rowGroupIndex : rowGroupIndexes) {
ColumnStatistics columnStatistics = rowGroupIndex.getColumnStatistics();
if (columnStatistics.hasMinAverageValueSizeInBytes()) {
totalBytes += columnStatistics.getTotalValueSizeInBytes();
}
}
if (fileIntrospector.isPresent()) {
columnIndexes.put(entry.getKey(), rowGroupIndexes);
}
}
}
fileIntrospector.ifPresent(introspector -> introspector.onRowGroupIndexes(stripe, columnIndexes.build()));
// value streams
Map> valueStreams = createValueStreams(includedStreams, streamsData, columnEncodings);
// build the dictionary streams
InputStreamSources dictionaryStreamSources = createDictionaryStreamSources(includedStreams, valueStreams, columnEncodings);
// build the row group
ImmutableMap.Builder> builder = ImmutableMap.builder();
for (Entry> entry : valueStreams.entrySet()) {
builder.put(entry.getKey(), new ValueInputStreamSource<>(entry.getValue()));
}
RowGroup rowGroup = new RowGroup(0, 0, stripe.getNumberOfRows(), totalBytes, new InputStreamSources(builder.build()));
return new Stripe(stripe.getNumberOfRows(), columnEncodings, ImmutableList.of(rowGroup), dictionaryStreamSources);
}
private StripeEncryptionGroup getStripeEncryptionGroup(DwrfDataEncryptor decryptor, Slice encryptedGroup, Collection columns, OrcAggregatedMemoryContext systemMemoryUsage)
throws IOException
{
OrcInputStream orcInputStream = new OrcInputStream(
orcDataSource.getId(),
// Memory is not accounted as the buffer is expected to be tiny and will be immediately discarded
new SharedBuffer(NOOP_ORC_LOCAL_MEMORY_CONTEXT),
encryptedGroup.getInput(),
decompressor,
Optional.of(decryptor),
systemMemoryUsage,
encryptedGroup.length());
return toStripeEncryptionGroup(orcDataSource.getId(), orcInputStream, types);
}
/**
* Add streams that are in includedOrcColumns to the includedStreams map,
* and return whether there were any rowGroupDictionaries
*/
private boolean addIncludedStreams(Map columnEncodings, List streams, Map includedStreams)
{
boolean hasRowGroupDictionary = false;
for (Stream stream : streams) {
if (includedOrcColumns.contains(stream.getColumn())) {
includedStreams.put(new StreamId(stream), stream);
if (stream.getStreamKind() == StreamKind.IN_DICTIONARY) {
ColumnEncoding columnEncoding = columnEncodings.get(stream.getColumn());
if (columnEncoding.getColumnEncodingKind() == DICTIONARY) {
hasRowGroupDictionary = true;
}
Optional> additionalSequenceEncodings = columnEncoding.getAdditionalSequenceEncodings();
if (additionalSequenceEncodings.isPresent()
&& additionalSequenceEncodings.get().values().stream()
.map(DwrfSequenceEncoding::getValueEncoding)
.anyMatch(encoding -> encoding.getColumnEncodingKind() == DICTIONARY)) {
hasRowGroupDictionary = true;
}
}
}
}
return hasRowGroupDictionary;
}
private Map readDiskRanges(
StripeId stripeId,
Map diskRanges,
OrcAggregatedMemoryContext systemMemoryUsage,
Optional decryptors,
SharedBuffer sharedDecompressionBuffer)
throws IOException
{
//
// Note: this code does not use the Java 8 stream APIs to avoid any extra object allocation
//
// read ranges
Map streamsData = stripeMetadataSource.getInputs(orcDataSource, stripeId, diskRanges, cacheable);
// transform streams to OrcInputStream
ImmutableMap.Builder streamsBuilder = ImmutableMap.builder();
for (Entry entry : streamsData.entrySet()) {
OrcDataSourceInput sourceInput = entry.getValue();
Optional dwrfDecryptor = createDwrfDecryptor(entry.getKey(), decryptors);
streamsBuilder.put(entry.getKey(), new OrcInputStream(
orcDataSource.getId(),
sharedDecompressionBuffer,
sourceInput.getInput(),
decompressor,
dwrfDecryptor,
systemMemoryUsage,
sourceInput.getRetainedSizeInBytes()));
}
return streamsBuilder.build();
}
private Optional createDwrfDecryptor(StreamId id, Optional decryptors)
{
if (!decryptors.isPresent()) {
return Optional.empty();
}
return decryptors.get().getEncryptorByNodeId(id.getColumn());
}
private Map> createValueStreams(Map streams, Map streamsData, Map columnEncodings)
{
ImmutableMap.Builder> valueStreams = ImmutableMap.builder();
for (Entry entry : streams.entrySet()) {
StreamId streamId = entry.getKey();
Stream stream = entry.getValue();
ColumnEncodingKind columnEncoding = columnEncodings.get(stream.getColumn())
.getColumnEncoding(stream.getSequence())
.getColumnEncodingKind();
// skip index and empty streams
if (isIndexStream(stream) || stream.getLength() == 0) {
continue;
}
OrcInputStream inputStream = streamsData.get(streamId);
OrcTypeKind columnType = types.get(stream.getColumn()).getOrcTypeKind();
valueStreams.put(streamId, ValueStreams.createValueStreams(streamId, inputStream, columnType, columnEncoding, stream.isUseVInts()));
}
return valueStreams.build();
}
public InputStreamSources createDictionaryStreamSources(Map streams, Map> valueStreams, Map columnEncodings)
{
ImmutableMap.Builder> dictionaryStreamBuilder = ImmutableMap.builder();
for (Entry entry : streams.entrySet()) {
StreamId streamId = entry.getKey();
Stream stream = entry.getValue();
int column = stream.getColumn();
// only process dictionary streams
ColumnEncodingKind columnEncoding = columnEncodings.get(column)
.getColumnEncoding(stream.getSequence())
.getColumnEncodingKind();
if (!isDictionary(stream, columnEncoding)) {
continue;
}
// skip streams without data
ValueInputStream> valueStream = valueStreams.get(streamId);
if (valueStream == null) {
continue;
}
OrcTypeKind columnType = types.get(stream.getColumn()).getOrcTypeKind();
StreamCheckpoint streamCheckpoint = getDictionaryStreamCheckpoint(streamId, columnType, columnEncoding);
InputStreamSource> streamSource = createCheckpointStreamSource(valueStream, streamCheckpoint);
dictionaryStreamBuilder.put(streamId, streamSource);
}
return new InputStreamSources(dictionaryStreamBuilder.build());
}
private List createRowGroups(
long rowsInStripe,
Map streams,
Map> valueStreams,
Map> columnIndexes,
Set selectedRowGroups,
Map encodings)
throws InvalidCheckpointException
{
ImmutableList.Builder rowGroupBuilder = ImmutableList.builder();
for (int rowGroupId : selectedRowGroups) {
Map checkpoints = getStreamCheckpoints(includedOrcColumns, types, decompressor.isPresent(), rowGroupId, encodings, streams, columnIndexes);
rowGroupBuilder.add(createRowGroup(rowGroupId, rowsInStripe, rowsInRowGroup, columnIndexes, valueStreams, checkpoints));
}
return rowGroupBuilder.build();
}
@VisibleForTesting
static RowGroup createRowGroup(int groupId, long rowsInStripe, long rowsInRowGroup, Map> columnIndexes, Map> valueStreams, Map checkpoints)
{
long totalRowGroupBytes = columnIndexes
.values()
.stream()
.mapToLong(e -> e.get(groupId)
.getColumnStatistics()
.getTotalValueSizeInBytes())
.sum();
long rowOffset = multiplyExact(groupId, rowsInRowGroup);
int rowCount = toIntExact(Math.min(rowsInStripe - rowOffset, rowsInRowGroup));
ImmutableMap.Builder> builder = ImmutableMap.builder();
for (Entry entry : checkpoints.entrySet()) {
StreamId streamId = entry.getKey();
StreamCheckpoint checkpoint = entry.getValue();
// skip streams without data
ValueInputStream> valueStream = valueStreams.get(streamId);
if (valueStream == null) {
continue;
}
builder.put(streamId, createCheckpointStreamSource(valueStream, checkpoint));
}
InputStreamSources rowGroupStreams = new InputStreamSources(builder.build());
return new RowGroup(groupId, rowOffset, rowCount, totalRowGroupBytes, rowGroupStreams);
}
public StripeFooter readStripeFooter(StripeId stripeId, StripeInformation stripe, OrcAggregatedMemoryContext systemMemoryUsage)
throws IOException
{
long footerOffset = stripe.getOffset() + stripe.getIndexLength() + stripe.getDataLength();
int footerLength = toIntExact(stripe.getFooterLength());
// read the footer
Slice footerSlice = stripeMetadataSource.getStripeFooterSlice(orcDataSource, stripeId, footerOffset, footerLength, cacheable);
try (InputStream inputStream = new OrcInputStream(
orcDataSource.getId(),
// Memory is not accounted as the buffer is expected to be tiny and will be immediately discarded
new SharedBuffer(NOOP_ORC_LOCAL_MEMORY_CONTEXT),
footerSlice.getInput(),
decompressor,
Optional.empty(),
systemMemoryUsage,
footerLength)) {
return metadataReader.readStripeFooter(orcDataSource.getId(), types, inputStream);
}
}
static boolean isIndexStream(Stream stream)
{
return stream.getStreamKind().getStreamArea() == INDEX;
}
private Map> readBloomFilterIndexes(Map streams, Map streamsData)
throws IOException
{
ImmutableMap.Builder> bloomFilters = ImmutableMap.builder();
for (Entry entry : streams.entrySet()) {
Stream stream = entry.getValue();
if (stream.getStreamKind() == BLOOM_FILTER) {
OrcInputStream inputStream = streamsData.get(entry.getKey());
bloomFilters.put(entry.getKey().getColumn(), metadataReader.readBloomFilterIndexes(inputStream));
}
// TODO: add support for BLOOM_FILTER_UTF8
}
return bloomFilters.build();
}
private Map> readColumnIndexes(Map streams, Map streamsData, StripeId stripeId)
throws IOException
{
// read the bloom filter for each column
Map> bloomFilterIndexes = readBloomFilterIndexes(streams, streamsData);
ImmutableMap.Builder> columnIndexes = ImmutableMap.builder();
for (Entry entry : streams.entrySet()) {
StreamId streamId = entry.getKey();
Stream stream = entry.getValue();
if (stream.getStreamKind() == ROW_INDEX) {
OrcInputStream inputStream = streamsData.get(streamId);
List bloomFilters = bloomFilterIndexes.get(streamId.getColumn());
List rowGroupIndexes = stripeMetadataSource.getRowIndexes(metadataReader, hiveWriterVersion, stripeId, streamId, inputStream, bloomFilters, runtimeStats);
columnIndexes.put(entry.getKey(), rowGroupIndexes);
}
}
return columnIndexes.build();
}
private Set selectRowGroups(StripeInformation stripe, Map> columnIndexes)
{
long rowsInStripe = stripe.getNumberOfRows();
int groupsInStripe = ceil(rowsInStripe, rowsInRowGroup);
ImmutableSet.Builder selectedRowGroups = ImmutableSet.builder();
long remainingRows = rowsInStripe;
for (int rowGroup = 0; rowGroup < groupsInStripe; ++rowGroup) {
int rows = toIntExact(Math.min(remainingRows, rowsInRowGroup));
Map statistics = getRowGroupStatistics(types.get(0), columnIndexes, rowGroup);
if (predicate.matches(rows, statistics)) {
selectedRowGroups.add(rowGroup);
}
remainingRows -= rows;
}
return selectedRowGroups.build();
}
private static Map getRowGroupStatistics(OrcType rootStructType, Map> columnIndexes, int rowGroup)
{
requireNonNull(rootStructType, "rootStructType is null");
checkArgument(rootStructType.getOrcTypeKind() == STRUCT);
requireNonNull(columnIndexes, "columnIndexes is null");
checkArgument(rowGroup >= 0, "rowGroup is negative");
Map> groupedColumnStatistics = new HashMap<>();
for (Entry> entry : columnIndexes.entrySet()) {
if (!entry.getValue().isEmpty() && entry.getValue().get(rowGroup) != null) {
groupedColumnStatistics.computeIfAbsent(entry.getKey().getColumn(), key -> new ArrayList<>())
.add(entry.getValue().get(rowGroup).getColumnStatistics());
}
}
ImmutableMap.Builder statistics = ImmutableMap.builder();
for (int ordinal = 0; ordinal < rootStructType.getFieldCount(); ordinal++) {
List columnStatistics = groupedColumnStatistics.get(rootStructType.getFieldTypeIndex(ordinal));
if (columnStatistics != null) {
if (columnStatistics.size() == 1) {
statistics.put(ordinal, getOnlyElement(columnStatistics));
}
else {
// Merge statistics from different streams
// This can happen if map is represented as struct (DWRF only)
statistics.put(ordinal, mergeColumnStatistics(columnStatistics));
}
}
}
return statistics.build();
}
private static boolean isDictionary(Stream stream, ColumnEncodingKind columnEncoding)
{
return stream.getStreamKind() == DICTIONARY_DATA || (stream.getStreamKind() == LENGTH && (columnEncoding == DICTIONARY || columnEncoding == DICTIONARY_V2));
}
@VisibleForTesting
public static Map getDiskRanges(List> streams)
{
ImmutableMap.Builder streamDiskRanges = ImmutableMap.builder();
for (List groupStreams : streams) {
long stripeOffset = 0;
for (Stream stream : groupStreams) {
int streamLength = toIntExact(stream.getLength());
if (stream.getOffset().isPresent()) {
stripeOffset = stream.getOffset().get();
}
// ignore zero byte streams
if (streamLength > 0) {
streamDiskRanges.put(new StreamId(stream), new DiskRange(stripeOffset, streamLength));
}
stripeOffset += streamLength;
}
}
return streamDiskRanges.build();
}
/**
* Ceiling of integer division
*/
private static int ceil(long dividend, int divisor)
{
long ceil = ((dividend + divisor) - 1) / divisor;
return toIntExact(ceil);
}
public static class StripeId
{
private final OrcDataSourceId sourceId;
private final long offset;
public StripeId(OrcDataSourceId sourceId, long offset)
{
this.sourceId = requireNonNull(sourceId, "sourceId is null");
this.offset = offset;
}
public OrcDataSourceId getSourceId()
{
return sourceId;
}
public long getOffset()
{
return offset;
}
@Override
public boolean equals(Object o)
{
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
StripeId stripeId = (StripeId) o;
return offset == stripeId.offset &&
Objects.equals(sourceId, stripeId.sourceId);
}
@Override
public int hashCode()
{
return Objects.hash(sourceId, offset);
}
@Override
public String toString()
{
return toStringHelper(this)
.add("sourceId", sourceId)
.add("offset", offset)
.toString();
}
}
public static class StripeStreamId
{
private final StripeId stripeId;
// StripeStreamId is used as a cache key. Multiple StripeStreamId share the StripeId,
// but they have unique streamId. Storing a reference to the StreamId double the
// number of objects. On some installations, StreamId accounts to 8% of the objects
// and all of that is from the Cache keys. Storing them in place though is hacky,
// removes the 8% of the objects for faster GC and object overhead.
// This is analogous to using primitive integer, instead of boxed Object Integer.
// There are multiple StreamId for same StripeId, so expanding StripeId is unnecessary.
private final int column;
private final int sequence;
private final StreamKind streamKind;
public StripeStreamId(StripeId stripeId, StreamId streamId)
{
this.stripeId = requireNonNull(stripeId, "stripeId is null");
requireNonNull(streamId, "streamId is null");
this.column = streamId.getColumn();
this.sequence = streamId.getSequence();
this.streamKind = streamId.getStreamKind();
}
@Override
public boolean equals(Object o)
{
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
StripeStreamId other = (StripeStreamId) o;
return Objects.equals(stripeId, other.stripeId) &&
column == other.column && sequence == other.sequence && streamKind == other.streamKind;
}
@Override
public int hashCode()
{
return Objects.hash(stripeId, column, sequence, streamKind);
}
@Override
public String toString()
{
return toStringHelper(this)
.add("stripeId", stripeId)
.add("column", column)
.add("sequence", sequence)
.add("streamKind", streamKind)
.toString();
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy