All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.facebook.presto.orc.reader.MapFlatBatchStreamReader Maven / Gradle / Ivy

The newest version!
/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.facebook.presto.orc.reader;

import com.facebook.presto.common.block.Block;
import com.facebook.presto.common.block.BlockBuilder;
import com.facebook.presto.common.block.DictionaryBlock;
import com.facebook.presto.common.block.VariableWidthBlockBuilder;
import com.facebook.presto.common.type.BigintType;
import com.facebook.presto.common.type.IntegerType;
import com.facebook.presto.common.type.MapType;
import com.facebook.presto.common.type.SmallintType;
import com.facebook.presto.common.type.TinyintType;
import com.facebook.presto.common.type.Type;
import com.facebook.presto.orc.OrcAggregatedMemoryContext;
import com.facebook.presto.orc.OrcCorruptionException;
import com.facebook.presto.orc.OrcRecordReaderOptions;
import com.facebook.presto.orc.StreamDescriptor;
import com.facebook.presto.orc.Stripe;
import com.facebook.presto.orc.metadata.ColumnEncoding;
import com.facebook.presto.orc.metadata.DwrfSequenceEncoding;
import com.facebook.presto.orc.metadata.OrcType;
import com.facebook.presto.orc.stream.BooleanInputStream;
import com.facebook.presto.orc.stream.InputStreamSource;
import com.facebook.presto.orc.stream.InputStreamSources;
import com.google.common.io.Closer;
import io.airlift.slice.Slice;
import io.airlift.slice.Slices;
import org.joda.time.DateTimeZone;
import org.openjdk.jol.info.ClassLayout;

import javax.annotation.Nullable;

import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Optional;
import java.util.SortedMap;

import static com.facebook.presto.orc.metadata.Stream.StreamKind.IN_MAP;
import static com.facebook.presto.orc.metadata.Stream.StreamKind.PRESENT;
import static com.facebook.presto.orc.reader.ReaderUtils.verifyStreamType;
import static com.facebook.presto.orc.stream.MissingInputStreamSource.getBooleanMissingStreamSource;
import static com.google.common.base.MoreObjects.toStringHelper;
import static java.util.Objects.requireNonNull;

/**
 * Flat Maps are a layout of maps supported in DWRF.
 * 

* Values associated with different keys are stored in separate streams rather than having a single set of value streams for the map. *

* There is a ColumnEncoding associated with the value streams for a given key. All the ColumnEncodings for values have the same * columnId, and use a sequenceId to distinguish them. The ColumnEncoding also contains the key it is associated with as metadata. *

* Note that the ColumnEncoding with sequenceId 0 for the values has no data associated with it, only statistics. Similarly there * is a ColumnEncoding for the key stream which has no data associated with it, only statistics, so it is not used in this class. */ public class MapFlatBatchStreamReader implements BatchStreamReader { private static final int INSTANCE_SIZE = ClassLayout.parseClass(MapFlatBatchStreamReader.class).instanceSize(); private final MapType type; private final StreamDescriptor streamDescriptor; private final DateTimeZone hiveStorageTimeZone; // This is the StreamDescriptor for the value stream with sequence ID 0, it is used to derive StreamDescriptors for the // value streams with other sequence IDs private final StreamDescriptor baseValueStreamDescriptor; private final OrcType.OrcTypeKind keyOrcType; private final List> inMapStreamSources = new ArrayList<>(); private final List inMapStreams = new ArrayList<>(); private final List valueStreamReaders = new ArrayList<>(); private final List valueStreamDescriptors = new ArrayList<>(); private Block keyBlockTemplate; private int readOffset; private int nextBatchSize; private InputStreamSource presentStreamSource = getBooleanMissingStreamSource(); @Nullable private BooleanInputStream presentStream; private boolean rowGroupOpen; private OrcAggregatedMemoryContext systemMemoryContext; private final OrcRecordReaderOptions options; public MapFlatBatchStreamReader(Type type, StreamDescriptor streamDescriptor, DateTimeZone hiveStorageTimeZone, OrcRecordReaderOptions options, OrcAggregatedMemoryContext systemMemoryContext) throws OrcCorruptionException { requireNonNull(type, "type is null"); verifyStreamType(streamDescriptor, type, MapType.class::isInstance); this.type = (MapType) type; this.streamDescriptor = requireNonNull(streamDescriptor, "stream is null"); this.hiveStorageTimeZone = requireNonNull(hiveStorageTimeZone, "hiveStorageTimeZone is null"); this.systemMemoryContext = requireNonNull(systemMemoryContext, "systemMemoryContext is null"); this.keyOrcType = streamDescriptor.getNestedStreams().get(0).getOrcTypeKind(); this.baseValueStreamDescriptor = streamDescriptor.getNestedStreams().get(1); this.options = requireNonNull(options); } @Override public void prepareNextRead(int batchSize) { readOffset += nextBatchSize; nextBatchSize = batchSize; } @Override public Block readBlock() throws IOException { if (!rowGroupOpen) { openRowGroup(); } if (readOffset > 0) { if (presentStream != null) { // skip ahead the present bit reader, but count the set bits // and use this as the skip size for the data reader readOffset = presentStream.countBitsSet(readOffset); } if (readOffset > 0) { for (int i = 0; i < valueStreamReaders.size(); i++) { int valueReadOffset = inMapStreams.get(i).countBitsSet(readOffset); valueStreamReaders.get(i).prepareNextRead(valueReadOffset); } } } boolean[][] inMapVectors = new boolean[inMapStreamSources.size()][]; boolean[] nullVector = null; int totalMapEntries = 0; if (presentStream == null) { for (int keyIndex = 0; keyIndex < inMapStreams.size(); keyIndex++) { inMapVectors[keyIndex] = new boolean[nextBatchSize]; totalMapEntries += inMapStreams.get(keyIndex).getSetBits(nextBatchSize, inMapVectors[keyIndex]); } } else { nullVector = new boolean[nextBatchSize]; int nullValues = presentStream.getUnsetBits(nextBatchSize, nullVector); if (nullValues != nextBatchSize) { for (int i = 0; i < inMapStreams.size(); i++) { inMapVectors[i] = new boolean[nextBatchSize]; totalMapEntries += inMapStreams.get(i).getSetBits(nextBatchSize, inMapVectors[i], nullVector); } } } MapType mapType = (MapType) type; Type valueType = mapType.getValueType(); Block[] valueBlocks = new Block[valueStreamReaders.size()]; if (totalMapEntries > 0) { for (int keyIndex = 0; keyIndex < valueStreamReaders.size(); keyIndex++) { int mapsContainingKey = 0; for (int mapIndex = 0; mapIndex < nextBatchSize; mapIndex++) { if (inMapVectors[keyIndex][mapIndex]) { mapsContainingKey++; } } if (mapsContainingKey > 0) { BatchStreamReader streamReader = valueStreamReaders.get(keyIndex); streamReader.prepareNextRead(mapsContainingKey); valueBlocks[keyIndex] = streamReader.readBlock(); } else { valueBlocks[keyIndex] = valueType.createBlockBuilder(null, 0).build(); } } } int[] valueBlockPositions = new int[inMapVectors.length]; BlockBuilder valueBlockBuilder = valueType.createBlockBuilder(null, totalMapEntries); int[] keyIds = new int[totalMapEntries]; int keyIdsIndex = 0; int[] mapOffsets = new int[nextBatchSize + 1]; mapOffsets[0] = 0; for (int mapIndex = 0; mapIndex < nextBatchSize; mapIndex++) { int mapLength = 0; if (totalMapEntries > 0) { for (int keyIndex = 0; keyIndex < inMapVectors.length; keyIndex++) { if (inMapVectors[keyIndex][mapIndex]) { mapLength++; valueType.appendTo(valueBlocks[keyIndex], valueBlockPositions[keyIndex], valueBlockBuilder); keyIds[keyIdsIndex++] = keyIndex; valueBlockPositions[keyIndex]++; } } } mapOffsets[mapIndex + 1] = mapOffsets[mapIndex] + mapLength; } Block block = mapType.createBlockFromKeyValue(nextBatchSize, Optional.ofNullable(nullVector), mapOffsets, new DictionaryBlock(keyBlockTemplate, keyIds), valueBlockBuilder); readOffset = 0; nextBatchSize = 0; return block; } private void openRowGroup() throws IOException { presentStream = presentStreamSource.openStream(); for (int i = 0; i < inMapStreamSources.size(); i++) { BooleanInputStream inMapStream = requireNonNull(inMapStreamSources.get(i).openStream(), "missing inMapStream at position " + i); inMapStreams.add(inMapStream); } rowGroupOpen = true; } @Override public void startStripe(Stripe stripe) throws IOException { presentStreamSource = getBooleanMissingStreamSource(); inMapStreamSources.clear(); valueStreamDescriptors.clear(); valueStreamReaders.clear(); ColumnEncoding encoding = stripe.getColumnEncodings().get(baseValueStreamDescriptor.getStreamId()); SortedMap additionalSequenceEncodings = Collections.emptySortedMap(); // encoding or encoding.getAdditionalSequenceEncodings() may not be present when every map is empty or null if (encoding != null && encoding.getAdditionalSequenceEncodings().isPresent()) { additionalSequenceEncodings = encoding.getAdditionalSequenceEncodings().get(); } // The ColumnEncoding with sequence ID 0 doesn't have any data associated with it for (int sequence : additionalSequenceEncodings.keySet()) { inMapStreamSources.add(getBooleanMissingStreamSource()); StreamDescriptor valueStreamDescriptor = baseValueStreamDescriptor.duplicate(sequence); valueStreamDescriptors.add(valueStreamDescriptor); BatchStreamReader valueStreamReader = BatchStreamReaders.createStreamReader(type.getValueType(), valueStreamDescriptor, hiveStorageTimeZone, options, systemMemoryContext); valueStreamReader.startStripe(stripe); valueStreamReaders.add(valueStreamReader); } keyBlockTemplate = getKeyBlockTemplate(additionalSequenceEncodings.values()); readOffset = 0; nextBatchSize = 0; presentStream = null; rowGroupOpen = false; } private Block getKeyBlockTemplate(Collection sequenceEncodings) { switch (keyOrcType) { case BYTE: case SHORT: case INT: case LONG: return getIntegerKeyBlockTemplate(sequenceEncodings); case STRING: case BINARY: return getSliceKeysBlockTemplate(sequenceEncodings); default: throw new IllegalArgumentException("Unsupported flat map key type: " + keyOrcType); } } private Block getIntegerKeyBlockTemplate(Collection sequenceEncodings) { Type keyType; switch (keyOrcType) { case BYTE: keyType = TinyintType.TINYINT; break; case SHORT: keyType = SmallintType.SMALLINT; break; case INT: keyType = IntegerType.INTEGER; break; case LONG: keyType = BigintType.BIGINT; break; default: throw new IllegalArgumentException("Unsupported flat map key type: " + keyOrcType); } BlockBuilder blockBuilder = keyType.createBlockBuilder(null, sequenceEncodings.size()); for (DwrfSequenceEncoding sequenceEncoding : sequenceEncodings) { keyType.writeLong(blockBuilder, sequenceEncoding.getKey().getIntKey()); } return blockBuilder.build(); } private Block getSliceKeysBlockTemplate(Collection sequenceEncodings) { int bytes = 0; for (DwrfSequenceEncoding sequenceEncoding : sequenceEncodings) { bytes += sequenceEncoding.getKey().getBytesKey().size(); } VariableWidthBlockBuilder builder = new VariableWidthBlockBuilder(null, sequenceEncodings.size(), bytes); for (DwrfSequenceEncoding sequenceEncoding : sequenceEncodings) { Slice key = Slices.wrappedBuffer(sequenceEncoding.getKey().getBytesKey().toByteArray()); builder.writeBytes(key, 0, key.length()); builder.closeEntry(); } return builder.build(); } @Override public void startRowGroup(InputStreamSources dataStreamSources) throws IOException { presentStreamSource = dataStreamSources.getInputStreamSource(streamDescriptor, PRESENT, BooleanInputStream.class); for (int i = 0; i < valueStreamDescriptors.size(); i++) { InputStreamSource inMapStreamSource = dataStreamSources.getInputStreamSource(valueStreamDescriptors.get(i), IN_MAP, BooleanInputStream.class); inMapStreamSources.set(i, inMapStreamSource); } readOffset = 0; nextBatchSize = 0; presentStream = null; inMapStreams.clear(); rowGroupOpen = false; for (BatchStreamReader valueStreamReader : valueStreamReaders) { valueStreamReader.startRowGroup(dataStreamSources); } } @Override public String toString() { return toStringHelper(this) .addValue(streamDescriptor) .toString(); } @Override public void close() { try (Closer closer = Closer.create()) { for (BatchStreamReader valueStreamReader : valueStreamReaders) { closer.register(valueStreamReader::close); } } catch (IOException e) { throw new UncheckedIOException(e); } } @Override public long getRetainedSizeInBytes() { long retainedSize = INSTANCE_SIZE; for (BatchStreamReader valueStreamReader : valueStreamReaders) { retainedSize += valueStreamReader.getRetainedSizeInBytes(); } return retainedSize; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy