All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.facebook.presto.orc.StreamSizeHelper Maven / Gradle / Ivy

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.facebook.presto.orc;

import com.facebook.presto.orc.metadata.ColumnEncoding;
import com.facebook.presto.orc.metadata.DwrfSequenceEncoding;
import com.facebook.presto.orc.metadata.OrcType;
import com.facebook.presto.orc.metadata.Stream;
import com.facebook.presto.orc.proto.DwrfProto;
import com.facebook.presto.orc.stream.StreamDataOutput;
import it.unimi.dsi.fastutil.ints.Int2IntMap;
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
import it.unimi.dsi.fastutil.ints.Int2LongMap;
import it.unimi.dsi.fastutil.ints.Int2LongOpenHashMap;
import it.unimi.dsi.fastutil.ints.Int2ObjectMap;
import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntList;
import it.unimi.dsi.fastutil.objects.Object2LongMap;
import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap;

import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;

import static com.google.common.base.Preconditions.checkArgument;
import static java.util.Objects.requireNonNull;

public class StreamSizeHelper
{
    private final List orcTypes;

    // flag indicating whether to collect flat map key size stats or not
    private final boolean collectKeyStats;

    // contains the node of the flattened column, it does not contain sub-nodes
    private final Set flatMapNodes;

    // contains the mapping of all map value nodes to the top-level (column) map
    // node for flattened nodes
    private final Int2IntMap flatMapNodeTrees;

    // contains self node sizes (not rolled up)
    private final long[] nodeSizes;

    // key sizes by flat map node
    private final Int2ObjectMap> keySizes = new Int2ObjectOpenHashMap<>();

    public StreamSizeHelper(List orcTypes, Set flatMapNodes, boolean mapStatisticsEnabled)
    {
        this.orcTypes = requireNonNull(orcTypes, "orcTypes is null");
        this.flatMapNodes = requireNonNull(flatMapNodes, "flattenedNodes is null");
        this.collectKeyStats = mapStatisticsEnabled && !flatMapNodes.isEmpty();
        this.nodeSizes = new long[orcTypes.size()];
        this.flatMapNodeTrees = buildFlattenedNodeTrees();
    }

    private Int2IntMap buildFlattenedNodeTrees()
    {
        Int2IntMap flattenedNodeTrees = new Int2IntOpenHashMap();
        if (!collectKeyStats) {
            return flattenedNodeTrees;
        }

        // map all map values nodes to their top-level map node
        for (Integer mapNode : flatMapNodes) {
            OrcType mapType = orcTypes.get(mapNode);
            checkArgument(mapType.getOrcTypeKind() == OrcType.OrcTypeKind.MAP, "flat map node %s must be a map, but was %s", mapNode, mapType.getOrcTypeKind());
            checkArgument(mapType.getFieldCount() == 2, "flat map node %s must have exactly 2 sub-fields but had %s", mapNode, mapType.getFieldCount());
            int mapValueNode = mapType.getFieldTypeIndex(1);
            IntList deepValueNodes = collectDeepTreeNodes(orcTypes, mapValueNode);
            deepValueNodes.intStream().forEach(valueNode -> flattenedNodeTrees.put(valueNode, mapNode.intValue()));
        }

        return flattenedNodeTrees;
    }

    public void collectStreamSizes(Iterable streamDataOutputs, Map columnEncodings)
    {
        // collect node sizes first
        for (StreamDataOutput streamDataOutput : streamDataOutputs) {
            requireNonNull(streamDataOutput, "streamDataOutput is null");
            Stream stream = streamDataOutput.getStream();
            int node = stream.getColumn();
            nodeSizes[node] += streamDataOutput.size();
        }

        // collect map key sizes only if flat maps and map statistics are enabled
        if (collectKeyStats) {
            // flatMapNodeSizes contains total stream sizes by flat map node and sequence,
            // all value sub-nodes are mapped to the flat map node
            Int2ObjectMap flatMapNodeSizes = new Int2ObjectOpenHashMap<>();

            // collect stream sizes aggregated by flat map node and sequence
            for (StreamDataOutput streamDataOutput : streamDataOutputs) {
                Stream stream = streamDataOutput.getStream();
                int node = stream.getColumn();

                // check if this node belongs to the flat map tree
                int flatMapNode = flatMapNodeTrees.getOrDefault(node, -1);
                if (flatMapNode != -1) {
                    Int2LongMap sequenceToSize = flatMapNodeSizes.computeIfAbsent(flatMapNode, Int2LongOpenHashMap::new);
                    sequenceToSize.mergeLong(stream.getSequence(), stream.getLength(), Long::sum);
                }
            }

            // merge stripe level sizes into the file level sizes
            for (int flatMapNode : flatMapNodeSizes.keySet()) {
                int flatMapValueNode = orcTypes.get(flatMapNode).getFieldTypeIndex(1);
                ColumnEncoding columnEncoding = columnEncodings.get(flatMapValueNode);
                checkArgument(columnEncoding != null, "columnEncoding for flat map node %s is null", flatMapNode);
                checkArgument(columnEncoding.getAdditionalSequenceEncodings().isPresent(), "columnEncoding for flat map node %s does not have keys", flatMapNode);

                SortedMap sequenceToKey = columnEncoding.getAdditionalSequenceEncodings().get();
                Int2LongMap sequenceToSize = flatMapNodeSizes.get(flatMapNode);
                Object2LongMap keyToSize = keySizes.computeIfAbsent(flatMapNode, (ignore) -> new Object2LongOpenHashMap<>());

                // set the flat map node storage size in the map column statistics
                for (Map.Entry entry : sequenceToKey.entrySet()) {
                    int sequence = entry.getKey();
                    DwrfProto.KeyInfo key = entry.getValue().getKey();
                    long size = sequenceToSize.getOrDefault(sequence, 0);
                    keyToSize.mergeLong(key, size, Long::sum);
                }
            }
        }
    }

    /**
     * Returns flat map key sizes by flat map node.
     */
    public Int2ObjectMap> getMapKeySizes()
    {
        return keySizes;
    }

    /**
     * Returns rolled up node sizes.
     */
    public Int2LongMap getNodeSizes()
    {
        Int2LongMap result = new Int2LongOpenHashMap(nodeSizes.length);
        rollupNodeSizes(result, 0);
        return result;
    }

    private long rollupNodeSizes(Int2LongMap result, int node)
    {
        long size = nodeSizes[node];
        List subFieldIndexes = orcTypes.get(node).getFieldTypeIndexes();
        for (Integer subNode : subFieldIndexes) {
            size += rollupNodeSizes(result, subNode);
        }
        result.put(node, size);
        return size;
    }

    private static IntList collectDeepTreeNodes(List orcTypes, int startNode)
    {
        IntList result = new IntArrayList();
        result.add(startNode);

        for (int i = 0; i < result.size(); i++) {
            int node = result.getInt(i);
            OrcType orcType = orcTypes.get(node);
            for (int j = 0; j < orcType.getFieldCount(); j++) {
                result.add(orcType.getFieldTypeIndex(j));
            }
        }

        return result;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy