All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.facebook.presto.orc.writer.StreamOrderingLayout Maven / Gradle / Ivy

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.facebook.presto.orc.writer;

import com.facebook.presto.orc.DwrfStreamOrderingConfig;
import com.facebook.presto.orc.metadata.ColumnEncoding;
import com.facebook.presto.orc.metadata.DwrfSequenceEncoding;
import com.facebook.presto.orc.metadata.Stream;
import com.facebook.presto.orc.proto.DwrfProto;
import com.facebook.presto.orc.stream.StreamDataOutput;
import com.google.common.collect.ImmutableMap;

import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;

import static com.google.common.base.MoreObjects.toStringHelper;
import static com.google.common.base.Preconditions.checkState;
import static java.util.Objects.requireNonNull;

public class StreamOrderingLayout
        implements StreamLayout
{
    private static final Comparator IN_GROUP_COMPARATOR = (streamDataA, streamDataB) -> {
        Stream streamA = streamDataA.getStream();
        Stream streamB = streamDataB.getStream();
        int nodeA = streamA.getColumn();
        int nodeB = streamB.getColumn();

        // order by the node in asc order
        if (nodeA != nodeB) {
            return Integer.compare(nodeA, nodeB);
        }

        // order streams of the same node by the stream kind
        return Integer.compare(streamA.getStreamKind().ordinal(), streamB.getStreamKind().ordinal());
    };

    private final DwrfStreamOrderingConfig config;
    private final StreamLayout nonStreamOrderingLayout;

    public StreamOrderingLayout(
            DwrfStreamOrderingConfig config,
            StreamLayout layout)
    {
        this.config = requireNonNull(config, "config cannot be null");
        this.nonStreamOrderingLayout = requireNonNull(layout, "layout cannot be null");
    }

    private static class StreamMetadata
    {
        //  -> List>
        private final Map> sequenceToStreams;
        //  -> SequenceId
        private final Map keyToSequence;

        public StreamMetadata(Map> sequenceToStreams, Map keyToSequence)
        {
            this.sequenceToStreams = requireNonNull(sequenceToStreams, "sequenceToStreams cannot be null");
            this.keyToSequence = requireNonNull(keyToSequence, "keyToSequence cannot be null");
        }
    }

    private static class ColumnKeyInfo
    {
        private final int column;
        private final DwrfProto.KeyInfo key;

        public ColumnKeyInfo(int column, DwrfProto.KeyInfo key)
        {
            this.column = column;
            this.key = requireNonNull(key, "key cannot be null");
        }

        @Override
        public boolean equals(Object obj)
        {
            if (obj == null) {
                return false;
            }
            if (!(obj instanceof ColumnKeyInfo)) {
                return false;
            }
            ColumnKeyInfo input = (ColumnKeyInfo) obj;
            return this.column == input.column && this.key.equals(input.key);
        }

        @Override
        public int hashCode()
        {
            return Objects.hash(column, key);
        }
    }

    private StreamMetadata getStreamMetadata(
            Map nodeIdToColumn,
            Map nodeIdToColumnEncodings,
            DwrfStreamOrderingConfig config)
    {
        ImmutableMap.Builder keyToSequenceBuilder = ImmutableMap.builder();
        ImmutableMap.Builder> sequenceToStreamsBuilder = ImmutableMap.builder();
        Map> columnToKeySet = config.getStreamOrdering();
        // Adding a set to track which of the columns in the reorder list are already visited
        // For complex maps (complex values for the value)
        // there could be multiple nodeId(s) mapping to a single column ID
        // For example,  if the flat map column is map> with node ids (1: <2, 3<4>>) and column id 0
        // There will be multiple entries in the nodeIdToColumnEncodings for each of the values
        // 1 -> DWRF_MAP_FLAT encoding
        // 3 -> DIRECT encoding  + SortedMap (sequence encodings)
        // 4 -> DIRECT encoding  + SortedMap (sequence encodings)
        Set columnsVisited = new HashSet<>(columnToKeySet.size());

        // iterate through all the encodings and if the encoding has additional sequence encodings put it in the map
        for (Map.Entry entry : nodeIdToColumnEncodings.entrySet()) {
            int nodeId = entry.getKey();

            // skip the root node, because it doesn't have a column
            if (nodeId == 0) {
                continue;
            }

            int column = nodeIdToColumn.get(nodeId);
            if (entry.getValue().getAdditionalSequenceEncodings().isPresent() && columnToKeySet.containsKey(column) && !columnsVisited.contains(column)) {
                // add entries only if stream ordering contains the column ID
                Set keysPerColumn = columnToKeySet.get(column);
                for (Map.Entry sequenceToEncoding : entry.getValue().getAdditionalSequenceEncodings().get().entrySet()) {
                    Integer sequence = sequenceToEncoding.getKey();
                    DwrfProto.KeyInfo key = sequenceToEncoding.getValue().getKey();
                    // add the stream only if it is present in the stream ordering config
                    if (keysPerColumn.contains(key)) {
                        keyToSequenceBuilder.put(new ColumnKeyInfo(column, key), sequence);
                        sequenceToStreamsBuilder.put(new ColumnSequenceKey(column, sequence), new ArrayList<>());
                    }
                }
                columnsVisited.add(column);
            }
        }
        return new StreamMetadata(sequenceToStreamsBuilder.build(), keyToSequenceBuilder.build());
    }

    @Override
    public void reorder(
            List dataStreams,
            Map nodeIdToColumn,
            Map nodeIdToColumnEncodings)
    {
        List nonReorderStreams = new ArrayList<>();
        StreamMetadata metadata = getStreamMetadata(nodeIdToColumn, nodeIdToColumnEncodings, config);
        Map> sequenceToStreams = metadata.sequenceToStreams;
        for (StreamDataOutput dataOutput : dataStreams) {
            int nodeId = dataOutput.getStream().getColumn();
            int sequence = dataOutput.getStream().getSequence();
            int column = nodeIdToColumn.get(nodeId);
            // only if sequence ID > 0, we do a look up in sequenceToStreams
            if (sequence > 0) {
                List streams = sequenceToStreams.get(new ColumnSequenceKey(column, sequence));
                if (streams == null) {
                    nonReorderStreams.add(dataOutput);
                }
                else {
                    streams.add(dataOutput);
                }
            }
            else {
                nonReorderStreams.add(dataOutput);
            }
        }

        // reorder everything in the input order
        List orderedStreams = new ArrayList<>();
        Map keyToSequence = metadata.keyToSequence;
        for (Map.Entry> columnToKeys : config.getStreamOrdering().entrySet()) {
            int column = columnToKeys.getKey();
            for (DwrfProto.KeyInfo key : columnToKeys.getValue()) {
                ColumnKeyInfo columnKeyInfo = new ColumnKeyInfo(column, key);
                Integer sequence = keyToSequence.get(columnKeyInfo);
                if (sequence != null) {
                    ColumnSequenceKey columnSequenceInfo = new ColumnSequenceKey(column, sequence);
                    List groupedDataStreams = sequenceToStreams.get(columnSequenceInfo);
                    checkState(groupedDataStreams != null, "list of streams for a sequence cannot be null");
                    checkState(groupedDataStreams.size() > 0, "There should be at least one stream for a sequence");

                    // order grouped streams
                    groupedDataStreams.sort(IN_GROUP_COMPARATOR);
                    orderedStreams.addAll(groupedDataStreams);
                }
            }
        }

        // do actual reordering
        nonStreamOrderingLayout.reorder(nonReorderStreams, nodeIdToColumn, nodeIdToColumnEncodings);

        // add all the streams
        checkState(orderedStreams.size() + nonReorderStreams.size() == dataStreams.size(),
                "Number of ordered + non ordered streams should be equal to total number of data streams " +
                "orderedStreams: %s, nonReorderStreams: %s, dataStreams: %s",
                orderedStreams.size(),
                nonReorderStreams.size(),
                dataStreams.size());
        dataStreams.clear();
        dataStreams.addAll(orderedStreams);
        dataStreams.addAll(nonReorderStreams);
    }

    @Override
    public String toString()
    {
        return toStringHelper(this)
                .add("config", config)
                .add("nonStreamOrderingLayout", nonStreamOrderingLayout)
                .toString();
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy