
com.facebook.presto.orc.writer.StreamOrderingLayout Maven / Gradle / Ivy
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.presto.orc.writer;
import com.facebook.presto.orc.DwrfStreamOrderingConfig;
import com.facebook.presto.orc.metadata.ColumnEncoding;
import com.facebook.presto.orc.metadata.DwrfSequenceEncoding;
import com.facebook.presto.orc.metadata.Stream;
import com.facebook.presto.orc.proto.DwrfProto;
import com.facebook.presto.orc.stream.StreamDataOutput;
import com.google.common.collect.ImmutableMap;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import static com.google.common.base.MoreObjects.toStringHelper;
import static com.google.common.base.Preconditions.checkState;
import static java.util.Objects.requireNonNull;
public class StreamOrderingLayout
implements StreamLayout
{
private static final Comparator IN_GROUP_COMPARATOR = (streamDataA, streamDataB) -> {
Stream streamA = streamDataA.getStream();
Stream streamB = streamDataB.getStream();
int nodeA = streamA.getColumn();
int nodeB = streamB.getColumn();
// order by the node in asc order
if (nodeA != nodeB) {
return Integer.compare(nodeA, nodeB);
}
// order streams of the same node by the stream kind
return Integer.compare(streamA.getStreamKind().ordinal(), streamB.getStreamKind().ordinal());
};
private final DwrfStreamOrderingConfig config;
private final StreamLayout nonStreamOrderingLayout;
public StreamOrderingLayout(
DwrfStreamOrderingConfig config,
StreamLayout layout)
{
this.config = requireNonNull(config, "config cannot be null");
this.nonStreamOrderingLayout = requireNonNull(layout, "layout cannot be null");
}
private static class StreamMetadata
{
// -> List>
private final Map> sequenceToStreams;
// -> SequenceId
private final Map keyToSequence;
public StreamMetadata(Map> sequenceToStreams, Map keyToSequence)
{
this.sequenceToStreams = requireNonNull(sequenceToStreams, "sequenceToStreams cannot be null");
this.keyToSequence = requireNonNull(keyToSequence, "keyToSequence cannot be null");
}
}
private static class ColumnKeyInfo
{
private final int column;
private final DwrfProto.KeyInfo key;
public ColumnKeyInfo(int column, DwrfProto.KeyInfo key)
{
this.column = column;
this.key = requireNonNull(key, "key cannot be null");
}
@Override
public boolean equals(Object obj)
{
if (obj == null) {
return false;
}
if (!(obj instanceof ColumnKeyInfo)) {
return false;
}
ColumnKeyInfo input = (ColumnKeyInfo) obj;
return this.column == input.column && this.key.equals(input.key);
}
@Override
public int hashCode()
{
return Objects.hash(column, key);
}
}
private StreamMetadata getStreamMetadata(
Map nodeIdToColumn,
Map nodeIdToColumnEncodings,
DwrfStreamOrderingConfig config)
{
ImmutableMap.Builder keyToSequenceBuilder = ImmutableMap.builder();
ImmutableMap.Builder> sequenceToStreamsBuilder = ImmutableMap.builder();
Map> columnToKeySet = config.getStreamOrdering();
// Adding a set to track which of the columns in the reorder list are already visited
// For complex maps (complex values for the value)
// there could be multiple nodeId(s) mapping to a single column ID
// For example, if the flat map column is map> with node ids (1: <2, 3<4>>) and column id 0
// There will be multiple entries in the nodeIdToColumnEncodings for each of the values
// 1 -> DWRF_MAP_FLAT encoding
// 3 -> DIRECT encoding + SortedMap (sequence encodings)
// 4 -> DIRECT encoding + SortedMap (sequence encodings)
Set columnsVisited = new HashSet<>(columnToKeySet.size());
// iterate through all the encodings and if the encoding has additional sequence encodings put it in the map
for (Map.Entry entry : nodeIdToColumnEncodings.entrySet()) {
int nodeId = entry.getKey();
// skip the root node, because it doesn't have a column
if (nodeId == 0) {
continue;
}
int column = nodeIdToColumn.get(nodeId);
if (entry.getValue().getAdditionalSequenceEncodings().isPresent() && columnToKeySet.containsKey(column) && !columnsVisited.contains(column)) {
// add entries only if stream ordering contains the column ID
Set keysPerColumn = columnToKeySet.get(column);
for (Map.Entry sequenceToEncoding : entry.getValue().getAdditionalSequenceEncodings().get().entrySet()) {
Integer sequence = sequenceToEncoding.getKey();
DwrfProto.KeyInfo key = sequenceToEncoding.getValue().getKey();
// add the stream only if it is present in the stream ordering config
if (keysPerColumn.contains(key)) {
keyToSequenceBuilder.put(new ColumnKeyInfo(column, key), sequence);
sequenceToStreamsBuilder.put(new ColumnSequenceKey(column, sequence), new ArrayList<>());
}
}
columnsVisited.add(column);
}
}
return new StreamMetadata(sequenceToStreamsBuilder.build(), keyToSequenceBuilder.build());
}
@Override
public void reorder(
List dataStreams,
Map nodeIdToColumn,
Map nodeIdToColumnEncodings)
{
List nonReorderStreams = new ArrayList<>();
StreamMetadata metadata = getStreamMetadata(nodeIdToColumn, nodeIdToColumnEncodings, config);
Map> sequenceToStreams = metadata.sequenceToStreams;
for (StreamDataOutput dataOutput : dataStreams) {
int nodeId = dataOutput.getStream().getColumn();
int sequence = dataOutput.getStream().getSequence();
int column = nodeIdToColumn.get(nodeId);
// only if sequence ID > 0, we do a look up in sequenceToStreams
if (sequence > 0) {
List streams = sequenceToStreams.get(new ColumnSequenceKey(column, sequence));
if (streams == null) {
nonReorderStreams.add(dataOutput);
}
else {
streams.add(dataOutput);
}
}
else {
nonReorderStreams.add(dataOutput);
}
}
// reorder everything in the input order
List orderedStreams = new ArrayList<>();
Map keyToSequence = metadata.keyToSequence;
for (Map.Entry> columnToKeys : config.getStreamOrdering().entrySet()) {
int column = columnToKeys.getKey();
for (DwrfProto.KeyInfo key : columnToKeys.getValue()) {
ColumnKeyInfo columnKeyInfo = new ColumnKeyInfo(column, key);
Integer sequence = keyToSequence.get(columnKeyInfo);
if (sequence != null) {
ColumnSequenceKey columnSequenceInfo = new ColumnSequenceKey(column, sequence);
List groupedDataStreams = sequenceToStreams.get(columnSequenceInfo);
checkState(groupedDataStreams != null, "list of streams for a sequence cannot be null");
checkState(groupedDataStreams.size() > 0, "There should be at least one stream for a sequence");
// order grouped streams
groupedDataStreams.sort(IN_GROUP_COMPARATOR);
orderedStreams.addAll(groupedDataStreams);
}
}
}
// do actual reordering
nonStreamOrderingLayout.reorder(nonReorderStreams, nodeIdToColumn, nodeIdToColumnEncodings);
// add all the streams
checkState(orderedStreams.size() + nonReorderStreams.size() == dataStreams.size(),
"Number of ordered + non ordered streams should be equal to total number of data streams " +
"orderedStreams: %s, nonReorderStreams: %s, dataStreams: %s",
orderedStreams.size(),
nonReorderStreams.size(),
dataStreams.size());
dataStreams.clear();
dataStreams.addAll(orderedStreams);
dataStreams.addAll(nonReorderStreams);
}
@Override
public String toString()
{
return toStringHelper(this)
.add("config", config)
.add("nonStreamOrderingLayout", nonStreamOrderingLayout)
.toString();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy