All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.facebook.presto.iceberg.IcebergOrcFileWriter Maven / Gradle / Ivy

The newest version!
/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.facebook.presto.iceberg;

import com.facebook.presto.common.io.DataSink;
import com.facebook.presto.common.type.Type;
import com.facebook.presto.hive.OrcFileWriter;
import com.facebook.presto.orc.DwrfEncryptionProvider;
import com.facebook.presto.orc.DwrfWriterEncryption;
import com.facebook.presto.orc.NoOpOrcWriterStats;
import com.facebook.presto.orc.OrcDataSource;
import com.facebook.presto.orc.OrcEncoding;
import com.facebook.presto.orc.OrcWriteValidation;
import com.facebook.presto.orc.OrcWriterOptions;
import com.facebook.presto.orc.metadata.CompressionKind;
import com.facebook.presto.orc.metadata.OrcType;
import com.facebook.presto.orc.metadata.statistics.ColumnStatistics;
import com.facebook.presto.orc.metadata.statistics.DateStatistics;
import com.facebook.presto.orc.metadata.statistics.DecimalStatistics;
import com.facebook.presto.orc.metadata.statistics.DoubleStatistics;
import com.facebook.presto.orc.metadata.statistics.IntegerStatistics;
import com.facebook.presto.orc.metadata.statistics.StringStatistics;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import io.airlift.slice.Slice;
import org.apache.iceberg.Metrics;
import org.apache.iceberg.Schema;
import org.apache.iceberg.types.Conversions;

import java.math.BigDecimal;
import java.nio.ByteBuffer;
import java.time.ZoneId;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.function.Supplier;

import static com.facebook.presto.iceberg.TypeConverter.ORC_ICEBERG_ID_KEY;
import static com.google.common.base.Verify.verify;
import static java.lang.Math.toIntExact;
import static java.util.Objects.requireNonNull;
import static org.apache.iceberg.types.Types.DecimalType;
import static org.apache.iceberg.types.Types.NestedField;

public class IcebergOrcFileWriter
        extends OrcFileWriter
        implements IcebergFileWriter
{
    private final Schema icebergSchema;
    private final List orcColumn;

    public IcebergOrcFileWriter(
            Schema icebergSchema,
            DataSink dataSink,
            Callable rollbackAction,
            OrcEncoding orcEncoding,
            List columnNames,
            List fileColumnTypes,
            List fileColumnOrcTypes,
            CompressionKind compression,
            OrcWriterOptions options,
            int[] fileInputColumnIndexes,
            Map metadata,
            ZoneId hiveStorageTimeZone,
            Optional> validationInputFactory,
            OrcWriteValidation.OrcWriteValidationMode validationMode,
            NoOpOrcWriterStats stats,
            DwrfEncryptionProvider dwrfEncryptionProvider,
            Optional dwrfWriterEncryption)
    {
        super(dataSink, rollbackAction, orcEncoding, columnNames, fileColumnTypes, Optional.ofNullable(fileColumnOrcTypes), compression, options, fileInputColumnIndexes, metadata, hiveStorageTimeZone, validationInputFactory, validationMode, stats, dwrfEncryptionProvider, dwrfWriterEncryption);
        this.icebergSchema = requireNonNull(icebergSchema, "icebergSchema is null");
        this.orcColumn = fileColumnOrcTypes;
    }

    @Override
    public Metrics getMetrics()
    {
        return computeMetrics(icebergSchema, orcColumn, orcWriter.getFileRowCount(), orcWriter.getFileStats());
    }

    private static Metrics computeMetrics(Schema icebergSchema, List orcRowTypes, long fileRowCount, List columnStatistics)
    {
        if (columnStatistics.isEmpty()) {
            return new Metrics(fileRowCount, null, null, null, null, null, null);
        }
        // Columns that are descendants of LIST or MAP types are excluded because:
        // 1. Their stats are not used by Apache Iceberg to filter out data files
        // 2. Their record count can be larger than table-level row count. There's no good way to calculate nullCounts for them.
        // See https://github.com/apache/iceberg/pull/199#discussion_r429443627
        Set excludedColumns = getExcludedColumns(orcRowTypes);

        ImmutableMap.Builder valueCountsBuilder = ImmutableMap.builder();
        ImmutableMap.Builder nullCountsBuilder = ImmutableMap.builder();
        ImmutableMap.Builder lowerBoundsBuilder = ImmutableMap.builder();
        ImmutableMap.Builder upperBoundsBuilder = ImmutableMap.builder();

        // OrcColumnId(0) is the root column that represents file-level schema
        for (int i = 1; i < orcRowTypes.size(); i++) {
            if (excludedColumns.contains(i)) {
                continue;
            }
            OrcType orcColumn = orcRowTypes.get(i);
            ColumnStatistics orcColumnStats = columnStatistics.get(i);
            int icebergId = getIcebergId(orcColumn);
            NestedField icebergField = icebergSchema.findField(icebergId);
            verify(icebergField != null, "Cannot find Iceberg column with ID %s in schema %s", icebergId, icebergSchema);
            valueCountsBuilder.put(icebergId, fileRowCount);
            if (orcColumnStats.hasNumberOfValues()) {
                nullCountsBuilder.put(icebergId, fileRowCount - orcColumnStats.getNumberOfValues());
            }
            toIcebergMinMax(orcColumnStats, icebergField.type()).ifPresent(minMax -> {
                lowerBoundsBuilder.put(icebergId, minMax.getMin());
                upperBoundsBuilder.put(icebergId, minMax.getMax());
            });
        }
        Map valueCounts = valueCountsBuilder.build();
        Map nullCounts = nullCountsBuilder.build();
        Map lowerBounds = lowerBoundsBuilder.build();
        Map upperBounds = upperBoundsBuilder.build();
        return new Metrics(
                fileRowCount,
                null, // TODO: Add column size accounting to ORC column writers
                valueCounts.isEmpty() ? null : valueCounts,
                nullCounts.isEmpty() ? null : nullCounts,
                null,
                lowerBounds.isEmpty() ? null : lowerBounds,
                upperBounds.isEmpty() ? null : upperBounds);
    }

    private static Set getExcludedColumns(List orcRowTypes)
    {
        ImmutableSet.Builder excludedColumns = ImmutableSet.builder();
        populateExcludedColumns(orcRowTypes, 0, false, excludedColumns);
        return excludedColumns.build();
    }

    private static void populateExcludedColumns(List orcRowTypes, int orcColumnId, boolean exclude, ImmutableSet.Builder excludedColumns)
    {
        if (exclude) {
            excludedColumns.add(orcColumnId);
        }
        OrcType orcColumn = orcRowTypes.get(orcColumnId);
        switch (orcColumn.getOrcTypeKind()) {
            case LIST:
            case MAP:
                for (Integer child : orcColumn.getFieldTypeIndexes()) {
                    populateExcludedColumns(orcRowTypes, child, true, excludedColumns);
                }
                return;
            case STRUCT:
                for (Integer child : orcColumn.getFieldTypeIndexes()) {
                    populateExcludedColumns(orcRowTypes, child, exclude, excludedColumns);
                }
                return;
        }
    }

    private static int getIcebergId(OrcType orcColumn)
    {
        String icebergId = orcColumn.getAttributes().get(ORC_ICEBERG_ID_KEY);
        verify(icebergId != null, "ORC column %s doesn't have an associated Iceberg ID", orcColumn);
        return Integer.parseInt(icebergId);
    }

    private static Optional toIcebergMinMax(ColumnStatistics orcColumnStats, org.apache.iceberg.types.Type icebergType)
    {
        IntegerStatistics integerStatistics = orcColumnStats.getIntegerStatistics();
        if (integerStatistics != null) {
            Object min = integerStatistics.getMin();
            Object max = integerStatistics.getMax();
            if (min == null || max == null) {
                return Optional.empty();
            }
            if (icebergType.typeId() == org.apache.iceberg.types.Type.TypeID.INTEGER) {
                min = toIntExact((Long) min);
                max = toIntExact((Long) max);
            }
            return Optional.of(new IcebergMinMax(icebergType, min, max));
        }
        DoubleStatistics doubleStatistics = orcColumnStats.getDoubleStatistics();
        if (doubleStatistics != null) {
            Object min = doubleStatistics.getMin();
            Object max = doubleStatistics.getMax();
            if (min == null || max == null) {
                return Optional.empty();
            }
            if (icebergType.typeId() == org.apache.iceberg.types.Type.TypeID.FLOAT) {
                min = ((Double) min).floatValue();
                max = ((Double) max).floatValue();
            }
            return Optional.of(new IcebergMinMax(icebergType, min, max));
        }
        StringStatistics stringStatistics = orcColumnStats.getStringStatistics();
        if (stringStatistics != null) {
            Slice min = stringStatistics.getMin();
            Slice max = stringStatistics.getMax();
            if (min == null || max == null) {
                return Optional.empty();
            }
            return Optional.of(new IcebergMinMax(icebergType, min.toStringUtf8(), max.toStringUtf8()));
        }
        DateStatistics dateStatistics = orcColumnStats.getDateStatistics();
        if (dateStatistics != null) {
            Integer min = dateStatistics.getMin();
            Integer max = dateStatistics.getMax();
            if (min == null || max == null) {
                return Optional.empty();
            }
            return Optional.of(new IcebergMinMax(icebergType, min, max));
        }
        DecimalStatistics decimalStatistics = orcColumnStats.getDecimalStatistics();
        if (decimalStatistics != null) {
            BigDecimal min = decimalStatistics.getMin();
            BigDecimal max = decimalStatistics.getMax();
            if (min == null || max == null) {
                return Optional.empty();
            }
            min = min.setScale(((DecimalType) icebergType).scale());
            max = max.setScale(((DecimalType) icebergType).scale());
            return Optional.of(new IcebergMinMax(icebergType, min, max));
        }
        return Optional.empty();
    }

    private static class IcebergMinMax
    {
        private ByteBuffer min;
        private ByteBuffer max;

        private IcebergMinMax(org.apache.iceberg.types.Type type, Object min, Object max)
        {
            this.min = Conversions.toByteBuffer(type, min);
            this.max = Conversions.toByteBuffer(type, max);
        }

        public ByteBuffer getMin()
        {
            return min;
        }

        public ByteBuffer getMax()
        {
            return max;
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy