All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.facebook.presto.iceberg.IcebergFileWriterFactory Maven / Gradle / Ivy

The newest version!
/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.facebook.presto.iceberg;

import com.facebook.presto.common.io.DataSink;
import com.facebook.presto.common.io.OutputStreamDataSink;
import com.facebook.presto.common.type.Type;
import com.facebook.presto.common.type.TypeManager;
import com.facebook.presto.hive.FileFormatDataSourceStats;
import com.facebook.presto.hive.HdfsContext;
import com.facebook.presto.hive.HdfsEnvironment;
import com.facebook.presto.hive.HiveDwrfEncryptionProvider;
import com.facebook.presto.hive.NodeVersion;
import com.facebook.presto.hive.OrcFileWriterConfig;
import com.facebook.presto.hive.orc.HdfsOrcDataSource;
import com.facebook.presto.orc.DefaultOrcWriterFlushPolicy;
import com.facebook.presto.orc.DwrfEncryptionProvider;
import com.facebook.presto.orc.NoOpOrcWriterStats;
import com.facebook.presto.orc.OrcDataSource;
import com.facebook.presto.orc.OrcDataSourceId;
import com.facebook.presto.parquet.writer.ParquetWriterOptions;
import com.facebook.presto.spi.ConnectorSession;
import com.facebook.presto.spi.PrestoException;
import com.google.common.collect.ImmutableMap;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.iceberg.MetricsConfig;
import org.apache.iceberg.Schema;
import org.apache.iceberg.types.Types;

import javax.inject.Inject;

import java.io.IOException;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.Callable;
import java.util.function.Supplier;
import java.util.stream.IntStream;

import static com.facebook.presto.hive.HiveCommonSessionProperties.getOrcMaxBufferSize;
import static com.facebook.presto.hive.HiveCommonSessionProperties.getOrcMaxMergeDistance;
import static com.facebook.presto.hive.HiveCommonSessionProperties.getOrcOptimizedWriterValidateMode;
import static com.facebook.presto.hive.HiveCommonSessionProperties.getOrcStreamBufferSize;
import static com.facebook.presto.hive.HiveCommonSessionProperties.isOrcOptimizedWriterValidate;
import static com.facebook.presto.hive.metastore.MetastoreUtil.PRESTO_QUERY_ID_NAME;
import static com.facebook.presto.hive.metastore.MetastoreUtil.PRESTO_VERSION_NAME;
import static com.facebook.presto.iceberg.IcebergErrorCode.ICEBERG_WRITER_OPEN_ERROR;
import static com.facebook.presto.iceberg.IcebergErrorCode.ICEBERG_WRITE_VALIDATION_FAILED;
import static com.facebook.presto.iceberg.IcebergSessionProperties.getCompressionCodec;
import static com.facebook.presto.iceberg.IcebergSessionProperties.getOrcOptimizedWriterMaxDictionaryMemory;
import static com.facebook.presto.iceberg.IcebergSessionProperties.getOrcOptimizedWriterMaxStripeRows;
import static com.facebook.presto.iceberg.IcebergSessionProperties.getOrcOptimizedWriterMaxStripeSize;
import static com.facebook.presto.iceberg.IcebergSessionProperties.getOrcOptimizedWriterMinStripeSize;
import static com.facebook.presto.iceberg.IcebergSessionProperties.getOrcStringStatisticsLimit;
import static com.facebook.presto.iceberg.IcebergSessionProperties.getParquetWriterBlockSize;
import static com.facebook.presto.iceberg.IcebergSessionProperties.getParquetWriterPageSize;
import static com.facebook.presto.iceberg.IcebergSessionProperties.getParquetWriterVersion;
import static com.facebook.presto.iceberg.TypeConverter.toOrcType;
import static com.facebook.presto.iceberg.TypeConverter.toPrestoType;
import static com.facebook.presto.iceberg.util.PrimitiveTypeMapBuilder.makeTypeMap;
import static com.facebook.presto.orc.NoOpOrcWriterStats.NOOP_WRITER_STATS;
import static com.facebook.presto.orc.OrcEncoding.ORC;
import static com.facebook.presto.spi.StandardErrorCode.NOT_SUPPORTED;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static java.time.ZoneOffset.UTC;
import static java.util.Objects.requireNonNull;
import static org.apache.iceberg.parquet.ParquetSchemaUtil.convert;

public class IcebergFileWriterFactory
{
    private final HdfsEnvironment hdfsEnvironment;
    private final TypeManager typeManager;
    private final FileFormatDataSourceStats readStats;
    private final NodeVersion nodeVersion;
    private final NoOpOrcWriterStats orcWriterStats = NOOP_WRITER_STATS;
    private final OrcFileWriterConfig orcFileWriterConfig;
    private final DwrfEncryptionProvider dwrfEncryptionProvider;

    @Inject
    public IcebergFileWriterFactory(
            HdfsEnvironment hdfsEnvironment,
            TypeManager typeManager,
            FileFormatDataSourceStats readStats,
            NodeVersion nodeVersion,
            OrcFileWriterConfig orcFileWriterConfig,
            HiveDwrfEncryptionProvider dwrfEncryptionProvider)
    {
        this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null");
        this.typeManager = requireNonNull(typeManager, "typeManager is null");
        this.readStats = requireNonNull(readStats, "readStats is null");
        this.nodeVersion = requireNonNull(nodeVersion, "nodeVersion is null");
        this.orcFileWriterConfig = requireNonNull(orcFileWriterConfig, "orcFileWriterConfig is null");
        this.dwrfEncryptionProvider = requireNonNull(dwrfEncryptionProvider, "DwrfEncryptionProvider is null").toDwrfEncryptionProvider();
    }

    public IcebergFileWriter createFileWriter(
            Path outputPath,
            Schema icebergSchema,
            JobConf jobConf,
            ConnectorSession session,
            HdfsContext hdfsContext,
            FileFormat fileFormat,
            MetricsConfig metricsConfig)
    {
        switch (fileFormat) {
            case PARQUET:
                return createParquetWriter(outputPath, icebergSchema, jobConf, session, hdfsContext, metricsConfig);
            case ORC:
                return createOrcWriter(outputPath, icebergSchema, jobConf, session);
        }
        throw new PrestoException(NOT_SUPPORTED, "File format not supported for Iceberg: " + fileFormat);
    }

    private IcebergFileWriter createParquetWriter(
            Path outputPath,
            Schema icebergSchema,
            JobConf jobConf,
            ConnectorSession session,
            HdfsContext hdfsContext,
            MetricsConfig metricsConfig)
    {
        List fileColumnNames = icebergSchema.columns().stream()
                .map(Types.NestedField::name)
                .collect(toImmutableList());
        List fileColumnTypes = icebergSchema.columns().stream()
                .map(column -> toPrestoType(column.type(), typeManager))
                .collect(toImmutableList());

        try {
            FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), outputPath, jobConf);

            Callable rollbackAction = () -> {
                fileSystem.delete(outputPath, false);
                return null;
            };

            ParquetWriterOptions parquetWriterOptions = ParquetWriterOptions.builder()
                    .setMaxPageSize(getParquetWriterPageSize(session))
                    .setMaxBlockSize(getParquetWriterBlockSize(session))
                    .setWriterVersion(getParquetWriterVersion(session))
                    .build();

            return new IcebergParquetFileWriter(
                    hdfsEnvironment.doAs(session.getUser(), () -> fileSystem.create(outputPath)),
                    rollbackAction,
                    fileColumnNames,
                    fileColumnTypes,
                    convert(icebergSchema, "table"),
                    makeTypeMap(fileColumnTypes, fileColumnNames),
                    parquetWriterOptions,
                    IntStream.range(0, fileColumnNames.size()).toArray(),
                    getCompressionCodec(session).getParquetCompressionCodec().get(),
                    outputPath,
                    hdfsEnvironment,
                    hdfsContext,
                    metricsConfig);
        }
        catch (IOException e) {
            throw new PrestoException(ICEBERG_WRITER_OPEN_ERROR, "Error creating Parquet file", e);
        }
    }

    private IcebergFileWriter createOrcWriter(
            Path outputPath,
            Schema icebergSchema,
            JobConf jobConf,
            ConnectorSession session)
    {
        try {
            FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), outputPath, jobConf);
            DataSink orcDataSink = hdfsEnvironment.doAs(session.getUser(), () -> new OutputStreamDataSink(fileSystem.create(outputPath)));
            Callable rollbackAction = () -> {
                hdfsEnvironment.doAs(session.getUser(), () -> fileSystem.delete(outputPath, false));
                return null;
            };

            List columnFields = icebergSchema.columns();
            List fileColumnNames = columnFields.stream()
                    .map(Types.NestedField::name)
                    .collect(toImmutableList());
            List fileColumnTypes = columnFields.stream()
                    .map(Types.NestedField::type)
                    .map(type -> toPrestoType(type, typeManager))
                    .collect(toImmutableList());

            Optional> validationInputFactory = Optional.empty();
            if (isOrcOptimizedWriterValidate(session)) {
                validationInputFactory = Optional.of(() -> {
                    try {
                        return new HdfsOrcDataSource(
                                new OrcDataSourceId(outputPath.toString()),
                                hdfsEnvironment.doAs(session.getUser(), () -> fileSystem.getFileStatus(outputPath).getLen()),
                                getOrcMaxMergeDistance(session),
                                getOrcMaxBufferSize(session),
                                getOrcStreamBufferSize(session),
                                false,
                                hdfsEnvironment.doAs(session.getUser(), () -> fileSystem.open(outputPath)),
                                readStats);
                    }
                    catch (IOException e) {
                        throw new PrestoException(ICEBERG_WRITE_VALIDATION_FAILED, e);
                    }
                });
            }

            return new IcebergOrcFileWriter(
                    icebergSchema,
                    orcDataSink,
                    rollbackAction,
                    ORC,
                    fileColumnNames,
                    fileColumnTypes,
                    toOrcType(icebergSchema),
                    getCompressionCodec(session).getOrcCompressionKind(),
                    orcFileWriterConfig
                            .toOrcWriterOptionsBuilder()
                            .withFlushPolicy(DefaultOrcWriterFlushPolicy.builder()
                                    .withStripeMinSize(getOrcOptimizedWriterMinStripeSize(session))
                                    .withStripeMaxSize(getOrcOptimizedWriterMaxStripeSize(session))
                                    .withStripeMaxRowCount(getOrcOptimizedWriterMaxStripeRows(session))
                                    .build())
                            .withDictionaryMaxMemory(getOrcOptimizedWriterMaxDictionaryMemory(session))
                            .withMaxStringStatisticsLimit(getOrcStringStatisticsLimit(session))
                            .build(),
                    IntStream.range(0, fileColumnNames.size()).toArray(),
                    ImmutableMap.builder()
                            .put(PRESTO_VERSION_NAME, nodeVersion.toString())
                            .put(PRESTO_QUERY_ID_NAME, session.getQueryId())
                            .build(),
                    UTC,
                    validationInputFactory,
                    getOrcOptimizedWriterValidateMode(session),
                    orcWriterStats,
                    dwrfEncryptionProvider,
                    Optional.empty());
        }
        catch (IOException e) {
            throw new PrestoException(ICEBERG_WRITER_OPEN_ERROR, "Error creating ORC file", e);
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy