All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.iceberg.parquet.Parquet Maven / Gradle / Ivy

There is a newer version: 1.0.0.8
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.iceberg.parquet;

import java.io.File;
import java.io.IOException;
import java.util.Collection;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.iceberg.FileFormat;
import org.apache.iceberg.Files;
import org.apache.iceberg.MetricsConfig;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.Schema;
import org.apache.iceberg.SchemaParser;
import org.apache.iceberg.SortOrder;
import org.apache.iceberg.StructLike;
import org.apache.iceberg.Table;
import org.apache.iceberg.avro.AvroSchemaUtil;
import org.apache.iceberg.data.parquet.GenericParquetWriter;
import org.apache.iceberg.deletes.EqualityDeleteWriter;
import org.apache.iceberg.deletes.PositionDeleteWriter;
import org.apache.iceberg.encryption.EncryptionKeyMetadata;
import org.apache.iceberg.exceptions.RuntimeIOException;
import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.hadoop.HadoopInputFile;
import org.apache.iceberg.hadoop.HadoopOutputFile;
import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.io.DataWriter;
import org.apache.iceberg.io.DeleteSchemaUtil;
import org.apache.iceberg.io.FileAppender;
import org.apache.iceberg.io.InputFile;
import org.apache.iceberg.io.OutputFile;
import org.apache.iceberg.mapping.NameMapping;
import org.apache.iceberg.parquet.ParquetValueWriters.PositionDeleteStructWriter;
import org.apache.iceberg.parquet.ParquetValueWriters.StructWriter;
import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
import org.apache.iceberg.util.ArrayUtil;
import org.apache.iceberg.util.PropertyUtil;
import org.apache.parquet.HadoopReadOptions;
import org.apache.parquet.ParquetReadOptions;
import org.apache.parquet.avro.AvroReadSupport;
import org.apache.parquet.avro.AvroWriteSupport;
import org.apache.parquet.column.ParquetProperties;
import org.apache.parquet.column.ParquetProperties.WriterVersion;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.ParquetFileWriter;
import org.apache.parquet.hadoop.ParquetReader;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.api.ReadSupport;
import org.apache.parquet.hadoop.api.WriteSupport;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.schema.MessageType;

import static org.apache.iceberg.TableProperties.DELETE_PARQUET_COMPRESSION;
import static org.apache.iceberg.TableProperties.DELETE_PARQUET_COMPRESSION_LEVEL;
import static org.apache.iceberg.TableProperties.DELETE_PARQUET_DICT_SIZE_BYTES;
import static org.apache.iceberg.TableProperties.DELETE_PARQUET_PAGE_SIZE_BYTES;
import static org.apache.iceberg.TableProperties.DELETE_PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT;
import static org.apache.iceberg.TableProperties.DELETE_PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT;
import static org.apache.iceberg.TableProperties.DELETE_PARQUET_ROW_GROUP_SIZE_BYTES;
import static org.apache.iceberg.TableProperties.PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX;
import static org.apache.iceberg.TableProperties.PARQUET_BLOOM_FILTER_MAX_BYTES;
import static org.apache.iceberg.TableProperties.PARQUET_BLOOM_FILTER_MAX_BYTES_DEFAULT;
import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION;
import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_DEFAULT;
import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_LEVEL;
import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_LEVEL_DEFAULT;
import static org.apache.iceberg.TableProperties.PARQUET_DICT_SIZE_BYTES;
import static org.apache.iceberg.TableProperties.PARQUET_DICT_SIZE_BYTES_DEFAULT;
import static org.apache.iceberg.TableProperties.PARQUET_PAGE_SIZE_BYTES;
import static org.apache.iceberg.TableProperties.PARQUET_PAGE_SIZE_BYTES_DEFAULT;
import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT;
import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT_DEFAULT;
import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT;
import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT_DEFAULT;
import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES;
import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT;

public class Parquet {
  private Parquet() {
  }

  private static final Collection READ_PROPERTIES_TO_REMOVE = Sets.newHashSet(
      "parquet.read.filter", "parquet.private.read.filter.predicate", "parquet.read.support.class");

  public static WriteBuilder write(OutputFile file) {
    return new WriteBuilder(file);
  }

  public static class WriteBuilder {
    private final OutputFile file;
    private final Configuration conf;
    private final Map metadata = Maps.newLinkedHashMap();
    private final Map config = Maps.newLinkedHashMap();
    private Schema schema = null;
    private String name = "table";
    private WriteSupport writeSupport = null;
    private Function> createWriterFunc = null;
    private MetricsConfig metricsConfig = MetricsConfig.getDefault();
    private ParquetFileWriter.Mode writeMode = ParquetFileWriter.Mode.CREATE;
    private WriterVersion writerVersion = WriterVersion.PARQUET_1_0;
    private Function, Context> createContextFunc = Context::dataContext;

    private WriteBuilder(OutputFile file) {
      this.file = file;
      if (file instanceof HadoopOutputFile) {
        this.conf = new Configuration(((HadoopOutputFile) file).getConf());
      } else {
        this.conf = new Configuration();
      }
    }

    public WriteBuilder forTable(Table table) {
      schema(table.schema());
      setAll(table.properties());
      metricsConfig(MetricsConfig.forTable(table));
      return this;
    }

    public WriteBuilder schema(Schema newSchema) {
      this.schema = newSchema;
      return this;
    }

    public WriteBuilder named(String newName) {
      this.name = newName;
      return this;
    }

    public WriteBuilder writeSupport(WriteSupport newWriteSupport) {
      this.writeSupport = newWriteSupport;
      return this;
    }

    public WriteBuilder set(String property, String value) {
      config.put(property, value);
      return this;
    }

    public WriteBuilder setAll(Map properties) {
      config.putAll(properties);
      return this;
    }

    public WriteBuilder meta(String property, String value) {
      metadata.put(property, value);
      return this;
    }

    public WriteBuilder createWriterFunc(Function> newCreateWriterFunc) {
      this.createWriterFunc = newCreateWriterFunc;
      return this;
    }

    public WriteBuilder metricsConfig(MetricsConfig newMetricsConfig) {
      this.metricsConfig = newMetricsConfig;
      return this;
    }

    public WriteBuilder overwrite() {
      return overwrite(true);
    }

    public WriteBuilder overwrite(boolean enabled) {
      this.writeMode = enabled ? ParquetFileWriter.Mode.OVERWRITE : ParquetFileWriter.Mode.CREATE;
      return this;
    }

    public WriteBuilder writerVersion(WriterVersion version) {
      this.writerVersion = version;
      return this;
    }

    @SuppressWarnings("unchecked")
    private  WriteSupport getWriteSupport(MessageType type) {
      if (writeSupport != null) {
        return (WriteSupport) writeSupport;
      } else {
        return new AvroWriteSupport<>(
            type,
            ParquetAvro.parquetAvroSchema(AvroSchemaUtil.convert(schema, name)),
            ParquetAvro.DEFAULT_MODEL);
      }
    }

    /*
     * Sets the writer version. Default value is PARQUET_1_0 (v1).
     */
    @VisibleForTesting
    WriteBuilder withWriterVersion(WriterVersion version) {
      this.writerVersion = version;
      return this;
    }

    // supposed to always be a private method used strictly by data and delete write builders
    private WriteBuilder createContextFunc(Function, Context> newCreateContextFunc) {
      this.createContextFunc = newCreateContextFunc;
      return this;
    }

    public  FileAppender build() throws IOException {
      Preconditions.checkNotNull(schema, "Schema is required");
      Preconditions.checkNotNull(name, "Table name is required and cannot be null");

      // add the Iceberg schema to keyValueMetadata
      meta("iceberg.schema", SchemaParser.toJson(schema));

      // Map Iceberg properties to pass down to the Parquet writer
      Context context = createContextFunc.apply(config);

      int rowGroupSize = context.rowGroupSize();
      int pageSize = context.pageSize();
      int dictionaryPageSize = context.dictionaryPageSize();
      String compressionLevel = context.compressionLevel();
      CompressionCodecName codec = context.codec();
      int rowGroupCheckMinRecordCount = context.rowGroupCheckMinRecordCount();
      int rowGroupCheckMaxRecordCount = context.rowGroupCheckMaxRecordCount();
      int bloomFilterMaxBytes = context.bloomFilterMaxBytes();
      Map columnBloomFilterEnabled = context.columnBloomFilterEnabled();

      if (compressionLevel != null) {
        switch (codec) {
          case GZIP:
            config.put("zlib.compress.level", compressionLevel);
            break;
          case BROTLI:
            config.put("compression.brotli.quality", compressionLevel);
            break;
          case ZSTD:
            // keep "io.compression.codec.zstd.level" for backwards compatibility
            config.put("io.compression.codec.zstd.level", compressionLevel);
            config.put("parquet.compression.codec.zstd.level", compressionLevel);
            break;
          default:
            // compression level is not supported; ignore it
        }
      }

      set("parquet.avro.write-old-list-structure", "false");
      MessageType type = ParquetSchemaUtil.convert(schema, name);

      if (createWriterFunc != null) {
        Preconditions.checkArgument(writeSupport == null,
            "Cannot write with both write support and Parquet value writer");

        for (Map.Entry entry : config.entrySet()) {
          conf.set(entry.getKey(), entry.getValue());
        }

        ParquetProperties.Builder propsBuilder = ParquetProperties.builder()
            .withWriterVersion(writerVersion)
            .withPageSize(pageSize)
            .withDictionaryPageSize(dictionaryPageSize)
            .withMinRowCountForPageSizeCheck(rowGroupCheckMinRecordCount)
            .withMaxRowCountForPageSizeCheck(rowGroupCheckMaxRecordCount)
            .withMaxBloomFilterBytes(bloomFilterMaxBytes);

        for (Map.Entry entry : columnBloomFilterEnabled.entrySet()) {
          String colPath = entry.getKey();
          String bloomEnabled = entry.getValue();
          propsBuilder.withBloomFilterEnabled(colPath, Boolean.valueOf(bloomEnabled));
        }

        ParquetProperties parquetProperties = propsBuilder.build();

        return new org.apache.iceberg.parquet.ParquetWriter<>(
            conf, file, schema, rowGroupSize, metadata, createWriterFunc, codec,
            parquetProperties, metricsConfig, writeMode);
      } else {
        ParquetWriteBuilder parquetWriteBuilder = new ParquetWriteBuilder(ParquetIO.file(file))
            .withWriterVersion(writerVersion)
            .setType(type)
            .setConfig(config)
            .setKeyValueMetadata(metadata)
            .setWriteSupport(getWriteSupport(type))
            .withCompressionCodec(codec)
            .withWriteMode(writeMode)
            .withRowGroupSize(rowGroupSize)
            .withPageSize(pageSize)
            .withDictionaryPageSize(dictionaryPageSize);

        for (Map.Entry entry : columnBloomFilterEnabled.entrySet()) {
          String colPath = entry.getKey();
          String bloomEnabled = entry.getValue();
          parquetWriteBuilder.withBloomFilterEnabled(colPath, Boolean.valueOf(bloomEnabled));
        }

        return new ParquetWriteAdapter<>(parquetWriteBuilder.build(), metricsConfig);
      }
    }

    private static class Context {
      private final int rowGroupSize;
      private final int pageSize;
      private final int dictionaryPageSize;
      private final CompressionCodecName codec;
      private final String compressionLevel;
      private final int rowGroupCheckMinRecordCount;
      private final int rowGroupCheckMaxRecordCount;
      private final int bloomFilterMaxBytes;
      private final Map columnBloomFilterEnabled;

      private Context(int rowGroupSize, int pageSize, int dictionaryPageSize,
                      CompressionCodecName codec, String compressionLevel,
                      int rowGroupCheckMinRecordCount, int rowGroupCheckMaxRecordCount,
                      int bloomFilterMaxBytes,
                      Map columnBloomFilterEnabled) {
        this.rowGroupSize = rowGroupSize;
        this.pageSize = pageSize;
        this.dictionaryPageSize = dictionaryPageSize;
        this.codec = codec;
        this.compressionLevel = compressionLevel;
        this.rowGroupCheckMinRecordCount = rowGroupCheckMinRecordCount;
        this.rowGroupCheckMaxRecordCount = rowGroupCheckMaxRecordCount;
        this.bloomFilterMaxBytes = bloomFilterMaxBytes;
        this.columnBloomFilterEnabled = columnBloomFilterEnabled;
      }

      static Context dataContext(Map config) {
        int rowGroupSize = PropertyUtil.propertyAsInt(config,
            PARQUET_ROW_GROUP_SIZE_BYTES, PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT);
        Preconditions.checkArgument(rowGroupSize > 0, "Row group size must be > 0");

        int pageSize = PropertyUtil.propertyAsInt(config,
            PARQUET_PAGE_SIZE_BYTES, PARQUET_PAGE_SIZE_BYTES_DEFAULT);
        Preconditions.checkArgument(pageSize > 0, "Page size must be > 0");

        int dictionaryPageSize = PropertyUtil.propertyAsInt(config,
            PARQUET_DICT_SIZE_BYTES, PARQUET_DICT_SIZE_BYTES_DEFAULT);
        Preconditions.checkArgument(dictionaryPageSize > 0, "Dictionary page size must be > 0");

        String codecAsString = config.getOrDefault(PARQUET_COMPRESSION, PARQUET_COMPRESSION_DEFAULT);
        CompressionCodecName codec = toCodec(codecAsString);

        String compressionLevel = config.getOrDefault(PARQUET_COMPRESSION_LEVEL, PARQUET_COMPRESSION_LEVEL_DEFAULT);

        int rowGroupCheckMinRecordCount = PropertyUtil.propertyAsInt(config,
            PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT, PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT_DEFAULT);
        Preconditions.checkArgument(rowGroupCheckMinRecordCount > 0,
            "Row group check minimal record count must be > 0");

        int rowGroupCheckMaxRecordCount = PropertyUtil.propertyAsInt(config,
            PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT, PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT_DEFAULT);
        Preconditions.checkArgument(rowGroupCheckMaxRecordCount > 0,
            "Row group check maximum record count must be > 0");
        Preconditions.checkArgument(rowGroupCheckMaxRecordCount >= rowGroupCheckMinRecordCount,
            "Row group check maximum record count must be >= minimal record count");

        int bloomFilterMaxBytes = PropertyUtil.propertyAsInt(config, PARQUET_BLOOM_FILTER_MAX_BYTES,
            PARQUET_BLOOM_FILTER_MAX_BYTES_DEFAULT);
        Preconditions.checkArgument(bloomFilterMaxBytes > 0, "bloom Filter Max Bytes must be > 0");

        Map columnBloomFilterEnabled =
            PropertyUtil.propertiesWithPrefix(config, PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX);

        return new Context(rowGroupSize, pageSize, dictionaryPageSize, codec, compressionLevel,
            rowGroupCheckMinRecordCount, rowGroupCheckMaxRecordCount, bloomFilterMaxBytes,
            columnBloomFilterEnabled);
      }

      static Context deleteContext(Map config) {
        // default delete config using data config
        Context dataContext = dataContext(config);

        int rowGroupSize = PropertyUtil.propertyAsInt(config,
            DELETE_PARQUET_ROW_GROUP_SIZE_BYTES, dataContext.rowGroupSize());
        Preconditions.checkArgument(rowGroupSize > 0, "Row group size must be > 0");

        int pageSize = PropertyUtil.propertyAsInt(config,
            DELETE_PARQUET_PAGE_SIZE_BYTES, dataContext.pageSize());
        Preconditions.checkArgument(pageSize > 0, "Page size must be > 0");

        int dictionaryPageSize = PropertyUtil.propertyAsInt(config,
            DELETE_PARQUET_DICT_SIZE_BYTES, dataContext.dictionaryPageSize());
        Preconditions.checkArgument(dictionaryPageSize > 0, "Dictionary page size must be > 0");

        String codecAsString = config.get(DELETE_PARQUET_COMPRESSION);
        CompressionCodecName codec = codecAsString != null ? toCodec(codecAsString) : dataContext.codec();

        String compressionLevel = config.getOrDefault(DELETE_PARQUET_COMPRESSION_LEVEL, dataContext.compressionLevel());

        int rowGroupCheckMinRecordCount = PropertyUtil.propertyAsInt(config,
            DELETE_PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT, dataContext.rowGroupCheckMinRecordCount());
        Preconditions.checkArgument(rowGroupCheckMinRecordCount > 0,
            "Row group check minimal record count must be > 0");

        int rowGroupCheckMaxRecordCount = PropertyUtil.propertyAsInt(config,
            DELETE_PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT, dataContext.rowGroupCheckMaxRecordCount());
        Preconditions.checkArgument(rowGroupCheckMaxRecordCount > 0,
            "Row group check maximum record count must be > 0");
        Preconditions.checkArgument(rowGroupCheckMaxRecordCount >= rowGroupCheckMinRecordCount,
            "Row group check maximum record count must be >= minimal record count");

        int bloomFilterMaxBytes = PropertyUtil.propertyAsInt(config, PARQUET_BLOOM_FILTER_MAX_BYTES,
            PARQUET_BLOOM_FILTER_MAX_BYTES_DEFAULT);
        Preconditions.checkArgument(bloomFilterMaxBytes > 0, "bloom Filter Max Bytes must be > 0");

        Map columnBloomFilterEnabled =
            PropertyUtil.propertiesWithPrefix(config, PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX);

        return new Context(rowGroupSize, pageSize, dictionaryPageSize, codec, compressionLevel,
            rowGroupCheckMinRecordCount, rowGroupCheckMaxRecordCount, bloomFilterMaxBytes,
            columnBloomFilterEnabled);
      }

      private static CompressionCodecName toCodec(String codecAsString) {
        try {
          return CompressionCodecName.valueOf(codecAsString.toUpperCase(Locale.ENGLISH));
        } catch (IllegalArgumentException e) {
          throw new IllegalArgumentException("Unsupported compression codec: " + codecAsString);
        }
      }

      int rowGroupSize() {
        return rowGroupSize;
      }

      int pageSize() {
        return pageSize;
      }

      int dictionaryPageSize() {
        return dictionaryPageSize;
      }

      CompressionCodecName codec() {
        return codec;
      }

      String compressionLevel() {
        return compressionLevel;
      }

      int rowGroupCheckMinRecordCount() {
        return rowGroupCheckMinRecordCount;
      }

      int rowGroupCheckMaxRecordCount() {
        return rowGroupCheckMaxRecordCount;
      }

      int bloomFilterMaxBytes() {
        return bloomFilterMaxBytes;
      }

      Map columnBloomFilterEnabled() {
        return columnBloomFilterEnabled;
      }
    }
  }

  public static DataWriteBuilder writeData(OutputFile file) {
    return new DataWriteBuilder(file);
  }

  public static class DataWriteBuilder {
    private final WriteBuilder appenderBuilder;
    private final String location;
    private PartitionSpec spec = null;
    private StructLike partition = null;
    private EncryptionKeyMetadata keyMetadata = null;
    private SortOrder sortOrder = null;

    private DataWriteBuilder(OutputFile file) {
      this.appenderBuilder = write(file);
      this.location = file.location();
    }

    public DataWriteBuilder forTable(Table table) {
      schema(table.schema());
      withSpec(table.spec());
      setAll(table.properties());
      metricsConfig(MetricsConfig.forTable(table));
      return this;
    }

    public DataWriteBuilder schema(Schema newSchema) {
      appenderBuilder.schema(newSchema);
      return this;
    }

    public DataWriteBuilder set(String property, String value) {
      appenderBuilder.set(property, value);
      return this;
    }

    public DataWriteBuilder setAll(Map properties) {
      appenderBuilder.setAll(properties);
      return this;
    }

    public DataWriteBuilder meta(String property, String value) {
      appenderBuilder.meta(property, value);
      return this;
    }

    public DataWriteBuilder overwrite() {
      return overwrite(true);
    }

    public DataWriteBuilder overwrite(boolean enabled) {
      appenderBuilder.overwrite(enabled);
      return this;
    }

    public DataWriteBuilder metricsConfig(MetricsConfig newMetricsConfig) {
      appenderBuilder.metricsConfig(newMetricsConfig);
      return this;
    }

    public DataWriteBuilder createWriterFunc(Function> newCreateWriterFunc) {
      appenderBuilder.createWriterFunc(newCreateWriterFunc);
      return this;
    }

    public DataWriteBuilder withSpec(PartitionSpec newSpec) {
      this.spec = newSpec;
      return this;
    }

    public DataWriteBuilder withPartition(StructLike newPartition) {
      this.partition = newPartition;
      return this;
    }

    public DataWriteBuilder withKeyMetadata(EncryptionKeyMetadata metadata) {
      this.keyMetadata = metadata;
      return this;
    }

    public DataWriteBuilder withSortOrder(SortOrder newSortOrder) {
      this.sortOrder = newSortOrder;
      return this;
    }

    public  DataWriter build() throws IOException {
      Preconditions.checkArgument(spec != null, "Cannot create data writer without spec");
      Preconditions.checkArgument(spec.isUnpartitioned() || partition != null,
          "Partition must not be null when creating data writer for partitioned spec");

      FileAppender fileAppender = appenderBuilder.build();
      return new DataWriter<>(fileAppender, FileFormat.PARQUET, location, spec, partition, keyMetadata, sortOrder);
    }
  }

  public static DeleteWriteBuilder writeDeletes(OutputFile file) {
    return new DeleteWriteBuilder(file);
  }

  public static class DeleteWriteBuilder {
    private final WriteBuilder appenderBuilder;
    private final String location;
    private Function> createWriterFunc = null;
    private Schema rowSchema = null;
    private PartitionSpec spec = null;
    private StructLike partition = null;
    private EncryptionKeyMetadata keyMetadata = null;
    private int[] equalityFieldIds = null;
    private SortOrder sortOrder;
    private Function pathTransformFunc = Function.identity();

    private DeleteWriteBuilder(OutputFile file) {
      this.appenderBuilder = write(file);
      this.location = file.location();
    }

    public DeleteWriteBuilder forTable(Table table) {
      rowSchema(table.schema());
      withSpec(table.spec());
      setAll(table.properties());
      metricsConfig(MetricsConfig.forTable(table));
      return this;
    }

    public DeleteWriteBuilder set(String property, String value) {
      appenderBuilder.set(property, value);
      return this;
    }

    public DeleteWriteBuilder setAll(Map properties) {
      appenderBuilder.setAll(properties);
      return this;
    }

    public DeleteWriteBuilder meta(String property, String value) {
      appenderBuilder.meta(property, value);
      return this;
    }

    public DeleteWriteBuilder overwrite() {
      return overwrite(true);
    }

    public DeleteWriteBuilder overwrite(boolean enabled) {
      appenderBuilder.overwrite(enabled);
      return this;
    }

    public DeleteWriteBuilder metricsConfig(MetricsConfig newMetricsConfig) {
      appenderBuilder.metricsConfig(newMetricsConfig);
      return this;
    }

    public DeleteWriteBuilder createWriterFunc(Function> newCreateWriterFunc) {
      this.createWriterFunc = newCreateWriterFunc;
      return this;
    }

    public DeleteWriteBuilder rowSchema(Schema newSchema) {
      this.rowSchema = newSchema;
      return this;
    }

    public DeleteWriteBuilder withSpec(PartitionSpec newSpec) {
      this.spec = newSpec;
      return this;
    }

    public DeleteWriteBuilder withPartition(StructLike key) {
      this.partition = key;
      return this;
    }

    public DeleteWriteBuilder withKeyMetadata(EncryptionKeyMetadata metadata) {
      this.keyMetadata = metadata;
      return this;
    }

    public DeleteWriteBuilder equalityFieldIds(List fieldIds) {
      this.equalityFieldIds = ArrayUtil.toIntArray(fieldIds);
      return this;
    }

    public DeleteWriteBuilder equalityFieldIds(int... fieldIds) {
      this.equalityFieldIds = fieldIds;
      return this;
    }

    public DeleteWriteBuilder transformPaths(Function newPathTransformFunc) {
      this.pathTransformFunc = newPathTransformFunc;
      return this;
    }

    public DeleteWriteBuilder withSortOrder(SortOrder newSortOrder) {
      this.sortOrder = newSortOrder;
      return this;
    }

    public  EqualityDeleteWriter buildEqualityWriter() throws IOException {
      Preconditions.checkState(rowSchema != null, "Cannot create equality delete file without a schema");
      Preconditions.checkState(equalityFieldIds != null, "Cannot create equality delete file without delete field ids");
      Preconditions.checkState(createWriterFunc != null,
          "Cannot create equality delete file unless createWriterFunc is set");
      Preconditions.checkArgument(spec != null,
          "Spec must not be null when creating equality delete writer");
      Preconditions.checkArgument(spec.isUnpartitioned() || partition != null,
          "Partition must not be null for partitioned writes");

      meta("delete-type", "equality");
      meta("delete-field-ids", IntStream.of(equalityFieldIds)
          .mapToObj(Objects::toString)
          .collect(Collectors.joining(", ")));

      // the appender uses the row schema without extra columns
      appenderBuilder.schema(rowSchema);
      appenderBuilder.createWriterFunc(createWriterFunc);
      appenderBuilder.createContextFunc(WriteBuilder.Context::deleteContext);

      return new EqualityDeleteWriter<>(
          appenderBuilder.build(), FileFormat.PARQUET, location, spec, partition, keyMetadata,
          sortOrder, equalityFieldIds);
    }

    public  PositionDeleteWriter buildPositionWriter() throws IOException {
      Preconditions.checkState(equalityFieldIds == null, "Cannot create position delete file using delete field ids");
      Preconditions.checkArgument(spec != null, "Spec must not be null when creating position delete writer");
      Preconditions.checkArgument(spec.isUnpartitioned() || partition != null,
          "Partition must not be null for partitioned writes");
      Preconditions.checkArgument(rowSchema == null || createWriterFunc != null,
          "Create function should be provided if we write row data");

      meta("delete-type", "position");

      if (rowSchema != null && createWriterFunc != null) {
        // the appender uses the row schema wrapped with position fields
        appenderBuilder.schema(DeleteSchemaUtil.posDeleteSchema(rowSchema));

        appenderBuilder.createWriterFunc(parquetSchema -> {
          ParquetValueWriter writer = createWriterFunc.apply(parquetSchema);
          if (writer instanceof StructWriter) {
            return new PositionDeleteStructWriter((StructWriter) writer, pathTransformFunc);
          } else {
            throw new UnsupportedOperationException("Cannot wrap writer for position deletes: " + writer.getClass());
          }
        });

      } else {
        appenderBuilder.schema(DeleteSchemaUtil.pathPosSchema());

        // We ignore the 'createWriterFunc' and 'rowSchema' even if is provided, since we do not write row data itself
        appenderBuilder.createWriterFunc(parquetSchema ->
            new PositionDeleteStructWriter((StructWriter) GenericParquetWriter.buildWriter(parquetSchema),
                Function.identity()));
      }

      appenderBuilder.createContextFunc(WriteBuilder.Context::deleteContext);

      return new PositionDeleteWriter<>(
          appenderBuilder.build(), FileFormat.PARQUET, location, spec, partition, keyMetadata);
    }
  }

  private static class ParquetWriteBuilder extends ParquetWriter.Builder> {
    private Map keyValueMetadata = Maps.newHashMap();
    private Map config = Maps.newHashMap();
    private MessageType type;
    private WriteSupport writeSupport;

    private ParquetWriteBuilder(org.apache.parquet.io.OutputFile path) {
      super(path);
    }

    @Override
    protected ParquetWriteBuilder self() {
      return this;
    }

    public ParquetWriteBuilder setKeyValueMetadata(Map keyValueMetadata) {
      this.keyValueMetadata = keyValueMetadata;
      return self();
    }

    public ParquetWriteBuilder setConfig(Map config) {
      this.config = config;
      return self();
    }

    public ParquetWriteBuilder setType(MessageType type) {
      this.type = type;
      return self();
    }

    public ParquetWriteBuilder setWriteSupport(WriteSupport writeSupport) {
      this.writeSupport = writeSupport;
      return self();
    }

    @Override
    protected WriteSupport getWriteSupport(Configuration configuration) {
      for (Map.Entry entry : config.entrySet()) {
        configuration.set(entry.getKey(), entry.getValue());
      }
      return new ParquetWriteSupport<>(type, keyValueMetadata, writeSupport);
    }
  }

  public static ReadBuilder read(InputFile file) {
    return new ReadBuilder(file);
  }

  public static class ReadBuilder {
    private final InputFile file;
    private final Map properties = Maps.newHashMap();
    private Long start = null;
    private Long length = null;
    private Schema schema = null;
    private Expression filter = null;
    private ReadSupport readSupport = null;
    private Function> batchedReaderFunc = null;
    private Function> readerFunc = null;
    private boolean filterRecords = true;
    private boolean caseSensitive = true;
    private boolean callInit = false;
    private boolean reuseContainers = false;
    private int maxRecordsPerBatch = 10000;
    private NameMapping nameMapping = null;

    private ReadBuilder(InputFile file) {
      this.file = file;
    }

    /**
     * Restricts the read to the given range: [start, start + length).
     *
     * @param newStart the start position for this read
     * @param newLength the length of the range this read should scan
     * @return this builder for method chaining
     */
    public ReadBuilder split(long newStart, long newLength) {
      this.start = newStart;
      this.length = newLength;
      return this;
    }

    public ReadBuilder project(Schema newSchema) {
      this.schema = newSchema;
      return this;
    }

    public ReadBuilder caseInsensitive() {
      return caseSensitive(false);
    }

    public ReadBuilder caseSensitive(boolean newCaseSensitive) {
      this.caseSensitive = newCaseSensitive;
      return this;
    }

    public ReadBuilder filterRecords(boolean newFilterRecords) {
      this.filterRecords = newFilterRecords;
      return this;
    }

    public ReadBuilder filter(Expression newFilter) {
      this.filter = newFilter;
      return this;
    }

    public ReadBuilder readSupport(ReadSupport newFilterSupport) {
      this.readSupport = newFilterSupport;
      return this;
    }

    public ReadBuilder createReaderFunc(Function> newReaderFunction) {
      Preconditions.checkArgument(this.batchedReaderFunc == null,
          "Reader function cannot be set since the batched version is already set");
      this.readerFunc = newReaderFunction;
      return this;
    }

    public ReadBuilder createBatchedReaderFunc(Function> func) {
      Preconditions.checkArgument(this.readerFunc == null,
          "Batched reader function cannot be set since the non-batched version is already set");
      this.batchedReaderFunc = func;
      return this;
    }

    public ReadBuilder set(String key, String value) {
      properties.put(key, value);
      return this;
    }

    public ReadBuilder callInit() {
      this.callInit = true;
      return this;
    }

    public ReadBuilder reuseContainers() {
      this.reuseContainers = true;
      return this;
    }

    public ReadBuilder recordsPerBatch(int numRowsPerBatch) {
      this.maxRecordsPerBatch = numRowsPerBatch;
      return this;
    }

    public ReadBuilder withNameMapping(NameMapping newNameMapping) {
      this.nameMapping = newNameMapping;
      return this;
    }

    @SuppressWarnings({"unchecked", "checkstyle:CyclomaticComplexity"})
    public  CloseableIterable build() {
      if (readerFunc != null || batchedReaderFunc != null) {
        ParquetReadOptions.Builder optionsBuilder;
        if (file instanceof HadoopInputFile) {
          // remove read properties already set that may conflict with this read
          Configuration conf = new Configuration(((HadoopInputFile) file).getConf());
          for (String property : READ_PROPERTIES_TO_REMOVE) {
            conf.unset(property);
          }
          optionsBuilder = HadoopReadOptions.builder(conf);
          // page size not used by decompressors
          optionsBuilder.withCodecFactory(new ParquetCodecFactory(conf, 0));
        } else {
          optionsBuilder = ParquetReadOptions.builder();
          // page size not used by decompressors
          optionsBuilder.withCodecFactory(new ParquetCodecFactory(new Configuration(), 0));
        }

        for (Map.Entry entry : properties.entrySet()) {
          optionsBuilder.set(entry.getKey(), entry.getValue());
        }

        if (start != null) {
          optionsBuilder.withRange(start, start + length);
        }

        ParquetReadOptions options = optionsBuilder.build();

        if (batchedReaderFunc != null) {
          return new VectorizedParquetReader<>(file, schema, options, batchedReaderFunc, nameMapping, filter,
              reuseContainers, caseSensitive, maxRecordsPerBatch);
        } else {
          return new org.apache.iceberg.parquet.ParquetReader<>(
              file, schema, options, readerFunc, nameMapping, filter, reuseContainers, caseSensitive);
        }
      }

      ParquetReadBuilder builder = new ParquetReadBuilder<>(ParquetIO.file(file));

      builder.project(schema);

      if (readSupport != null) {
        builder.readSupport((ReadSupport) readSupport);
      } else {
        builder.readSupport(new AvroReadSupport<>(ParquetAvro.DEFAULT_MODEL));
      }

      // default options for readers
      builder.set("parquet.strict.typing", "false") // allow type promotion
          .set("parquet.avro.compatible", "false") // use the new RecordReader with Utf8 support
          .set("parquet.avro.add-list-element-records", "false"); // assume that lists use a 3-level schema

      for (Map.Entry entry : properties.entrySet()) {
        builder.set(entry.getKey(), entry.getValue());
      }

      if (filter != null) {
        // TODO: should not need to get the schema to push down before opening the file.
        // Parquet should allow setting a filter inside its read support
        MessageType type;
        try (ParquetFileReader schemaReader = ParquetFileReader.open(ParquetIO.file(file))) {
          type = schemaReader.getFileMetaData().getSchema();
        } catch (IOException e) {
          throw new RuntimeIOException(e);
        }
        Schema fileSchema = ParquetSchemaUtil.convert(type);
        builder.useStatsFilter()
            .useDictionaryFilter()
            .useRecordFilter(filterRecords)
            .useBloomFilter()
            .withFilter(ParquetFilters.convert(fileSchema, filter, caseSensitive));
      } else {
        // turn off filtering
        builder.useStatsFilter(false)
            .useDictionaryFilter(false)
            .useBloomFilter(false)
            .useRecordFilter(false);
      }

      if (callInit) {
        builder.callInit();
      }

      if (start != null) {
        builder.withFileRange(start, start + length);
      }

      if (nameMapping != null) {
        builder.withNameMapping(nameMapping);
      }

      return new ParquetIterable<>(builder);
    }
  }

  private static class ParquetReadBuilder extends ParquetReader.Builder {
    private Schema schema = null;
    private ReadSupport readSupport = null;
    private boolean callInit = false;
    private NameMapping nameMapping = null;

    private ParquetReadBuilder(org.apache.parquet.io.InputFile file) {
      super(file);
    }

    public ParquetReadBuilder project(Schema newSchema) {
      this.schema = newSchema;
      return this;
    }

    public ParquetReadBuilder withNameMapping(NameMapping newNameMapping) {
      this.nameMapping = newNameMapping;
      return this;
    }

    public ParquetReadBuilder readSupport(ReadSupport newReadSupport) {
      this.readSupport = newReadSupport;
      return this;
    }

    public ParquetReadBuilder callInit() {
      this.callInit = true;
      return this;
    }

    @Override
    protected ReadSupport getReadSupport() {
      return new ParquetReadSupport<>(schema, readSupport, callInit, nameMapping);
    }
  }

  /**
   * Combines several files into one
   *
   * @param inputFiles   an {@link Iterable} of parquet files. The order of iteration determines the order in which
   *                     content of files are read and written to the {@code outputFile}
   * @param outputFile   the output parquet file containing all the data from {@code inputFiles}
   * @param rowGroupSize the row group size to use when writing the {@code outputFile}
   * @param schema       the schema of the data
   * @param metadata     extraMetadata to write at the footer of the {@code outputFile}
   */
  public static void concat(Iterable inputFiles, File outputFile, int rowGroupSize, Schema schema,
                            Map metadata) throws IOException {
    OutputFile file = Files.localOutput(outputFile);
    ParquetFileWriter writer = new ParquetFileWriter(
            ParquetIO.file(file), ParquetSchemaUtil.convert(schema, "table"),
            ParquetFileWriter.Mode.CREATE, rowGroupSize, 0);
    writer.start();
    for (File inputFile : inputFiles) {
      writer.appendFile(ParquetIO.file(Files.localInput(inputFile)));
    }
    writer.end(metadata);
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy