org.apache.iceberg.spark.source.SparkAppenderFactory Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-spark3 Show documentation
A table format for huge analytic datasets
There is a newer version: 0.13.2
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.iceberg.spark.source;

import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.Map;
import org.apache.iceberg.FileFormat;
import org.apache.iceberg.MetricsConfig;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.Schema;
import org.apache.iceberg.StructLike;
import org.apache.iceberg.Table;
import org.apache.iceberg.avro.Avro;
import org.apache.iceberg.deletes.EqualityDeleteWriter;
import org.apache.iceberg.deletes.PositionDeleteWriter;
import org.apache.iceberg.encryption.EncryptedOutputFile;
import org.apache.iceberg.exceptions.RuntimeIOException;
import org.apache.iceberg.io.DataWriter;
import org.apache.iceberg.io.DeleteSchemaUtil;
import org.apache.iceberg.io.FileAppender;
import org.apache.iceberg.io.FileAppenderFactory;
import org.apache.iceberg.io.OutputFile;
import org.apache.iceberg.orc.ORC;
import org.apache.iceberg.parquet.Parquet;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.spark.SparkSchemaUtil;
import org.apache.iceberg.spark.data.SparkAvroWriter;
import org.apache.iceberg.spark.data.SparkOrcWriter;
import org.apache.iceberg.spark.data.SparkParquetWriters;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.unsafe.types.UTF8String;

class SparkAppenderFactory implements FileAppenderFactory {
  private final Map properties;
  private final Schema writeSchema;
  private final StructType dsSchema;
  private final PartitionSpec spec;
  private final int[] equalityFieldIds;
  private final Schema eqDeleteRowSchema;
  private final Schema posDeleteRowSchema;

  private StructType eqDeleteSparkType = null;
  private StructType posDeleteSparkType = null;

  SparkAppenderFactory(Map properties, Schema writeSchema, StructType dsSchema, PartitionSpec spec,
                       int[] equalityFieldIds, Schema eqDeleteRowSchema, Schema posDeleteRowSchema) {
    this.properties = properties;
    this.writeSchema = writeSchema;
    this.dsSchema = dsSchema;
    this.spec = spec;
    this.equalityFieldIds = equalityFieldIds;
    this.eqDeleteRowSchema = eqDeleteRowSchema;
    this.posDeleteRowSchema = posDeleteRowSchema;
  }

  static Builder builderFor(Table table, Schema writeSchema, StructType dsSchema) {
    return new Builder(table, writeSchema, dsSchema);
  }

  static class Builder {
    private final Table table;
    private final Schema writeSchema;
    private final StructType dsSchema;
    private PartitionSpec spec;
    private int[] equalityFieldIds;
    private Schema eqDeleteRowSchema;
    private Schema posDeleteRowSchema;


    Builder(Table table, Schema writeSchema, StructType dsSchema) {
      this.table = table;
      this.spec = table.spec();
      this.writeSchema = writeSchema;
      this.dsSchema = dsSchema;
    }

    Builder spec(PartitionSpec newSpec) {
      this.spec = newSpec;
      return this;
    }

    Builder equalityFieldIds(int[] newEqualityFieldIds) {
      this.equalityFieldIds = newEqualityFieldIds;
      return this;
    }

    Builder eqDeleteRowSchema(Schema newEqDeleteRowSchema) {
      this.eqDeleteRowSchema = newEqDeleteRowSchema;
      return this;
    }

    Builder posDelRowSchema(Schema newPosDelRowSchema) {
      this.posDeleteRowSchema = newPosDelRowSchema;
      return this;
    }

    SparkAppenderFactory build() {
      Preconditions.checkNotNull(table, "Table must not be null");
      Preconditions.checkNotNull(writeSchema, "Write Schema must not be null");
      Preconditions.checkNotNull(dsSchema, "DS Schema must not be null");
      if (equalityFieldIds != null) {
        Preconditions.checkNotNull(eqDeleteRowSchema, "Equality Field Ids and Equality Delete Row Schema" +
            " must be set together");
      }
      if (eqDeleteRowSchema != null) {
        Preconditions.checkNotNull(equalityFieldIds, "Equality Field Ids and Equality Delete Row Schema" +
            " must be set together");
      }

      return new SparkAppenderFactory(table.properties(), writeSchema, dsSchema, spec, equalityFieldIds,
          eqDeleteRowSchema, posDeleteRowSchema);
    }
  }

  private StructType lazyEqDeleteSparkType() {
    if (eqDeleteSparkType == null) {
      Preconditions.checkNotNull(eqDeleteRowSchema, "Equality delete row schema shouldn't be null");
      this.eqDeleteSparkType = SparkSchemaUtil.convert(eqDeleteRowSchema);
    }
    return eqDeleteSparkType;
  }

  private StructType lazyPosDeleteSparkType() {
    if (posDeleteSparkType == null) {
      Preconditions.checkNotNull(posDeleteRowSchema, "Position delete row schema shouldn't be null");
      this.posDeleteSparkType = SparkSchemaUtil.convert(posDeleteRowSchema);
    }
    return posDeleteSparkType;
  }

  @Override
  public FileAppender newAppender(OutputFile file, FileFormat fileFormat) {
    MetricsConfig metricsConfig = MetricsConfig.fromProperties(properties);
    try {
      switch (fileFormat) {
        case PARQUET:
          return Parquet.write(file)
              .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(dsSchema, msgType))
              .setAll(properties)
              .metricsConfig(metricsConfig)
              .schema(writeSchema)
              .overwrite()
              .build();

        case AVRO:
          return Avro.write(file)
              .createWriterFunc(ignored -> new SparkAvroWriter(dsSchema))
              .setAll(properties)
              .schema(writeSchema)
              .overwrite()
              .build();

        case ORC:
          return ORC.write(file)
              .createWriterFunc(SparkOrcWriter::new)
              .setAll(properties)
              .metricsConfig(metricsConfig)
              .schema(writeSchema)
              .overwrite()
              .build();

        default:
          throw new UnsupportedOperationException("Cannot write unknown format: " + fileFormat);
      }
    } catch (IOException e) {
      throw new RuntimeIOException(e);
    }
  }

  @Override
  public DataWriter newDataWriter(EncryptedOutputFile file, FileFormat format, StructLike partition) {
    return new DataWriter<>(newAppender(file.encryptingOutputFile(), format), format,
        file.encryptingOutputFile().location(), spec, partition, file.keyMetadata());
  }

  @Override
  public EqualityDeleteWriter newEqDeleteWriter(EncryptedOutputFile file, FileFormat format,
                                                             StructLike partition) {
    Preconditions.checkState(equalityFieldIds != null && equalityFieldIds.length > 0,
        "Equality field ids shouldn't be null or empty when creating equality-delete writer");
    Preconditions.checkNotNull(eqDeleteRowSchema,
        "Equality delete row schema shouldn't be null when creating equality-delete writer");

    try {
      switch (format) {
        case PARQUET:
          return Parquet.writeDeletes(file.encryptingOutputFile())
              .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(lazyEqDeleteSparkType(), msgType))
              .overwrite()
              .rowSchema(eqDeleteRowSchema)
              .withSpec(spec)
              .withPartition(partition)
              .equalityFieldIds(equalityFieldIds)
              .withKeyMetadata(file.keyMetadata())
              .buildEqualityWriter();

        case AVRO:
          return Avro.writeDeletes(file.encryptingOutputFile())
              .createWriterFunc(ignored -> new SparkAvroWriter(lazyEqDeleteSparkType()))
              .overwrite()
              .rowSchema(eqDeleteRowSchema)
              .withSpec(spec)
              .withPartition(partition)
              .equalityFieldIds(equalityFieldIds)
              .withKeyMetadata(file.keyMetadata())
              .buildEqualityWriter();

        case ORC:
          return ORC.writeDeletes(file.encryptingOutputFile())
              .createWriterFunc(SparkOrcWriter::new)
              .overwrite()
              .rowSchema(eqDeleteRowSchema)
              .withSpec(spec)
              .withPartition(partition)
              .equalityFieldIds(equalityFieldIds)
              .withKeyMetadata(file.keyMetadata())
              .buildEqualityWriter();

        default:
          throw new UnsupportedOperationException(
              "Cannot write equality-deletes for unsupported file format: " + format);
      }
    } catch (IOException e) {
      throw new UncheckedIOException("Failed to create new equality delete writer", e);
    }
  }

  @Override
  public PositionDeleteWriter newPosDeleteWriter(EncryptedOutputFile file, FileFormat format,
                                                              StructLike partition) {
    try {
      switch (format) {
        case PARQUET:
          StructType sparkPosDeleteSchema =
              SparkSchemaUtil.convert(DeleteSchemaUtil.posDeleteSchema(posDeleteRowSchema));
          return Parquet.writeDeletes(file.encryptingOutputFile())
              .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(sparkPosDeleteSchema, msgType))
              .overwrite()
              .rowSchema(posDeleteRowSchema)
              .withSpec(spec)
              .withPartition(partition)
              .withKeyMetadata(file.keyMetadata())
              .transformPaths(path -> UTF8String.fromString(path.toString()))
              .buildPositionWriter();

        case AVRO:
          return Avro.writeDeletes(file.encryptingOutputFile())
              .createWriterFunc(ignored -> new SparkAvroWriter(lazyPosDeleteSparkType()))
              .overwrite()
              .rowSchema(posDeleteRowSchema)
              .withSpec(spec)
              .withPartition(partition)
              .withKeyMetadata(file.keyMetadata())
              .buildPositionWriter();

        case ORC:
          return ORC.writeDeletes(file.encryptingOutputFile())
              .createWriterFunc(SparkOrcWriter::new)
              .overwrite()
              .rowSchema(posDeleteRowSchema)
              .withSpec(spec)
              .withPartition(partition)
              .withKeyMetadata(file.keyMetadata())
              .transformPaths(path -> UTF8String.fromString(path.toString()))
              .buildPositionWriter();

        default:
          throw new UnsupportedOperationException(
              "Cannot write pos-deletes for unsupported file format: " + format);
      }

    } catch (IOException e) {
      throw new UncheckedIOException("Failed to create new equality delete writer", e);
    }
  }
}