org.apache.iceberg.spark.source.SparkPositionDeletesRewrite Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-spark-3.5_2.13 Show documentation
A table format for huge analytic datasets
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.iceberg.spark.source;

import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import org.apache.iceberg.DeleteFile;
import org.apache.iceberg.FileFormat;
import org.apache.iceberg.MetadataColumns;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.PositionDeletesTable;
import org.apache.iceberg.Schema;
import org.apache.iceberg.StructLike;
import org.apache.iceberg.Table;
import org.apache.iceberg.deletes.DeleteGranularity;
import org.apache.iceberg.deletes.PositionDelete;
import org.apache.iceberg.io.ClusteredPositionDeleteWriter;
import org.apache.iceberg.io.FileIO;
import org.apache.iceberg.io.OutputFileFactory;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.spark.PositionDeletesRewriteCoordinator;
import org.apache.iceberg.spark.ScanTaskSetManager;
import org.apache.iceberg.spark.SparkWriteConf;
import org.apache.iceberg.util.DeleteFileSet;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.broadcast.Broadcast;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.connector.write.BatchWrite;
import org.apache.spark.sql.connector.write.DataWriter;
import org.apache.spark.sql.connector.write.DataWriterFactory;
import org.apache.spark.sql.connector.write.LogicalWriteInfo;
import org.apache.spark.sql.connector.write.PhysicalWriteInfo;
import org.apache.spark.sql.connector.write.Write;
import org.apache.spark.sql.connector.write.WriterCommitMessage;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

/**
 * {@link Write} class for rewriting position delete files from Spark. Responsible for creating
 * {@link SparkPositionDeletesRewrite.PositionDeleteBatchWrite}
 *
 * This class is meant to be used for an action to rewrite position delete files. Hence, it
 * assumes all position deletes to rewrite have come from {@link ScanTaskSetManager} and that all
 * have the same partition spec id and partition values.
 */
public class SparkPositionDeletesRewrite implements Write {

  private final JavaSparkContext sparkContext;
  private final Table table;
  private final String queryId;
  private final FileFormat format;
  private final long targetFileSize;
  private final DeleteGranularity deleteGranularity;
  private final Schema writeSchema;
  private final StructType dsSchema;
  private final String fileSetId;
  private final int specId;
  private final StructLike partition;
  private final Map writeProperties;

  /**
   * Constructs a {@link SparkPositionDeletesRewrite}.
   *
   * @param spark Spark session
   * @param table instance of {@link PositionDeletesTable}
   * @param writeConf Spark write config
   * @param writeInfo Spark write info
   * @param writeSchema Iceberg output schema
   * @param dsSchema schema of original incoming position deletes dataset
   * @param specId spec id of position deletes
   * @param partition partition value of position deletes
   */
  SparkPositionDeletesRewrite(
      SparkSession spark,
      Table table,
      SparkWriteConf writeConf,
      LogicalWriteInfo writeInfo,
      Schema writeSchema,
      StructType dsSchema,
      int specId,
      StructLike partition) {
    this.sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext());
    this.table = table;
    this.queryId = writeInfo.queryId();
    this.format = writeConf.deleteFileFormat();
    this.targetFileSize = writeConf.targetDeleteFileSize();
    this.deleteGranularity = writeConf.deleteGranularity();
    this.writeSchema = writeSchema;
    this.dsSchema = dsSchema;
    this.fileSetId = writeConf.rewrittenFileSetId();
    this.specId = specId;
    this.partition = partition;
    this.writeProperties = writeConf.writeProperties();
  }

  @Override
  public BatchWrite toBatch() {
    return new PositionDeleteBatchWrite();
  }

  /** {@link BatchWrite} class for rewriting position deletes files from Spark */
  class PositionDeleteBatchWrite implements BatchWrite {

    @Override
    public DataWriterFactory createBatchWriterFactory(PhysicalWriteInfo info) {
      // broadcast the table metadata as the writer factory will be sent to executors
      Broadcast tableBroadcast =
          sparkContext.broadcast(SerializableTableWithSize.copyOf(table));
      return new PositionDeletesWriterFactory(
          tableBroadcast,
          queryId,
          format,
          targetFileSize,
          deleteGranularity,
          writeSchema,
          dsSchema,
          specId,
          partition,
          writeProperties);
    }

    @Override
    public boolean useCommitCoordinator() {
      return false;
    }

    @Override
    public void commit(WriterCommitMessage[] messages) {
      PositionDeletesRewriteCoordinator coordinator = PositionDeletesRewriteCoordinator.get();
      coordinator.stageRewrite(table, fileSetId, DeleteFileSet.of(files(messages)));
    }

    @Override
    public void abort(WriterCommitMessage[] messages) {
      SparkCleanupUtil.deleteFiles("job abort", table.io(), files(messages));
    }

    private List files(WriterCommitMessage[] messages) {
      List files = Lists.newArrayList();

      for (WriterCommitMessage message : messages) {
        if (message != null) {
          DeleteTaskCommit taskCommit = (DeleteTaskCommit) message;
          files.addAll(Arrays.asList(taskCommit.files()));
        }
      }

      return files;
    }
  }

  /**
   * Writer factory for position deletes metadata table. Responsible for creating {@link
   * DeleteWriter}.
   *
   * This writer is meant to be used for an action to rewrite delete files. Hence, it makes an
   * assumption that all incoming deletes belong to the same partition, and that incoming dataset is
   * from {@link ScanTaskSetManager}.
   */
  static class PositionDeletesWriterFactory implements DataWriterFactory {
    private final Broadcast tableBroadcast;
    private final String queryId;
    private final FileFormat format;
    private final Long targetFileSize;
    private final DeleteGranularity deleteGranularity;
    private final Schema writeSchema;
    private final StructType dsSchema;
    private final int specId;
    private final StructLike partition;
    private final Map writeProperties;

    PositionDeletesWriterFactory(
        Broadcast tableBroadcast,
        String queryId,
        FileFormat format,
        long targetFileSize,
        DeleteGranularity deleteGranularity,
        Schema writeSchema,
        StructType dsSchema,
        int specId,
        StructLike partition,
        Map writeProperties) {
      this.tableBroadcast = tableBroadcast;
      this.queryId = queryId;
      this.format = format;
      this.targetFileSize = targetFileSize;
      this.deleteGranularity = deleteGranularity;
      this.writeSchema = writeSchema;
      this.dsSchema = dsSchema;
      this.specId = specId;
      this.partition = partition;
      this.writeProperties = writeProperties;
    }

    @Override
    public DataWriter createWriter(int partitionId, long taskId) {
      Table table = tableBroadcast.value();

      OutputFileFactory deleteFileFactory =
          OutputFileFactory.builderFor(table, partitionId, taskId)
              .format(format)
              .operationId(queryId)
              .suffix("deletes")
              .build();

      Schema positionDeleteRowSchema = positionDeleteRowSchema();
      StructType deleteSparkType = deleteSparkType();
      StructType deleteSparkTypeWithoutRow = deleteSparkTypeWithoutRow();

      SparkFileWriterFactory writerFactoryWithRow =
          SparkFileWriterFactory.builderFor(table)
              .deleteFileFormat(format)
              .positionDeleteRowSchema(positionDeleteRowSchema)
              .positionDeleteSparkType(deleteSparkType)
              .writeProperties(writeProperties)
              .build();
      SparkFileWriterFactory writerFactoryWithoutRow =
          SparkFileWriterFactory.builderFor(table)
              .deleteFileFormat(format)
              .positionDeleteSparkType(deleteSparkTypeWithoutRow)
              .writeProperties(writeProperties)
              .build();

      return new DeleteWriter(
          table,
          writerFactoryWithRow,
          writerFactoryWithoutRow,
          deleteFileFactory,
          targetFileSize,
          deleteGranularity,
          dsSchema,
          specId,
          partition);
    }

    private Schema positionDeleteRowSchema() {
      return new Schema(
          writeSchema
              .findField(MetadataColumns.DELETE_FILE_ROW_FIELD_NAME)
              .type()
              .asStructType()
              .fields());
    }

    private StructType deleteSparkType() {
      return new StructType(
          new StructField[] {
            dsSchema.apply(MetadataColumns.DELETE_FILE_PATH.name()),
            dsSchema.apply(MetadataColumns.DELETE_FILE_POS.name()),
            dsSchema.apply(MetadataColumns.DELETE_FILE_ROW_FIELD_NAME)
          });
    }

    private StructType deleteSparkTypeWithoutRow() {
      return new StructType(
          new StructField[] {
            dsSchema.apply(MetadataColumns.DELETE_FILE_PATH.name()),
            dsSchema.apply(MetadataColumns.DELETE_FILE_POS.name()),
          });
    }
  }

  /**
   * Writer for position deletes metadata table.
   *
   * Iceberg specifies delete files schema as having either 'row' as a required field, or omits
   * 'row' altogether. This is to ensure accuracy of delete file statistics on 'row' column. Hence,
   * this writer, if receiving source position deletes with null and non-null rows, redirects rows
   * with null 'row' to one file writer, and non-null 'row' to another file writer.
   *
   * 
This writer is meant to be used for an action to rewrite delete files. Hence, it makes an
   * assumption that all incoming deletes belong to the same partition.
   */
  private static class DeleteWriter implements DataWriter {
    private final SparkFileWriterFactory writerFactoryWithRow;
    private final SparkFileWriterFactory writerFactoryWithoutRow;
    private final OutputFileFactory deleteFileFactory;
    private final long targetFileSize;
    private final DeleteGranularity deleteGranularity;
    private final PositionDelete positionDelete;
    private final FileIO io;
    private final PartitionSpec spec;
    private final int fileOrdinal;
    private final int positionOrdinal;
    private final int rowOrdinal;
    private final int rowSize;
    private final StructLike partition;

    private ClusteredPositionDeleteWriter writerWithRow;
    private ClusteredPositionDeleteWriter writerWithoutRow;
    private boolean closed = false;

    /**
     * Constructs a {@link DeleteWriter}.
     *
     * @param table position deletes metadata table
     * @param writerFactoryWithRow writer factory for deletes with non-null 'row'
     * @param writerFactoryWithoutRow writer factory for deletes with null 'row'
     * @param deleteFileFactory delete file factory
     * @param targetFileSize target file size
     * @param dsSchema schema of incoming dataset of position deletes
     * @param specId partition spec id of incoming position deletes. All incoming partition deletes
     *     are required to have the same spec id.
     * @param partition partition value of incoming position delete. All incoming partition deletes
     *     are required to have the same partition.
     */
    DeleteWriter(
        Table table,
        SparkFileWriterFactory writerFactoryWithRow,
        SparkFileWriterFactory writerFactoryWithoutRow,
        OutputFileFactory deleteFileFactory,
        long targetFileSize,
        DeleteGranularity deleteGranularity,
        StructType dsSchema,
        int specId,
        StructLike partition) {
      this.deleteFileFactory = deleteFileFactory;
      this.targetFileSize = targetFileSize;
      this.deleteGranularity = deleteGranularity;
      this.writerFactoryWithRow = writerFactoryWithRow;
      this.writerFactoryWithoutRow = writerFactoryWithoutRow;
      this.positionDelete = PositionDelete.create();
      this.io = table.io();
      this.spec = table.specs().get(specId);
      this.partition = partition;

      this.fileOrdinal = dsSchema.fieldIndex(MetadataColumns.DELETE_FILE_PATH.name());
      this.positionOrdinal = dsSchema.fieldIndex(MetadataColumns.DELETE_FILE_POS.name());

      this.rowOrdinal = dsSchema.fieldIndex(MetadataColumns.DELETE_FILE_ROW_FIELD_NAME);
      DataType type = dsSchema.apply(MetadataColumns.DELETE_FILE_ROW_FIELD_NAME).dataType();
      Preconditions.checkArgument(
          type instanceof StructType, "Expected row as struct type but was %s", type);
      this.rowSize = ((StructType) type).size();
    }

    @Override
    public void write(InternalRow record) throws IOException {
      String file = record.getString(fileOrdinal);
      long position = record.getLong(positionOrdinal);
      InternalRow row = record.getStruct(rowOrdinal, rowSize);
      if (row != null) {
        positionDelete.set(file, position, row);
        lazyWriterWithRow().write(positionDelete, spec, partition);
      } else {
        positionDelete.set(file, position, null);
        lazyWriterWithoutRow().write(positionDelete, spec, partition);
      }
    }

    @Override
    public WriterCommitMessage commit() throws IOException {
      close();
      return new DeleteTaskCommit(allDeleteFiles());
    }

    @Override
    public void abort() throws IOException {
      close();
      SparkCleanupUtil.deleteTaskFiles(io, allDeleteFiles());
    }

    @Override
    public void close() throws IOException {
      if (!closed) {
        if (writerWithRow != null) {
          writerWithRow.close();
        }
        if (writerWithoutRow != null) {
          writerWithoutRow.close();
        }
        this.closed = true;
      }
    }

    private ClusteredPositionDeleteWriter lazyWriterWithRow() {
      if (writerWithRow == null) {
        this.writerWithRow =
            new ClusteredPositionDeleteWriter<>(
                writerFactoryWithRow, deleteFileFactory, io, targetFileSize, deleteGranularity);
      }
      return writerWithRow;
    }

    private ClusteredPositionDeleteWriter lazyWriterWithoutRow() {
      if (writerWithoutRow == null) {
        this.writerWithoutRow =
            new ClusteredPositionDeleteWriter<>(
                writerFactoryWithoutRow, deleteFileFactory, io, targetFileSize, deleteGranularity);
      }
      return writerWithoutRow;
    }

    private List allDeleteFiles() {
      List allDeleteFiles = Lists.newArrayList();
      if (writerWithRow != null) {
        allDeleteFiles.addAll(writerWithRow.result().deleteFiles());
      }
      if (writerWithoutRow != null) {
        allDeleteFiles.addAll(writerWithoutRow.result().deleteFiles());
      }
      return allDeleteFiles;
    }
  }

  public static class DeleteTaskCommit implements WriterCommitMessage {
    private final DeleteFile[] taskFiles;

    DeleteTaskCommit(List deleteFiles) {
      this.taskFiles = deleteFiles.toArray(new DeleteFile[0]);
    }

    DeleteFile[] files() {
      return taskFiles;
    }
  }
}
    

    

    

            
    
            

    
        
            
                Related Artifacts
                
                     mysql-connector-java mysql
 facebook-messenger com.github.codedrinker
 selenium-java org.seleniumhq.selenium
 instagram-java com.github.sola92
 gson com.google.code.gson
 poi org.apache.poi
 httpclient org.apache.httpcomponents
 json org.json
 facebook-java-api com.google.code.facebook-java-api
 poi-ooxml org.apache.poi
 jackson-databind com.fasterxml.jackson.core
 junit junit
 primefaces org.primefaces
 ojdbc7 com.github.noraui
 jfoenix com.jfoenix
 testng org.testng
 json-simple com.googlecode.json-simple
 selenium-server org.seleniumhq.selenium
 itextpdf com.itextpdf
 spring-core org.springframework
                
            
        
        
            
                Related Groups
                
                     org.springframework
 org.apache.poi
 org.hibernate
 org.springframework.boot
 com.fasterxml.jackson.core
 com.itextpdf
 org.seleniumhq.selenium
 mysql
 org.finos.legend.engine
 org.apache.httpcomponents
 org.apache.logging.log4j
 org.openjfx
 org.apache.commons
 org.json
 com.google.guava
 com.google.zxing
 net.sf.jasperreports
 javax.xml.bind
 ojdbc
 com.google.code.facebook-java-api
                
            
        
    
    





    © 2015 - 2024 Weber Informatics LLC | Privacy Policy