All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.netease.arctic.spark.writer.UnkeyedSparkBatchWrite Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.netease.arctic.spark.writer;

import com.netease.arctic.ams.api.BlockableOperation;
import com.netease.arctic.ams.api.OperationConflictException;
import com.netease.arctic.catalog.ArcticCatalog;
import com.netease.arctic.hive.utils.HiveTableUtil;
import com.netease.arctic.spark.io.TaskWriters;
import com.netease.arctic.table.UnkeyedTable;
import com.netease.arctic.table.blocker.Blocker;
import com.netease.arctic.table.blocker.TableBlockerManager;
import com.netease.arctic.shade.org.apache.iceberg.AppendFiles;
import com.netease.arctic.shade.org.apache.iceberg.DataFile;
import com.netease.arctic.shade.org.apache.iceberg.DeleteFile;
import com.netease.arctic.shade.org.apache.iceberg.OverwriteFiles;
import com.netease.arctic.shade.org.apache.iceberg.ReplacePartitions;
import com.netease.arctic.shade.org.apache.iceberg.RowDelta;
import com.netease.arctic.shade.org.apache.iceberg.expressions.Expression;
import com.netease.arctic.shade.org.apache.iceberg.io.TaskWriter;
import com.netease.arctic.shade.org.apache.iceberg.relocated.com.google.common.collect.Lists;
import com.netease.arctic.shade.org.apache.iceberg.util.PropertyUtil;
import com.netease.arctic.shade.org.apache.iceberg.util.Tasks;
import com.netease.arctic.shade.org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.connector.write.BatchWrite;
import org.apache.spark.sql.connector.write.DataWriter;
import org.apache.spark.sql.connector.write.DataWriterFactory;
import org.apache.spark.sql.connector.write.LogicalWriteInfo;
import org.apache.spark.sql.connector.write.PhysicalWriteInfo;
import org.apache.spark.sql.connector.write.WriterCommitMessage;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

import static com.netease.arctic.hive.op.UpdateHiveFiles.DELETE_UNTRACKED_HIVE_FILE;
import static com.netease.arctic.spark.writer.WriteTaskCommit.files;
import static com.netease.arctic.shade.org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS;
import static com.netease.arctic.shade.org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS_DEFAULT;
import static com.netease.arctic.shade.org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS;
import static com.netease.arctic.shade.org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS_DEFAULT;
import static com.netease.arctic.shade.org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES;
import static com.netease.arctic.shade.org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES_DEFAULT;
import static com.netease.arctic.shade.org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS;
import static com.netease.arctic.shade.org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT;

public class UnkeyedSparkBatchWrite implements ArcticSparkWriteBuilder.ArcticWrite {

  private final UnkeyedTable table;
  private final StructType dsSchema;
  private final String hiveSubdirectory = HiveTableUtil.newHiveSubdirectory();

  private final boolean orderedWriter;

  private final ArcticCatalog catalog;

  public UnkeyedSparkBatchWrite(UnkeyedTable table, LogicalWriteInfo info, ArcticCatalog catalog) {
    this.table = table;
    this.dsSchema = info.schema();
    this.orderedWriter = Boolean.parseBoolean(info.options().getOrDefault(
        "writer.distributed-and-ordered", "false"
    ));
    this.catalog = catalog;
  }

  @Override
  public BatchWrite asBatchAppend() {
    return new AppendWrite();
  }

  @Override
  public BatchWrite asDynamicOverwrite() {
    return new DynamicOverwrite();
  }

  @Override
  public BatchWrite asOverwriteByFilter(Expression overwriteExpr) {
    return new OverwriteByFilter(overwriteExpr);
  }

  @Override
  public BatchWrite asUpsertWrite() {
    return new UpsertWrite();
  }

  @Override
  public BatchWrite asMergeBatchWrite() {
    return new MergeIntoWrite();
  }

  private abstract class BaseBatchWrite implements BatchWrite {

    protected TableBlockerManager tableBlockerManager;
    protected Blocker block;

    @Override
    public void abort(WriterCommitMessage[] messages) {
      try {
        Map props = table.properties();
        Tasks.foreach(files(messages))
            .retry(PropertyUtil.propertyAsInt(props, COMMIT_NUM_RETRIES, COMMIT_NUM_RETRIES_DEFAULT))
            .exponentialBackoff(
                PropertyUtil.propertyAsInt(props, COMMIT_MIN_RETRY_WAIT_MS, COMMIT_MIN_RETRY_WAIT_MS_DEFAULT),
                PropertyUtil.propertyAsInt(props, COMMIT_MAX_RETRY_WAIT_MS, COMMIT_MAX_RETRY_WAIT_MS_DEFAULT),
                PropertyUtil.propertyAsInt(props, COMMIT_TOTAL_RETRY_TIME_MS, COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT),
                2.0 /* exponential */)
            .throwFailureWhenFinished()
            .run(file -> {
              table.io().deleteFile(file.path().toString());
            });
      } finally {
        tableBlockerManager.release(block);
      }
    }

    public void checkBlocker(TableBlockerManager tableBlockerManager) {
      List blockerIds = tableBlockerManager.getBlockers()
          .stream().map(Blocker::blockerId).collect(Collectors.toList());
      if (!blockerIds.contains(block.blockerId())) {
        throw new IllegalStateException("block is not in blockerManager");
      }
    }

    public void getBlocker() {
      this.tableBlockerManager = catalog.getTableBlockerManager(table.id());
      ArrayList operations = Lists.newArrayList();
      operations.add(BlockableOperation.BATCH_WRITE);
      operations.add(BlockableOperation.OPTIMIZE);
      try {
        this.block = tableBlockerManager.block(operations);
      } catch (OperationConflictException e) {
        throw new IllegalStateException("failed to block table " + table.id() + " with " + operations, e);
      }
    }
  }

  private class AppendWrite extends BaseBatchWrite {

    @Override
    public DataWriterFactory createBatchWriterFactory(PhysicalWriteInfo info) {
      getBlocker();
      return new WriterFactory(table, dsSchema, false, null, orderedWriter);
    }

    @Override
    public void commit(WriterCommitMessage[] messages) {
      checkBlocker(tableBlockerManager);
      AppendFiles appendFiles = table.newAppend();
      for (DataFile file : files(messages)) {
        appendFiles.appendFile(file);
      }
      appendFiles.commit();
      tableBlockerManager.release(block);
    }
  }

  private class DynamicOverwrite extends BaseBatchWrite {

    @Override
    public DataWriterFactory createBatchWriterFactory(PhysicalWriteInfo info) {
      getBlocker();
      return new WriterFactory(table, dsSchema, true, hiveSubdirectory, orderedWriter);
    }

    @Override
    public void commit(WriterCommitMessage[] messages) {
      checkBlocker(tableBlockerManager);
      ReplacePartitions replacePartitions = table.newReplacePartitions();
      for (DataFile file : files(messages)) {
        replacePartitions.addFile(file);
      }
      replacePartitions.commit();
      tableBlockerManager.release(block);
    }
  }

  private class OverwriteByFilter extends BaseBatchWrite {
    private final Expression overwriteExpr;

    private OverwriteByFilter(Expression overwriteExpr) {
      this.overwriteExpr = overwriteExpr;
    }

    @Override
    public DataWriterFactory createBatchWriterFactory(PhysicalWriteInfo info) {
      getBlocker();
      return new WriterFactory(table, dsSchema, true, hiveSubdirectory, orderedWriter);
    }

    @Override
    public void commit(WriterCommitMessage[] messages) {
      checkBlocker(tableBlockerManager);
      OverwriteFiles overwriteFiles = table.newOverwrite();
      overwriteFiles.overwriteByRowFilter(overwriteExpr);
      overwriteFiles.set(DELETE_UNTRACKED_HIVE_FILE, "true");
      for (DataFile file : files(messages)) {
        overwriteFiles.addFile(file);
      }
      overwriteFiles.commit();
      tableBlockerManager.release(block);
    }
  }

  private class UpsertWrite extends BaseBatchWrite {
    @Override
    public DataWriterFactory createBatchWriterFactory(PhysicalWriteInfo info) {
      getBlocker();
      return new DeltaUpsertWriteFactory(table, dsSchema, orderedWriter);
    }

    @Override
    public void commit(WriterCommitMessage[] messages) {
      checkBlocker(tableBlockerManager);
      RowDelta rowDelta = table.newRowDelta();
      if (WriteTaskCommit.deleteFiles(messages).iterator().hasNext()) {
        for (DeleteFile file : WriteTaskCommit.deleteFiles(messages)) {
          rowDelta.addDeletes(file);
        }
      }
      if (WriteTaskCommit.files(messages).iterator().hasNext()) {
        for (DataFile file : WriteTaskCommit.files(messages)) {
          rowDelta.addRows(file);
        }
      }
      rowDelta.commit();
      tableBlockerManager.release(block);
    }
  }

  private static class WriterFactory implements DataWriterFactory, Serializable {
    protected final UnkeyedTable table;
    protected final StructType dsSchema;

    protected final String hiveSubdirectory;

    protected final boolean isOverwrite;
    protected final boolean orderedWriter;

    WriterFactory(
        UnkeyedTable table,
        StructType dsSchema,
        boolean isOverwrite,
        String hiveSubdirectory,
        boolean orderedWrite) {
      this.table = table;
      this.dsSchema = dsSchema;
      this.isOverwrite = isOverwrite;
      this.hiveSubdirectory = hiveSubdirectory;
      this.orderedWriter = orderedWrite;
    }

    @Override
    public DataWriter createWriter(int partitionId, long taskId) {
      TaskWriters builder =  TaskWriters.of(table)
          .withPartitionId(partitionId)
          .withTaskId(taskId)
          .withOrderedWriter(orderedWriter)
          .withDataSourceSchema(dsSchema)
          .withHiveSubdirectory(hiveSubdirectory);

      TaskWriter writer = builder.newBaseWriter(this.isOverwrite);
      return new SimpleInternalRowDataWriter(writer);
    }
  }

  private static class DeltaUpsertWriteFactory extends WriterFactory {

    DeltaUpsertWriteFactory(UnkeyedTable table, StructType dsSchema, boolean ordredWriter) {
      super(table, dsSchema, false, null, ordredWriter);
    }

    @Override
    public DataWriter createWriter(int partitionId, long taskId) {
      StructType schema = new StructType(Arrays.stream(dsSchema.fields()).filter(f -> !f.name().equals("_file") &&
          !f.name().equals("_pos") && !f.name().equals("_arctic_upsert_op")).toArray(StructField[]::new));
      TaskWriter internalRowUnkeyedUpsertSparkWriter = TaskWriters.of(table)
          .withPartitionId(partitionId)
          .withTaskId(taskId)
          .withDataSourceSchema(schema)
          .newUnkeyedUpsertWriter();

      return new SimpleUnkeyedUpsertDataWriter(internalRowUnkeyedUpsertSparkWriter, dsSchema);
    }
  }

  private static class MergeWriteFactory extends WriterFactory {

    MergeWriteFactory(UnkeyedTable table, StructType dsSchema, boolean orderedWrite) {
      super(table, dsSchema, false, null, orderedWrite);
    }

    @Override
    public RowLevelWriter createWriter(int partitionId, long taskId) {
      StructType schema = new StructType(Arrays.stream(dsSchema.fields()).filter(f -> !f.name().equals("_file") &&
          !f.name().equals("_pos") && !f.name().equals("_arctic_upsert_op")).toArray(StructField[]::new));
      TaskWriter writer = TaskWriters.of(table)
          .withPartitionId(partitionId)
          .withTaskId(taskId)
          .withDataSourceSchema(schema)
          .newUnkeyedUpsertWriter();
      return new SimpleMergeRowDataWriter(writer, dsSchema, table.isKeyedTable());
    }
  }

  private class MergeIntoWrite extends BaseBatchWrite {


    @Override
    public DataWriterFactory createBatchWriterFactory(PhysicalWriteInfo info) {
      getBlocker();
      return new MergeWriteFactory(table, dsSchema, orderedWriter);
    }

    @Override
    public void commit(WriterCommitMessage[] messages) {
      checkBlocker(tableBlockerManager);
      RowDelta rowDelta = table.newRowDelta();
      if (WriteTaskCommit.deleteFiles(messages).iterator().hasNext()) {
        for (DeleteFile file : WriteTaskCommit.deleteFiles(messages)) {
          rowDelta.addDeletes(file);
        }

      }
      if (WriteTaskCommit.files(messages).iterator().hasNext()) {
        for (DataFile file : WriteTaskCommit.files(messages)) {
          rowDelta.addRows(file);
        }
      }
      rowDelta.commit();
      tableBlockerManager.release(block);
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy