org.apache.iceberg.spark.actions.RewriteDataFilesSparkAction Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-spark-3.5_2.13 Show documentation
A table format for huge analytic datasets
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.iceberg.spark.actions;

import java.io.IOException;
import java.math.RoundingMode;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.iceberg.DataFile;
import org.apache.iceberg.FileScanTask;
import org.apache.iceberg.RewriteJobOrder;
import org.apache.iceberg.SortOrder;
import org.apache.iceberg.StructLike;
import org.apache.iceberg.Table;
import org.apache.iceberg.actions.FileRewriter;
import org.apache.iceberg.actions.ImmutableRewriteDataFiles;
import org.apache.iceberg.actions.ImmutableRewriteDataFiles.Result.Builder;
import org.apache.iceberg.actions.RewriteDataFiles;
import org.apache.iceberg.actions.RewriteDataFilesCommitManager;
import org.apache.iceberg.actions.RewriteFileGroup;
import org.apache.iceberg.data.GenericRecord;
import org.apache.iceberg.exceptions.CommitFailedException;
import org.apache.iceberg.exceptions.ValidationException;
import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.expressions.Expressions;
import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet;
import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.relocated.com.google.common.collect.Queues;
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
import org.apache.iceberg.relocated.com.google.common.math.IntMath;
import org.apache.iceberg.relocated.com.google.common.util.concurrent.MoreExecutors;
import org.apache.iceberg.relocated.com.google.common.util.concurrent.ThreadFactoryBuilder;
import org.apache.iceberg.types.Types.StructType;
import org.apache.iceberg.util.PropertyUtil;
import org.apache.iceberg.util.StructLikeMap;
import org.apache.iceberg.util.Tasks;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.internal.SQLConf;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class RewriteDataFilesSparkAction
    extends BaseSnapshotUpdateSparkAction implements RewriteDataFiles {

  private static final Logger LOG = LoggerFactory.getLogger(RewriteDataFilesSparkAction.class);
  private static final Set VALID_OPTIONS =
      ImmutableSet.of(
          MAX_CONCURRENT_FILE_GROUP_REWRITES,
          MAX_FILE_GROUP_SIZE_BYTES,
          PARTIAL_PROGRESS_ENABLED,
          PARTIAL_PROGRESS_MAX_COMMITS,
          PARTIAL_PROGRESS_MAX_FAILED_COMMITS,
          TARGET_FILE_SIZE_BYTES,
          USE_STARTING_SEQUENCE_NUMBER,
          REWRITE_JOB_ORDER,
          OUTPUT_SPEC_ID,
          REMOVE_DANGLING_DELETES);

  private static final RewriteDataFilesSparkAction.Result EMPTY_RESULT =
      ImmutableRewriteDataFiles.Result.builder().rewriteResults(ImmutableList.of()).build();

  private final Table table;

  private Expression filter = Expressions.alwaysTrue();
  private int maxConcurrentFileGroupRewrites;
  private int maxCommits;
  private int maxFailedCommits;
  private boolean partialProgressEnabled;
  private boolean removeDanglingDeletes;
  private boolean useStartingSequenceNumber;
  private RewriteJobOrder rewriteJobOrder;
  private FileRewriter rewriter = null;

  RewriteDataFilesSparkAction(SparkSession spark, Table table) {
    super(spark.cloneSession());
    // Disable Adaptive Query Execution as this may change the output partitioning of our write
    spark().conf().set(SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), false);
    this.table = table;
  }

  @Override
  protected RewriteDataFilesSparkAction self() {
    return this;
  }

  @Override
  public RewriteDataFilesSparkAction binPack() {
    Preconditions.checkArgument(
        rewriter == null, "Must use only one rewriter type (bin-pack, sort, zorder)");
    this.rewriter = new SparkBinPackDataRewriter(spark(), table);
    return this;
  }

  @Override
  public RewriteDataFilesSparkAction sort(SortOrder sortOrder) {
    Preconditions.checkArgument(
        rewriter == null, "Must use only one rewriter type (bin-pack, sort, zorder)");
    this.rewriter = new SparkSortDataRewriter(spark(), table, sortOrder);
    return this;
  }

  @Override
  public RewriteDataFilesSparkAction sort() {
    Preconditions.checkArgument(
        rewriter == null, "Must use only one rewriter type (bin-pack, sort, zorder)");
    this.rewriter = new SparkSortDataRewriter(spark(), table);
    return this;
  }

  @Override
  public RewriteDataFilesSparkAction zOrder(String... columnNames) {
    Preconditions.checkArgument(
        rewriter == null, "Must use only one rewriter type (bin-pack, sort, zorder)");
    this.rewriter = new SparkZOrderDataRewriter(spark(), table, Arrays.asList(columnNames));
    return this;
  }

  @Override
  public RewriteDataFilesSparkAction filter(Expression expression) {
    filter = Expressions.and(filter, expression);
    return this;
  }

  @Override
  public RewriteDataFiles.Result execute() {
    if (table.currentSnapshot() == null) {
      return EMPTY_RESULT;
    }

    long startingSnapshotId = table.currentSnapshot().snapshotId();

    // Default to BinPack if no strategy selected
    if (this.rewriter == null) {
      this.rewriter = new SparkBinPackDataRewriter(spark(), table);
    }

    validateAndInitOptions();

    StructLikeMap>> fileGroupsByPartition =
        planFileGroups(startingSnapshotId);
    RewriteExecutionContext ctx = new RewriteExecutionContext(fileGroupsByPartition);

    if (ctx.totalGroupCount() == 0) {
      LOG.info("Nothing found to rewrite in {}", table.name());
      return EMPTY_RESULT;
    }

    Stream groupStream = toGroupStream(ctx, fileGroupsByPartition);

    Builder resultBuilder =
        partialProgressEnabled
            ? doExecuteWithPartialProgress(ctx, groupStream, commitManager(startingSnapshotId))
            : doExecute(ctx, groupStream, commitManager(startingSnapshotId));

    if (removeDanglingDeletes) {
      RemoveDanglingDeletesSparkAction action =
          new RemoveDanglingDeletesSparkAction(spark(), table);
      int removedCount = Iterables.size(action.execute().removedDeleteFiles());
      resultBuilder.removedDeleteFilesCount(removedCount);
    }
    return resultBuilder.build();
  }

  StructLikeMap>> planFileGroups(long startingSnapshotId) {
    CloseableIterable fileScanTasks =
        table
            .newScan()
            .useSnapshot(startingSnapshotId)
            .filter(filter)
            .ignoreResiduals()
            .planFiles();

    try {
      StructType partitionType = table.spec().partitionType();
      StructLikeMap> filesByPartition =
          groupByPartition(partitionType, fileScanTasks);
      return fileGroupsByPartition(filesByPartition);
    } finally {
      try {
        fileScanTasks.close();
      } catch (IOException io) {
        LOG.error("Cannot properly close file iterable while planning for rewrite", io);
      }
    }
  }

  private StructLikeMap> groupByPartition(
      StructType partitionType, Iterable tasks) {
    StructLikeMap> filesByPartition = StructLikeMap.create(partitionType);
    StructLike emptyStruct = GenericRecord.create(partitionType);

    for (FileScanTask task : tasks) {
      // If a task uses an incompatible partition spec the data inside could contain values
      // which belong to multiple partitions in the current spec. Treating all such files as
      // un-partitioned and grouping them together helps to minimize new files made.
      StructLike taskPartition =
          task.file().specId() == table.spec().specId() ? task.file().partition() : emptyStruct;

      List files = filesByPartition.get(taskPartition);
      if (files == null) {
        files = Lists.newArrayList();
      }

      files.add(task);
      filesByPartition.put(taskPartition, files);
    }
    return filesByPartition;
  }

  private StructLikeMap>> fileGroupsByPartition(
      StructLikeMap> filesByPartition) {
    return filesByPartition.transformValues(this::planFileGroups);
  }

  private List> planFileGroups(List tasks) {
    return ImmutableList.copyOf(rewriter.planFileGroups(tasks));
  }

  @VisibleForTesting
  RewriteFileGroup rewriteFiles(RewriteExecutionContext ctx, RewriteFileGroup fileGroup) {
    String desc = jobDesc(fileGroup, ctx);
    Set addedFiles =
        withJobGroupInfo(
            newJobGroupInfo("REWRITE-DATA-FILES", desc),
            () -> rewriter.rewrite(fileGroup.fileScans()));

    fileGroup.setOutputFiles(addedFiles);
    LOG.info("Rewrite Files Ready to be Committed - {}", desc);
    return fileGroup;
  }

  private ExecutorService rewriteService() {
    return MoreExecutors.getExitingExecutorService(
        (ThreadPoolExecutor)
            Executors.newFixedThreadPool(
                maxConcurrentFileGroupRewrites,
                new ThreadFactoryBuilder().setNameFormat("Rewrite-Service-%d").build()));
  }

  @VisibleForTesting
  RewriteDataFilesCommitManager commitManager(long startingSnapshotId) {
    return new RewriteDataFilesCommitManager(
        table, startingSnapshotId, useStartingSequenceNumber, commitSummary());
  }

  private Builder doExecute(
      RewriteExecutionContext ctx,
      Stream groupStream,
      RewriteDataFilesCommitManager commitManager) {
    ExecutorService rewriteService = rewriteService();

    ConcurrentLinkedQueue rewrittenGroups = Queues.newConcurrentLinkedQueue();

    Tasks.Builder rewriteTaskBuilder =
        Tasks.foreach(groupStream)
            .executeWith(rewriteService)
            .stopOnFailure()
            .noRetry()
            .onFailure(
                (fileGroup, exception) -> {
                  LOG.warn(
                      "Failure during rewrite process for group {}", fileGroup.info(), exception);
                });

    try {
      rewriteTaskBuilder.run(
          fileGroup -> {
            rewrittenGroups.add(rewriteFiles(ctx, fileGroup));
          });
    } catch (Exception e) {
      // At least one rewrite group failed, clean up all completed rewrites
      LOG.error(
          "Cannot complete rewrite, {} is not enabled and one of the file set groups failed to "
              + "be rewritten. This error occurred during the writing of new files, not during the commit process. This "
              + "indicates something is wrong that doesn't involve conflicts with other Iceberg operations. Enabling "
              + "{} may help in this case but the root cause should be investigated. Cleaning up {} groups which finished "
              + "being written.",
          PARTIAL_PROGRESS_ENABLED,
          PARTIAL_PROGRESS_ENABLED,
          rewrittenGroups.size(),
          e);

      Tasks.foreach(rewrittenGroups)
          .suppressFailureWhenFinished()
          .run(commitManager::abortFileGroup);
      throw e;
    } finally {
      rewriteService.shutdown();
    }

    try {
      commitManager.commitOrClean(Sets.newHashSet(rewrittenGroups));
    } catch (ValidationException | CommitFailedException e) {
      String errorMessage =
          String.format(
              "Cannot commit rewrite because of a ValidationException or CommitFailedException. This usually means that "
                  + "this rewrite has conflicted with another concurrent Iceberg operation. To reduce the likelihood of "
                  + "conflicts, set %s which will break up the rewrite into multiple smaller commits controlled by %s. "
                  + "Separate smaller rewrite commits can succeed independently while any commits that conflict with "
                  + "another Iceberg operation will be ignored. This mode will create additional snapshots in the table "
                  + "history, one for each commit.",
              PARTIAL_PROGRESS_ENABLED, PARTIAL_PROGRESS_MAX_COMMITS);
      throw new RuntimeException(errorMessage, e);
    }

    List rewriteResults =
        rewrittenGroups.stream().map(RewriteFileGroup::asResult).collect(Collectors.toList());
    return ImmutableRewriteDataFiles.Result.builder().rewriteResults(rewriteResults);
  }

  private Builder doExecuteWithPartialProgress(
      RewriteExecutionContext ctx,
      Stream groupStream,
      RewriteDataFilesCommitManager commitManager) {
    ExecutorService rewriteService = rewriteService();

    // start commit service
    int groupsPerCommit = IntMath.divide(ctx.totalGroupCount(), maxCommits, RoundingMode.CEILING);
    RewriteDataFilesCommitManager.CommitService commitService =
        commitManager.service(groupsPerCommit);
    commitService.start();

    Collection rewriteFailures = new ConcurrentLinkedQueue<>();
    // start rewrite tasks
    Tasks.foreach(groupStream)
        .suppressFailureWhenFinished()
        .executeWith(rewriteService)
        .noRetry()
        .onFailure(
            (fileGroup, exception) -> {
              LOG.error("Failure during rewrite group {}", fileGroup.info(), exception);
              rewriteFailures.add(
                  ImmutableRewriteDataFiles.FileGroupFailureResult.builder()
                      .info(fileGroup.info())
                      .dataFilesCount(fileGroup.numFiles())
                      .build());
            })
        .run(fileGroup -> commitService.offer(rewriteFiles(ctx, fileGroup)));
    rewriteService.shutdown();

    // stop commit service
    commitService.close();

    int failedCommits = maxCommits - commitService.succeededCommits();
    if (failedCommits > 0 && failedCommits <= maxFailedCommits) {
      LOG.warn(
          "{} is true but {} rewrite commits failed. Check the logs to determine why the individual "
              + "commits failed. If this is persistent it may help to increase {} which will split the rewrite operation "
              + "into smaller commits.",
          PARTIAL_PROGRESS_ENABLED,
          failedCommits,
          PARTIAL_PROGRESS_MAX_COMMITS);
    } else if (failedCommits > maxFailedCommits) {
      String errorMessage =
          String.format(
              "%s is true but %d rewrite commits failed. This is more than the maximum allowed failures of %d. "
                  + "Check the logs to determine why the individual commits failed. If this is persistent it may help to "
                  + "increase %s which will split the rewrite operation into smaller commits.",
              PARTIAL_PROGRESS_ENABLED,
              failedCommits,
              maxFailedCommits,
              PARTIAL_PROGRESS_MAX_COMMITS);
      throw new RuntimeException(errorMessage);
    }

    return ImmutableRewriteDataFiles.Result.builder()
        .rewriteResults(toRewriteResults(commitService.results()))
        .rewriteFailures(rewriteFailures);
  }

  Stream toGroupStream(
      RewriteExecutionContext ctx, Map>> groupsByPartition) {
    return groupsByPartition.entrySet().stream()
        .filter(e -> !e.getValue().isEmpty())
        .flatMap(
            e -> {
              StructLike partition = e.getKey();
              List> scanGroups = e.getValue();
              return scanGroups.stream().map(tasks -> newRewriteGroup(ctx, partition, tasks));
            })
        .sorted(RewriteFileGroup.comparator(rewriteJobOrder));
  }

  private RewriteFileGroup newRewriteGroup(
      RewriteExecutionContext ctx, StructLike partition, List tasks) {
    int globalIndex = ctx.currentGlobalIndex();
    int partitionIndex = ctx.currentPartitionIndex(partition);
    FileGroupInfo info =
        ImmutableRewriteDataFiles.FileGroupInfo.builder()
            .globalIndex(globalIndex)
            .partitionIndex(partitionIndex)
            .partition(partition)
            .build();
    return new RewriteFileGroup(info, tasks);
  }

  private Iterable toRewriteResults(List commitResults) {
    return commitResults.stream().map(RewriteFileGroup::asResult).collect(Collectors.toList());
  }

  void validateAndInitOptions() {
    Set validOptions = Sets.newHashSet(rewriter.validOptions());
    validOptions.addAll(VALID_OPTIONS);

    Set invalidKeys = Sets.newHashSet(options().keySet());
    invalidKeys.removeAll(validOptions);

    Preconditions.checkArgument(
        invalidKeys.isEmpty(),
        "Cannot use options %s, they are not supported by the action or the rewriter %s",
        invalidKeys,
        rewriter.description());

    rewriter.init(options());

    maxConcurrentFileGroupRewrites =
        PropertyUtil.propertyAsInt(
            options(),
            MAX_CONCURRENT_FILE_GROUP_REWRITES,
            MAX_CONCURRENT_FILE_GROUP_REWRITES_DEFAULT);

    maxCommits =
        PropertyUtil.propertyAsInt(
            options(), PARTIAL_PROGRESS_MAX_COMMITS, PARTIAL_PROGRESS_MAX_COMMITS_DEFAULT);

    maxFailedCommits =
        PropertyUtil.propertyAsInt(options(), PARTIAL_PROGRESS_MAX_FAILED_COMMITS, maxCommits);

    partialProgressEnabled =
        PropertyUtil.propertyAsBoolean(
            options(), PARTIAL_PROGRESS_ENABLED, PARTIAL_PROGRESS_ENABLED_DEFAULT);

    useStartingSequenceNumber =
        PropertyUtil.propertyAsBoolean(
            options(), USE_STARTING_SEQUENCE_NUMBER, USE_STARTING_SEQUENCE_NUMBER_DEFAULT);

    removeDanglingDeletes =
        PropertyUtil.propertyAsBoolean(
            options(), REMOVE_DANGLING_DELETES, REMOVE_DANGLING_DELETES_DEFAULT);

    rewriteJobOrder =
        RewriteJobOrder.fromName(
            PropertyUtil.propertyAsString(options(), REWRITE_JOB_ORDER, REWRITE_JOB_ORDER_DEFAULT));

    Preconditions.checkArgument(
        maxConcurrentFileGroupRewrites >= 1,
        "Cannot set %s to %s, the value must be positive.",
        MAX_CONCURRENT_FILE_GROUP_REWRITES,
        maxConcurrentFileGroupRewrites);

    Preconditions.checkArgument(
        !partialProgressEnabled || maxCommits > 0,
        "Cannot set %s to %s, the value must be positive when %s is true",
        PARTIAL_PROGRESS_MAX_COMMITS,
        maxCommits,
        PARTIAL_PROGRESS_ENABLED);
  }

  private String jobDesc(RewriteFileGroup group, RewriteExecutionContext ctx) {
    StructLike partition = group.info().partition();
    if (partition.size() > 0) {
      return String.format(
          "Rewriting %d files (%s, file group %d/%d, %s (%d/%d)) in %s",
          group.rewrittenFiles().size(),
          rewriter.description(),
          group.info().globalIndex(),
          ctx.totalGroupCount(),
          partition,
          group.info().partitionIndex(),
          ctx.groupsInPartition(partition),
          table.name());
    } else {
      return String.format(
          "Rewriting %d files (%s, file group %d/%d) in %s",
          group.rewrittenFiles().size(),
          rewriter.description(),
          group.info().globalIndex(),
          ctx.totalGroupCount(),
          table.name());
    }
  }

  @VisibleForTesting
  static class RewriteExecutionContext {
    private final StructLikeMap numGroupsByPartition;
    private final int totalGroupCount;
    private final Map partitionIndexMap;
    private final AtomicInteger groupIndex;

    RewriteExecutionContext(StructLikeMap>> fileGroupsByPartition) {
      this.numGroupsByPartition = fileGroupsByPartition.transformValues(List::size);
      this.totalGroupCount = numGroupsByPartition.values().stream().reduce(Integer::sum).orElse(0);
      this.partitionIndexMap = Maps.newConcurrentMap();
      this.groupIndex = new AtomicInteger(1);
    }

    public int currentGlobalIndex() {
      return groupIndex.getAndIncrement();
    }

    public int currentPartitionIndex(StructLike partition) {
      return partitionIndexMap.merge(partition, 1, Integer::sum);
    }

    public int groupsInPartition(StructLike partition) {
      return numGroupsByPartition.get(partition);
    }

    public int totalGroupCount() {
      return totalGroupCount;
    }
  }
}