org.apache.iceberg.actions.RewriteDataFiles Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-api Show documentation
A table format for huge analytic datasets
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.iceberg.actions;

import java.util.List;
import org.apache.iceberg.SortOrder;
import org.apache.iceberg.StructLike;
import org.apache.iceberg.expressions.Expression;

/**
 * An action for rewriting data files according to a rewrite strategy.
 * Generally used for optimizing the sizing and layout of data files within a table.
 */
public interface RewriteDataFiles extends SnapshotUpdate {

  /**
   * Enable committing groups of files (see max-file-group-size-bytes) prior to the entire rewrite completing.
   * This will produce additional commits but allow for progress even if some groups fail to commit. This setting
   * will not change the correctness of the rewrite operation as file groups can be compacted independently.
   * 
   * The default is false, which produces a single commit when the entire job has completed.
   */
  String PARTIAL_PROGRESS_ENABLED = "partial-progress.enabled";
  boolean PARTIAL_PROGRESS_ENABLED_DEFAULT = false;

  /**
   * The maximum amount of Iceberg commits that this rewrite is allowed to produce if partial progress is enabled. This
   * setting has no effect if partial progress is disabled.
   */
  String PARTIAL_PROGRESS_MAX_COMMITS = "partial-progress.max-commits";
  int PARTIAL_PROGRESS_MAX_COMMITS_DEFAULT = 10;

  /**
   * The entire rewrite operation is broken down into pieces based on partitioning and within partitions based
   * on size into groups. These sub-units of the rewrite are referred to as file groups. The largest amount of data that
   * should be compacted in a single group is controlled by {@link #MAX_FILE_GROUP_SIZE_BYTES}. This helps with
   * breaking down the rewriting of very large partitions which may not be rewritable otherwise due to the resource
   * constraints of the cluster. For example a sort based rewrite may not scale to terabyte sized partitions, those
   * partitions need to be worked on in small subsections to avoid exhaustion of resources.
   * 

   * When grouping files, the underlying rewrite strategy will use this value as to limit the files which
   * will be included in a single file group. A group will be processed by a single framework "action". For example,
   * in Spark this means that each group would be rewritten in its own Spark action. A group will never contain files
   * for multiple output partitions.
   */
  String MAX_FILE_GROUP_SIZE_BYTES = "max-file-group-size-bytes";
  long MAX_FILE_GROUP_SIZE_BYTES_DEFAULT = 1024L * 1024L * 1024L * 100L; // 100 Gigabytes

  /**
   * The max number of file groups to be simultaneously rewritten by the rewrite strategy. The structure and
   * contents of the group is determined by the rewrite strategy. Each file group will be rewritten
   * independently and asynchronously.
   **/
  String MAX_CONCURRENT_FILE_GROUP_REWRITES = "max-concurrent-file-group-rewrites";
  int MAX_CONCURRENT_FILE_GROUP_REWRITES_DEFAULT = 1;

  /**
   * The output file size that this rewrite strategy will attempt to generate when rewriting files. By default this
   * will use the "write.target-file-size-bytes value" in the table properties of the table being updated.
   */
  String TARGET_FILE_SIZE_BYTES = "target-file-size-bytes";

  /**
   * If the compaction should use the sequence number of the snapshot at compaction start time for new data files,
   * instead of using the sequence number of the newly produced snapshot.
   * 

   * This avoids commit conflicts with updates that add newer equality deletes at a higher sequence number.
   * 
   * Defaults to true.
   */
  String USE_STARTING_SEQUENCE_NUMBER = "use-starting-sequence-number";
  boolean USE_STARTING_SEQUENCE_NUMBER_DEFAULT = true;

  /**
   * Choose BINPACK as a strategy for this rewrite operation
   * @return this for method chaining
   */
  default RewriteDataFiles binPack() {
    return this;
  }

  /**
   * Choose SORT as a strategy for this rewrite operation using the table's sortOrder
   * @return this for method chaining
   */
  default RewriteDataFiles sort() {
    throw new UnsupportedOperationException("SORT Rewrite Strategy not implemented for this framework");
  }

  /**
   * Choose SORT as a strategy for this rewrite operation and manually specify the sortOrder to use
   * @param sortOrder user defined sortOrder
   * @return this for method chaining
   */
  default RewriteDataFiles sort(SortOrder sortOrder) {
    throw new UnsupportedOperationException("SORT Rewrite Strategy not implemented for this framework");
  }

  /**
   * A user provided filter for determining which files will be considered by the rewrite strategy. This will be used
   * in addition to whatever rules the rewrite strategy generates. For example this would be used for providing a
   * restriction to only run rewrite on a specific partition.
   *
   * @param expression An iceberg expression used to determine which files will be considered for rewriting
   * @return this for chaining
   */
  RewriteDataFiles filter(Expression expression);

  /**
   * A map of file group information to the results of rewriting that file group. If the results are null then
   * that particular file group failed. We should only have failed groups if partial progress is enabled otherwise we
   * will report a total failure for the job.
   */
  interface Result {
    List rewriteResults();

    default int addedDataFilesCount() {
      return rewriteResults().stream().mapToInt(FileGroupRewriteResult::addedDataFilesCount).sum();
    }

    default int rewrittenDataFilesCount() {
      return rewriteResults().stream().mapToInt(FileGroupRewriteResult::rewrittenDataFilesCount).sum();
    }
  }

  /**
   *  For a particular file group, the number of files which are newly created and the number of files
   *  which were formerly part of the table but have been rewritten.
   */
  interface FileGroupRewriteResult {
    FileGroupInfo info();

    int addedDataFilesCount();

    int rewrittenDataFilesCount();
  }

  /**
   * A description of a file group, when it was processed, and within which partition. For use
   * tracking rewrite operations and for returning results.
   */
  interface FileGroupInfo {

    /**
     * returns which file group this is out of the total set of file groups for this rewrite
     */
    int globalIndex();

    /**
     * returns which file group this is out of the set of file groups for this partition
     */
    int partitionIndex();

    /**
     * returns which partition this file group contains files from
     */
    StructLike partition();
  }
}