All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.iceberg.RemoveSnapshots Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.iceberg;

import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS;
import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS_DEFAULT;
import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS;
import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS_DEFAULT;
import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES;
import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES_DEFAULT;
import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS;
import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT;
import static org.apache.iceberg.TableProperties.GC_ENABLED;
import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT;
import static org.apache.iceberg.TableProperties.MAX_REF_AGE_MS;
import static org.apache.iceberg.TableProperties.MAX_REF_AGE_MS_DEFAULT;
import static org.apache.iceberg.TableProperties.MAX_SNAPSHOT_AGE_MS;
import static org.apache.iceberg.TableProperties.MAX_SNAPSHOT_AGE_MS_DEFAULT;
import static org.apache.iceberg.TableProperties.MIN_SNAPSHOTS_TO_KEEP;
import static org.apache.iceberg.TableProperties.MIN_SNAPSHOTS_TO_KEEP_DEFAULT;

import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.function.Consumer;
import org.apache.iceberg.exceptions.CommitFailedException;
import org.apache.iceberg.exceptions.ValidationException;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
import org.apache.iceberg.relocated.com.google.common.util.concurrent.MoreExecutors;
import org.apache.iceberg.util.DateTimeUtil;
import org.apache.iceberg.util.PropertyUtil;
import org.apache.iceberg.util.SnapshotUtil;
import org.apache.iceberg.util.Tasks;
import org.apache.iceberg.util.ThreadPools;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@SuppressWarnings("UnnecessaryAnonymousClass")
class RemoveSnapshots implements ExpireSnapshots {
  private static final Logger LOG = LoggerFactory.getLogger(RemoveSnapshots.class);

  // Creates an executor service that runs each task in the thread that invokes execute/submit.
  private static final ExecutorService DEFAULT_DELETE_EXECUTOR_SERVICE =
      MoreExecutors.newDirectExecutorService();

  private final Consumer defaultDelete =
      new Consumer() {
        @Override
        public void accept(String file) {
          ops.io().deleteFile(file);
        }
      };

  private final TableOperations ops;
  private final Set idsToRemove = Sets.newHashSet();
  private final long now;
  private final long defaultMaxRefAgeMs;
  private boolean cleanExpiredFiles = true;
  private TableMetadata base;
  private long defaultExpireOlderThan;
  private int defaultMinNumSnapshots;
  private Consumer deleteFunc = defaultDelete;
  private ExecutorService deleteExecutorService = DEFAULT_DELETE_EXECUTOR_SERVICE;
  private ExecutorService planExecutorService = ThreadPools.getWorkerPool();
  private Boolean incrementalCleanup;
  private boolean specifiedSnapshotId = false;

  RemoveSnapshots(TableOperations ops) {
    this.ops = ops;
    this.base = ops.current();
    ValidationException.check(
        PropertyUtil.propertyAsBoolean(base.properties(), GC_ENABLED, GC_ENABLED_DEFAULT),
        "Cannot expire snapshots: GC is disabled (deleting files may corrupt other tables)");

    long defaultMaxSnapshotAgeMs =
        PropertyUtil.propertyAsLong(
            base.properties(), MAX_SNAPSHOT_AGE_MS, MAX_SNAPSHOT_AGE_MS_DEFAULT);

    this.now = System.currentTimeMillis();
    this.defaultExpireOlderThan = now - defaultMaxSnapshotAgeMs;
    this.defaultMinNumSnapshots =
        PropertyUtil.propertyAsInt(
            base.properties(), MIN_SNAPSHOTS_TO_KEEP, MIN_SNAPSHOTS_TO_KEEP_DEFAULT);

    this.defaultMaxRefAgeMs =
        PropertyUtil.propertyAsLong(base.properties(), MAX_REF_AGE_MS, MAX_REF_AGE_MS_DEFAULT);
  }

  @Override
  public ExpireSnapshots cleanExpiredFiles(boolean clean) {
    this.cleanExpiredFiles = clean;
    return this;
  }

  @Override
  public ExpireSnapshots expireSnapshotId(long expireSnapshotId) {
    LOG.info("Expiring snapshot with id: {}", expireSnapshotId);
    idsToRemove.add(expireSnapshotId);
    specifiedSnapshotId = true;
    return this;
  }

  @Override
  public ExpireSnapshots expireOlderThan(long timestampMillis) {
    LOG.info(
        "Expiring snapshots older than: {} ({})",
        DateTimeUtil.formatTimestampMillis(timestampMillis),
        timestampMillis);
    this.defaultExpireOlderThan = timestampMillis;
    return this;
  }

  @Override
  public ExpireSnapshots retainLast(int numSnapshots) {
    Preconditions.checkArgument(
        1 <= numSnapshots,
        "Number of snapshots to retain must be at least 1, cannot be: %s",
        numSnapshots);
    this.defaultMinNumSnapshots = numSnapshots;
    return this;
  }

  @Override
  public ExpireSnapshots deleteWith(Consumer newDeleteFunc) {
    this.deleteFunc = newDeleteFunc;
    return this;
  }

  @Override
  public ExpireSnapshots executeDeleteWith(ExecutorService executorService) {
    this.deleteExecutorService = executorService;
    return this;
  }

  @Override
  public ExpireSnapshots planWith(ExecutorService executorService) {
    this.planExecutorService = executorService;
    return this;
  }

  @Override
  public List apply() {
    TableMetadata updated = internalApply();
    List removed = Lists.newArrayList(base.snapshots());
    removed.removeAll(updated.snapshots());

    return removed;
  }

  private TableMetadata internalApply() {
    this.base = ops.refresh();
    if (base.snapshots().isEmpty()) {
      return base;
    }

    Set idsToRetain = Sets.newHashSet();
    // Identify refs that should be removed
    Map retainedRefs = computeRetainedRefs(base.refs());
    Map> retainedIdToRefs = Maps.newHashMap();
    for (Map.Entry retainedRefEntry : retainedRefs.entrySet()) {
      long snapshotId = retainedRefEntry.getValue().snapshotId();
      retainedIdToRefs.putIfAbsent(snapshotId, Lists.newArrayList());
      retainedIdToRefs.get(snapshotId).add(retainedRefEntry.getKey());
      idsToRetain.add(snapshotId);
    }

    for (long idToRemove : idsToRemove) {
      List refsForId = retainedIdToRefs.get(idToRemove);
      Preconditions.checkArgument(
          refsForId == null,
          "Cannot expire %s. Still referenced by refs: %s",
          idToRemove,
          refsForId);
    }

    idsToRetain.addAll(computeAllBranchSnapshotsToRetain(retainedRefs.values()));
    idsToRetain.addAll(unreferencedSnapshotsToRetain(retainedRefs.values()));

    TableMetadata.Builder updatedMetaBuilder = TableMetadata.buildFrom(base);

    base.refs().keySet().stream()
        .filter(ref -> !retainedRefs.containsKey(ref))
        .forEach(updatedMetaBuilder::removeRef);

    base.snapshots().stream()
        .map(Snapshot::snapshotId)
        .filter(snapshot -> !idsToRetain.contains(snapshot))
        .forEach(idsToRemove::add);
    updatedMetaBuilder.removeSnapshots(idsToRemove);

    return updatedMetaBuilder.build();
  }

  private Map computeRetainedRefs(Map refs) {
    Map retainedRefs = Maps.newHashMap();
    for (Map.Entry refEntry : refs.entrySet()) {
      String name = refEntry.getKey();
      SnapshotRef ref = refEntry.getValue();
      if (name.equals(SnapshotRef.MAIN_BRANCH)) {
        retainedRefs.put(name, ref);
        continue;
      }

      Snapshot snapshot = base.snapshot(ref.snapshotId());
      long maxRefAgeMs = ref.maxRefAgeMs() != null ? ref.maxRefAgeMs() : defaultMaxRefAgeMs;
      if (snapshot != null) {
        long refAgeMs = now - snapshot.timestampMillis();
        if (refAgeMs <= maxRefAgeMs) {
          retainedRefs.put(name, ref);
        }
      } else {
        LOG.warn("Removing invalid ref {}: snapshot {} does not exist", name, ref.snapshotId());
      }
    }

    return retainedRefs;
  }

  private Set computeAllBranchSnapshotsToRetain(Collection refs) {
    Set branchSnapshotsToRetain = Sets.newHashSet();
    for (SnapshotRef ref : refs) {
      if (ref.isBranch()) {
        long expireSnapshotsOlderThan =
            ref.maxSnapshotAgeMs() != null ? now - ref.maxSnapshotAgeMs() : defaultExpireOlderThan;
        int minSnapshotsToKeep =
            ref.minSnapshotsToKeep() != null ? ref.minSnapshotsToKeep() : defaultMinNumSnapshots;
        branchSnapshotsToRetain.addAll(
            computeBranchSnapshotsToRetain(
                ref.snapshotId(), expireSnapshotsOlderThan, minSnapshotsToKeep));
      }
    }

    return branchSnapshotsToRetain;
  }

  private Set computeBranchSnapshotsToRetain(
      long snapshot, long expireSnapshotsOlderThan, int minSnapshotsToKeep) {
    Set idsToRetain = Sets.newHashSet();
    for (Snapshot ancestor : SnapshotUtil.ancestorsOf(snapshot, base::snapshot)) {
      if (idsToRetain.size() < minSnapshotsToKeep
          || ancestor.timestampMillis() >= expireSnapshotsOlderThan) {
        idsToRetain.add(ancestor.snapshotId());
      } else {
        return idsToRetain;
      }
    }

    return idsToRetain;
  }

  private Set unreferencedSnapshotsToRetain(Collection refs) {
    Set referencedSnapshots = Sets.newHashSet();
    for (SnapshotRef ref : refs) {
      if (ref.isBranch()) {
        for (Snapshot snapshot : SnapshotUtil.ancestorsOf(ref.snapshotId(), base::snapshot)) {
          referencedSnapshots.add(snapshot.snapshotId());
        }
      } else {
        referencedSnapshots.add(ref.snapshotId());
      }
    }

    Set snapshotsToRetain = Sets.newHashSet();
    for (Snapshot snapshot : base.snapshots()) {
      if (!referencedSnapshots.contains(snapshot.snapshotId())
          && // unreferenced
          snapshot.timestampMillis() >= defaultExpireOlderThan) { // not old enough to expire
        snapshotsToRetain.add(snapshot.snapshotId());
      }
    }

    return snapshotsToRetain;
  }

  @Override
  public void commit() {
    Tasks.foreach(ops)
        .retry(base.propertyAsInt(COMMIT_NUM_RETRIES, COMMIT_NUM_RETRIES_DEFAULT))
        .exponentialBackoff(
            base.propertyAsInt(COMMIT_MIN_RETRY_WAIT_MS, COMMIT_MIN_RETRY_WAIT_MS_DEFAULT),
            base.propertyAsInt(COMMIT_MAX_RETRY_WAIT_MS, COMMIT_MAX_RETRY_WAIT_MS_DEFAULT),
            base.propertyAsInt(COMMIT_TOTAL_RETRY_TIME_MS, COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT),
            2.0 /* exponential */)
        .onlyRetryOn(CommitFailedException.class)
        .run(
            item -> {
              TableMetadata updated = internalApply();
              ops.commit(base, updated);
            });
    LOG.info("Committed snapshot changes");

    if (cleanExpiredFiles) {
      cleanExpiredSnapshots();
    }
  }

  ExpireSnapshots withIncrementalCleanup(boolean useIncrementalCleanup) {
    this.incrementalCleanup = useIncrementalCleanup;
    return this;
  }

  private void cleanExpiredSnapshots() {
    TableMetadata current = ops.refresh();

    if (specifiedSnapshotId) {
      if (incrementalCleanup != null && incrementalCleanup) {
        throw new UnsupportedOperationException(
            "Cannot clean files incrementally when snapshot IDs are specified");
      }

      incrementalCleanup = false;
    }

    if (incrementalCleanup == null) {
      incrementalCleanup = current.refs().size() == 1;
    }

    LOG.info(
        "Cleaning up expired files (local, {})", incrementalCleanup ? "incremental" : "reachable");

    FileCleanupStrategy cleanupStrategy =
        incrementalCleanup
            ? new IncrementalFileCleanup(
                ops.io(), deleteExecutorService, planExecutorService, deleteFunc)
            : new ReachableFileCleanup(
                ops.io(), deleteExecutorService, planExecutorService, deleteFunc);

    cleanupStrategy.cleanFiles(base, current);
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy