All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.iceberg.ReachableFileCleanup Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.iceberg;

import java.io.IOException;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutorService;
import java.util.function.Consumer;
import java.util.stream.Collectors;
import org.apache.iceberg.exceptions.RuntimeIOException;
import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.io.FileIO;
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
import org.apache.iceberg.util.Tasks;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * File cleanup strategy for snapshot expiration which determines, via an in-memory reference set,
 * metadata and data files that are not reachable given the previous and current table states.
 */
class ReachableFileCleanup extends FileCleanupStrategy {

  private static final Logger LOG = LoggerFactory.getLogger(ReachableFileCleanup.class);

  ReachableFileCleanup(
      FileIO fileIO,
      ExecutorService deleteExecutorService,
      ExecutorService planExecutorService,
      Consumer deleteFunc) {
    super(fileIO, deleteExecutorService, planExecutorService, deleteFunc);
  }

  @Override
  public void cleanFiles(TableMetadata beforeExpiration, TableMetadata afterExpiration) {
    Set manifestListsToDelete = Sets.newHashSet();

    Set snapshotsBeforeExpiration = Sets.newHashSet(beforeExpiration.snapshots());
    Set snapshotsAfterExpiration = Sets.newHashSet(afterExpiration.snapshots());
    Set expiredSnapshots = Sets.newHashSet();
    for (Snapshot snapshot : snapshotsBeforeExpiration) {
      if (!snapshotsAfterExpiration.contains(snapshot)) {
        expiredSnapshots.add(snapshot);
        if (snapshot.manifestListLocation() != null) {
          manifestListsToDelete.add(snapshot.manifestListLocation());
        }
      }
    }
    Set deletionCandidates = readManifests(expiredSnapshots);

    if (!deletionCandidates.isEmpty()) {
      Set currentManifests = ConcurrentHashMap.newKeySet();
      Set manifestsToDelete =
          pruneReferencedManifests(
              snapshotsAfterExpiration, deletionCandidates, currentManifests::add);

      if (!manifestsToDelete.isEmpty()) {
        Set dataFilesToDelete = findFilesToDelete(manifestsToDelete, currentManifests);
        deleteFiles(dataFilesToDelete, "data");
        Set manifestPathsToDelete =
            manifestsToDelete.stream().map(ManifestFile::path).collect(Collectors.toSet());
        deleteFiles(manifestPathsToDelete, "manifest");
      }
    }

    deleteFiles(manifestListsToDelete, "manifest list");

    if (hasAnyStatisticsFiles(beforeExpiration)) {
      deleteFiles(
          expiredStatisticsFilesLocations(beforeExpiration, afterExpiration), "statistics files");
    }
  }

  private Set pruneReferencedManifests(
      Set snapshots,
      Set deletionCandidates,
      Consumer currentManifestCallback) {
    Set candidateSet = ConcurrentHashMap.newKeySet();
    candidateSet.addAll(deletionCandidates);
    Tasks.foreach(snapshots)
        .retry(3)
        .stopOnFailure()
        .throwFailureWhenFinished()
        .executeWith(planExecutorService)
        .onFailure(
            (snapshot, exc) ->
                LOG.warn(
                    "Failed to determine manifests for snapshot {}", snapshot.snapshotId(), exc))
        .run(
            snapshot -> {
              try (CloseableIterable manifestFiles = readManifests(snapshot)) {
                for (ManifestFile manifestFile : manifestFiles) {
                  candidateSet.remove(manifestFile);
                  if (candidateSet.isEmpty()) {
                    return;
                  }

                  currentManifestCallback.accept(manifestFile.copy());
                }
              } catch (IOException e) {
                throw new RuntimeIOException(
                    e, "Failed to close manifest list: %s", snapshot.manifestListLocation());
              }
            });

    return candidateSet;
  }

  private Set readManifests(Set snapshots) {
    Set manifestFiles = ConcurrentHashMap.newKeySet();
    Tasks.foreach(snapshots)
        .retry(3)
        .stopOnFailure()
        .throwFailureWhenFinished()
        .executeWith(planExecutorService)
        .onFailure(
            (snapshot, exc) ->
                LOG.warn(
                    "Failed to determine manifests for snapshot {}", snapshot.snapshotId(), exc))
        .run(
            snapshot -> {
              try (CloseableIterable manifests = readManifests(snapshot)) {
                for (ManifestFile manifestFile : manifests) {
                  manifestFiles.add(manifestFile.copy());
                }
              } catch (IOException e) {
                throw new RuntimeIOException(
                    e, "Failed to close manifest list: %s", snapshot.manifestListLocation());
              }
            });

    return manifestFiles;
  }

  // Helper to determine data files to delete
  private Set findFilesToDelete(
      Set manifestFilesToDelete, Set currentManifestFiles) {
    Set filesToDelete = ConcurrentHashMap.newKeySet();

    Tasks.foreach(manifestFilesToDelete)
        .retry(3)
        .suppressFailureWhenFinished()
        .executeWith(planExecutorService)
        .onFailure(
            (item, exc) ->
                LOG.warn(
                    "Failed to determine live files in manifest {}. Retrying", item.path(), exc))
        .run(
            manifest -> {
              try (CloseableIterable paths = ManifestFiles.readPaths(manifest, fileIO)) {
                paths.forEach(filesToDelete::add);
              } catch (IOException e) {
                throw new RuntimeIOException(e, "Failed to read manifest file: %s", manifest);
              }
            });

    if (filesToDelete.isEmpty()) {
      return filesToDelete;
    }

    try {
      Tasks.foreach(currentManifestFiles)
          .retry(3)
          .stopOnFailure()
          .throwFailureWhenFinished()
          .executeWith(planExecutorService)
          .onFailure(
              (item, exc) ->
                  LOG.warn(
                      "Failed to determine live files in manifest {}. Retrying", item.path(), exc))
          .run(
              manifest -> {
                if (filesToDelete.isEmpty()) {
                  return;
                }

                // Remove all the live files from the candidate deletion set
                try (CloseableIterable paths = ManifestFiles.readPaths(manifest, fileIO)) {
                  paths.forEach(filesToDelete::remove);
                } catch (IOException e) {
                  throw new RuntimeIOException(e, "Failed to read manifest file: %s", manifest);
                }
              });

    } catch (Throwable e) {
      LOG.warn("Failed to list all reachable files", e);
      return Sets.newHashSet();
    }

    return filesToDelete;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy