org.apache.iceberg.ReachableFileCleanup Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-core Show documentation
Show all versions of iceberg-core Show documentation
A table format for huge analytic datasets
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg;
import java.io.IOException;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutorService;
import java.util.function.Consumer;
import java.util.stream.Collectors;
import org.apache.iceberg.exceptions.RuntimeIOException;
import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.io.FileIO;
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
import org.apache.iceberg.util.Tasks;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* File cleanup strategy for snapshot expiration which determines, via an in-memory reference set,
* metadata and data files that are not reachable given the previous and current table states.
*/
class ReachableFileCleanup extends FileCleanupStrategy {
private static final Logger LOG = LoggerFactory.getLogger(ReachableFileCleanup.class);
ReachableFileCleanup(
FileIO fileIO,
ExecutorService deleteExecutorService,
ExecutorService planExecutorService,
Consumer deleteFunc) {
super(fileIO, deleteExecutorService, planExecutorService, deleteFunc);
}
@Override
public void cleanFiles(TableMetadata beforeExpiration, TableMetadata afterExpiration) {
Set manifestListsToDelete = Sets.newHashSet();
Set snapshotsBeforeExpiration = Sets.newHashSet(beforeExpiration.snapshots());
Set snapshotsAfterExpiration = Sets.newHashSet(afterExpiration.snapshots());
Set expiredSnapshots = Sets.newHashSet();
for (Snapshot snapshot : snapshotsBeforeExpiration) {
if (!snapshotsAfterExpiration.contains(snapshot)) {
expiredSnapshots.add(snapshot);
if (snapshot.manifestListLocation() != null) {
manifestListsToDelete.add(snapshot.manifestListLocation());
}
}
}
Set deletionCandidates = readManifests(expiredSnapshots);
if (!deletionCandidates.isEmpty()) {
Set currentManifests = ConcurrentHashMap.newKeySet();
Set manifestsToDelete =
pruneReferencedManifests(
snapshotsAfterExpiration, deletionCandidates, currentManifests::add);
if (!manifestsToDelete.isEmpty()) {
Set dataFilesToDelete = findFilesToDelete(manifestsToDelete, currentManifests);
deleteFiles(dataFilesToDelete, "data");
Set manifestPathsToDelete =
manifestsToDelete.stream().map(ManifestFile::path).collect(Collectors.toSet());
deleteFiles(manifestPathsToDelete, "manifest");
}
}
deleteFiles(manifestListsToDelete, "manifest list");
if (hasAnyStatisticsFiles(beforeExpiration)) {
deleteFiles(
expiredStatisticsFilesLocations(beforeExpiration, afterExpiration), "statistics files");
}
}
private Set pruneReferencedManifests(
Set snapshots,
Set deletionCandidates,
Consumer currentManifestCallback) {
Set candidateSet = ConcurrentHashMap.newKeySet();
candidateSet.addAll(deletionCandidates);
Tasks.foreach(snapshots)
.retry(3)
.stopOnFailure()
.throwFailureWhenFinished()
.executeWith(planExecutorService)
.onFailure(
(snapshot, exc) ->
LOG.warn(
"Failed to determine manifests for snapshot {}", snapshot.snapshotId(), exc))
.run(
snapshot -> {
try (CloseableIterable manifestFiles = readManifests(snapshot)) {
for (ManifestFile manifestFile : manifestFiles) {
candidateSet.remove(manifestFile);
if (candidateSet.isEmpty()) {
return;
}
currentManifestCallback.accept(manifestFile.copy());
}
} catch (IOException e) {
throw new RuntimeIOException(
e, "Failed to close manifest list: %s", snapshot.manifestListLocation());
}
});
return candidateSet;
}
private Set readManifests(Set snapshots) {
Set manifestFiles = ConcurrentHashMap.newKeySet();
Tasks.foreach(snapshots)
.retry(3)
.stopOnFailure()
.throwFailureWhenFinished()
.executeWith(planExecutorService)
.onFailure(
(snapshot, exc) ->
LOG.warn(
"Failed to determine manifests for snapshot {}", snapshot.snapshotId(), exc))
.run(
snapshot -> {
try (CloseableIterable manifests = readManifests(snapshot)) {
for (ManifestFile manifestFile : manifests) {
manifestFiles.add(manifestFile.copy());
}
} catch (IOException e) {
throw new RuntimeIOException(
e, "Failed to close manifest list: %s", snapshot.manifestListLocation());
}
});
return manifestFiles;
}
// Helper to determine data files to delete
private Set findFilesToDelete(
Set manifestFilesToDelete, Set currentManifestFiles) {
Set filesToDelete = ConcurrentHashMap.newKeySet();
Tasks.foreach(manifestFilesToDelete)
.retry(3)
.suppressFailureWhenFinished()
.executeWith(planExecutorService)
.onFailure(
(item, exc) ->
LOG.warn(
"Failed to determine live files in manifest {}. Retrying", item.path(), exc))
.run(
manifest -> {
try (CloseableIterable paths = ManifestFiles.readPaths(manifest, fileIO)) {
paths.forEach(filesToDelete::add);
} catch (IOException e) {
throw new RuntimeIOException(e, "Failed to read manifest file: %s", manifest);
}
});
if (filesToDelete.isEmpty()) {
return filesToDelete;
}
try {
Tasks.foreach(currentManifestFiles)
.retry(3)
.stopOnFailure()
.throwFailureWhenFinished()
.executeWith(planExecutorService)
.onFailure(
(item, exc) ->
LOG.warn(
"Failed to determine live files in manifest {}. Retrying", item.path(), exc))
.run(
manifest -> {
if (filesToDelete.isEmpty()) {
return;
}
// Remove all the live files from the candidate deletion set
try (CloseableIterable paths = ManifestFiles.readPaths(manifest, fileIO)) {
paths.forEach(filesToDelete::remove);
} catch (IOException e) {
throw new RuntimeIOException(e, "Failed to read manifest file: %s", manifest);
}
});
} catch (Throwable e) {
LOG.warn("Failed to list all reachable files", e);
return Sets.newHashSet();
}
return filesToDelete;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy