org.dinky.shaded.paimon.operation.FileDeletionBase Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.dinky.shaded.paimon.operation;
import org.dinky.shaded.paimon.Snapshot;
import org.dinky.shaded.paimon.data.BinaryRow;
import org.dinky.shaded.paimon.fs.FileIO;
import org.dinky.shaded.paimon.fs.Path;
import org.dinky.shaded.paimon.index.IndexFileHandler;
import org.dinky.shaded.paimon.index.IndexFileMeta;
import org.dinky.shaded.paimon.manifest.IndexManifestEntry;
import org.dinky.shaded.paimon.manifest.ManifestEntry;
import org.dinky.shaded.paimon.manifest.ManifestFile;
import org.dinky.shaded.paimon.manifest.ManifestFileMeta;
import org.dinky.shaded.paimon.manifest.ManifestList;
import org.dinky.shaded.paimon.utils.FileStorePathFactory;
import org.dinky.shaded.paimon.utils.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.Executor;
import java.util.function.Consumer;
import java.util.function.Predicate;
import java.util.stream.Collectors;
/**
* Base class for file deletion including methods for clean data files, manifest files and empty
* data directories.
*/
public abstract class FileDeletionBase {
private static final Logger LOG = LoggerFactory.getLogger(FileDeletionBase.class);
protected final FileIO fileIO;
protected final FileStorePathFactory pathFactory;
protected final ManifestFile manifestFile;
protected final ManifestList manifestList;
protected final IndexFileHandler indexFileHandler;
protected final Map> deletionBuckets;
protected final Executor ioExecutor;
public FileDeletionBase(
FileIO fileIO,
FileStorePathFactory pathFactory,
ManifestFile manifestFile,
ManifestList manifestList,
IndexFileHandler indexFileHandler) {
this.fileIO = fileIO;
this.pathFactory = pathFactory;
this.manifestFile = manifestFile;
this.manifestList = manifestList;
this.indexFileHandler = indexFileHandler;
this.deletionBuckets = new HashMap<>();
this.ioExecutor = FileUtils.COMMON_IO_FORK_JOIN_POOL;
}
/**
* Clean data files that will not be used anymore in the snapshot.
*
* @param snapshot {@link Snapshot} that will be cleaned
* @param skipper if the test result of a data file is true, it will be skipped when deleting;
* else it will be deleted
*/
public abstract void cleanUnusedDataFiles(Snapshot snapshot, Predicate skipper);
/**
* Clean metadata files that will not be used anymore of a snapshot, including data manifests,
* index manifests and manifest lists.
*
* @param snapshot {@link Snapshot} that will be cleaned
* @param skippingSet manifests that should not be deleted
*/
public abstract void cleanUnusedManifests(Snapshot snapshot, Set skippingSet);
/** Try to delete data directories that may be empty after data file deletion. */
public void cleanDataDirectories() {
if (deletionBuckets.isEmpty()) {
return;
}
// All directory paths are deduplicated and sorted by hierarchy level
Map> deduplicate = new HashMap<>();
for (Map.Entry> entry : deletionBuckets.entrySet()) {
List toDeleteEmptyDirectory = new ArrayList<>();
// try to delete bucket directories
for (Integer bucket : entry.getValue()) {
toDeleteEmptyDirectory.add(pathFactory.bucketPath(entry.getKey(), bucket));
}
deleteFiles(toDeleteEmptyDirectory, this::tryDeleteEmptyDirectory);
List hierarchicalPaths = pathFactory.getHierarchicalPartitionPath(entry.getKey());
int hierarchies = hierarchicalPaths.size();
if (hierarchies == 0) {
continue;
}
if (tryDeleteEmptyDirectory(hierarchicalPaths.get(hierarchies - 1))) {
// deduplicate high level partition directories
for (int hierarchy = 0; hierarchy < hierarchies - 1; hierarchy++) {
Path path = hierarchicalPaths.get(hierarchy);
deduplicate.computeIfAbsent(hierarchy, i -> new HashSet<>()).add(path);
}
}
}
// from deepest to shallowest
for (int hierarchy = deduplicate.size() - 1; hierarchy >= 0; hierarchy--) {
deduplicate.get(hierarchy).forEach(this::tryDeleteEmptyDirectory);
}
deletionBuckets.clear();
}
protected void recordDeletionBuckets(ManifestEntry entry) {
deletionBuckets
.computeIfAbsent(entry.partition(), p -> new HashSet<>())
.add(entry.bucket());
}
protected void cleanUnusedManifests(
Snapshot snapshot, Set skippingSet, boolean deleteChangelog) {
// clean base and delta manifests
List toDeleteManifests = new ArrayList<>();
List toExpireManifests = new ArrayList<>();
toExpireManifests.addAll(tryReadManifestList(snapshot.baseManifestList()));
toExpireManifests.addAll(tryReadManifestList(snapshot.deltaManifestList()));
for (ManifestFileMeta manifest : toExpireManifests) {
String fileName = manifest.fileName();
if (!skippingSet.contains(fileName)) {
toDeleteManifests.add(fileName);
// to avoid other snapshots trying to delete again
skippingSet.add(fileName);
}
}
deleteFiles(toDeleteManifests, manifestFile::delete);
toDeleteManifests.clear();
if (!skippingSet.contains(snapshot.baseManifestList())) {
toDeleteManifests.add(snapshot.baseManifestList());
}
if (!skippingSet.contains(snapshot.deltaManifestList())) {
toDeleteManifests.add(snapshot.deltaManifestList());
}
deleteFiles(toDeleteManifests, manifestList::delete);
// clean changelog manifests
if (deleteChangelog && snapshot.changelogManifestList() != null) {
deleteFiles(
tryReadManifestList(snapshot.changelogManifestList()),
manifest -> manifestFile.delete(manifest.fileName()));
manifestList.delete(snapshot.changelogManifestList());
}
// clean index manifests
String indexManifest = snapshot.indexManifest();
// check exists, it may have been deleted by other snapshots
if (indexManifest != null && indexFileHandler.existsManifest(indexManifest)) {
List indexManifestEntries =
indexFileHandler.readManifest(indexManifest);
indexManifestEntries.removeIf(
entry -> skippingSet.contains(entry.indexFile().fileName()));
deleteFiles(indexManifestEntries, indexFileHandler::deleteIndexFile);
if (!skippingSet.contains(indexManifest)) {
indexFileHandler.deleteManifest(indexManifest);
}
}
}
/**
* It is possible that a job was killed during expiration and some manifest files have been
* deleted, so if the clean methods need to get manifests of a snapshot to be cleaned, we should
* try to read manifests and return empty list if failed instead of calling {@link
* Snapshot#dataManifests} directly.
*/
protected List tryReadManifestList(String manifestListName) {
try {
return manifestList.read(manifestListName);
} catch (Exception e) {
LOG.warn("Failed to read manifest list file " + manifestListName, e);
return Collections.emptyList();
}
}
protected List tryReadDataManifests(Snapshot snapshot) {
List manifestFileMetas = tryReadManifestList(snapshot.baseManifestList());
manifestFileMetas.addAll(tryReadManifestList(snapshot.deltaManifestList()));
return readManifestFileNames(manifestFileMetas);
}
protected List readManifestFileNames(List manifestFileMetas) {
return manifestFileMetas.stream()
.map(ManifestFileMeta::fileName)
.collect(Collectors.toCollection(LinkedList::new));
}
/**
* NOTE: This method is used for building data file skipping set. If failed to read some
* manifests, it will throw exception which callers must handle.
*/
protected void addMergedDataFiles(
Map>> dataFiles, Snapshot snapshot)
throws IOException {
for (ManifestEntry entry : readMergedDataFiles(snapshot)) {
dataFiles
.computeIfAbsent(entry.partition(), p -> new HashMap<>())
.computeIfAbsent(entry.bucket(), b -> new HashSet<>())
.add(entry.file().fileName());
}
}
protected Collection readMergedDataFiles(Snapshot snapshot) throws IOException {
// read data manifests
List files = tryReadDataManifests(snapshot);
// read and merge manifest entries
Map map = new HashMap<>();
for (String manifest : files) {
List entries;
entries = manifestFile.readWithIOException(manifest);
ManifestEntry.mergeEntries(entries, map);
}
return map.values();
}
protected boolean containsDataFile(
Map>> dataFiles, ManifestEntry testee) {
Map> buckets = dataFiles.get(testee.partition());
if (buckets != null) {
Set fileNames = buckets.get(testee.bucket());
if (fileNames != null) {
return fileNames.contains(testee.file().fileName());
}
}
return false;
}
/** Changelogs were not checked. Let the subclass determine whether to delete them. */
public Set manifestSkippingSet(Snapshot skippingSnapshot) {
return manifestSkippingSet(Collections.singletonList(skippingSnapshot));
}
public Set manifestSkippingSet(List skippingSnapshots) {
Set skippingSet = new HashSet<>();
for (Snapshot skippingSnapshot : skippingSnapshots) {
// data manifests
skippingSet.add(skippingSnapshot.baseManifestList());
skippingSet.add(skippingSnapshot.deltaManifestList());
skippingSnapshot.dataManifests(manifestList).stream()
.map(ManifestFileMeta::fileName)
.forEach(skippingSet::add);
// index manifests
String indexManifest = skippingSnapshot.indexManifest();
if (indexManifest != null) {
skippingSet.add(indexManifest);
indexFileHandler.readManifest(indexManifest).stream()
.map(IndexManifestEntry::indexFile)
.map(IndexFileMeta::fileName)
.forEach(skippingSet::add);
}
}
return skippingSet;
}
private boolean tryDeleteEmptyDirectory(Path path) {
try {
fileIO.delete(path, false);
return true;
} catch (IOException e) {
LOG.debug("Failed to delete directory '{}'. Check whether it is empty.", path);
return false;
}
}
protected void deleteFiles(Collection files, Consumer deletion) {
if (files.isEmpty()) {
return;
}
List> deletionFutures = new ArrayList<>(files.size());
for (T file : files) {
deletionFutures.add(
CompletableFuture.runAsync(() -> deletion.accept(file), ioExecutor));
}
try {
CompletableFuture.allOf(deletionFutures.toArray(new CompletableFuture[0])).get();
} catch (Exception e) {
throw new RuntimeException(e);
}
}
}