All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.dinky.shaded.paimon.operation.OrphanFilesClean Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.dinky.shaded.paimon.operation;

import org.dinky.shaded.paimon.FileStore;
import org.dinky.shaded.paimon.Snapshot;
import org.dinky.shaded.paimon.fs.FileIO;
import org.dinky.shaded.paimon.fs.FileStatus;
import org.dinky.shaded.paimon.fs.Path;
import org.dinky.shaded.paimon.index.IndexFileHandler;
import org.dinky.shaded.paimon.index.IndexFileMeta;
import org.dinky.shaded.paimon.manifest.IndexManifestEntry;
import org.dinky.shaded.paimon.manifest.ManifestEntry;
import org.dinky.shaded.paimon.manifest.ManifestFile;
import org.dinky.shaded.paimon.manifest.ManifestFileMeta;
import org.dinky.shaded.paimon.manifest.ManifestList;
import org.dinky.shaded.paimon.table.FileStoreTable;
import org.dinky.shaded.paimon.utils.DateTimeUtils;
import org.dinky.shaded.paimon.utils.FileUtils;
import org.dinky.shaded.paimon.utils.SnapshotManager;
import org.dinky.shaded.paimon.utils.TagManager;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nullable;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.stream.Collectors;

import static org.dinky.shaded.paimon.io.DataFilePathFactory.BUCKET_PATH_PREFIX;

/**
 * To remove the data files and metadata files that are not used by table (so-called "orphan
 * files").
 *
 * 

It will ignore exception when listing all files because it's OK to not delete unread files. * *

To avoid deleting newly written files, it only deletes orphan files older than 1 day by * default. The interval can be modified by {@link #olderThan}. * *

To avoid conflicting with snapshot expiration, tag deletion and rollback, it will skip the * snapshot/tag when catching {@link FileNotFoundException} in the process of listing used files. * *

To avoid deleting files that are used but not read by mistaken, it will stop removing process * when failed to read used files. */ public class OrphanFilesClean { private static final Logger LOG = LoggerFactory.getLogger(OrphanFilesClean.class); private static final int READ_FILE_RETRY_NUM = 3; private static final int READ_FILE_RETRY_INTERVAL = 5; private final SnapshotManager snapshotManager; private final TagManager tagManager; private final FileIO fileIO; private final Path location; private final int partitionKeysNum; private final ManifestList manifestList; private final ManifestFile manifestFile; private final IndexFileHandler indexFileHandler; // an estimated value of how many files were deleted private int deletedFilesNum = 0; private long olderThanMillis = System.currentTimeMillis() - TimeUnit.DAYS.toMillis(1); public OrphanFilesClean(FileStoreTable table) { this.snapshotManager = table.snapshotManager(); this.tagManager = table.tagManager(); this.fileIO = table.fileIO(); this.location = table.location(); this.partitionKeysNum = table.partitionKeys().size(); FileStore store = table.store(); this.manifestList = store.manifestListFactory().create(); this.manifestFile = store.manifestFileFactory().create(); this.indexFileHandler = store.newIndexFileHandler(); } public OrphanFilesClean olderThan(String timestamp) { // The FileStatus#getModificationTime returns milliseconds this.olderThanMillis = DateTimeUtils.parseTimestampData(timestamp, 3, DateTimeUtils.LOCAL_TZ) .getMillisecond(); return this; } public int clean() throws IOException, ExecutionException, InterruptedException { if (snapshotManager.earliestSnapshotId() == null) { LOG.info("No snapshot found, skip removing."); return 0; } // specially handle the snapshot directory List nonSnapshotFiles = snapshotManager.tryGetNonSnapshotFiles(this::oldEnough); nonSnapshotFiles.forEach(this::deleteFileOrDirQuietly); deletedFilesNum += nonSnapshotFiles.size(); Set usedFiles = getUsedFiles(); Map candidates = getCandidateDeletingFiles(); Set deleted = new HashSet<>(candidates.keySet()); deleted.removeAll(usedFiles); for (String file : deleted) { Path path = candidates.get(file); deleteFileOrDirQuietly(path); } deletedFilesNum += deleted.size(); return deletedFilesNum; } /** Get all the files used by snapshots and tags. */ private Set getUsedFiles() throws IOException, ExecutionException, InterruptedException { // safely get all snapshots to be read Set readSnapshots = new HashSet<>(snapshotManager.safelyGetAllSnapshots()); SortedMap tags = tagManager.tags(); List taggedSnapshots = new ArrayList<>(tags.keySet()); readSnapshots.addAll(taggedSnapshots); return FileUtils.COMMON_IO_FORK_JOIN_POOL .submit( () -> readSnapshots .parallelStream() .flatMap( snapshot -> getUsedFilesForSnapshot(snapshot).stream()) .collect(Collectors.toSet())) .get(); } /** * Get all the candidate deleting files in the specified directories and filter them by * olderThanMillis. */ private Map getCandidateDeletingFiles() { List fileDirs = listPaimonFileDirs(); try { return FileUtils.COMMON_IO_FORK_JOIN_POOL .submit( () -> fileDirs.parallelStream() .flatMap(p -> tryBestListingDirs(p).stream()) .filter(this::oldEnough) .map(FileStatus::getPath) .collect( Collectors.toMap( Path::getName, Function.identity()))) .get(); } catch (ExecutionException | InterruptedException e) { LOG.debug("Failed to get candidate deleting files.", e); return Collections.emptyMap(); } } /** * If getting null when reading some files, the snapshot/tag is being deleted, so just return an * empty result. */ private List getUsedFilesForSnapshot(Snapshot snapshot) { List files = new ArrayList<>(); addManifestList(files, snapshot); try { // try to read manifests List manifestFileMetas = retryReadingFiles(() -> readAllManifestsWithIOException(snapshot)); if (manifestFileMetas == null) { return Collections.emptyList(); } List manifestFileName = manifestFileMetas.stream() .map(ManifestFileMeta::fileName) .collect(Collectors.toList()); files.addAll(manifestFileName); // try to read data files List dataFiles = retryReadingDataFiles(manifestFileName); if (dataFiles == null) { return Collections.emptyList(); } files.addAll(dataFiles); // try to read index files String indexManifest = snapshot.indexManifest(); if (indexManifest != null && indexFileHandler.existsManifest(indexManifest)) { files.add(indexManifest); List indexManifestEntries = retryReadingFiles( () -> indexFileHandler.readManifestWithIOException(indexManifest)); if (indexManifestEntries == null) { return Collections.emptyList(); } indexManifestEntries.stream() .map(IndexManifestEntry::indexFile) .map(IndexFileMeta::fileName) .forEach(files::add); } } catch (IOException e) { throw new RuntimeException(e); } return files; } private void addManifestList(List used, Snapshot snapshot) { used.add(snapshot.baseManifestList()); used.add(snapshot.deltaManifestList()); String changelogManifestList = snapshot.changelogManifestList(); if (changelogManifestList != null) { used.add(changelogManifestList); } } /** * Retry reading files when {@link IOException} was thrown by the reader. If the exception is * {@link FileNotFoundException}, return null. Finally, if retry times reaches the limits, * rethrow the IOException. */ @Nullable private T retryReadingFiles(ReaderWithIOException reader) throws IOException { int retryNumber = 0; IOException caught = null; while (retryNumber++ < READ_FILE_RETRY_NUM) { try { return reader.read(); } catch (FileNotFoundException e) { return null; } catch (IOException e) { caught = e; } try { TimeUnit.MILLISECONDS.sleep(READ_FILE_RETRY_INTERVAL); } catch (InterruptedException e) { Thread.currentThread().interrupt(); throw new RuntimeException(e); } } throw caught; } private List readAllManifestsWithIOException(Snapshot snapshot) throws IOException { List result = new ArrayList<>(); result.addAll(manifestList.readWithIOException(snapshot.baseManifestList())); result.addAll(manifestList.readWithIOException(snapshot.deltaManifestList())); String changelogManifestList = snapshot.changelogManifestList(); if (changelogManifestList != null) { result.addAll(manifestList.readWithIOException(changelogManifestList)); } return result; } @Nullable private List retryReadingDataFiles(List manifestNames) throws IOException { List dataFiles = new ArrayList<>(); for (String manifestName : manifestNames) { List manifestEntries = retryReadingFiles(() -> manifestFile.readWithIOException(manifestName)); if (manifestEntries == null) { return null; } manifestEntries.stream() .map(ManifestEntry::file) .forEach( f -> { dataFiles.add(f.fileName()); dataFiles.addAll(f.extraFiles()); }); } return dataFiles; } /** List directories that contains data files and manifest files. */ private List listPaimonFileDirs() { List paimonFileDirs = new ArrayList<>(); paimonFileDirs.add(new Path(location, "manifest")); paimonFileDirs.add(new Path(location, "index")); paimonFileDirs.addAll(listAndCleanDataDirs(location, partitionKeysNum)); return paimonFileDirs; } /** * If failed to list directory, just return an empty result because it's OK to not delete them. */ private List tryBestListingDirs(Path dir) { try { if (!fileIO.exists(dir)) { return Collections.emptyList(); } List status = retryReadingFiles( () -> { FileStatus[] s = fileIO.listStatus(dir); return s == null ? Collections.emptyList() : Arrays.asList(s); }); return status == null ? Collections.emptyList() : status; } catch (IOException e) { LOG.debug("Failed to list directory {}, skip it.", dir, e); return Collections.emptyList(); } } private boolean oldEnough(FileStatus status) { return status.getModificationTime() < olderThanMillis; } /** * List directories that contains data files and may clean non Paimon data dirs/files. The * argument level is used to control recursive depth. */ private List listAndCleanDataDirs(Path dir, int level) { List dirs = tryBestListingDirs(dir); if (level == 0) { // return bucket paths return filterAndCleanDataDirs( dirs, p -> p.getName().startsWith(BUCKET_PATH_PREFIX), // if buckets are under partition, we can do clean partitionKeysNum -> partitionKeysNum != 0); } List partitionPaths = filterAndCleanDataDirs( dirs, p -> p.getName().contains("="), // if partitions are under a parent partition, we can do clean partitionKeysNum -> level != partitionKeysNum); // dive into the next partition level try { return FileUtils.COMMON_IO_FORK_JOIN_POOL .submit( () -> partitionPaths .parallelStream() .flatMap( p -> listAndCleanDataDirs(p, level - 1) .stream()) .collect(Collectors.toList())) .get(); } catch (ExecutionException | InterruptedException e) { LOG.debug("Failed to list partition directory {}", dir, e); return Collections.emptyList(); } } private List filterAndCleanDataDirs( List statuses, Predicate filter, Predicate cleanCondition) { List filtered = new ArrayList<>(); List mayBeClean = new ArrayList<>(); for (FileStatus status : statuses) { Path path = status.getPath(); if (filter.test(path)) { filtered.add(path); } else { mayBeClean.add(status); } } if (cleanCondition.test(partitionKeysNum)) { mayBeClean.stream() .filter(this::oldEnough) .map(FileStatus::getPath) .forEach( p -> { deleteFileOrDirQuietly(p); deletedFilesNum++; }); } return filtered; } private void deleteFileOrDirQuietly(Path path) { try { if (fileIO.isDir(path)) { fileIO.deleteDirectoryQuietly(path); } else { fileIO.deleteQuietly(path); } } catch (IOException ignored) { } } /** A helper functional interface for method {@link #retryReadingFiles}. */ @FunctionalInterface private interface ReaderWithIOException { T read() throws IOException; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy