org.apache.hudi.table.action.clean.CleanPlanner Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.table.action.clean;
import org.apache.hudi.avro.model.HoodieCleanMetadata;
import org.apache.hudi.avro.model.HoodieSavepointMetadata;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.CleanFileInfo;
import org.apache.hudi.common.model.CompactionOperation;
import org.apache.hudi.common.model.FileSlice;
import org.apache.hudi.common.model.HoodieBaseFile;
import org.apache.hudi.common.model.HoodieCleaningPolicy;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieFileGroup;
import org.apache.hudi.common.model.HoodieFileGroupId;
import org.apache.hudi.common.model.HoodieLogFile;
import org.apache.hudi.common.model.HoodieReplaceCommitMetadata;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.table.timeline.TimelineMetadataUtils;
import org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanV1MigrationHandler;
import org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanV2MigrationHandler;
import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
import org.apache.hudi.common.table.view.SyncableFileSystemView;
import org.apache.hudi.common.util.CleanerUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.exception.HoodieSavepointException;
import org.apache.hudi.storage.StoragePath;
import org.apache.hudi.table.HoodieTable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.Serializable;
import java.time.Instant;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import static org.apache.hudi.common.table.timeline.InstantComparison.GREATER_THAN;
import static org.apache.hudi.common.table.timeline.InstantComparison.GREATER_THAN_OR_EQUALS;
import static org.apache.hudi.common.table.timeline.InstantComparison.LESSER_THAN;
import static org.apache.hudi.common.table.timeline.InstantComparison.compareTimestamps;
/**
* Cleaner is responsible for garbage collecting older files in a given partition path. Such that
*
* 1) It provides sufficient time for existing queries running on older versions, to close
*
* 2) It bounds the growth of the files in the file system
*/
public class CleanPlanner implements Serializable {
private static final Logger LOG = LoggerFactory.getLogger(CleanPlanner.class);
public static final Integer CLEAN_PLAN_VERSION_1 = CleanPlanV1MigrationHandler.VERSION;
public static final Integer CLEAN_PLAN_VERSION_2 = CleanPlanV2MigrationHandler.VERSION;
public static final Integer LATEST_CLEAN_PLAN_VERSION = CLEAN_PLAN_VERSION_2;
private transient HoodieTimeline commitTimeline;
private final Map fgIdToPendingCompactionOperations;
private final Map fgIdToPendingLogCompactionOperations;
private final HoodieTable hoodieTable;
private final HoodieWriteConfig config;
private transient HoodieEngineContext context;
private final List savepointedTimestamps;
private Option earliestCommitToRetain = Option.empty();
public CleanPlanner(HoodieEngineContext context, HoodieTable hoodieTable, HoodieWriteConfig config) {
this.context = context;
this.hoodieTable = hoodieTable;
this.config = config;
SyncableFileSystemView fileSystemView = (SyncableFileSystemView) hoodieTable.getSliceView();
this.fgIdToPendingCompactionOperations = fileSystemView
.getPendingCompactionOperations()
.map(entry -> Pair.of(
new HoodieFileGroupId(entry.getValue().getPartitionPath(), entry.getValue().getFileId()),
entry.getValue()))
.collect(Collectors.toMap(Pair::getKey, Pair::getValue));
this.fgIdToPendingLogCompactionOperations = fileSystemView.getPendingLogCompactionOperations()
.map(entry -> Pair.of(new HoodieFileGroupId(entry.getValue().getPartitionPath(), entry.getValue().getFileId()), entry.getValue()))
.collect(Collectors.toMap(Pair::getKey, Pair::getValue));
// collect savepointed timestamps to assist with incremental cleaning. For non-partitioned and metadata table, we may not need this.
this.savepointedTimestamps = hoodieTable.isMetadataTable() ? Collections.emptyList() : (hoodieTable.isPartitioned() ? new ArrayList<>(hoodieTable.getSavepointTimestamps())
: Collections.emptyList());
}
private HoodieTimeline getCommitTimeline() {
if (commitTimeline == null) {
commitTimeline = hoodieTable.getCompletedCommitsTimeline();
}
return commitTimeline;
}
/**
* @return list of savepointed timestamps in active timeline as of this clean planning.
*/
List getSavepointedTimestamps() {
return this.savepointedTimestamps;
}
/**
* Get the list of data file names savepointed.
*/
public Stream getSavepointedDataFiles(String savepointTime) {
HoodieSavepointMetadata metadata = getSavepointMetadata(savepointTime);
return metadata.getPartitionMetadata().values().stream().flatMap(s -> s.getSavepointDataFile().stream());
}
private HoodieSavepointMetadata getSavepointMetadata(String savepointTimestamp) {
if (!hoodieTable.getSavepointTimestamps().contains(savepointTimestamp)) {
throw new HoodieSavepointException(
"Could not get data files for savepoint " + savepointTimestamp + ". No such savepoint.");
}
HoodieInstant instant = hoodieTable.getMetaClient().createNewInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.SAVEPOINT_ACTION, savepointTimestamp);
try {
return TimelineMetadataUtils.deserializeHoodieSavepointMetadata(
hoodieTable.getActiveTimeline().getInstantDetails(instant).get());
} catch (IOException e) {
throw new HoodieSavepointException("Could not get savepointed data files for savepoint " + savepointTimestamp, e);
}
}
/**
* Returns list of partitions where clean operations needs to be performed.
*
* @param earliestRetainedInstant New instant to be retained after this cleanup operation
* @return list of partitions to scan for cleaning
* @throws IOException when underlying file-system throws this exception
*/
public List getPartitionPathsToClean(Option earliestRetainedInstant) throws IOException {
switch (config.getCleanerPolicy()) {
case KEEP_LATEST_COMMITS:
case KEEP_LATEST_BY_HOURS:
return getPartitionPathsForCleanByCommits(earliestRetainedInstant);
case KEEP_LATEST_FILE_VERSIONS:
return getPartitionPathsForFullCleaning();
default:
throw new IllegalStateException("Unknown Cleaner Policy");
}
}
/**
* Return partition paths for cleaning by commits mode.
* @param instantToRetain Earliest Instant to retain
* @return list of partitions
* @throws IOException
*/
private List getPartitionPathsForCleanByCommits(Option instantToRetain) throws IOException {
if (!instantToRetain.isPresent()) {
LOG.info("No earliest commit to retain. No need to scan partitions !!");
return Collections.emptyList();
}
if (config.incrementalCleanerModeEnabled()) {
Option lastClean = hoodieTable.getCleanTimeline().filterCompletedInstants().lastInstant();
if (lastClean.isPresent()) {
if (hoodieTable.getActiveTimeline().isEmpty(lastClean.get())) {
hoodieTable.getActiveTimeline().deleteEmptyInstantIfExists(lastClean.get());
} else {
HoodieCleanMetadata cleanMetadata = TimelineMetadataUtils
.deserializeHoodieCleanMetadata(hoodieTable.getActiveTimeline().getInstantDetails(lastClean.get()).get());
if ((cleanMetadata.getEarliestCommitToRetain() != null)
&& !cleanMetadata.getEarliestCommitToRetain().trim().isEmpty()
&& !hoodieTable.getActiveTimeline().getCommitsTimeline().isBeforeTimelineStarts(cleanMetadata.getEarliestCommitToRetain())) {
return getPartitionPathsForIncrementalCleaning(cleanMetadata, instantToRetain);
}
}
}
}
return getPartitionPathsForFullCleaning();
}
/**
* Use Incremental Mode for finding partition paths.
* @param cleanMetadata
* @param newInstantToRetain
* @return
*/
private List getPartitionPathsForIncrementalCleaning(HoodieCleanMetadata cleanMetadata,
Option newInstantToRetain) {
boolean isAnySavepointDeleted = isAnySavepointDeleted(cleanMetadata);
if (isAnySavepointDeleted) {
LOG.info("Since savepoints have been removed compared to previous clean, triggering clean planning for all partitions");
return getPartitionPathsForFullCleaning();
} else {
LOG.info(
"Incremental Cleaning mode is enabled. Looking up partition-paths that have changed "
+ "since last clean at {}. New Instant to retain {}.",
cleanMetadata.getEarliestCommitToRetain(),
newInstantToRetain);
return hoodieTable.getCompletedCommitsTimeline().getInstantsAsStream()
.filter(instant -> compareTimestamps(instant.requestedTime(), GREATER_THAN_OR_EQUALS,
cleanMetadata.getEarliestCommitToRetain()) && compareTimestamps(instant.requestedTime(),
LESSER_THAN, newInstantToRetain.get().requestedTime()))
.flatMap(this::getPartitionsForInstants).distinct().collect(Collectors.toList());
}
}
private boolean isAnySavepointDeleted(HoodieCleanMetadata cleanMetadata) {
List savepointedTimestampsFromLastClean = cleanMetadata.getExtraMetadata() == null ? Collections.emptyList()
: Arrays.stream(cleanMetadata.getExtraMetadata().getOrDefault(CleanerUtils.SAVEPOINTED_TIMESTAMPS, StringUtils.EMPTY_STRING).split(","))
.filter(partition -> !StringUtils.isNullOrEmpty(partition)).collect(Collectors.toList());
if (savepointedTimestampsFromLastClean.isEmpty()) {
return false;
}
// check for any savepointed removed in latest compared to previous saved list
List removedSavepointedTimestamps = new ArrayList<>(savepointedTimestampsFromLastClean);
removedSavepointedTimestamps.removeAll(savepointedTimestamps);
if (removedSavepointedTimestamps.isEmpty()) {
return false;
}
return true;
}
/**
* Fetch partitions updated as part of a HoodieInstant.
* @param instant {@link HoodieInstant} of interest.
* @return partitions that were part of {@link HoodieInstant} given.
*/
private Stream getPartitionsForInstants(HoodieInstant instant) {
try {
if (HoodieTimeline.REPLACE_COMMIT_ACTION.equals(instant.getAction())) {
HoodieReplaceCommitMetadata replaceCommitMetadata = HoodieReplaceCommitMetadata.fromBytes(
hoodieTable.getActiveTimeline().getInstantDetails(instant).get(), HoodieReplaceCommitMetadata.class);
return Stream.concat(replaceCommitMetadata.getPartitionToReplaceFileIds().keySet().stream(), replaceCommitMetadata.getPartitionToWriteStats().keySet().stream());
} else {
HoodieCommitMetadata commitMetadata = hoodieTable.getMetaClient()
.getCommitMetadataSerDe().deserialize(instant, hoodieTable.getActiveTimeline().getInstantDetails(instant).get(),
HoodieCommitMetadata.class);
return commitMetadata.getPartitionToWriteStats().keySet().stream();
}
} catch (IOException e) {
throw new HoodieIOException(e.getMessage(), e);
}
}
/**
* Scan and list all partitions for cleaning.
* @return all partitions paths for the dataset.
*/
private List getPartitionPathsForFullCleaning() {
// Go to brute force mode of scanning all partitions
try {
return hoodieTable.getMetadataTable().getAllPartitionPaths();
} catch (IOException ioe) {
throw new HoodieIOException("Fetching all partitions failed ", ioe);
}
}
/**
* Verify whether file slice exists in savepointedFiles, check both base file and log files
*/
private boolean isFileSliceExistInSavepointedFiles(FileSlice fs, List savepointedFiles) {
if (fs.getBaseFile().isPresent() && savepointedFiles.contains(fs.getBaseFile().get().getFileName())) {
return true;
}
for (HoodieLogFile hoodieLogFile : fs.getLogFiles().collect(Collectors.toList())) {
if (savepointedFiles.contains(hoodieLogFile.getFileName())) {
return true;
}
}
return false;
}
/**
* Selects the older versions of files for cleaning, such that it bounds the number of versions of each file. This
* policy is useful, if you are simply interested in querying the table, and you don't want too many versions for a
* single file (i.e., run it with versionsRetained = 1)
*/
private Pair> getFilesToCleanKeepingLatestVersions(String partitionPath) {
LOG.info(
"Cleaning {}, retaining latest {} file versions.",
partitionPath,
config.getCleanerFileVersionsRetained());
List deletePaths = new ArrayList<>();
// Collect all the datafiles savepointed by all the savepoints
List savepointedFiles = hoodieTable.getSavepointTimestamps().stream()
.flatMap(this::getSavepointedDataFiles)
.collect(Collectors.toList());
// In this scenario, we will assume that once replaced a file group automatically becomes eligible for cleaning completely
// In other words, the file versions only apply to the active file groups.
deletePaths.addAll(getReplacedFilesEligibleToClean(savepointedFiles, partitionPath, Option.empty()));
boolean toDeletePartition = false;
List fileGroups = hoodieTable.getHoodieView().getAllFileGroupsStateless(partitionPath).collect(Collectors.toList());
for (HoodieFileGroup fileGroup : fileGroups) {
int keepVersions = config.getCleanerFileVersionsRetained();
// do not cleanup slice required for pending compaction
Iterator fileSliceIterator =
fileGroup.getAllFileSlices()
.filter(fs -> !isFileSliceNeededForPendingMajorOrMinorCompaction(fs))
.iterator();
if (isFileGroupInPendingMajorOrMinorCompaction(fileGroup)) {
// We have already saved the last version of file-groups for pending compaction Id
keepVersions--;
}
while (fileSliceIterator.hasNext() && keepVersions > 0) {
// Skip this most recent version
fileSliceIterator.next();
keepVersions--;
}
// Delete the remaining files
while (fileSliceIterator.hasNext()) {
FileSlice nextSlice = fileSliceIterator.next();
if (isFileSliceExistInSavepointedFiles(nextSlice, savepointedFiles)) {
// do not clean up a savepoint data file
continue;
}
deletePaths.addAll(getCleanFileInfoForSlice(nextSlice));
}
}
// if there are no valid file groups
// and no pending data files under the partition [IMPORTANT],
// mark it to be deleted
if (fileGroups.isEmpty() && !hasPendingFiles(partitionPath)) {
toDeletePartition = true;
}
return Pair.of(toDeletePartition, deletePaths);
}
private Pair> getFilesToCleanKeepingLatestCommits(String partitionPath, Option earliestCommitToRetain) {
return getFilesToCleanKeepingLatestCommits(partitionPath, config.getCleanerCommitsRetained(), earliestCommitToRetain, HoodieCleaningPolicy.KEEP_LATEST_COMMITS);
}
/**
* Selects the versions for file for cleaning, such that it
*
* - Leaves the latest version of the file untouched - For older versions, - It leaves all the commits untouched which
* has occurred in last config.getCleanerCommitsRetained()
commits - It leaves ONE commit before this
* window. We assume that the max(query execution time) == commit_batch_time * config.getCleanerCommitsRetained().
* This is 5 hours by default (assuming ingestion is running every 30 minutes). This is essential to leave the file
* used by the query that is running for the max time.
*
* This provides the effect of having lookback into all changes that happened in the last X commits. (eg: if you
* retain 10 commits, and commit batch time is 30 mins, then you have 5 hrs of lookback)
*
* This policy is the default.
*
* @return A {@link Pair} whose left is boolean indicating whether partition itself needs to be deleted,
* and right is a list of {@link CleanFileInfo} about the files in the partition that needs to be deleted.
*/
private Pair> getFilesToCleanKeepingLatestCommits(String partitionPath,
int commitsRetained, Option earliestCommitToRetain, HoodieCleaningPolicy policy) {
if (policy != HoodieCleaningPolicy.KEEP_LATEST_COMMITS && policy != HoodieCleaningPolicy.KEEP_LATEST_BY_HOURS) {
throw new IllegalArgumentException("getFilesToCleanKeepingLatestCommits can only be used for KEEP_LATEST_COMMITS or KEEP_LATEST_BY_HOURS");
}
LOG.info("Cleaning " + partitionPath + ", retaining latest " + commitsRetained + " commits. ");
List deletePaths = new ArrayList<>();
// Collect all the datafiles savepointed by all the savepoints
List savepointedFiles = hoodieTable.getSavepointTimestamps().stream()
.flatMap(this::getSavepointedDataFiles)
.collect(Collectors.toList());
// determine if we have enough commits, to start cleaning.
boolean toDeletePartition = false;
if (getCommitTimeline().countInstants() > commitsRetained) {
HoodieInstant earliestInstant = earliestCommitToRetain.get();
// all replaced file groups before earliestCommitToRetain are eligible to clean
deletePaths.addAll(getReplacedFilesEligibleToClean(savepointedFiles, partitionPath, earliestCommitToRetain));
// add active files
List fileGroups = hoodieTable.getHoodieView().getAllFileGroupsStateless(partitionPath).collect(Collectors.toList());
for (HoodieFileGroup fileGroup : fileGroups) {
List fileSliceList = fileGroup.getAllFileSlices().collect(Collectors.toList());
if (fileSliceList.isEmpty()) {
continue;
}
String lastVersion = fileSliceList.get(0).getBaseInstantTime();
String lastVersionBeforeEarliestCommitToRetain =
getLatestVersionBeforeCommit(fileSliceList, earliestInstant);
// Ensure there are more than 1 version of the file (we only clean old files from updates)
// i.e., always spare the last commit.
for (FileSlice aSlice : fileSliceList) {
Option aFile = aSlice.getBaseFile();
String fileCommitTime = aSlice.getBaseInstantTime();
if (isFileSliceExistInSavepointedFiles(aSlice, savepointedFiles)) {
// do not clean up a savepoint data file
continue;
}
// Do not delete the latest commit and also the last commit before the earliest commit we
// are retaining
// The window of commit retain == max query run time. So a query could be running which
// still uses this file.
if (fileCommitTime.equals(lastVersion) || fileCommitTime.equals(lastVersionBeforeEarliestCommitToRetain)) {
// move on to the next file
continue;
}
// Always keep the last commit
if (!isFileSliceNeededForPendingMajorOrMinorCompaction(aSlice)
&& compareTimestamps(earliestInstant.requestedTime(), GREATER_THAN, fileCommitTime)) {
// this is a commit, that should be cleaned.
aFile.ifPresent(hoodieDataFile -> {
deletePaths.add(new CleanFileInfo(hoodieDataFile.getPath(), false));
if (hoodieDataFile.getBootstrapBaseFile().isPresent() && config.shouldCleanBootstrapBaseFile()) {
deletePaths.add(new CleanFileInfo(hoodieDataFile.getBootstrapBaseFile().get().getPath(), true));
}
});
// clean the log files for the commits, which contain cdc log files in cdc scenario
// and normal log files for mor tables.
deletePaths.addAll(aSlice.getLogFiles().map(lf -> new CleanFileInfo(lf.getPath().toString(), false))
.collect(Collectors.toList()));
}
}
}
// if there are no valid file groups
// and no pending data files under the partition [IMPORTANT],
// and no subsequent replace commit after the earliest retained commit
// mark it to be deleted
if (fileGroups.isEmpty()
&& !hasPendingFiles(partitionPath)
&& noSubsequentReplaceCommit(earliestInstant.requestedTime(), partitionPath)) {
toDeletePartition = true;
}
}
return Pair.of(toDeletePartition, deletePaths);
}
/**
* Returns whether there are uncommitted data files under the given partition,
* the pending files are generated by the inflight instants and maybe ready to commit,
* the partition can not be deleted as a whole if any pending file exists.
*
* IMPORTANT: {@code fsView.getAllFileGroups} does not return pending file groups for metadata table,
* file listing must be used instead.
*/
private boolean hasPendingFiles(String partitionPath) {
try {
HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(hoodieTable.getMetaClient(), hoodieTable.getActiveTimeline());
StoragePath fullPartitionPath = new StoragePath(hoodieTable.getMetaClient().getBasePath(), partitionPath);
fsView.addFilesToView(partitionPath, FSUtils.getAllDataFilesInPartition(
hoodieTable.getStorage(), fullPartitionPath));
// use #getAllFileGroups(partitionPath) instead of #getAllFileGroups() to exclude the replaced file groups.
return fsView.getAllFileGroups(partitionPath).findAny().isPresent();
} catch (Exception ex) {
// if any exception throws, assume there are existing pending files
LOG.warn("Error while checking the pending files under partition: " + partitionPath + ", assumes the files exist", ex);
return true;
}
}
/**
* This method finds the files to be cleaned based on the number of hours. If {@code config.getCleanerHoursRetained()} is set to 5,
* all the files with commit time earlier than 5 hours will be removed. Also the latest file for any file group is retained.
* This policy gives much more flexibility to users for retaining data for running incremental queries as compared to
* KEEP_LATEST_COMMITS cleaning policy. The default number of hours is 5.
* @param partitionPath partition path to check
* @param earliestCommitToRetain earliest commit to retain
* @return list of files to clean
*/
private Pair> getFilesToCleanKeepingLatestHours(String partitionPath, Option earliestCommitToRetain) {
return getFilesToCleanKeepingLatestCommits(partitionPath, 0, earliestCommitToRetain, HoodieCleaningPolicy.KEEP_LATEST_BY_HOURS);
}
private List getReplacedFilesEligibleToClean(List savepointedFiles, String partitionPath, Option earliestCommitToRetain) {
final Stream replacedGroups;
if (earliestCommitToRetain.isPresent()) {
replacedGroups = hoodieTable.getHoodieView().getReplacedFileGroupsBefore(earliestCommitToRetain.get().requestedTime(), partitionPath);
} else {
replacedGroups = hoodieTable.getHoodieView().getAllReplacedFileGroups(partitionPath);
}
return replacedGroups.flatMap(HoodieFileGroup::getAllFileSlices)
// do not delete savepointed files (archival will make sure corresponding replacecommit file is not deleted)
.filter(slice -> !isFileSliceExistInSavepointedFiles(slice, savepointedFiles))
.flatMap(slice -> getCleanFileInfoForSlice(slice).stream())
.collect(Collectors.toList());
}
/**
* Gets the latest version < instantTime. This version file could still be used by queries.
*/
private String getLatestVersionBeforeCommit(List fileSliceList, HoodieInstant instantTime) {
for (FileSlice file : fileSliceList) {
String fileCommitTime = file.getBaseInstantTime();
if (compareTimestamps(instantTime.requestedTime(), GREATER_THAN, fileCommitTime)) {
// fileList is sorted on the reverse, so the first commit we find <= instantTime is the
// one we want
return fileCommitTime;
}
}
// There is no version of this file which is <= instantTime
return null;
}
private List getCleanFileInfoForSlice(FileSlice nextSlice) {
List cleanPaths = new ArrayList<>();
if (nextSlice.getBaseFile().isPresent()) {
HoodieBaseFile dataFile = nextSlice.getBaseFile().get();
cleanPaths.add(new CleanFileInfo(dataFile.getPath(), false));
if (dataFile.getBootstrapBaseFile().isPresent() && config.shouldCleanBootstrapBaseFile()) {
cleanPaths.add(new CleanFileInfo(dataFile.getBootstrapBaseFile().get().getPath(), true));
}
}
// clean the log files for the commits, which contain cdc log files in cdc scenario
// and normal log files for mor tables.
cleanPaths.addAll(
nextSlice.getLogFiles().map(lf -> new CleanFileInfo(lf.getPath().toString(), false))
.collect(Collectors.toList()));
return cleanPaths;
}
/**
* Returns files to be cleaned for the given partitionPath based on cleaning policy.
*/
public Pair> getDeletePaths(String partitionPath, Option earliestCommitToRetain) {
HoodieCleaningPolicy policy = config.getCleanerPolicy();
Pair> deletePaths;
if (policy == HoodieCleaningPolicy.KEEP_LATEST_COMMITS) {
deletePaths = getFilesToCleanKeepingLatestCommits(partitionPath, earliestCommitToRetain);
} else if (policy == HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS) {
deletePaths = getFilesToCleanKeepingLatestVersions(partitionPath);
} else if (policy == HoodieCleaningPolicy.KEEP_LATEST_BY_HOURS) {
deletePaths = getFilesToCleanKeepingLatestHours(partitionPath, earliestCommitToRetain);
} else {
throw new IllegalArgumentException("Unknown cleaning policy : " + policy.name());
}
LOG.info(deletePaths.getValue().size() + " patterns used to delete in partition path:" + partitionPath);
if (deletePaths.getKey()) {
LOG.info("Partition " + partitionPath + " to be deleted");
}
return deletePaths;
}
/**
* Returns the earliest commit to retain based on cleaning policy.
*/
public Option getEarliestCommitToRetain() {
if (!earliestCommitToRetain.isPresent()) {
earliestCommitToRetain = CleanerUtils.getEarliestCommitToRetain(
hoodieTable.getMetaClient().getActiveTimeline().getCommitsAndCompactionTimeline(),
config.getCleanerPolicy(),
config.getCleanerCommitsRetained(),
Instant.now(),
config.getCleanerHoursRetained(),
hoodieTable.getMetaClient().getTableConfig().getTimelineTimezone());
}
return earliestCommitToRetain;
}
/**
* Returns the last completed commit timestamp before clean.
*/
public String getLastCompletedCommitTimestamp() {
return getCommitTimeline().lastInstant().map(HoodieInstant::requestedTime).orElse("");
}
/*
* Determine if file slice needed to be preserved for pending compaction or log compaction.
* @param fileSlice File slice
* @return true if file slice needs to be preserved, false otherwise.
*/
private boolean isFileSliceNeededForPendingMajorOrMinorCompaction(FileSlice fileSlice) {
return isFileSliceNeededForPendingCompaction(fileSlice) || isFileSliceNeededForPendingLogCompaction(fileSlice);
}
/**
* Determine if file slice needed to be preserved for pending compaction.
*
* @param fileSlice File Slice
* @return true if file slice needs to be preserved, false otherwise.
*/
private boolean isFileSliceNeededForPendingCompaction(FileSlice fileSlice) {
CompactionOperation op = fgIdToPendingCompactionOperations.get(fileSlice.getFileGroupId());
if (null != op) {
// If file slice's instant time is newer or same as that of operation, do not clean
return compareTimestamps(fileSlice.getBaseInstantTime(), GREATER_THAN_OR_EQUALS, op.getBaseInstantTime()
);
}
return false;
}
/**
* Determine if file slice needed to be preserved for pending logcompaction.
*
* @param fileSlice File Slice
* @return true if file slice needs to be preserved, false otherwise.
*/
private boolean isFileSliceNeededForPendingLogCompaction(FileSlice fileSlice) {
CompactionOperation op = fgIdToPendingLogCompactionOperations.get(fileSlice.getFileGroupId());
if (null != op) {
// If file slice's instant time is newer or same as that of operation, do not clean
return compareTimestamps(fileSlice.getBaseInstantTime(), GREATER_THAN_OR_EQUALS, op.getBaseInstantTime()
);
}
return false;
}
private boolean isFileGroupInPendingMajorOrMinorCompaction(HoodieFileGroup fg) {
return fgIdToPendingCompactionOperations.containsKey(fg.getFileGroupId())
|| fgIdToPendingLogCompactionOperations.containsKey(fg.getFileGroupId());
}
private boolean noSubsequentReplaceCommit(String earliestCommitToRetain, String partitionPath) {
return !hoodieTable.getHoodieView().getReplacedFileGroupsAfterOrOn(earliestCommitToRetain, partitionPath).findAny().isPresent();
}
}