Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.apache.hudi.common.util.CompactionUtils Maven / Gradle / Ivy
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
package org.apache.hudi.common.util;
import org.apache.hudi.avro.model.HoodieCompactionOperation;
import org.apache.hudi.avro.model.HoodieCompactionPlan;
import org.apache.hudi.common.model.BaseFile;
import org.apache.hudi.common.model.CompactionOperation;
import org.apache.hudi.common.model.FileSlice;
import org.apache.hudi.common.model.HoodieFileGroupId;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.table.timeline.TimelineMetadataUtils;
import org.apache.hudi.common.table.timeline.versioning.compaction.CompactionPlanMigrator;
import org.apache.hudi.common.table.timeline.versioning.compaction.CompactionV1MigrationHandler;
import org.apache.hudi.common.table.timeline.versioning.compaction.CompactionV2MigrationHandler;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.exception.HoodieException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
* Helper class to generate compaction plan from FileGroup/FileSlice abstraction.
public class CompactionUtils {
public static final Integer COMPACTION_METADATA_VERSION_1 = CompactionV1MigrationHandler.VERSION;
public static final Integer COMPACTION_METADATA_VERSION_2 = CompactionV2MigrationHandler.VERSION;
* Generate compaction operation from file-slice.
* @param partitionPath Partition path
* @param fileSlice File Slice
* @param metricsCaptureFunction Metrics Capture function
* @return Compaction Operation
public static HoodieCompactionOperation buildFromFileSlice(String partitionPath, FileSlice fileSlice,
Option, Map>> metricsCaptureFunction) {
HoodieCompactionOperation.Builder builder = HoodieCompactionOperation.newBuilder();
builder.setDeltaFilePaths(fileSlice.getLogFiles().map(lf -> lf.getPath().getName()).collect(Collectors.toList()));
if (fileSlice.getBaseFile().isPresent()) {
if (metricsCaptureFunction.isPresent()) {
builder.setMetrics(metricsCaptureFunction.get().apply(Pair.of(partitionPath, fileSlice)));
* Generate compaction plan from file-slices.
* @param partitionFileSlicePairs list of partition file-slice pairs
* @param extraMetadata Extra Metadata
* @param metricsCaptureFunction Metrics Capture function
public static HoodieCompactionPlan buildFromFileSlices(List> partitionFileSlicePairs,
Option> extraMetadata,
Option, Map>> metricsCaptureFunction) {
HoodieCompactionPlan.Builder builder = HoodieCompactionPlan.newBuilder();
.map(pfPair -> buildFromFileSlice(pfPair.getKey(), pfPair.getValue(), metricsCaptureFunction))
* Build Avro generated Compaction operation payload from compaction operation POJO for serialization.
public static HoodieCompactionOperation buildHoodieCompactionOperation(CompactionOperation op) {
return HoodieCompactionOperation.newBuilder().setFileId(op.getFileId()).setBaseInstantTime(op.getBaseInstantTime())
.setDataFilePath(op.getDataFileName().isPresent() ? op.getDataFileName().get() : null)
* Build Compaction operation payload from Avro version for using in Spark executors.
* @param hc HoodieCompactionOperation
public static CompactionOperation buildCompactionOperation(HoodieCompactionOperation hc) {
return CompactionOperation.convertFromAvroRecordInstance(hc);
* Get all pending compaction plans along with their instants.
* @param metaClient Hoodie Meta Client
public static List> getAllPendingCompactionPlans(
HoodieTableMetaClient metaClient) {
// This function returns pending compaction timeline.
Function getFilteredTimelineByActionType =
(hoodieTableMetaClient) -> hoodieTableMetaClient.getActiveTimeline().filterPendingCompactionTimeline();
// Hoodie requested instant supplier
Function requestedInstantSupplier = metaClient.getInstantGenerator()::getCompactionRequestedInstant;
return getCompactionPlansByTimeline(metaClient, getFilteredTimelineByActionType, requestedInstantSupplier);
* Get all pending logcompaction plans along with their instants.
* @param metaClient Hoodie Meta Client
public static List> getAllPendingLogCompactionPlans(
HoodieTableMetaClient metaClient) {
// This function returns pending logcompaction timeline.
Function filteredTimelineSupplier =
(hoodieTableMetaClient) -> hoodieTableMetaClient.getActiveTimeline().filterPendingLogCompactionTimeline();
// Hoodie requested instant supplier
Function requestedInstantSupplier = metaClient.getInstantGenerator()::getLogCompactionRequestedInstant;
return getCompactionPlansByTimeline(metaClient, filteredTimelineSupplier, requestedInstantSupplier);
* Util method to get compaction plans by action_type(COMPACT or LOG_COMPACT)
* @param metaClient HoodieTable's metaclient
* @param filteredTimelineSupplier gives a timeline object, this can be either filtered to return pending compactions or log compaction instants.
* @param requestedInstantWrapper function that gives a requested Hoodie instant.
* @return List of pair of HoodieInstant and it's corresponding compaction plan.
* Note here the compaction plan can be related to a compaction instant or log compaction instant.
private static List> getCompactionPlansByTimeline(
HoodieTableMetaClient metaClient, Function filteredTimelineSupplier,
Function requestedInstantWrapper) {
List filteredInstants = filteredTimelineSupplier.apply(metaClient).getInstants();
.map(instant -> Pair.of(instant, getCompactionPlan(metaClient, requestedInstantWrapper.apply(instant.requestedTime()))))
* This method will serve only Compaction instants
* because we use same HoodieCompactionPlan for both the operations.
public static HoodieCompactionPlan getCompactionPlan(HoodieTableMetaClient metaClient, String compactionInstant) {
HoodieInstant compactionRequestedInstant = metaClient.getInstantGenerator().getCompactionRequestedInstant(compactionInstant);
return getCompactionPlan(metaClient, compactionRequestedInstant);
* This method will serve only log compaction instants,
* because we use same HoodieCompactionPlan for both the operations.
public static HoodieCompactionPlan getLogCompactionPlan(HoodieTableMetaClient metaClient, String logCompactionInstant) {
HoodieInstant logCompactionRequestedInstant = metaClient.getInstantGenerator().getLogCompactionRequestedInstant(logCompactionInstant);
return getCompactionPlan(metaClient, logCompactionRequestedInstant);
* Util method to fetch both compaction and log compaction plan from requestedInstant.
private static HoodieCompactionPlan getCompactionPlan(HoodieTableMetaClient metaClient, HoodieInstant requestedInstant) {
return getCompactionPlan(metaClient, metaClient.getActiveTimeline().readCompactionPlanAsBytes(requestedInstant));
* Util method to fetch both compaction and log compaction plan from requestedInstant.
public static HoodieCompactionPlan getCompactionPlan(HoodieTableMetaClient metaClient, Option planContent) {
CompactionPlanMigrator migrator = new CompactionPlanMigrator(metaClient);
try {
HoodieCompactionPlan compactionPlan = TimelineMetadataUtils.deserializeCompactionPlan(planContent.get());
return migrator.upgradeToLatest(compactionPlan, compactionPlan.getVersion());
} catch (IOException e) {
throw new HoodieException(e);
* Get all PartitionPath + file-ids with pending Compaction operations and their target compaction instant time.
* @param metaClient Hoodie Table Meta Client
public static Map> getAllPendingCompactionOperations(
HoodieTableMetaClient metaClient) {
List> pendingCompactionPlanWithInstants =
return getAllPendingCompactionOperationsInPendingCompactionPlans(pendingCompactionPlanWithInstants);
* Get all partition + file Ids with pending Log Compaction operations and their target log compaction instant time.
public static Map> getAllPendingLogCompactionOperations(
HoodieTableMetaClient metaClient) {
List> pendingLogCompactionPlanWithInstants =
return getAllPendingCompactionOperationsInPendingCompactionPlans(pendingLogCompactionPlanWithInstants);
* Get all partition + file Ids with pending Log Compaction operations and their target log compaction instant time.
public static Map> getAllPendingCompactionOperationsInPendingCompactionPlans(
List> pendingLogCompactionPlanWithInstants) {
Map> fgIdToPendingCompactionsWithInstantMap = new HashMap<>(); ->
getPendingCompactionOperations(instantPlanPair.getKey(), instantPlanPair.getValue())).forEach(pair -> {
// Defensive check to ensure a single-fileId does not have more than one pending log compaction with different
// file slices. If we find a full duplicate we assume it is caused by eventual nature of the move operation
// on some DFSs.
if (fgIdToPendingCompactionsWithInstantMap.containsKey(pair.getKey())) {
HoodieCompactionOperation operation = pair.getValue().getValue();
HoodieCompactionOperation anotherOperation = fgIdToPendingCompactionsWithInstantMap.get(pair.getKey()).getValue();
if (!operation.equals(anotherOperation)) {
String msg = "Hudi File Id (" + pair.getKey() + ") has more than 1 pending operation. Instants: "
+ pair.getValue() + ", " + fgIdToPendingCompactionsWithInstantMap.get(pair.getKey());
throw new IllegalStateException(msg);
fgIdToPendingCompactionsWithInstantMap.put(pair.getKey(), pair.getValue());
return fgIdToPendingCompactionsWithInstantMap;
* Get pending compaction operations for both major and minor compaction.
public static Stream>> getPendingCompactionOperations(
HoodieInstant instant, HoodieCompactionPlan compactionPlan) {
List ops = compactionPlan.getOperations();
if (null != ops) {
return -> Pair.of(new HoodieFileGroupId(op.getPartitionPath(), op.getFileId()),
Pair.of(instant.requestedTime(), op)));
} else {
return Stream.empty();
* Return all pending compaction instant times.
* @return
public static List getPendingCompactionInstantTimes(HoodieTableMetaClient metaClient) {
return metaClient.getActiveTimeline().filterPendingCompactionTimeline().getInstants();
* Returns a pair of (timeline containing the completed delta commits after the latest completed
* compaction commit, the completed compaction commit instant), if the latest completed
* compaction commit is present; a pair of (timeline containing all the completed delta commits,
* the first delta commit instant), if there is no completed compaction commit.
* @param activeTimeline Active timeline of a table.
* @return Pair of timeline containing delta commits and an instant.
public static Option> getCompletedDeltaCommitsSinceLatestCompaction(
HoodieActiveTimeline activeTimeline) {
return getDeltaCommitsSinceLatestCompaction(activeTimeline)
.map(pair -> Pair.of(pair.getLeft().filterCompletedInstants(), pair.getRight()));
* Returns a pair of (timeline containing the delta commits after the latest completed
* compaction commit, the completed compaction commit instant), if the latest completed
* compaction commit is present; a pair of (timeline containing all the delta commits,
* the first delta commit instant), if there is no completed compaction commit.
* @param activeTimeline Active timeline of a table.
* @return Pair of timeline containing delta commits and an instant.
public static Option> getDeltaCommitsSinceLatestCompaction(
HoodieActiveTimeline activeTimeline) {
Option lastCompaction = activeTimeline.getCommitTimeline().filterCompletedInstants().lastInstant();
HoodieTimeline deltaCommits = activeTimeline.getDeltaCommitTimeline();
final HoodieInstant latestInstant;
if (lastCompaction.isPresent()) {
latestInstant = lastCompaction.get();
// timeline containing the delta commits after the latest completed compaction commit,
// and the completed compaction commit instant
return Option.of(Pair.of(deltaCommits.findInstantsModifiedAfterByCompletionTime(latestInstant.requestedTime()), latestInstant));
} else {
if (deltaCommits.countInstants() > 0) {
latestInstant = deltaCommits.firstInstant().get();
// timeline containing all the delta commits, and the first delta commit instant
return Option.of(Pair.of(deltaCommits, latestInstant));
} else {
return Option.empty();
public static Option> getDeltaCommitsSinceLatestCompactionRequest(
HoodieActiveTimeline activeTimeline) {
Option lastCompaction = activeTimeline.getCommitTimeline()
Option lastRequestCompaction = activeTimeline.getAllCommitsTimeline()
if (lastRequestCompaction.isPresent()) {
lastCompaction = lastRequestCompaction;
HoodieTimeline deltaCommits = activeTimeline.getDeltaCommitTimeline();
HoodieInstant latestInstant;
if (lastCompaction.isPresent()) {
latestInstant = lastCompaction.get();
// timeline containing the delta commits after the latest completed compaction commit,
// and the completed compaction commit instant
return Option.of(Pair.of(deltaCommits.findInstantsAfter(
latestInstant.requestedTime(), Integer.MAX_VALUE), lastCompaction.get()));
} else {
if (deltaCommits.countInstants() > 0) {
latestInstant = deltaCommits.firstInstant().get();
// timeline containing all the delta commits, and the first delta commit instant
return Option.of(Pair.of(deltaCommits.findInstantsAfterOrEquals(
latestInstant.requestedTime(), Integer.MAX_VALUE), latestInstant));
} else {
return Option.empty();
* Gets the earliest instant to retain for MOR compaction.
* If there is no completed compaction,
* num delta commits >= ""
* If there is a completed compaction,
* num delta commits after latest completed compaction >= ""
* @param activeTimeline Active timeline of a table.
* @param maxDeltaCommits Maximum number of delta commits that trigger the compaction plan,
* i.e., "".
* @return the earliest instant to keep for MOR compaction.
public static Option getEarliestInstantToRetainForCompaction(
HoodieActiveTimeline activeTimeline, int maxDeltaCommits) {
Option> deltaCommitsInfoOption =
if (deltaCommitsInfoOption.isPresent()) {
Pair deltaCommitsInfo = deltaCommitsInfoOption.get();
HoodieTimeline deltaCommitTimeline = deltaCommitsInfo.getLeft();
int numDeltaCommits = deltaCommitTimeline.countInstants();
if (numDeltaCommits < maxDeltaCommits) {
return Option.of(deltaCommitsInfo.getRight());
} else {
// delta commits with the last one to keep
List instants = deltaCommitTimeline.getInstantsAsStream()
.limit(numDeltaCommits - maxDeltaCommits + 1).collect(Collectors.toList());
return Option.of(instants.get(instants.size() - 1));
return Option.empty();