All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.client.utils.ArchivalUtils Maven / Gradle / Ivy

The newest version!
/*
 *
 *  * Licensed to the Apache Software Foundation (ASF) under one
 *  * or more contributor license agreements.  See the NOTICE file
 *  * distributed with this work for additional information
 *  * regarding copyright ownership.  The ASF licenses this file
 *  * to you under the Apache License, Version 2.0 (the
 *  * "License"); you may not use this file except in compliance
 *  * with the License.  You may obtain a copy of the License at
 *  *
 *  *      http://www.apache.org/licenses/LICENSE-2.0
 *  *
 *  * Unless required by applicable law or agreed to in writing, software
 *  * distributed under the License is distributed on an "AS IS" BASIS,
 *  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  * See the License for the specific language governing permissions and
 *  * limitations under the License.
 *
 */

package org.apache.hudi.client.utils;

import org.apache.hudi.common.model.HoodieCleaningPolicy;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.util.CleanerUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.table.HoodieTable;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.text.ParseException;
import java.time.Instant;

import static org.apache.hudi.common.table.timeline.TimelineUtils.NOT_PARSABLE_TIMESTAMPS;
import static org.apache.hudi.common.table.timeline.TimelineUtils.parseDateFromInstantTime;
import static org.apache.hudi.config.HoodieArchivalConfig.MAX_COMMITS_TO_KEEP;
import static org.apache.hudi.config.HoodieArchivalConfig.MIN_COMMITS_TO_KEEP;
import static org.apache.hudi.config.HoodieCleanConfig.CLEANER_COMMITS_RETAINED;
import static org.apache.hudi.config.HoodieCleanConfig.CLEANER_FILE_VERSIONS_RETAINED;
import static org.apache.hudi.config.HoodieCleanConfig.CLEANER_HOURS_RETAINED;
import static org.apache.hudi.config.HoodieCleanConfig.CLEANER_POLICY;

/**
 * Provides utilities for timeline archival service.
 */
public class ArchivalUtils {

  private static final Logger LOG = LoggerFactory.getLogger(ArchivalUtils.class);

  /**
   *  getMinAndMaxInstantsToKeep is used by archival service to find the
   *  min instants and max instants to keep in the active timeline
   * @param table table implementation extending org.apache.hudi.table.HoodieTable
   * @param metaClient meta client
   * @return Pair containing min instants and max instants to keep.
   */
  public static Pair getMinAndMaxInstantsToKeep(HoodieTable table, HoodieTableMetaClient metaClient) {
    HoodieWriteConfig config = table.getConfig();
    HoodieTimeline completedCommitsTimeline = table.getCompletedCommitsTimeline();
    Option latestCommit = completedCommitsTimeline.lastInstant();
    HoodieCleaningPolicy cleanerPolicy = config.getCleanerPolicy();
    int cleanerCommitsRetained = config.getCleanerCommitsRetained();
    int cleanerHoursRetained = config.getCleanerHoursRetained();
    int maxInstantsToKeep;
    int minInstantsToKeep;
    Option earliestCommitToRetain = getEarliestCommitToRetain(metaClient, latestCommit, cleanerPolicy, cleanerCommitsRetained, cleanerHoursRetained);
    int configuredMinInstantsToKeep = config.getMinCommitsToKeep();
    int configuredMaxInstantsToKeep = config.getMaxCommitsToKeep();
    if (earliestCommitToRetain.isPresent()) {
      int minInstantsToKeepBasedOnCleaning =
          completedCommitsTimeline.findInstantsAfter(earliestCommitToRetain.get().requestedTime())
              .countInstants() + 2;
      if (configuredMinInstantsToKeep < minInstantsToKeepBasedOnCleaning) {
        maxInstantsToKeep = minInstantsToKeepBasedOnCleaning
            + configuredMaxInstantsToKeep - configuredMinInstantsToKeep;
        minInstantsToKeep = minInstantsToKeepBasedOnCleaning;
        LOG.warn("The configured archival configs {}={} is more aggressive than the cleaning "
                + "configs as the earliest commit to retain is {}. Adjusted the archival configs "
                + "to be {}={} and {}={}",
            MIN_COMMITS_TO_KEEP.key(), configuredMinInstantsToKeep, earliestCommitToRetain.get(),
            MIN_COMMITS_TO_KEEP.key(), minInstantsToKeep,
            MAX_COMMITS_TO_KEEP.key(), maxInstantsToKeep);
        switch (cleanerPolicy) {
          case KEEP_LATEST_COMMITS:
            LOG.warn("Cleaning configs: {}=KEEP_LATEST_COMMITS {}={}", CLEANER_POLICY.key(),
                CLEANER_COMMITS_RETAINED.key(), cleanerCommitsRetained);
            break;
          case KEEP_LATEST_BY_HOURS:
            LOG.warn("Cleaning configs: {}=KEEP_LATEST_BY_HOURS {}={}", CLEANER_POLICY.key(),
                CLEANER_HOURS_RETAINED.key(), cleanerHoursRetained);
            break;
          case KEEP_LATEST_FILE_VERSIONS:
            LOG.warn("Cleaning configs: {}=CLEANER_FILE_VERSIONS_RETAINED {}={}", CLEANER_POLICY.key(),
                CLEANER_FILE_VERSIONS_RETAINED.key(), config.getCleanerFileVersionsRetained());
            break;
          default:
            break;
        }
      } else {
        maxInstantsToKeep = configuredMaxInstantsToKeep;
        minInstantsToKeep = configuredMinInstantsToKeep;
      }
    } else {
      maxInstantsToKeep = configuredMaxInstantsToKeep;
      minInstantsToKeep = configuredMinInstantsToKeep;
    }
    return Pair.of(minInstantsToKeep, maxInstantsToKeep);
  }

  private static Option getEarliestCommitToRetain(
      HoodieTableMetaClient metaClient,
      Option latestCommit,
      HoodieCleaningPolicy cleanerPolicy,
      int cleanerCommitsRetained, int cleanerHoursRetained) {
    Option earliestCommitToRetain = Option.empty();
    try {
      earliestCommitToRetain = CleanerUtils.getEarliestCommitToRetain(
          metaClient.getActiveTimeline().getCommitsTimeline(),
          cleanerPolicy,
          cleanerCommitsRetained,
          latestCommit.isPresent()
              ? parseDateFromInstantTime(latestCommit.get().requestedTime()).toInstant()
              : Instant.now(),
          cleanerHoursRetained,
          metaClient.getTableConfig().getTimelineTimezone());
    } catch (ParseException e) {
      if (NOT_PARSABLE_TIMESTAMPS.stream().noneMatch(ts -> latestCommit.get().requestedTime().startsWith(ts))) {
        LOG.warn("Error parsing instant time: " + latestCommit.get().requestedTime());
      }
    }
    return earliestCommitToRetain;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy