All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.client.HoodieTimelineArchiver Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.hudi.client;

import org.apache.hudi.avro.model.HoodieArchivedMetaEntry;
import org.apache.hudi.avro.model.HoodieMergeArchiveFilePlan;
import org.apache.hudi.client.transaction.TransactionManager;
import org.apache.hudi.client.utils.MetadataConversionUtils;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.fs.HoodieWrapperFileSystem;
import org.apache.hudi.common.fs.StorageSchemes;
import org.apache.hudi.common.model.HoodieArchivedLogFile;
import org.apache.hudi.common.model.HoodieAvroIndexedRecord;
import org.apache.hudi.common.model.HoodieAvroPayload;
import org.apache.hudi.common.model.HoodieLogFile;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.log.HoodieLogFormat;
import org.apache.hudi.common.table.log.HoodieLogFormat.Writer;
import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock;
import org.apache.hudi.common.table.log.block.HoodieLogBlock;
import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType;
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
import org.apache.hudi.common.table.timeline.HoodieArchivedTimeline;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.table.timeline.TimelineMetadataUtils;
import org.apache.hudi.common.table.timeline.TimelineUtils;
import org.apache.hudi.common.util.ClusteringUtils;
import org.apache.hudi.common.util.CollectionUtils;
import org.apache.hudi.common.util.CompactionUtils;
import org.apache.hudi.common.util.FileIOUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieCommitException;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.metadata.HoodieTableMetadata;
import org.apache.hudi.table.HoodieTable;
import org.apache.hudi.table.action.compact.CompactionTriggerStrategy;
import org.apache.hudi.table.marker.WriteMarkers;
import org.apache.hudi.table.marker.WriteMarkersFactory;

import org.apache.avro.Schema;
import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import static org.apache.hudi.client.utils.ArchivalUtils.getMinAndMaxInstantsToKeep;
import static org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN;
import static org.apache.hudi.common.table.timeline.HoodieTimeline.LESSER_THAN;
import static org.apache.hudi.common.table.timeline.HoodieTimeline.LESSER_THAN_OR_EQUALS;
import static org.apache.hudi.common.table.timeline.HoodieTimeline.compareTimestamps;

/**
 * Archiver to bound the growth of files under .hoodie meta path.
 */
public class HoodieTimelineArchiver {

  private static final Logger LOG = LoggerFactory.getLogger(HoodieTimelineArchiver.class);

  private final Path archiveFilePath;
  private final HoodieWriteConfig config;
  private Writer writer;
  private final int maxInstantsToKeep;
  private final int minInstantsToKeep;
  private final HoodieTable table;
  private final HoodieTableMetaClient metaClient;
  private final TransactionManager txnManager;

  public HoodieTimelineArchiver(HoodieWriteConfig config, HoodieTable table) {
    this.config = config;
    this.table = table;
    this.metaClient = table.getMetaClient();
    this.archiveFilePath = HoodieArchivedTimeline.getArchiveLogPath(metaClient.getArchivePath());
    this.txnManager = new TransactionManager(config, table.getMetaClient().getFs());
    Pair minAndMaxInstants = getMinAndMaxInstantsToKeep(table, metaClient);
    this.minInstantsToKeep = minAndMaxInstants.getLeft();
    this.maxInstantsToKeep = minAndMaxInstants.getRight();
  }

  private Writer openWriter() {
    try {
      if (this.writer == null) {
        return HoodieLogFormat.newWriterBuilder().onParentPath(archiveFilePath.getParent())
            .withFileId(archiveFilePath.getName()).withFileExtension(HoodieArchivedLogFile.ARCHIVE_EXTENSION)
            .withFs(metaClient.getFs()).overBaseCommit("").build();
      } else {
        return this.writer;
      }
    } catch (IOException e) {
      throw new HoodieException("Unable to initialize HoodieLogFormat writer", e);
    }
  }

  public Writer reOpenWriter() {
    try {
      if (this.writer != null) {
        this.writer.close();
        this.writer = null;
      }
      this.writer = openWriter();
      return writer;
    } catch (IOException e) {
      throw new HoodieException("Unable to initialize HoodieLogFormat writer", e);
    }
  }

  private void close() {
    try {
      if (this.writer != null) {
        this.writer.close();
      }
    } catch (IOException e) {
      throw new HoodieException("Unable to close HoodieLogFormat writer", e);
    }
  }

  public boolean archiveIfRequired(HoodieEngineContext context) throws IOException {
    return archiveIfRequired(context, false);
  }

  /**
   * Check if commits need to be archived. If yes, archive commits.
   */
  public boolean archiveIfRequired(HoodieEngineContext context, boolean acquireLock) throws IOException {
    try {
      if (acquireLock) {
        // there is no owner or instant time per se for archival.
        txnManager.beginTransaction(Option.empty(), Option.empty());
      }
      List instantsToArchive = getInstantsToArchive().collect(Collectors.toList());
      verifyLastMergeArchiveFilesIfNecessary(context);
      boolean success = true;
      if (!instantsToArchive.isEmpty()) {
        this.writer = openWriter();
        LOG.info("Archiving instants " + instantsToArchive);
        archive(context, instantsToArchive);
        LOG.info("Deleting archived instants " + instantsToArchive);
        success = deleteArchivedInstants(instantsToArchive, context);
      } else {
        LOG.info("No Instants to archive");
      }

      if (shouldMergeSmallArchiveFiles()) {
        mergeArchiveFilesIfNecessary(context);
      }
      return success;
    } finally {
      close();
      if (acquireLock) {
        txnManager.endTransaction(Option.empty());
      }
    }
  }

  public boolean shouldMergeSmallArchiveFiles() {
    return config.getArchiveMergeEnable() && !StorageSchemes.isAppendSupported(metaClient.getFs().getScheme());
  }

  /**
   * Here Hoodie can merge the small archive files into a new larger one.
   * Only used for filesystem which does not support append operation.
   * The whole merge small archive files operation has four stages:
   * 1. Build merge plan with merge candidates/merged file name infos.
   * 2. Do merge.
   * 3. Delete all the candidates.
   * 4. Delete the merge plan.
   *
   * @param context HoodieEngineContext
   * @throws IOException
   */
  private void mergeArchiveFilesIfNecessary(HoodieEngineContext context) throws IOException {
    Path planPath = new Path(metaClient.getArchivePath(), HoodieArchivedTimeline.MERGE_ARCHIVE_PLAN_NAME);
    // Flush remained content if existed and open a new write
    reOpenWriter();
    // List all archive files
    FileStatus[] fsStatuses = metaClient.getFs().globStatus(
        new Path(metaClient.getArchivePath() + "/.commits_.archive*"));
    // Sort files by version suffix in reverse (implies reverse chronological order)
    Arrays.sort(fsStatuses, new HoodieArchivedTimeline.ArchiveFileVersionComparator());

    int archiveMergeFilesBatchSize = config.getArchiveMergeFilesBatchSize();
    long smallFileLimitBytes = config.getArchiveMergeSmallFileLimitBytes();

    List mergeCandidate = getMergeCandidates(smallFileLimitBytes, fsStatuses);

    if (mergeCandidate.size() >= archiveMergeFilesBatchSize) {
      List candidateFiles = mergeCandidate.stream().map(fs -> fs.getPath().toString()).collect(Collectors.toList());
      // before merge archive files build merge plan
      String logFileName = computeLogFileName();
      buildArchiveMergePlan(candidateFiles, planPath, logFileName);
      // merge archive files
      mergeArchiveFiles(mergeCandidate);
      // after merge, delete the small archive files.
      deleteFilesParallelize(metaClient, candidateFiles, context, true);
      LOG.info("Success to delete replaced small archive files.");
      // finally, delete archiveMergePlan which means merging small archive files operation is successful.
      metaClient.getFs().delete(planPath, false);
      LOG.info("Success to merge small archive files.");
    }
  }

  /**
   * Find the latest 'huge archive file' index as a break point and only check/merge newer archive files.
   * Because we need to keep the original order of archive files which is important when loading archived instants with time filter.
   * {@link HoodieArchivedTimeline} loadInstants(TimeRangeFilter filter, boolean loadInstantDetails, Function commitsFilter)
   *
   * @param smallFileLimitBytes small File Limit Bytes
   * @param fsStatuses          Sort by version suffix in reverse
   * @return merge candidates
   */
  private List getMergeCandidates(long smallFileLimitBytes, FileStatus[] fsStatuses) {
    int index = 0;
    for (; index < fsStatuses.length; index++) {
      if (fsStatuses[index].getLen() > smallFileLimitBytes) {
        break;
      }
    }
    return Arrays.stream(fsStatuses).limit(index).collect(Collectors.toList());
  }

  /**
   * Get final written archive file name based on storageSchemes which does not support append.
   */
  private String computeLogFileName() throws IOException {
    String logWriteToken = writer.getLogFile().getLogWriteToken();
    HoodieLogFile hoodieLogFile = writer.getLogFile().rollOver(metaClient.getFs(), logWriteToken);
    return hoodieLogFile.getFileName();
  }

  /**
   * Check/Solve if there is any failed and unfinished merge small archive files operation
   *
   * @param context HoodieEngineContext used for parallelize to delete small archive files if necessary.
   * @throws IOException
   */
  private void verifyLastMergeArchiveFilesIfNecessary(HoodieEngineContext context) throws IOException {
    if (shouldMergeSmallArchiveFiles()) {
      Path planPath = new Path(metaClient.getArchivePath(), HoodieArchivedTimeline.MERGE_ARCHIVE_PLAN_NAME);
      HoodieWrapperFileSystem fs = metaClient.getFs();
      // If plan exist, last merge small archive files was failed.
      // we need to revert or complete last action.
      if (fs.exists(planPath)) {
        HoodieMergeArchiveFilePlan plan = null;
        try {
          plan = TimelineMetadataUtils.deserializeAvroMetadata(FileIOUtils.readDataFromPath(fs, planPath).get(), HoodieMergeArchiveFilePlan.class);
        } catch (IOException e) {
          LOG.warn("Parsing merge archive plan failed.", e);
          // Reading partial plan file which means last merge action is failed during writing plan file.
          fs.delete(planPath);
          return;
        }
        Path mergedArchiveFile = new Path(metaClient.getArchivePath(), plan.getMergedArchiveFileName());
        List candidates = plan.getCandidate().stream().map(Path::new).collect(Collectors.toList());
        if (candidateAllExists(candidates)) {
          // Last merge action is failed during writing merged archive file.
          // But all the small archive files are not deleted.
          // Revert last action by deleting mergedArchiveFile if existed.
          if (fs.exists(mergedArchiveFile)) {
            fs.delete(mergedArchiveFile, false);
          }
        } else {
          // Last merge action is failed during deleting small archive files.
          // But the merged files is completed.
          // Try to complete last action
          if (fs.exists(mergedArchiveFile)) {
            deleteFilesParallelize(metaClient, plan.getCandidate(), context, true);
          }
        }

        fs.delete(planPath);
      }
    }
  }

  /**
   * If all the candidate small archive files existed, last merge operation was failed during writing the merged archive file.
   * If at least one of candidate small archive files existed, the merged archive file was created and last operation was failed during deleting the small archive files.
   */
  private boolean candidateAllExists(List candidates) throws IOException {
    for (Path archiveFile : candidates) {
      if (!metaClient.getFs().exists(archiveFile)) {
        // candidate is deleted
        return false;
      }
    }
    return true;
  }

  public void buildArchiveMergePlan(List compactCandidate, Path planPath, String compactedArchiveFileName) throws IOException {
    LOG.info("Start to build archive merge plan.");
    HoodieMergeArchiveFilePlan plan = HoodieMergeArchiveFilePlan.newBuilder()
        .setCandidate(compactCandidate)
        .setMergedArchiveFileName(compactedArchiveFileName)
        .build();
    Option content = TimelineMetadataUtils.serializeAvroMetadata(plan, HoodieMergeArchiveFilePlan.class);
    // building merge archive files plan.
    FileIOUtils.createFileInPath(metaClient.getFs(), planPath, content);
    LOG.info("Success to build archive merge plan");
  }

  public void mergeArchiveFiles(List compactCandidate) throws IOException {
    LOG.info("Starting to merge small archive files.");
    Schema wrapperSchema = HoodieArchivedMetaEntry.getClassSchema();
    try {
      List records = new ArrayList<>();
      for (FileStatus fs : compactCandidate) {
        // Read the archived file
        try (HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(metaClient.getFs(),
            new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema())) {
          // Read the avro blocks
          while (reader.hasNext()) {
            HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next();
            blk.getRecordIterator(HoodieRecordType.AVRO).forEachRemaining(r -> records.add((IndexedRecord) r.getData()));
            if (records.size() >= this.config.getCommitArchivalBatchSize()) {
              writeToFile(wrapperSchema, records);
            }
          }
        }
      }
      writeToFile(wrapperSchema, records);
    } catch (Exception e) {
      throw new HoodieCommitException("Failed to merge small archive files", e);
    } finally {
      writer.close();
    }
    LOG.info("Success to merge small archive files.");
  }

  private Map deleteFilesParallelize(HoodieTableMetaClient metaClient, List paths, HoodieEngineContext context, boolean ignoreFailed) {

    return FSUtils.parallelizeFilesProcess(context,
        metaClient.getFs(),
        config.getArchiveDeleteParallelism(),
        pairOfSubPathAndConf -> {
          Path file = new Path(pairOfSubPathAndConf.getKey());
          try {
            FileSystem fs = metaClient.getFs();
            if (fs.exists(file)) {
              return fs.delete(file, false);
            }
            return true;
          } catch (IOException e) {
            if (!ignoreFailed) {
              throw new HoodieIOException("Failed to delete : " + file, e);
            } else {
              LOG.warn("Ignore failed deleting : " + file);
              return true;
            }
          }
        },
        paths);
  }

  private Stream getCleanInstantsToArchive() {
    HoodieTimeline cleanAndRollbackTimeline = table.getActiveTimeline()
        .getTimelineOfActions(CollectionUtils.createSet(HoodieTimeline.CLEAN_ACTION, HoodieTimeline.ROLLBACK_ACTION)).filterCompletedInstants();
    return cleanAndRollbackTimeline.getInstantsAsStream()
        .collect(Collectors.groupingBy(HoodieInstant::getAction)).values().stream()
        .map(hoodieInstants -> {
          if (hoodieInstants.size() > this.maxInstantsToKeep) {
            return hoodieInstants.subList(0, hoodieInstants.size() - this.minInstantsToKeep);
          } else {
            return new ArrayList();
          }
        }).flatMap(Collection::stream);
  }

  private Stream getCommitInstantsToArchive() throws IOException {
    // TODO (na) : Add a way to return actions associated with a timeline and then merge/unify
    // with logic above to avoid Stream.concat
    HoodieTimeline commitTimeline = table.getCompletedCommitsTimeline();

    // Get the oldest inflight instant and a completed commit before this inflight instant.
    Option oldestPendingInstant = table.getActiveTimeline()
        .getWriteTimeline()
        .filter(instant -> !instant.isCompleted())
        .firstInstant();

    // Oldest commit to retain is the greatest completed commit, that is less than the oldest pending instant.
    // In some cases when inflight is the lowest commit then oldest commit to retain will be equal to oldest
    // inflight commit.
    Option oldestCommitToRetain;
    if (oldestPendingInstant.isPresent()) {
      Option completedCommitBeforeOldestPendingInstant =
          Option.fromJavaOptional(commitTimeline.getReverseOrderedInstants()
              .filter(instant -> HoodieTimeline.compareTimestamps(instant.getTimestamp(),
                  LESSER_THAN, oldestPendingInstant.get().getTimestamp())).findFirst());
      // Check if the completed instant is higher than the oldest inflight instant
      // in that case update the oldestCommitToRetain to oldestInflight commit time.
      if (!completedCommitBeforeOldestPendingInstant.isPresent()
          || HoodieTimeline.compareTimestamps(oldestPendingInstant.get().getTimestamp(),
          LESSER_THAN, completedCommitBeforeOldestPendingInstant.get().getTimestamp())) {
        oldestCommitToRetain = oldestPendingInstant;
      } else {
        oldestCommitToRetain = completedCommitBeforeOldestPendingInstant;
      }
    } else {
      oldestCommitToRetain = Option.empty();
    }

    // NOTE: We cannot have any holes in the commit timeline.
    // We cannot archive any commits which are made after the first savepoint present,
    // unless HoodieArchivalConfig#ARCHIVE_BEYOND_SAVEPOINT is enabled.
    Option firstSavepoint = table.getCompletedSavepointTimeline().firstInstant();
    Set savepointTimestamps = table.getSavepointTimestamps();
    if (!commitTimeline.empty() && commitTimeline.countInstants() > maxInstantsToKeep) {
      // For Merge-On-Read table, inline or async compaction is enabled
      // We need to make sure that there are enough delta commits in the active timeline
      // to trigger compaction scheduling, when the trigger strategy of compaction is
      // NUM_COMMITS or NUM_AND_TIME.
      Option oldestInstantToRetainForCompaction =
          (metaClient.getTableType() == HoodieTableType.MERGE_ON_READ
              && (config.getInlineCompactTriggerStrategy() == CompactionTriggerStrategy.NUM_COMMITS
              || config.getInlineCompactTriggerStrategy() == CompactionTriggerStrategy.NUM_AND_TIME))
              ? CompactionUtils.getOldestInstantToRetainForCompaction(
              table.getActiveTimeline(), config.getInlineCompactDeltaCommitMax())
              : Option.empty();

      // The clustering commit instant can not be archived unless we ensure that the replaced files have been cleaned,
      // without the replaced files metadata on the timeline, the fs view would expose duplicates for readers.
      // Meanwhile, when inline or async clustering is enabled, we need to ensure that there is a commit in the active timeline
      // to check whether the file slice generated in pending clustering after archive isn't committed.
      Option oldestInstantToRetainForClustering =
          ClusteringUtils.getOldestInstantToRetainForClustering(table.getActiveTimeline(), table.getMetaClient());

      // Actually do the commits
      Stream instantToArchiveStream = commitTimeline.getInstantsAsStream()
          .filter(s -> {
            if (config.shouldArchiveBeyondSavepoint()) {
              // skip savepoint commits and proceed further
              return !savepointTimestamps.contains(s.getTimestamp());
            } else {
              // if no savepoint present, then don't filter
              // stop at first savepoint commit
              return !(firstSavepoint.isPresent() && compareTimestamps(firstSavepoint.get().getTimestamp(), LESSER_THAN_OR_EQUALS, s.getTimestamp()));
            }
          }).filter(s -> {
            // oldestCommitToRetain is the highest completed commit instant that is less than the oldest inflight instant.
            // By filtering out any commit >= oldestCommitToRetain, we can ensure there are no gaps in the timeline
            // when inflight commits are present.
            return oldestCommitToRetain
                .map(instant -> compareTimestamps(instant.getTimestamp(), GREATER_THAN, s.getTimestamp()))
                .orElse(true);
          }).filter(s ->
              oldestInstantToRetainForCompaction.map(instantToRetain ->
                      compareTimestamps(s.getTimestamp(), LESSER_THAN, instantToRetain.getTimestamp()))
                  .orElse(true)
          ).filter(s ->
              oldestInstantToRetainForClustering.map(instantToRetain ->
                      HoodieTimeline.compareTimestamps(s.getTimestamp(), LESSER_THAN, instantToRetain.getTimestamp()))
                  .orElse(true)
          );
      return instantToArchiveStream.limit(commitTimeline.countInstants() - minInstantsToKeep);
    } else {
      return Stream.empty();
    }
  }

  private Stream getInstantsToArchive() throws IOException {
    Stream instants = Stream.concat(getCleanInstantsToArchive(), getCommitInstantsToArchive());
    if (config.isMetaserverEnabled()) {
      return Stream.empty();
    }

    // For archiving and cleaning instants, we need to include intermediate state files if they exist
    HoodieActiveTimeline rawActiveTimeline = new HoodieActiveTimeline(metaClient, false);
    Map, List> groupByTsAction = rawActiveTimeline.getInstantsAsStream()
        .collect(Collectors.groupingBy(i -> Pair.of(i.getTimestamp(),
            HoodieInstant.getComparableAction(i.getAction()))));

    // If metadata table is enabled, do not archive instants which are more recent than the last compaction on the
    // metadata table.
    if (table.getMetaClient().getTableConfig().isMetadataTableAvailable()) {
      try (HoodieTableMetadata tableMetadata = HoodieTableMetadata.create(table.getContext(), config.getMetadataConfig(), config.getBasePath())) {
        Option latestCompactionTime = tableMetadata.getLatestCompactionTime();
        if (!latestCompactionTime.isPresent()) {
          LOG.info("Not archiving as there is no compaction yet on the metadata table");
          instants = Stream.empty();
        } else {
          LOG.info("Limiting archiving of instants to latest compaction on metadata table at " + latestCompactionTime.get());
          instants = instants.filter(instant -> compareTimestamps(instant.getTimestamp(), LESSER_THAN,
              latestCompactionTime.get()));
        }
      } catch (Exception e) {
        throw new HoodieException("Error limiting instant archival based on metadata table", e);
      }
    }

    if (table.isMetadataTable()) {
      HoodieTableMetaClient dataMetaClient = HoodieTableMetaClient.builder()
          .setBasePath(HoodieTableMetadata.getDatasetBasePath(config.getBasePath()))
          .setConf(metaClient.getHadoopConf())
          .build();
      Option qualifiedEarliestInstant =
          TimelineUtils.getEarliestInstantForMetadataArchival(
              dataMetaClient.getActiveTimeline(), config.shouldArchiveBeyondSavepoint());

      // Do not archive the instants after the earliest commit (COMMIT, DELTA_COMMIT, and
      // REPLACE_COMMIT only, considering non-savepoint commit only if enabling archive
      // beyond savepoint) and the earliest inflight instant (all actions).
      // This is required by metadata table, see HoodieTableMetadataUtil#processRollbackMetadata
      // for details.
      // Note that we cannot blindly use the earliest instant of all actions, because CLEAN and
      // ROLLBACK instants are archived separately apart from commits (check
      // HoodieTimelineArchiver#getCleanInstantsToArchive).  If we do so, a very old completed
      // CLEAN or ROLLBACK instant can block the archive of metadata table timeline and causes
      // the active timeline of metadata table to be extremely long, leading to performance issues
      // for loading the timeline.
      if (qualifiedEarliestInstant.isPresent()) {
        instants = instants.filter(instant ->
            compareTimestamps(
                instant.getTimestamp(),
                HoodieTimeline.LESSER_THAN,
                qualifiedEarliestInstant.get().getTimestamp()));
      }
    }

    return instants.flatMap(hoodieInstant -> {
      List instantsToStream = groupByTsAction.get(Pair.of(hoodieInstant.getTimestamp(),
          HoodieInstant.getComparableAction(hoodieInstant.getAction())));
      if (instantsToStream != null) {
        return instantsToStream.stream();
      } else {
        // if a concurrent writer archived the instant
        return Stream.empty();
      }
    });
  }

  private boolean deleteArchivedInstants(List archivedInstants, HoodieEngineContext context) throws IOException {
    LOG.info("Deleting instants " + archivedInstants);

    List pendingInstants = new ArrayList<>();
    List completedInstants = new ArrayList<>();

    for (HoodieInstant instant : archivedInstants) {
      if (instant.isCompleted()) {
        completedInstants.add(instant);
      } else {
        pendingInstants.add(instant);
      }
    }

    context.setJobStatus(this.getClass().getSimpleName(), "Delete archived instants: " + config.getTableName());
    // Delete the metadata files
    // in HoodieInstant.State sequence: requested -> inflight -> completed,
    // this is important because when a COMPLETED metadata file is removed first,
    // other monitors on the timeline(such as the compaction or clustering services) would
    // mistakenly recognize the pending file as a pending operation,
    // then all kinds of weird bugs occur.
    HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();
    if (!pendingInstants.isEmpty()) {
      context.foreach(
          pendingInstants,
          instant -> activeTimeline.deleteInstantFileIfExists(instant),
          Math.min(pendingInstants.size(), config.getArchiveDeleteParallelism())
      );
    }
    if (!completedInstants.isEmpty()) {
      context.foreach(
          completedInstants,
          instant -> activeTimeline.deleteInstantFileIfExists(instant),
          Math.min(completedInstants.size(), config.getArchiveDeleteParallelism())
      );
    }

    return true;
  }

  public void archive(HoodieEngineContext context, List instants) throws HoodieCommitException {
    try {
      Schema wrapperSchema = HoodieArchivedMetaEntry.getClassSchema();
      LOG.info("Wrapper schema " + wrapperSchema.toString());
      List records = new ArrayList<>();
      for (HoodieInstant hoodieInstant : instants) {
        try {
          deleteAnyLeftOverMarkers(context, hoodieInstant);
          records.add(convertToAvroRecord(hoodieInstant));
          if (records.size() >= this.config.getCommitArchivalBatchSize()) {
            writeToFile(wrapperSchema, records);
          }
        } catch (Exception e) {
          LOG.error("Failed to archive commits, .commit file: " + hoodieInstant.getFileName(), e);
          if (this.config.isFailOnTimelineArchivingEnabled()) {
            throw e;
          }
        }
      }
      writeToFile(wrapperSchema, records);
    } catch (Exception e) {
      throw new HoodieCommitException("Failed to archive commits", e);
    }
  }

  private void deleteAnyLeftOverMarkers(HoodieEngineContext context, HoodieInstant instant) {
    WriteMarkers writeMarkers = WriteMarkersFactory.get(config.getMarkersType(), table, instant.getTimestamp());
    if (writeMarkers.deleteMarkerDir(context, config.getMarkersDeleteParallelism())) {
      LOG.info("Cleaned up left over marker directory for instant :" + instant);
    }
  }

  private void writeToFile(Schema wrapperSchema, List records) throws Exception {
    if (records.size() > 0) {
      Map header = new HashMap<>();
      header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, wrapperSchema.toString());
      final String keyField = table.getMetaClient().getTableConfig().getRecordKeyFieldProp();
      List indexRecords = records.stream().map(HoodieAvroIndexedRecord::new).collect(Collectors.toList());
      HoodieAvroDataBlock block = new HoodieAvroDataBlock(indexRecords, header, keyField);
      writer.appendBlock(block);
      records.clear();
    }
  }

  private IndexedRecord convertToAvroRecord(HoodieInstant hoodieInstant)
      throws IOException {
    return MetadataConversionUtils.createMetaWrapper(hoodieInstant, metaClient);
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy