All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.table.HoodieTimelineArchiveLog Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.table;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hudi.avro.model.HoodieArchivedMetaEntry;
import org.apache.hudi.client.utils.MetadataConversionUtils;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieArchivedLogFile;
import org.apache.hudi.common.model.HoodieAvroPayload;
import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.log.HoodieLogFormat;
import org.apache.hudi.common.table.log.HoodieLogFormat.Writer;
import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock;
import org.apache.hudi.common.table.log.block.HoodieLogBlock;
import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType;
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
import org.apache.hudi.common.table.timeline.HoodieArchivedTimeline;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.table.view.FileSystemViewStorageConfig;
import org.apache.hudi.common.util.CollectionUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieCommitException;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.metadata.HoodieTableMetadata;
import org.apache.hudi.table.marker.WriteMarkers;
import org.apache.hudi.table.marker.WriteMarkersFactory;

import org.apache.avro.Schema;
import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import static org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN;
import static org.apache.hudi.common.table.timeline.HoodieTimeline.LESSER_THAN_OR_EQUALS;

/**
 * Archiver to bound the growth of files under .hoodie meta path.
 */
public class HoodieTimelineArchiveLog {

  private static final Logger LOG = LogManager.getLogger(HoodieTimelineArchiveLog.class);

  private final Path archiveFilePath;
  private final HoodieWriteConfig config;
  private Writer writer;
  private final int maxInstantsToKeep;
  private final int minInstantsToKeep;
  private final HoodieTable table;
  private final HoodieTableMetaClient metaClient;

  public HoodieTimelineArchiveLog(HoodieWriteConfig config, HoodieTable table) {
    this.config = config;
    this.table = table;
    this.metaClient = table.getMetaClient();
    this.archiveFilePath = HoodieArchivedTimeline.getArchiveLogPath(metaClient.getArchivePath());
    this.maxInstantsToKeep = config.getMaxCommitsToKeep();
    this.minInstantsToKeep = config.getMinCommitsToKeep();
  }

  private Writer openWriter() {
    try {
      if (this.writer == null) {
        return HoodieLogFormat.newWriterBuilder().onParentPath(archiveFilePath.getParent())
            .withFileId(archiveFilePath.getName()).withFileExtension(HoodieArchivedLogFile.ARCHIVE_EXTENSION)
            .withFs(metaClient.getFs()).overBaseCommit("").build();
      } else {
        return this.writer;
      }
    } catch (IOException e) {
      throw new HoodieException("Unable to initialize HoodieLogFormat writer", e);
    }
  }

  private void close() {
    try {
      if (this.writer != null) {
        this.writer.close();
      }
    } catch (IOException e) {
      throw new HoodieException("Unable to close HoodieLogFormat writer", e);
    }
  }

  /**
   * Check if commits need to be archived. If yes, archive commits.
   */
  public boolean archiveIfRequired(HoodieEngineContext context) throws IOException {
    try {
      List instantsToArchive = getInstantsToArchive().collect(Collectors.toList());

      boolean success = true;
      if (!instantsToArchive.isEmpty()) {
        this.writer = openWriter();
        LOG.info("Archiving instants " + instantsToArchive);
        archive(context, instantsToArchive);
        LOG.info("Deleting archived instants " + instantsToArchive);
        success = deleteArchivedInstants(instantsToArchive, context);
      } else {
        LOG.info("No Instants to archive");
      }

      return success;
    } finally {
      close();
    }
  }

  private Stream getCleanInstantsToArchive() {
    HoodieTimeline cleanAndRollbackTimeline = table.getActiveTimeline()
        .getTimelineOfActions(CollectionUtils.createSet(HoodieTimeline.CLEAN_ACTION, HoodieTimeline.ROLLBACK_ACTION)).filterCompletedInstants();
    return cleanAndRollbackTimeline.getInstants()
        .collect(Collectors.groupingBy(HoodieInstant::getAction)).values().stream()
        .map(hoodieInstants -> {
          if (hoodieInstants.size() > this.maxInstantsToKeep) {
            return hoodieInstants.subList(0, hoodieInstants.size() - this.minInstantsToKeep);
          } else {
            return new ArrayList();
          }
        }).flatMap(Collection::stream);
  }

  private Stream getCommitInstantsToArchive() {
    // TODO (na) : Add a way to return actions associated with a timeline and then merge/unify
    // with logic above to avoid Stream.concat
    HoodieTimeline commitTimeline = table.getCompletedCommitsTimeline();

    Option oldestPendingCompactionAndReplaceInstant = table.getActiveTimeline()
        .getTimelineOfActions(CollectionUtils.createSet(HoodieTimeline.COMPACTION_ACTION, HoodieTimeline.REPLACE_COMMIT_ACTION))
        .filter(s -> !s.isCompleted())
        .firstInstant();

    Option oldestInflightCommitInstant =
        table.getActiveTimeline()
            .getTimelineOfActions(CollectionUtils.createSet(HoodieTimeline.COMMIT_ACTION, HoodieTimeline.DELTA_COMMIT_ACTION))
            .filterInflights().firstInstant();

    // We cannot have any holes in the commit timeline. We cannot archive any commits which are
    // made after the first savepoint present.
    Option firstSavepoint = table.getCompletedSavepointTimeline().firstInstant();
    if (!commitTimeline.empty() && commitTimeline.countInstants() > maxInstantsToKeep) {
      // Actually do the commits
      Stream instantToArchiveStream = commitTimeline.getInstants()
          .filter(s -> {
            // if no savepoint present, then don't filter
            return !(firstSavepoint.isPresent() && HoodieTimeline.compareTimestamps(firstSavepoint.get().getTimestamp(), LESSER_THAN_OR_EQUALS, s.getTimestamp()));
          }).filter(s -> {
            // Ensure commits >= oldest pending compaction commit is retained
            return oldestPendingCompactionAndReplaceInstant
                .map(instant -> HoodieTimeline.compareTimestamps(instant.getTimestamp(), GREATER_THAN, s.getTimestamp()))
                .orElse(true);
          });
      // We need this to ensure that when multiple writers are performing conflict resolution, eligible instants don't
      // get archived, i.e, instants after the oldestInflight are retained on the timeline
      if (config.getFailedWritesCleanPolicy() == HoodieFailedWritesCleaningPolicy.LAZY) {
        instantToArchiveStream = instantToArchiveStream.filter(s -> oldestInflightCommitInstant.map(instant ->
            HoodieTimeline.compareTimestamps(instant.getTimestamp(), GREATER_THAN, s.getTimestamp()))
            .orElse(true));
      }
      return instantToArchiveStream.limit(commitTimeline.countInstants() - minInstantsToKeep);
    } else {
      return Stream.empty();
    }
  }

  private Stream getInstantsToArchive() {
    Stream instants = Stream.concat(getCleanInstantsToArchive(), getCommitInstantsToArchive());

    // For archiving and cleaning instants, we need to include intermediate state files if they exist
    HoodieActiveTimeline rawActiveTimeline = new HoodieActiveTimeline(metaClient, false);
    Map, List> groupByTsAction = rawActiveTimeline.getInstants()
        .collect(Collectors.groupingBy(i -> Pair.of(i.getTimestamp(),
            HoodieInstant.getComparableAction(i.getAction()))));

    // If metadata table is enabled, do not archive instants which are more recent that the last compaction on the
    // metadata table.
    if (config.isMetadataTableEnabled()) {
      try (HoodieTableMetadata tableMetadata = HoodieTableMetadata.create(table.getContext(), config.getMetadataConfig(),
          config.getBasePath(), FileSystemViewStorageConfig.SPILLABLE_DIR.defaultValue())) {
        Option latestCompactionTime = tableMetadata.getLatestCompactionTime();
        if (!latestCompactionTime.isPresent()) {
          LOG.info("Not archiving as there is no compaction yet on the metadata table");
          instants = Stream.empty();
        } else {
          LOG.info("Limiting archiving of instants to latest compaction on metadata table at " + latestCompactionTime.get());
          instants = instants.filter(instant -> HoodieTimeline.compareTimestamps(instant.getTimestamp(), HoodieTimeline.LESSER_THAN,
              latestCompactionTime.get()));
        }
      } catch (Exception e) {
        throw new HoodieException("Error limiting instant archival based on metadata table", e);
      }
    }

    return instants.flatMap(hoodieInstant ->
        groupByTsAction.get(Pair.of(hoodieInstant.getTimestamp(),
            HoodieInstant.getComparableAction(hoodieInstant.getAction()))).stream());
  }

  private boolean deleteArchivedInstants(List archivedInstants, HoodieEngineContext context) throws IOException {
    LOG.info("Deleting instants " + archivedInstants);
    boolean success = true;
    List instantFiles = archivedInstants.stream().map(archivedInstant ->
        new Path(metaClient.getMetaPath(), archivedInstant.getFileName())
    ).map(Path::toString).collect(Collectors.toList());

    context.setJobStatus(this.getClass().getSimpleName(), "Delete archived instants");
    Map resultDeleteInstantFiles = FSUtils.parallelizeFilesProcess(context,
        metaClient.getFs(),
        config.getArchiveDeleteParallelism(),
        pairOfSubPathAndConf -> {
          Path commitFile = new Path(pairOfSubPathAndConf.getKey());
          try {
            FileSystem fs = commitFile.getFileSystem(pairOfSubPathAndConf.getValue().get());
            if (fs.exists(commitFile)) {
              return fs.delete(commitFile, false);
            }
            return true;
          } catch (IOException e) {
            throw new HoodieIOException("Failed to delete archived instant " + commitFile, e);
          }
        },
        instantFiles);

    for (Map.Entry result : resultDeleteInstantFiles.entrySet()) {
      LOG.info("Archived and deleted instant file " + result.getKey() + " : " + result.getValue());
      success &= result.getValue();
    }

    // Remove older meta-data from auxiliary path too
    Option latestCommitted = Option.fromJavaOptional(archivedInstants.stream().filter(i -> i.isCompleted() && (i.getAction().equals(HoodieTimeline.COMMIT_ACTION)
        || (i.getAction().equals(HoodieTimeline.DELTA_COMMIT_ACTION)))).max(Comparator.comparing(HoodieInstant::getTimestamp)));
    LOG.info("Latest Committed Instant=" + latestCommitted);
    if (latestCommitted.isPresent()) {
      success &= deleteAllInstantsOlderOrEqualsInAuxMetaFolder(latestCommitted.get());
    }
    return success;
  }

  /**
   * Remove older instants from auxiliary meta folder.
   *
   * @param thresholdInstant Hoodie Instant
   * @return success if all eligible file deleted successfully
   * @throws IOException in case of error
   */
  private boolean deleteAllInstantsOlderOrEqualsInAuxMetaFolder(HoodieInstant thresholdInstant) throws IOException {
    List instants = null;
    boolean success = true;
    try {
      instants =
          metaClient.scanHoodieInstantsFromFileSystem(
              new Path(metaClient.getMetaAuxiliaryPath()),
              HoodieActiveTimeline.VALID_EXTENSIONS_IN_ACTIVE_TIMELINE,
              false);
    } catch (FileNotFoundException e) {
      /*
       * On some FSs deletion of all files in the directory can auto remove the directory itself.
       * GCS is one example, as it doesn't have real directories and subdirectories. When client
       * removes all the files from a "folder" on GCS is has to create a special "/" to keep the folder
       * around. If this doesn't happen (timeout, mis configured client, ...) folder will be deleted and
       * in this case we should not break when aux folder is not found.
       * GCS information: (https://cloud.google.com/storage/docs/gsutil/addlhelp/HowSubdirectoriesWork)
       */
      LOG.warn("Aux path not found. Skipping: " + metaClient.getMetaAuxiliaryPath());
      return true;
    }

    List instantsToBeDeleted =
        instants.stream().filter(instant1 -> HoodieTimeline.compareTimestamps(instant1.getTimestamp(),
            LESSER_THAN_OR_EQUALS, thresholdInstant.getTimestamp())).collect(Collectors.toList());

    for (HoodieInstant deleteInstant : instantsToBeDeleted) {
      LOG.info("Deleting instant " + deleteInstant + " in auxiliary meta path " + metaClient.getMetaAuxiliaryPath());
      Path metaFile = new Path(metaClient.getMetaAuxiliaryPath(), deleteInstant.getFileName());
      if (metaClient.getFs().exists(metaFile)) {
        success &= metaClient.getFs().delete(metaFile, false);
        LOG.info("Deleted instant file in auxiliary meta path : " + metaFile);
      }
    }
    return success;
  }

  public void archive(HoodieEngineContext context, List instants) throws HoodieCommitException {
    try {
      Schema wrapperSchema = HoodieArchivedMetaEntry.getClassSchema();
      LOG.info("Wrapper schema " + wrapperSchema.toString());
      List records = new ArrayList<>();
      for (HoodieInstant hoodieInstant : instants) {
        try {
          if (table.getActiveTimeline().isEmpty(hoodieInstant)
                  && (
                          hoodieInstant.getAction().equals(HoodieTimeline.CLEAN_ACTION)
                          || (hoodieInstant.getAction().equals(HoodieTimeline.ROLLBACK_ACTION) && hoodieInstant.isCompleted())
                     )
          ) {
            table.getActiveTimeline().deleteEmptyInstantIfExists(hoodieInstant);
          } else {
            deleteAnyLeftOverMarkers(context, hoodieInstant);
            records.add(convertToAvroRecord(hoodieInstant));
            if (records.size() >= this.config.getCommitArchivalBatchSize()) {
              writeToFile(wrapperSchema, records);
            }
          }
        } catch (Exception e) {
          LOG.error("Failed to archive commits, .commit file: " + hoodieInstant.getFileName(), e);
          if (this.config.isFailOnTimelineArchivingEnabled()) {
            throw e;
          }
        }
      }
      writeToFile(wrapperSchema, records);
    } catch (Exception e) {
      throw new HoodieCommitException("Failed to archive commits", e);
    }
  }

  private void deleteAnyLeftOverMarkers(HoodieEngineContext context, HoodieInstant instant) {
    WriteMarkers writeMarkers = WriteMarkersFactory.get(config.getMarkersType(), table, instant.getTimestamp());
    if (writeMarkers.deleteMarkerDir(context, config.getMarkersDeleteParallelism())) {
      LOG.info("Cleaned up left over marker directory for instant :" + instant);
    }
  }

  private void writeToFile(Schema wrapperSchema, List records) throws Exception {
    if (records.size() > 0) {
      Map header = new HashMap<>();
      header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, wrapperSchema.toString());
      final String keyField = table.getMetaClient().getTableConfig().getRecordKeyFieldProp();
      HoodieAvroDataBlock block = new HoodieAvroDataBlock(records, header, keyField);
      writer.appendBlock(block);
      records.clear();
    }
  }

  private IndexedRecord convertToAvroRecord(HoodieInstant hoodieInstant)
      throws IOException {
    return MetadataConversionUtils.createMetaWrapper(hoodieInstant, metaClient);
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy