All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.table.repair.RepairUtils Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.hudi.table.repair;

import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieReplaceCommitMetadata;
import org.apache.hudi.common.model.HoodieWriteStat;
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
import org.apache.hudi.common.table.timeline.HoodieArchivedTimeline;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.exception.HoodieException;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;

import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;

import static org.apache.hudi.common.table.timeline.HoodieTimeline.COMMIT_ACTION;
import static org.apache.hudi.common.table.timeline.HoodieTimeline.DELTA_COMMIT_ACTION;
import static org.apache.hudi.common.table.timeline.HoodieTimeline.REPLACE_COMMIT_ACTION;

/**
 * Utils for table repair tool.
 */
public final class RepairUtils {
  /**
   * Tags the instant time of each base or log file from the input file paths.
   *
   * @param basePath          Base path of the table.
   * @param allPaths          A {@link List} of file paths to tag.
   * @return A {@link Map} of instant time in {@link String} to a {@link List} of relative file paths.
   */
  public static Map> tagInstantsOfBaseAndLogFiles(
      String basePath, List allPaths) {
    // Instant time -> Set of base and log file paths
    Map> instantToFilesMap = new HashMap<>();
    allPaths.forEach(path -> {
      String instantTime = FSUtils.getCommitTime(path.getName());
      instantToFilesMap.computeIfAbsent(instantTime, k -> new ArrayList<>());
      instantToFilesMap.get(instantTime).add(
          FSUtils.getRelativePartitionPath(new Path(basePath), path));
    });
    return instantToFilesMap;
  }

  /**
   * Gets the base and log file paths written for a given instant from the timeline.
   * This reads the details of the instant metadata.
   *
   * @param timeline {@link HoodieTimeline} instance, can be active or archived timeline.
   * @param instant  Instant for lookup.
   * @return A {@link Option} of {@link Set} of relative file paths to base path
   * if the instant action is supported; empty {@link Option} otherwise.
   * @throws IOException if reading instant details fail.
   */
  public static Option> getBaseAndLogFilePathsFromTimeline(
      HoodieTimeline timeline, HoodieInstant instant) throws IOException {
    if (!instant.isCompleted()) {
      throw new HoodieException("Cannot get base and log file paths from "
          + "instant not completed: " + instant.getTimestamp());
    }

    switch (instant.getAction()) {
      case COMMIT_ACTION:
      case DELTA_COMMIT_ACTION:
        final HoodieCommitMetadata commitMetadata =
            HoodieCommitMetadata.fromBytes(
                timeline.getInstantDetails(instant).get(), HoodieCommitMetadata.class);
        return Option.of(commitMetadata.getPartitionToWriteStats().values().stream().flatMap(List::stream)
            .map(HoodieWriteStat::getPath).collect(Collectors.toSet()));
      case REPLACE_COMMIT_ACTION:
        final HoodieReplaceCommitMetadata replaceCommitMetadata =
            HoodieReplaceCommitMetadata.fromBytes(
                timeline.getInstantDetails(instant).get(), HoodieReplaceCommitMetadata.class);
        return Option.of(replaceCommitMetadata.getPartitionToWriteStats().values().stream().flatMap(List::stream)
            .map(HoodieWriteStat::getPath).collect(Collectors.toSet()));
      default:
        return Option.empty();
    }
  }

  /**
   * Finds the dangling files to remove for a given instant to repair.
   *
   * @param instantToRepair       Instant timestamp to repair.
   * @param baseAndLogFilesFromFs A {@link List} of base and log files based on the file system.
   * @param activeTimeline        {@link HoodieActiveTimeline} instance.
   * @param archivedTimeline      {@link HoodieArchivedTimeline} instance.
   * @return A {@link List} of relative file paths to base path for removing.
   */
  public static List findInstantFilesToRemove(
      String instantToRepair, List baseAndLogFilesFromFs,
      HoodieActiveTimeline activeTimeline, HoodieArchivedTimeline archivedTimeline) {
    // Skips the instant if it is requested or inflight in active timeline
    if (activeTimeline.filter(instant -> instant.getTimestamp().equals(instantToRepair)
        && !instant.isCompleted()).getInstants().findAny().isPresent()) {
      return Collections.emptyList();
    }

    try {
      boolean doesInstantExist = false;
      Option> filesFromTimeline = Option.empty();
      Option instantOption = activeTimeline.filterCompletedInstants().filter(
          instant -> instant.getTimestamp().equals(instantToRepair)).firstInstant();
      if (instantOption.isPresent()) {
        // Completed instant in active timeline
        doesInstantExist = true;
        filesFromTimeline = RepairUtils.getBaseAndLogFilePathsFromTimeline(
            activeTimeline, instantOption.get());
      } else {
        instantOption = archivedTimeline.filterCompletedInstants().filter(
            instant -> instant.getTimestamp().equals(instantToRepair)).firstInstant();
        if (instantOption.isPresent()) {
          // Completed instant in archived timeline
          doesInstantExist = true;
          filesFromTimeline = RepairUtils.getBaseAndLogFilePathsFromTimeline(
              archivedTimeline, instantOption.get());
        }
      }

      if (doesInstantExist) {
        if (!filesFromTimeline.isPresent() || filesFromTimeline.get().isEmpty()) {
          // Skips if no instant details
          return Collections.emptyList();
        }
        // Excludes committed base and log files from timeline
        Set filesToRemove = new HashSet<>(baseAndLogFilesFromFs);
        filesToRemove.removeAll(filesFromTimeline.get());
        return new ArrayList<>(filesToRemove);
      } else {
        // The instant does not exist in the whole timeline (neither completed nor requested/inflight),
        // this means the files from this instant are dangling, which should be removed
        return baseAndLogFilesFromFs;
      }
    } catch (IOException e) {
      // In case of failure, does not remove any files for the instant
      return Collections.emptyList();
    }
  }

  /**
   * Serializable path filter class for Spark job.
   */
  public interface SerializablePathFilter extends PathFilter, Serializable {
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy