All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.utilities.HoodieRepairTool Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.hudi.utilities;

import org.apache.hudi.client.common.HoodieSparkEngineContext;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
import org.apache.hudi.common.table.timeline.HoodieArchivedTimeline;
import org.apache.hudi.common.util.FileIOUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.common.util.collection.ImmutablePair;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.hadoop.fs.HadoopFSUtils;
import org.apache.hudi.metadata.FileSystemBackedTableMetadata;
import org.apache.hudi.metadata.HoodieTableMetadata;
import org.apache.hudi.storage.HoodieStorage;
import org.apache.hudi.storage.HoodieStorageUtils;
import org.apache.hudi.storage.StorageConfiguration;
import org.apache.hudi.storage.StoragePath;
import org.apache.hudi.table.repair.RepairUtils;

import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.api.java.JavaSparkContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.Serializable;
import java.security.SecureRandom;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

/**
 * A tool with spark-submit to repair Hudi table by finding and deleting dangling
 * base and log files.
 * 

* You can run this tool with the following command: * ``` * spark-submit \ * --class org.apache.hudi.utilities.HoodieRepairTool \ * --driver-memory 4g \ * --executor-memory 1g \ * --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ * --conf spark.sql.catalogImplementation=hive \ * --conf spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension \ * $HUDI_DIR/packaging/hudi-utilities-bundle/target/hudi-utilities-bundle_2.12-0.11.0-SNAPSHOT.jar \ * --mode dry_run \ * --base-path base_path \ * --assume-date-partitioning * ``` *

* You can specify the running mode of the tool through `--mode`. * There are three modes of the {@link HoodieRepairTool}: * - REPAIR ("repair"): repairs the table by removing dangling data and log files not belonging to any commit. * The removed files are going to be backed up at the backup path provided, in case recovery is needed. * In this mode, backup path is required through `--backup-path`. You can also provide a range for repairing * only the instants within the range, through `--start-instant-time` and `--end-instant-time`. You can also * specify only one of them. If no range is provided, all instants are going to be repaired. *

* Example command: * ``` * spark-submit \ * --class org.apache.hudi.utilities.HoodieRepairTool \ * --driver-memory 4g \ * --executor-memory 1g \ * --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ * --conf spark.sql.catalogImplementation=hive \ * --conf spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension \ * $HUDI_DIR/packaging/hudi-utilities-bundle/target/hudi-utilities-bundle_2.12-0.11.0-SNAPSHOT.jar \ * --mode repair \ * --base-path base_path \ * --backup-path backup_path \ * --start-instant-time ts1 \ * --end-instant-time ts2 \ * --assume-date-partitioning * ``` *

* - DRY_RUN ("dry_run"): only looks for dangling data and log files. You can also provide a range for looking * at only the instants within the range, through `--start-instant-time` and `--end-instant-time`. You can also * specify only one of them. If no range is provided, all instants are going to be scanned. *

* Example command: * ``` * spark-submit \ * --class org.apache.hudi.utilities.HoodieRepairTool \ * --driver-memory 4g \ * --executor-memory 1g \ * --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ * --conf spark.sql.catalogImplementation=hive \ * --conf spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension \ * $HUDI_DIR/packaging/hudi-utilities-bundle/target/hudi-utilities-bundle_2.12-0.11.0-SNAPSHOT.jar \ * --mode dry_run \ * --base-path base_path \ * --start-instant-time ts1 \ * --end-instant-time ts2 \ * --assume-date-partitioning * ``` *

* - UNDO ("undo"): undoes the repair by copying back the files from backup directory to the table base path. * In this mode, backup path is required through `--backup-path`. *

* Example command: * ``` * spark-submit \ * --class org.apache.hudi.utilities.HoodieRepairTool \ * --driver-memory 4g \ * --executor-memory 1g \ * --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ * --conf spark.sql.catalogImplementation=hive \ * --conf spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension \ * $HUDI_DIR/packaging/hudi-utilities-bundle/target/hudi-utilities-bundle_2.12-0.11.0-SNAPSHOT.jar \ * --mode undo \ * --base-path base_path \ * --backup-path backup_path * ``` */ public class HoodieRepairTool { private static final Logger LOG = LoggerFactory.getLogger(HoodieRepairTool.class); private static final String BACKUP_DIR_PREFIX = "hoodie_repair_backup_"; // Repair config private final Config cfg; // Properties with source, hoodie client, key generator etc. private TypedProperties props; // Spark context private final HoodieEngineContext context; private final HoodieTableMetaClient metaClient; private final HoodieTableMetadata tableMetadata; public HoodieRepairTool(JavaSparkContext jsc, Config cfg) { if (cfg.propsFilePath != null) { cfg.propsFilePath = HadoopFSUtils.addSchemeIfLocalPath(cfg.propsFilePath).toString(); } this.context = new HoodieSparkEngineContext(jsc); this.cfg = cfg; this.props = cfg.propsFilePath == null ? UtilHelpers.buildProperties(cfg.configs) : readConfigFromFileSystem(jsc, cfg); this.metaClient = HoodieTableMetaClient.builder() .setConf(HadoopFSUtils.getStorageConfWithCopy(jsc.hadoopConfiguration())).setBasePath(cfg.basePath) .setLoadActiveTimelineOnLoad(true) .build(); this.tableMetadata = new FileSystemBackedTableMetadata( context, metaClient.getTableConfig(), metaClient.getStorage(), cfg.basePath); } public boolean run() { Option startingInstantOption = Option.ofNullable(cfg.startingInstantTime); Option endingInstantOption = Option.ofNullable(cfg.endingInstantTime); if (startingInstantOption.isPresent() && endingInstantOption.isPresent()) { LOG.info(String.format("Start repairing completed instants between %s and %s (inclusive)", startingInstantOption.get(), endingInstantOption.get())); } else if (startingInstantOption.isPresent()) { LOG.info(String.format("Start repairing completed instants from %s (inclusive)", startingInstantOption.get())); } else if (endingInstantOption.isPresent()) { LOG.info(String.format("Start repairing completed instants till %s (inclusive)", endingInstantOption.get())); } else { LOG.info("Start repairing all completed instants"); } try { Mode mode = Mode.valueOf(cfg.runningMode.toUpperCase()); switch (mode) { case REPAIR: LOG.info(" ****** The repair tool is in REPAIR mode, dangling data and logs files " + "not belonging to any commit are going to be DELETED from the table ******"); if (checkBackupPathForRepair() < 0) { LOG.error("Backup path check failed."); return false; } return doRepair(startingInstantOption, endingInstantOption, false); case DRY_RUN: LOG.info(" ****** The repair tool is in DRY_RUN mode, " + "only LOOKING FOR dangling data and log files from the table ******"); return doRepair(startingInstantOption, endingInstantOption, true); case UNDO: if (checkBackupPathAgainstBasePath() < 0) { LOG.error("Backup path check failed."); return false; } return undoRepair(); default: LOG.info("Unsupported running mode [" + cfg.runningMode + "], quit the job directly"); return false; } } catch (IOException e) { throw new HoodieIOException("Unable to repair table in " + cfg.basePath, e); } } public static void main(String[] args) { final Config cfg = new Config(); JCommander cmd = new JCommander(cfg, null, args); if (cfg.help || args.length == 0) { cmd.usage(); System.exit(1); } final JavaSparkContext jsc = UtilHelpers.buildSparkContext("hudi-table-repair", cfg.sparkMaster, cfg.sparkMemory); try { new HoodieRepairTool(jsc, cfg).run(); } catch (Throwable throwable) { LOG.error("Fail to run table repair for " + cfg.basePath, throwable); } finally { jsc.stop(); } } /** * Copies the list of files from source base path to destination base path. * The destination file path (base + relative) should not already exist. * * @param context {@link HoodieEngineContext} instance. * @param relativeFilePaths A {@link List} of relative file paths for copying. * @param sourceBasePath Source base path. * @param destBasePath Destination base path. * @return {@code true} if all successful; {@code false} otherwise. */ static boolean copyFiles( HoodieEngineContext context, List relativeFilePaths, String sourceBasePath, String destBasePath) { StorageConfiguration conf = context.getStorageConf(); List allResults = context.parallelize(relativeFilePaths) .mapPartitions(iterator -> { List results = new ArrayList<>(); HoodieStorage storage = HoodieStorageUtils.getStorage(destBasePath, conf); iterator.forEachRemaining(filePath -> { boolean success = false; StoragePath sourcePath = new StoragePath(sourceBasePath, filePath); StoragePath destPath = new StoragePath(destBasePath, filePath); try { if (!storage.exists(destPath)) { FileIOUtils.copy(storage, sourcePath, destPath); success = true; } } catch (IOException e) { // Copy Fail LOG.error(String.format("Copying file fails: source [%s], destination [%s]", sourcePath, destPath)); } finally { results.add(success); } }); return results.iterator(); }, true) .collectAsList(); return allResults.stream().reduce((r1, r2) -> r1 && r2).orElse(false); } /** * Lists all Hoodie files from the table base path. * * @param context {@link HoodieEngineContext} instance. * @param basePathStr Table base path. * @param expectedLevel Expected level in the directory hierarchy to include the file status. * @param parallelism Parallelism for the file listing. * @return A list of absolute file paths of all Hoodie files. * @throws IOException upon errors. */ static List listFilesFromBasePath( HoodieEngineContext context, String basePathStr, int expectedLevel, int parallelism) { FileSystem fs = HadoopFSUtils.getFs(basePathStr, context.getStorageConf()); Path basePath = new Path(basePathStr); return HadoopFSUtils.getFileStatusAtLevel( context, fs, basePath, expectedLevel, parallelism).stream() .filter(fileStatus -> { if (!fileStatus.isFile()) { return false; } return HadoopFSUtils.isDataFile(fileStatus.getPath()); }) .map(fileStatus -> fileStatus.getPath().toString()) .collect(Collectors.toList()); } /** * Deletes files from table base path. * * @param context {@link HoodieEngineContext} instance. * @param basePath Base path of the table. * @param relativeFilePaths A {@link List} of relative file paths for deleting. */ static boolean deleteFiles( HoodieEngineContext context, String basePath, List relativeFilePaths) { StorageConfiguration conf = context.getStorageConf(); return context.parallelize(relativeFilePaths) .mapPartitions(iterator -> { FileSystem fs = HadoopFSUtils.getFs(basePath, conf); List results = new ArrayList<>(); iterator.forEachRemaining(relativeFilePath -> { boolean success = false; try { success = fs.delete(new Path(basePath, relativeFilePath), false); } catch (IOException e) { LOG.warn("Failed to delete file " + relativeFilePath); } finally { results.add(success); } }); return results.iterator(); }, true) .collectAsList() .stream().reduce((a, b) -> a && b) .orElse(true); } /** * Does repair, either in REPAIR or DRY_RUN mode. * * @param startingInstantOption {@link Option} of starting instant for scanning, can be empty. * @param endingInstantOption {@link Option} of ending instant for scanning, can be empty. * @param isDryRun Is dry run. * @throws IOException upon errors. */ boolean doRepair( Option startingInstantOption, Option endingInstantOption, boolean isDryRun) throws IOException { // Scans all partitions to find base and log files in the base path List allFilesInPartitions = HoodieDataTableUtils.getBaseAndLogFilePathsFromFileSystem(tableMetadata, cfg.basePath); // Buckets the files based on instant time // instant time -> relative paths of base and log files to base path Map> instantToFilesMap = RepairUtils.tagInstantsOfBaseAndLogFiles( metaClient.getBasePath().toString(), allFilesInPartitions); List instantTimesToRepair = instantToFilesMap.keySet().stream() .filter(instant -> (!startingInstantOption.isPresent() || instant.compareTo(startingInstantOption.get()) >= 0) && (!endingInstantOption.isPresent() || instant.compareTo(endingInstantOption.get()) <= 0) ).collect(Collectors.toList()); HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline(); HoodieArchivedTimeline archivedTimeline = metaClient.getArchivedTimeline(); // This assumes that the archived timeline only has completed instants so this is safe archivedTimeline.loadCompletedInstantDetailsInMemory(); List>> instantFilesToRemove = context.parallelize(instantTimesToRepair) .map(instantToRepair -> new ImmutablePair<>(instantToRepair, RepairUtils.findInstantFilesToRemove(instantToRepair, instantToFilesMap.get(instantToRepair), activeTimeline, archivedTimeline))) .collectAsList(); List>> instantsWithDanglingFiles = instantFilesToRemove.stream().filter(e -> !e.getValue().isEmpty()).collect(Collectors.toList()); printRepairInfo(instantTimesToRepair, instantsWithDanglingFiles); if (!isDryRun) { List relativeFilePathsToDelete = instantsWithDanglingFiles.stream() .flatMap(e -> e.getValue().stream()) .collect(Collectors.toList()); if (relativeFilePathsToDelete.size() > 0) { if (!backupFiles(relativeFilePathsToDelete)) { LOG.error("Error backing up dangling files. Exiting..."); return false; } return deleteFiles(context, cfg.basePath, relativeFilePathsToDelete); } LOG.info(String.format("Table repair on %s is successful", cfg.basePath)); } return true; } /** * Undoes repair for UNDO mode. * * @throws IOException upon errors. */ boolean undoRepair() throws IOException { HoodieStorage storage = metaClient.getStorage(); String backupPathStr = cfg.backupPath; StoragePath backupPath = new StoragePath(backupPathStr); if (!storage.exists(backupPath)) { LOG.error("Cannot find backup path: " + backupPath); return false; } List allPartitionPaths = tableMetadata.getAllPartitionPaths(); if (allPartitionPaths.isEmpty()) { LOG.error("Cannot get one partition path since there is no partition available"); return false; } int partitionLevels = getExpectedLevelBasedOnPartitionPath(allPartitionPaths.get(0)); List relativeFilePaths = listFilesFromBasePath( context, backupPathStr, partitionLevels, cfg.parallelism).stream() .map(filePath -> HadoopFSUtils.getRelativePartitionPath(new Path(backupPathStr), new Path(filePath))) .collect(Collectors.toList()); return restoreFiles(relativeFilePaths); } int getExpectedLevelBasedOnPartitionPath(String partitionPath) { if (StringUtils.isNullOrEmpty(partitionPath)) { return 0; } String[] partitionParts = partitionPath.split("/"); return partitionParts.length; } /** * Verifies the backup path for repair. * If there is no backup path configured, creates a new one in temp folder. * If the backup path already has files, throws an error to the user. * If the backup path is within the table base path, throws an error too. * * @return {@code 0} if successful; {@code -1} otherwise. * @throws IOException upon errors. */ int checkBackupPathForRepair() throws IOException { if (cfg.backupPath == null) { SecureRandom random = new SecureRandom(); long randomLong = random.nextLong(); cfg.backupPath = "/tmp/" + BACKUP_DIR_PREFIX + randomLong; } StoragePath backupPath = new StoragePath(cfg.backupPath); if (metaClient.getStorage().exists(backupPath) && metaClient.getStorage().listDirectEntries(backupPath).size() > 0) { LOG.error(String.format("Cannot use backup path %s: it is not empty", cfg.backupPath)); return -1; } return checkBackupPathAgainstBasePath(); } /** * Verifies the backup path against table base path. * If the backup path is within the table base path, throws an error. * * @return {@code 0} if successful; {@code -1} otherwise. */ int checkBackupPathAgainstBasePath() { if (cfg.backupPath == null) { LOG.error("Backup path is not configured"); return -1; } if (cfg.backupPath.contains(cfg.basePath)) { LOG.error(String.format("Cannot use backup path %s: it resides in the base path %s", cfg.backupPath, cfg.basePath)); return -1; } return 0; } /** * Backs up dangling files from table base path to backup path. * * @param relativeFilePaths A {@link List} of relative file paths for backup. * @return {@code true} if all successful; {@code false} otherwise. */ boolean backupFiles(List relativeFilePaths) { return copyFiles(context, relativeFilePaths, cfg.basePath, cfg.backupPath); } /** * Restores dangling files from backup path to table base path. * * @param relativeFilePaths A {@link List} of relative file paths for restoring. * @return {@code true} if all successful; {@code false} otherwise. */ boolean restoreFiles(List relativeFilePaths) { return copyFiles(context, relativeFilePaths, cfg.backupPath, cfg.basePath); } /** * Prints the repair info. * * @param instantTimesToRepair A list instant times in consideration for repair * @param instantsWithDanglingFiles A list of instants with dangling files. */ private void printRepairInfo( List instantTimesToRepair, List>> instantsWithDanglingFiles) { int numInstantsToRepair = instantsWithDanglingFiles.size(); LOG.warn("Number of instants verified based on the base and log files: " + instantTimesToRepair.size()); LOG.warn("Instant timestamps: " + instantTimesToRepair); LOG.warn("Number of instants to repair: " + numInstantsToRepair); if (numInstantsToRepair > 0) { instantsWithDanglingFiles.forEach(e -> LOG.warn(" ** Removing files: " + e.getValue())); } } /** * Reads config from the file system. * * @param jsc {@link JavaSparkContext} instance. * @param cfg {@link Config} instance. * @return the {@link TypedProperties} instance. */ private TypedProperties readConfigFromFileSystem(JavaSparkContext jsc, Config cfg) { return UtilHelpers.readConfig(jsc.hadoopConfiguration(), new Path(cfg.propsFilePath), cfg.configs) .getProps(true); } public static class Config implements Serializable { @Parameter(names = {"--base-path", "-sp"}, description = "Base path for the table", required = true) public String basePath = null; @Parameter(names = {"--mode", "-m"}, description = "Set job mode: Set \"repair\" means repairing the table " + "by removing dangling data and log files not belonging to any commit; " + "Set \"dry_run\" means only looking for dangling data and log files; " + "Set \"undo\" means undoing the repair by copying back the files from backup directory", required = true) public String runningMode = null; @Parameter(names = {"--start-instant-time", "-si"}, description = "Starting Instant time " + "for repair (inclusive)", required = false) public String startingInstantTime = null; @Parameter(names = {"--end-instant-time", "-ei"}, description = "Ending Instant time " + "for repair (inclusive)", required = false) public String endingInstantTime = null; @Parameter(names = {"--backup-path", "-bp"}, description = "Backup path for storing dangling data " + "and log files from the table", required = false) public String backupPath = null; @Parameter(names = {"--parallelism", "-pl"}, description = "Parallelism for repair", required = false) public int parallelism = 2; @Parameter(names = {"--spark-master", "-ms"}, description = "Spark master", required = false) public String sparkMaster = null; @Parameter(names = {"--spark-memory", "-sm"}, description = "spark memory to use", required = false) public String sparkMemory = "1g"; @Parameter(names = {"--assume-date-partitioning", "-dp"}, description = "whether the partition path " + "is date with three levels", required = false) public Boolean assumeDatePartitioning = false; @Parameter(names = {"--help", "-h"}, help = true) public Boolean help = false; @Parameter(names = {"--props"}, description = "path to properties file on localfs or dfs, with configurations for " + "hoodie client for table repair") public String propsFilePath = null; @Parameter(names = {"--hoodie-conf"}, description = "Any configuration that can be set in the properties file " + "(using the CLI parameter \"--props\") can also be passed command line using this parameter. This can be repeated", splitter = IdentitySplitter.class) public List configs = new ArrayList<>(); } public enum Mode { // Repairs the table by removing dangling data and log files not belonging to any commit REPAIR, // Dry run by only looking for dangling data and log files DRY_RUN, // Undoes the repair by copying back the files from backup directory UNDO } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy