org.apache.hudi.utilities.HoodieRepairTool Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hudi.utilities;
import org.apache.hudi.client.common.HoodieSparkEngineContext;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
import org.apache.hudi.common.table.timeline.HoodieArchivedTimeline;
import org.apache.hudi.common.util.FileIOUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.common.util.collection.ImmutablePair;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.hadoop.fs.HadoopFSUtils;
import org.apache.hudi.metadata.FileSystemBackedTableMetadata;
import org.apache.hudi.metadata.HoodieTableMetadata;
import org.apache.hudi.storage.HoodieStorage;
import org.apache.hudi.storage.HoodieStorageUtils;
import org.apache.hudi.storage.StorageConfiguration;
import org.apache.hudi.storage.StoragePath;
import org.apache.hudi.table.repair.RepairUtils;
import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.api.java.JavaSparkContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.Serializable;
import java.security.SecureRandom;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
/**
* A tool with spark-submit to repair Hudi table by finding and deleting dangling
* base and log files.
*
* You can run this tool with the following command:
* ```
* spark-submit \
* --class org.apache.hudi.utilities.HoodieRepairTool \
* --driver-memory 4g \
* --executor-memory 1g \
* --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
* --conf spark.sql.catalogImplementation=hive \
* --conf spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension \
* $HUDI_DIR/packaging/hudi-utilities-bundle/target/hudi-utilities-bundle_2.12-0.11.0-SNAPSHOT.jar \
* --mode dry_run \
* --base-path base_path \
* --assume-date-partitioning
* ```
*
* You can specify the running mode of the tool through `--mode`.
* There are three modes of the {@link HoodieRepairTool}:
* - REPAIR ("repair"): repairs the table by removing dangling data and log files not belonging to any commit.
* The removed files are going to be backed up at the backup path provided, in case recovery is needed.
* In this mode, backup path is required through `--backup-path`. You can also provide a range for repairing
* only the instants within the range, through `--start-instant-time` and `--end-instant-time`. You can also
* specify only one of them. If no range is provided, all instants are going to be repaired.
*
* Example command:
* ```
* spark-submit \
* --class org.apache.hudi.utilities.HoodieRepairTool \
* --driver-memory 4g \
* --executor-memory 1g \
* --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
* --conf spark.sql.catalogImplementation=hive \
* --conf spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension \
* $HUDI_DIR/packaging/hudi-utilities-bundle/target/hudi-utilities-bundle_2.12-0.11.0-SNAPSHOT.jar \
* --mode repair \
* --base-path base_path \
* --backup-path backup_path \
* --start-instant-time ts1 \
* --end-instant-time ts2 \
* --assume-date-partitioning
* ```
*
* - DRY_RUN ("dry_run"): only looks for dangling data and log files. You can also provide a range for looking
* at only the instants within the range, through `--start-instant-time` and `--end-instant-time`. You can also
* specify only one of them. If no range is provided, all instants are going to be scanned.
*
* Example command:
* ```
* spark-submit \
* --class org.apache.hudi.utilities.HoodieRepairTool \
* --driver-memory 4g \
* --executor-memory 1g \
* --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
* --conf spark.sql.catalogImplementation=hive \
* --conf spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension \
* $HUDI_DIR/packaging/hudi-utilities-bundle/target/hudi-utilities-bundle_2.12-0.11.0-SNAPSHOT.jar \
* --mode dry_run \
* --base-path base_path \
* --start-instant-time ts1 \
* --end-instant-time ts2 \
* --assume-date-partitioning
* ```
*
* - UNDO ("undo"): undoes the repair by copying back the files from backup directory to the table base path.
* In this mode, backup path is required through `--backup-path`.
*
* Example command:
* ```
* spark-submit \
* --class org.apache.hudi.utilities.HoodieRepairTool \
* --driver-memory 4g \
* --executor-memory 1g \
* --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
* --conf spark.sql.catalogImplementation=hive \
* --conf spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension \
* $HUDI_DIR/packaging/hudi-utilities-bundle/target/hudi-utilities-bundle_2.12-0.11.0-SNAPSHOT.jar \
* --mode undo \
* --base-path base_path \
* --backup-path backup_path
* ```
*/
public class HoodieRepairTool {
private static final Logger LOG = LoggerFactory.getLogger(HoodieRepairTool.class);
private static final String BACKUP_DIR_PREFIX = "hoodie_repair_backup_";
// Repair config
private final Config cfg;
// Properties with source, hoodie client, key generator etc.
private TypedProperties props;
// Spark context
private final HoodieEngineContext context;
private final HoodieTableMetaClient metaClient;
private final HoodieTableMetadata tableMetadata;
public HoodieRepairTool(JavaSparkContext jsc, Config cfg) {
if (cfg.propsFilePath != null) {
cfg.propsFilePath = HadoopFSUtils.addSchemeIfLocalPath(cfg.propsFilePath).toString();
}
this.context = new HoodieSparkEngineContext(jsc);
this.cfg = cfg;
this.props = cfg.propsFilePath == null
? UtilHelpers.buildProperties(cfg.configs)
: readConfigFromFileSystem(jsc, cfg);
this.metaClient = HoodieTableMetaClient.builder()
.setConf(HadoopFSUtils.getStorageConfWithCopy(jsc.hadoopConfiguration())).setBasePath(cfg.basePath)
.setLoadActiveTimelineOnLoad(true)
.build();
this.tableMetadata = new FileSystemBackedTableMetadata(
context, metaClient.getTableConfig(), metaClient.getStorage(), cfg.basePath);
}
public boolean run() {
Option startingInstantOption = Option.ofNullable(cfg.startingInstantTime);
Option endingInstantOption = Option.ofNullable(cfg.endingInstantTime);
if (startingInstantOption.isPresent() && endingInstantOption.isPresent()) {
LOG.info(String.format("Start repairing completed instants between %s and %s (inclusive)",
startingInstantOption.get(), endingInstantOption.get()));
} else if (startingInstantOption.isPresent()) {
LOG.info(String.format("Start repairing completed instants from %s (inclusive)",
startingInstantOption.get()));
} else if (endingInstantOption.isPresent()) {
LOG.info(String.format("Start repairing completed instants till %s (inclusive)",
endingInstantOption.get()));
} else {
LOG.info("Start repairing all completed instants");
}
try {
Mode mode = Mode.valueOf(cfg.runningMode.toUpperCase());
switch (mode) {
case REPAIR:
LOG.info(" ****** The repair tool is in REPAIR mode, dangling data and logs files "
+ "not belonging to any commit are going to be DELETED from the table ******");
if (checkBackupPathForRepair() < 0) {
LOG.error("Backup path check failed.");
return false;
}
return doRepair(startingInstantOption, endingInstantOption, false);
case DRY_RUN:
LOG.info(" ****** The repair tool is in DRY_RUN mode, "
+ "only LOOKING FOR dangling data and log files from the table ******");
return doRepair(startingInstantOption, endingInstantOption, true);
case UNDO:
if (checkBackupPathAgainstBasePath() < 0) {
LOG.error("Backup path check failed.");
return false;
}
return undoRepair();
default:
LOG.info("Unsupported running mode [" + cfg.runningMode + "], quit the job directly");
return false;
}
} catch (IOException e) {
throw new HoodieIOException("Unable to repair table in " + cfg.basePath, e);
}
}
public static void main(String[] args) {
final Config cfg = new Config();
JCommander cmd = new JCommander(cfg, null, args);
if (cfg.help || args.length == 0) {
cmd.usage();
System.exit(1);
}
final JavaSparkContext jsc = UtilHelpers.buildSparkContext("hudi-table-repair", cfg.sparkMaster, cfg.sparkMemory);
try {
new HoodieRepairTool(jsc, cfg).run();
} catch (Throwable throwable) {
LOG.error("Fail to run table repair for " + cfg.basePath, throwable);
} finally {
jsc.stop();
}
}
/**
* Copies the list of files from source base path to destination base path.
* The destination file path (base + relative) should not already exist.
*
* @param context {@link HoodieEngineContext} instance.
* @param relativeFilePaths A {@link List} of relative file paths for copying.
* @param sourceBasePath Source base path.
* @param destBasePath Destination base path.
* @return {@code true} if all successful; {@code false} otherwise.
*/
static boolean copyFiles(
HoodieEngineContext context, List relativeFilePaths, String sourceBasePath,
String destBasePath) {
StorageConfiguration> conf = context.getStorageConf();
List allResults = context.parallelize(relativeFilePaths)
.mapPartitions(iterator -> {
List results = new ArrayList<>();
HoodieStorage storage = HoodieStorageUtils.getStorage(destBasePath, conf);
iterator.forEachRemaining(filePath -> {
boolean success = false;
StoragePath sourcePath = new StoragePath(sourceBasePath, filePath);
StoragePath destPath = new StoragePath(destBasePath, filePath);
try {
if (!storage.exists(destPath)) {
FileIOUtils.copy(storage, sourcePath, destPath);
success = true;
}
} catch (IOException e) {
// Copy Fail
LOG.error(String.format("Copying file fails: source [%s], destination [%s]",
sourcePath, destPath));
} finally {
results.add(success);
}
});
return results.iterator();
}, true)
.collectAsList();
return allResults.stream().reduce((r1, r2) -> r1 && r2).orElse(false);
}
/**
* Lists all Hoodie files from the table base path.
*
* @param context {@link HoodieEngineContext} instance.
* @param basePathStr Table base path.
* @param expectedLevel Expected level in the directory hierarchy to include the file status.
* @param parallelism Parallelism for the file listing.
* @return A list of absolute file paths of all Hoodie files.
* @throws IOException upon errors.
*/
static List listFilesFromBasePath(
HoodieEngineContext context, String basePathStr, int expectedLevel, int parallelism) {
FileSystem fs = HadoopFSUtils.getFs(basePathStr, context.getStorageConf());
Path basePath = new Path(basePathStr);
return HadoopFSUtils.getFileStatusAtLevel(
context, fs, basePath, expectedLevel, parallelism).stream()
.filter(fileStatus -> {
if (!fileStatus.isFile()) {
return false;
}
return HadoopFSUtils.isDataFile(fileStatus.getPath());
})
.map(fileStatus -> fileStatus.getPath().toString())
.collect(Collectors.toList());
}
/**
* Deletes files from table base path.
*
* @param context {@link HoodieEngineContext} instance.
* @param basePath Base path of the table.
* @param relativeFilePaths A {@link List} of relative file paths for deleting.
*/
static boolean deleteFiles(
HoodieEngineContext context, String basePath, List relativeFilePaths) {
StorageConfiguration> conf = context.getStorageConf();
return context.parallelize(relativeFilePaths)
.mapPartitions(iterator -> {
FileSystem fs = HadoopFSUtils.getFs(basePath, conf);
List results = new ArrayList<>();
iterator.forEachRemaining(relativeFilePath -> {
boolean success = false;
try {
success = fs.delete(new Path(basePath, relativeFilePath), false);
} catch (IOException e) {
LOG.warn("Failed to delete file " + relativeFilePath);
} finally {
results.add(success);
}
});
return results.iterator();
}, true)
.collectAsList()
.stream().reduce((a, b) -> a && b)
.orElse(true);
}
/**
* Does repair, either in REPAIR or DRY_RUN mode.
*
* @param startingInstantOption {@link Option} of starting instant for scanning, can be empty.
* @param endingInstantOption {@link Option} of ending instant for scanning, can be empty.
* @param isDryRun Is dry run.
* @throws IOException upon errors.
*/
boolean doRepair(
Option startingInstantOption, Option endingInstantOption, boolean isDryRun) throws IOException {
// Scans all partitions to find base and log files in the base path
List allFilesInPartitions =
HoodieDataTableUtils.getBaseAndLogFilePathsFromFileSystem(tableMetadata, cfg.basePath);
// Buckets the files based on instant time
// instant time -> relative paths of base and log files to base path
Map> instantToFilesMap = RepairUtils.tagInstantsOfBaseAndLogFiles(
metaClient.getBasePath().toString(), allFilesInPartitions);
List instantTimesToRepair = instantToFilesMap.keySet().stream()
.filter(instant -> (!startingInstantOption.isPresent()
|| instant.compareTo(startingInstantOption.get()) >= 0)
&& (!endingInstantOption.isPresent()
|| instant.compareTo(endingInstantOption.get()) <= 0)
).collect(Collectors.toList());
HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();
HoodieArchivedTimeline archivedTimeline = metaClient.getArchivedTimeline();
// This assumes that the archived timeline only has completed instants so this is safe
archivedTimeline.loadCompletedInstantDetailsInMemory();
List>> instantFilesToRemove =
context.parallelize(instantTimesToRepair)
.map(instantToRepair ->
new ImmutablePair<>(instantToRepair, RepairUtils.findInstantFilesToRemove(instantToRepair,
instantToFilesMap.get(instantToRepair), activeTimeline, archivedTimeline)))
.collectAsList();
List>> instantsWithDanglingFiles =
instantFilesToRemove.stream().filter(e -> !e.getValue().isEmpty()).collect(Collectors.toList());
printRepairInfo(instantTimesToRepair, instantsWithDanglingFiles);
if (!isDryRun) {
List relativeFilePathsToDelete = instantsWithDanglingFiles.stream()
.flatMap(e -> e.getValue().stream())
.collect(Collectors.toList());
if (relativeFilePathsToDelete.size() > 0) {
if (!backupFiles(relativeFilePathsToDelete)) {
LOG.error("Error backing up dangling files. Exiting...");
return false;
}
return deleteFiles(context, cfg.basePath, relativeFilePathsToDelete);
}
LOG.info(String.format("Table repair on %s is successful", cfg.basePath));
}
return true;
}
/**
* Undoes repair for UNDO mode.
*
* @throws IOException upon errors.
*/
boolean undoRepair() throws IOException {
HoodieStorage storage = metaClient.getStorage();
String backupPathStr = cfg.backupPath;
StoragePath backupPath = new StoragePath(backupPathStr);
if (!storage.exists(backupPath)) {
LOG.error("Cannot find backup path: " + backupPath);
return false;
}
List allPartitionPaths = tableMetadata.getAllPartitionPaths();
if (allPartitionPaths.isEmpty()) {
LOG.error("Cannot get one partition path since there is no partition available");
return false;
}
int partitionLevels = getExpectedLevelBasedOnPartitionPath(allPartitionPaths.get(0));
List relativeFilePaths = listFilesFromBasePath(
context, backupPathStr, partitionLevels, cfg.parallelism).stream()
.map(filePath ->
HadoopFSUtils.getRelativePartitionPath(new Path(backupPathStr), new Path(filePath)))
.collect(Collectors.toList());
return restoreFiles(relativeFilePaths);
}
int getExpectedLevelBasedOnPartitionPath(String partitionPath) {
if (StringUtils.isNullOrEmpty(partitionPath)) {
return 0;
}
String[] partitionParts = partitionPath.split("/");
return partitionParts.length;
}
/**
* Verifies the backup path for repair.
* If there is no backup path configured, creates a new one in temp folder.
* If the backup path already has files, throws an error to the user.
* If the backup path is within the table base path, throws an error too.
*
* @return {@code 0} if successful; {@code -1} otherwise.
* @throws IOException upon errors.
*/
int checkBackupPathForRepair() throws IOException {
if (cfg.backupPath == null) {
SecureRandom random = new SecureRandom();
long randomLong = random.nextLong();
cfg.backupPath = "/tmp/" + BACKUP_DIR_PREFIX + randomLong;
}
StoragePath backupPath = new StoragePath(cfg.backupPath);
if (metaClient.getStorage().exists(backupPath)
&& metaClient.getStorage().listDirectEntries(backupPath).size() > 0) {
LOG.error(String.format("Cannot use backup path %s: it is not empty", cfg.backupPath));
return -1;
}
return checkBackupPathAgainstBasePath();
}
/**
* Verifies the backup path against table base path.
* If the backup path is within the table base path, throws an error.
*
* @return {@code 0} if successful; {@code -1} otherwise.
*/
int checkBackupPathAgainstBasePath() {
if (cfg.backupPath == null) {
LOG.error("Backup path is not configured");
return -1;
}
if (cfg.backupPath.contains(cfg.basePath)) {
LOG.error(String.format("Cannot use backup path %s: it resides in the base path %s",
cfg.backupPath, cfg.basePath));
return -1;
}
return 0;
}
/**
* Backs up dangling files from table base path to backup path.
*
* @param relativeFilePaths A {@link List} of relative file paths for backup.
* @return {@code true} if all successful; {@code false} otherwise.
*/
boolean backupFiles(List relativeFilePaths) {
return copyFiles(context, relativeFilePaths, cfg.basePath, cfg.backupPath);
}
/**
* Restores dangling files from backup path to table base path.
*
* @param relativeFilePaths A {@link List} of relative file paths for restoring.
* @return {@code true} if all successful; {@code false} otherwise.
*/
boolean restoreFiles(List relativeFilePaths) {
return copyFiles(context, relativeFilePaths, cfg.backupPath, cfg.basePath);
}
/**
* Prints the repair info.
*
* @param instantTimesToRepair A list instant times in consideration for repair
* @param instantsWithDanglingFiles A list of instants with dangling files.
*/
private void printRepairInfo(
List instantTimesToRepair, List>> instantsWithDanglingFiles) {
int numInstantsToRepair = instantsWithDanglingFiles.size();
LOG.warn("Number of instants verified based on the base and log files: "
+ instantTimesToRepair.size());
LOG.warn("Instant timestamps: " + instantTimesToRepair);
LOG.warn("Number of instants to repair: " + numInstantsToRepair);
if (numInstantsToRepair > 0) {
instantsWithDanglingFiles.forEach(e -> LOG.warn(" ** Removing files: " + e.getValue()));
}
}
/**
* Reads config from the file system.
*
* @param jsc {@link JavaSparkContext} instance.
* @param cfg {@link Config} instance.
* @return the {@link TypedProperties} instance.
*/
private TypedProperties readConfigFromFileSystem(JavaSparkContext jsc, Config cfg) {
return UtilHelpers.readConfig(jsc.hadoopConfiguration(), new Path(cfg.propsFilePath), cfg.configs)
.getProps(true);
}
public static class Config implements Serializable {
@Parameter(names = {"--base-path", "-sp"}, description = "Base path for the table", required = true)
public String basePath = null;
@Parameter(names = {"--mode", "-m"}, description = "Set job mode: Set \"repair\" means repairing the table "
+ "by removing dangling data and log files not belonging to any commit; "
+ "Set \"dry_run\" means only looking for dangling data and log files; "
+ "Set \"undo\" means undoing the repair by copying back the files from backup directory", required = true)
public String runningMode = null;
@Parameter(names = {"--start-instant-time", "-si"}, description = "Starting Instant time "
+ "for repair (inclusive)", required = false)
public String startingInstantTime = null;
@Parameter(names = {"--end-instant-time", "-ei"}, description = "Ending Instant time "
+ "for repair (inclusive)", required = false)
public String endingInstantTime = null;
@Parameter(names = {"--backup-path", "-bp"}, description = "Backup path for storing dangling data "
+ "and log files from the table", required = false)
public String backupPath = null;
@Parameter(names = {"--parallelism", "-pl"}, description = "Parallelism for repair", required = false)
public int parallelism = 2;
@Parameter(names = {"--spark-master", "-ms"}, description = "Spark master", required = false)
public String sparkMaster = null;
@Parameter(names = {"--spark-memory", "-sm"}, description = "spark memory to use", required = false)
public String sparkMemory = "1g";
@Parameter(names = {"--assume-date-partitioning", "-dp"}, description = "whether the partition path "
+ "is date with three levels", required = false)
public Boolean assumeDatePartitioning = false;
@Parameter(names = {"--help", "-h"}, help = true)
public Boolean help = false;
@Parameter(names = {"--props"}, description = "path to properties file on localfs or dfs, with configurations for "
+ "hoodie client for table repair")
public String propsFilePath = null;
@Parameter(names = {"--hoodie-conf"}, description = "Any configuration that can be set in the properties file "
+ "(using the CLI parameter \"--props\") can also be passed command line using this parameter. This can be repeated",
splitter = IdentitySplitter.class)
public List configs = new ArrayList<>();
}
public enum Mode {
// Repairs the table by removing dangling data and log files not belonging to any commit
REPAIR,
// Dry run by only looking for dangling data and log files
DRY_RUN,
// Undoes the repair by copying back the files from backup directory
UNDO
}
}