org.apache.hudi.cli.commands.RepairsCommand Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.cli.commands;
import org.apache.hudi.cli.HoodieCLI;
import org.apache.hudi.cli.HoodiePrintHelper;
import org.apache.hudi.cli.HoodieTableHeaderFields;
import org.apache.hudi.cli.utils.InputStreamConsumer;
import org.apache.hudi.cli.utils.SparkUtil;
import org.apache.hudi.common.engine.HoodieLocalEngineContext;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodiePartitionMetadata;
import org.apache.hudi.common.table.HoodieTableConfig;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.table.timeline.TimelineUtils;
import org.apache.hudi.common.util.CleanerUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.PartitionPathEncodeUtils;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.storage.StoragePath;
import org.apache.avro.AvroRuntimeException;
import org.apache.spark.launcher.SparkLauncher;
import org.apache.spark.sql.hudi.DeDupeType;
import org.apache.spark.util.Utils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.shell.standard.ShellComponent;
import org.springframework.shell.standard.ShellMethod;
import org.springframework.shell.standard.ShellOption;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.TreeSet;
import java.util.stream.Collectors;
import scala.collection.JavaConverters;
/**
* CLI command to display and trigger repair options.
*/
@ShellComponent
public class RepairsCommand {
private static final Logger LOG = LoggerFactory.getLogger(RepairsCommand.class);
public static final String DEDUPLICATE_RETURN_PREFIX = "Deduplicated files placed in: ";
@ShellMethod(key = "repair deduplicate",
value = "De-duplicate a partition path contains duplicates & produce repaired files to replace with")
public String deduplicate(
@ShellOption(value = {"--duplicatedPartitionPath"}, defaultValue = "", help = "Partition Path containing the duplicates")
final String duplicatedPartitionPath,
@ShellOption(value = {"--repairedOutputPath"}, help = "Location to place the repaired files")
final String repairedOutputPath,
@ShellOption(value = {"--sparkProperties"}, help = "Spark Properties File Path",
defaultValue = "") String sparkPropertiesPath,
@ShellOption(value = "--sparkMaster", defaultValue = "", help = "Spark Master") String master,
@ShellOption(value = "--sparkMemory", defaultValue = "4G",
help = "Spark executor memory") final String sparkMemory,
@ShellOption(value = {"--dryrun"},
help = "Should we actually remove duplicates or just run and store result to repairedOutputPath",
defaultValue = "true") final boolean dryRun,
@ShellOption(value = {"--dedupeType"}, help = "Valid values are - insert_type, update_type and upsert_type",
defaultValue = "insert_type") final String dedupeType)
throws Exception {
if (!DeDupeType.values().contains(DeDupeType.withName(dedupeType))) {
throw new IllegalArgumentException("Please provide valid dedupe type!");
}
if (StringUtils.isNullOrEmpty(sparkPropertiesPath)) {
sparkPropertiesPath =
Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala());
}
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
SparkMain.addAppArgs(sparkLauncher, SparkMain.SparkCommand.DEDUPLICATE, master, sparkMemory,
duplicatedPartitionPath, repairedOutputPath, HoodieCLI.basePath, String.valueOf(dryRun), dedupeType);
Process process = sparkLauncher.launch();
InputStreamConsumer.captureOutput(process);
int exitCode = process.waitFor();
if (exitCode != 0) {
return "Deduplication failed!";
}
if (dryRun) {
return DEDUPLICATE_RETURN_PREFIX + repairedOutputPath;
} else {
return DEDUPLICATE_RETURN_PREFIX + duplicatedPartitionPath;
}
}
@ShellMethod(key = "repair addpartitionmeta", value = "Add partition metadata to a table, if not present")
public String addPartitionMeta(
@ShellOption(value = {"--dryrun"}, help = "Should we actually add or just print what would be done",
defaultValue = "true") final boolean dryRun)
throws IOException {
HoodieTableMetaClient client = HoodieCLI.getTableMetaClient();
String latestCommit =
client.getActiveTimeline().getCommitAndReplaceTimeline().lastInstant().get().requestedTime();
List partitionPaths =
FSUtils.getAllPartitionFoldersThreeLevelsDown(HoodieCLI.storage, HoodieCLI.basePath);
StoragePath basePath = client.getBasePath();
String[][] rows = new String[partitionPaths.size()][];
int ind = 0;
for (String partition : partitionPaths) {
StoragePath partitionPath = FSUtils.constructAbsolutePath(basePath, partition);
String[] row = new String[3];
row[0] = partition;
row[1] = "Yes";
row[2] = "None";
if (!HoodiePartitionMetadata.hasPartitionMetadata(HoodieCLI.storage, partitionPath)) {
row[1] = "No";
if (!dryRun) {
HoodiePartitionMetadata partitionMetadata =
new HoodiePartitionMetadata(HoodieCLI.storage, latestCommit, basePath, partitionPath,
client.getTableConfig().getPartitionMetafileFormat());
partitionMetadata.trySave();
row[2] = "Repaired";
}
}
rows[ind++] = row;
}
return HoodiePrintHelper.print(new String[] {HoodieTableHeaderFields.HEADER_PARTITION_PATH,
HoodieTableHeaderFields.HEADER_METADATA_PRESENT, HoodieTableHeaderFields.HEADER_ACTION}, rows);
}
@ShellMethod(key = "repair overwrite-hoodie-props",
value = "Overwrite hoodie.properties with provided file. Risky operation. Proceed with caution!")
public String overwriteHoodieProperties(
@ShellOption(value = {"--new-props-file"},
help = "Path to a properties file on local filesystem to overwrite the table's hoodie.properties with")
final String overwriteFilePath) throws IOException {
HoodieTableMetaClient client = HoodieCLI.getTableMetaClient();
Properties newProps = new Properties();
try (FileInputStream fileInputStream = new FileInputStream(overwriteFilePath)) {
newProps.load(fileInputStream);
}
Map oldProps = client.getTableConfig().propsMap();
// Copy Initial Version from old-props to new-props
if (oldProps.containsKey(HoodieTableConfig.INITIAL_VERSION.key())) {
newProps.put(HoodieTableConfig.INITIAL_VERSION.key(), oldProps.get(HoodieTableConfig.INITIAL_VERSION.key()));
}
HoodieTableConfig.create(client.getStorage(), client.getMetaPath(), newProps);
// reload new props as checksum would have been added
newProps =
HoodieTableMetaClient.reload(HoodieCLI.getTableMetaClient()).getTableConfig().getProps();
TreeSet allPropKeys = new TreeSet<>();
allPropKeys.addAll(
newProps.keySet().stream().map(Object::toString).collect(Collectors.toSet()));
allPropKeys.addAll(oldProps.keySet());
String[][] rows = new String[allPropKeys.size()][];
int ind = 0;
for (String propKey : allPropKeys) {
String[] row = new String[] {
propKey,
oldProps.getOrDefault(propKey, "null"),
newProps.getOrDefault(propKey, "null").toString()
};
rows[ind++] = row;
}
return HoodiePrintHelper.print(new String[] {HoodieTableHeaderFields.HEADER_HOODIE_PROPERTY,
HoodieTableHeaderFields.HEADER_OLD_VALUE, HoodieTableHeaderFields.HEADER_NEW_VALUE}, rows);
}
@ShellMethod(key = "repair corrupted clean files", value = "repair corrupted clean files")
public void removeCorruptedPendingCleanAction() {
HoodieTableMetaClient client = HoodieCLI.getTableMetaClient();
HoodieTimeline cleanerTimeline = HoodieCLI.getTableMetaClient().getActiveTimeline().getCleanerTimeline();
LOG.info("Inspecting pending clean metadata in timeline for corrupted files");
cleanerTimeline.filterInflightsAndRequested().getInstants().forEach(instant -> {
try {
CleanerUtils.getCleanerPlan(client, instant);
} catch (AvroRuntimeException e) {
LOG.warn("Corruption found. Trying to remove corrupted clean instant file: " + instant);
TimelineUtils.deleteInstantFile(client.getStorage(), client.getTimelinePath(),
instant, client.getInstantFileNameGenerator());
} catch (IOException ioe) {
if (ioe.getMessage().contains("Not an Avro data file")) {
LOG.warn("Corruption found. Trying to remove corrupted clean instant file: " + instant);
TimelineUtils.deleteInstantFile(client.getStorage(), client.getTimelinePath(),
instant, client.getInstantFileNameGenerator());
} else {
throw new HoodieIOException(ioe.getMessage(), ioe);
}
}
});
}
@ShellMethod(key = "repair show empty commit metadata", value = "show failed commits")
public void showFailedCommits() {
HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient();
HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();
activeTimeline.filterCompletedInstants().getInstantsAsStream().filter(activeTimeline::isEmpty).forEach(hoodieInstant -> LOG.warn("Empty Commit: " + hoodieInstant.toString()));
}
@ShellMethod(key = "repair migrate-partition-meta", value = "Migrate all partition meta file currently stored in text format "
+ "to be stored in base file format. See HoodieTableConfig#PARTITION_METAFILE_USE_DATA_FORMAT.")
public String migratePartitionMeta(
@ShellOption(value = {"--dryrun"}, help = "dry run without modifying anything.", defaultValue = "true")
final boolean dryRun)
throws IOException {
HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(HoodieCLI.conf);
HoodieTableMetaClient client = HoodieCLI.getTableMetaClient();
List partitionPaths =
FSUtils.getAllPartitionPaths(engineContext, client.getStorage(), client.getBasePath(), false);
StoragePath basePath = client.getBasePath();
String[][] rows = new String[partitionPaths.size()][];
int ind = 0;
for (String partitionPath : partitionPaths) {
StoragePath partition =
FSUtils.constructAbsolutePath(client.getBasePath(), partitionPath);
Option textFormatFile =
HoodiePartitionMetadata.textFormatMetaPathIfExists(HoodieCLI.storage, partition);
Option baseFormatFile =
HoodiePartitionMetadata.baseFormatMetaPathIfExists(HoodieCLI.storage, partition);
String latestCommit =
client.getActiveTimeline().getCommitAndReplaceTimeline().lastInstant().get().requestedTime();
String[] row = new String[] {
partitionPath,
String.valueOf(textFormatFile.isPresent()),
String.valueOf(baseFormatFile.isPresent()),
textFormatFile.isPresent() ? "MIGRATE" : "NONE"
};
if (!dryRun) {
if (!baseFormatFile.isPresent()) {
HoodiePartitionMetadata partitionMetadata =
new HoodiePartitionMetadata(HoodieCLI.storage, latestCommit, basePath, partition,
Option.of(client.getTableConfig().getBaseFileFormat()));
partitionMetadata.trySave();
}
// delete it, in case we failed midway last time.
textFormatFile.ifPresent(path -> {
try {
HoodieCLI.storage.deleteFile(path);
} catch (IOException e) {
throw new HoodieIOException(e.getMessage(), e);
}
});
row[3] = "MIGRATED";
}
rows[ind++] = row;
}
Properties props = new Properties();
props.setProperty(HoodieTableConfig.PARTITION_METAFILE_USE_BASE_FORMAT.key(), "true");
HoodieTableConfig.update(HoodieCLI.storage, client.getMetaPath(), props);
return HoodiePrintHelper.print(new String[] {
HoodieTableHeaderFields.HEADER_PARTITION_PATH,
HoodieTableHeaderFields.HEADER_TEXT_METAFILE_PRESENT,
HoodieTableHeaderFields.HEADER_BASE_METAFILE_PRESENT,
HoodieTableHeaderFields.HEADER_ACTION
}, rows);
}
@ShellMethod(key = "repair deprecated partition",
value = "Repair deprecated partition (\"default\"). Re-writes data from the deprecated partition into " + PartitionPathEncodeUtils.DEFAULT_PARTITION_PATH)
public String repairDeprecatePartition(
@ShellOption(value = {"--sparkProperties"}, help = "Spark Properties File Path",
defaultValue = "") String sparkPropertiesPath,
@ShellOption(value = "--sparkMaster", defaultValue = "", help = "Spark Master") String master,
@ShellOption(value = "--sparkMemory", defaultValue = "4G",
help = "Spark executor memory") final String sparkMemory) throws Exception {
if (StringUtils.isNullOrEmpty(sparkPropertiesPath)) {
sparkPropertiesPath =
Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala());
}
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
SparkMain.addAppArgs(sparkLauncher, SparkMain.SparkCommand.REPAIR_DEPRECATED_PARTITION, master, sparkMemory,
HoodieCLI.basePath);
Process process = sparkLauncher.launch();
InputStreamConsumer.captureOutput(process);
int exitCode = process.waitFor();
if (exitCode != 0) {
return "Deduplication failed!";
}
return "Repair succeeded";
}
@ShellMethod(key = "rename partition",
value = "Rename partition. Usage: rename partition --oldPartition --newPartition ")
public String renamePartition(
@ShellOption(value = {"--oldPartition"}, help = "Partition value to be renamed") String oldPartition,
@ShellOption(value = {"--newPartition"}, help = "New partition value after rename") String newPartition,
@ShellOption(value = {"--sparkProperties"}, help = "Spark Properties File Path",
defaultValue = "") String sparkPropertiesPath,
@ShellOption(value = "--sparkMaster", defaultValue = "", help = "Spark Master") String master,
@ShellOption(value = "--sparkMemory", defaultValue = "4G",
help = "Spark executor memory") final String sparkMemory) throws Exception {
if (StringUtils.isNullOrEmpty(sparkPropertiesPath)) {
sparkPropertiesPath =
Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala());
}
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
SparkMain.addAppArgs(sparkLauncher, SparkMain.SparkCommand.RENAME_PARTITION, master, sparkMemory,
HoodieCLI.basePath, oldPartition, newPartition);
Process process = sparkLauncher.launch();
InputStreamConsumer.captureOutput(process);
int exitCode = process.waitFor();
if (exitCode != 0) {
return "rename partition failed!";
}
return "rename partition succeeded";
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy