org.apache.hudi.utilities.HoodieDataTableValidator Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hudi.utilities;
import org.apache.hudi.async.HoodieAsyncService;
import org.apache.hudi.client.common.HoodieSparkEngineContext;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieValidationException;
import org.apache.hudi.hadoop.fs.HadoopFSUtils;
import org.apache.hudi.metadata.FileSystemBackedTableMetadata;
import org.apache.hudi.metadata.HoodieTableMetadata;
import org.apache.hudi.storage.StoragePath;
import org.apache.hudi.table.repair.RepairUtils;
import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter;
import org.apache.hadoop.fs.Path;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import static org.apache.hudi.common.table.timeline.InstantComparison.LESSER_THAN;
import static org.apache.hudi.common.table.timeline.InstantComparison.compareTimestamps;
/**
* TODO: [HUDI-8294]
* A validator with spark-submit to ensure there are no dangling data files in the data table.
* No data files found for commits prior to active timeline.
* No extra data files found for completed commits more than whats present in commit metadata.
*
*
* - Default : This validator will validate the data files only once.
*
* Example command:
* ```
* spark-submit \
* --class org.apache.hudi.utilities.HoodieDataTableValidator \
* --master spark://xxxx:7077 \
* --driver-memory 1g \
* --executor-memory 1g \
* $HUDI_DIR/hudi/packaging/hudi-utilities-bundle/target/hudi-utilities-bundle_2.11-0.11.0-SNAPSHOT.jar \
* --base-path basePath
* ```
*
*
* Also You can set `--continuous` for long running this validator.
* And use `--min-validate-interval-seconds` to control the validation frequency, default is 10 minutes.
*
* Example command:
* ```
* spark-submit \
* --class org.apache.hudi.utilities.HoodieDataTableValidator \
* --master spark://xxxx:7077 \
* --driver-memory 1g \
* --executor-memory 1g \
* $HUDI_DIR/hudi/packaging/hudi-utilities-bundle/target/hudi-utilities-bundle_2.11-0.11.0-SNAPSHOT.jar \
* --base-path basePath
* --continuous \
* --min-validate-interval-seconds 60
* ```
*/
public class HoodieDataTableValidator implements Serializable {
private static final long serialVersionUID = 1L;
private static final Logger LOG = LoggerFactory.getLogger(HoodieDataTableValidator.class);
// Spark context
private transient JavaSparkContext jsc;
// config
private Config cfg;
// Properties with source, hoodie client, key generator etc.
private TypedProperties props;
private HoodieTableMetaClient metaClient;
protected transient Option asyncDataTableValidateService;
public HoodieDataTableValidator(HoodieTableMetaClient metaClient) {
this.metaClient = metaClient;
}
public HoodieDataTableValidator(JavaSparkContext jsc, Config cfg) {
this.jsc = jsc;
this.cfg = cfg;
this.props = cfg.propsFilePath == null
? UtilHelpers.buildProperties(cfg.configs)
: readConfigFromFileSystem(jsc, cfg);
this.metaClient = HoodieTableMetaClient.builder()
.setConf(HadoopFSUtils.getStorageConfWithCopy(jsc.hadoopConfiguration()))
.setBasePath(cfg.basePath)
.setLoadActiveTimelineOnLoad(true)
.build();
this.asyncDataTableValidateService = cfg.continuous ? Option.of(new AsyncDataTableValidateService()) : Option.empty();
}
/**
* Reads config from the file system.
*
* @param jsc {@link JavaSparkContext} instance.
* @param cfg {@link Config} instance.
* @return the {@link TypedProperties} instance.
*/
private TypedProperties readConfigFromFileSystem(JavaSparkContext jsc, Config cfg) {
return UtilHelpers.readConfig(jsc.hadoopConfiguration(), new Path(cfg.propsFilePath), cfg.configs)
.getProps(true);
}
public static class Config implements Serializable {
@Parameter(names = {"--base-path", "-sp"}, description = "Base path for the table", required = true)
public String basePath = null;
@Parameter(names = {"--continuous"}, description = "Running MetadataTableValidator in continuous. "
+ "Can use --min-validate-interval-seconds to control validation frequency", required = false)
public boolean continuous = false;
@Parameter(names = {"--min-validate-interval-seconds"},
description = "the min validate interval of each validate when set --continuous, default is 10 minutes.")
public Integer minValidateIntervalSeconds = 10 * 60;
@Parameter(names = {"--parallelism", "-pl"}, description = "Parallelism for validation", required = false)
public int parallelism = 200;
@Parameter(names = {"--ignore-failed", "-ig"}, description = "Ignore data table validate failure and continue.", required = false)
public boolean ignoreFailed = false;
@Parameter(names = {"--assume-date-partitioning"}, description = "Should HoodieWriteClient assume the data is partitioned by dates, i.e three levels from base path."
+ "This is a stop-gap to support tables created by versions < 0.3.1. Will be removed eventually", required = false)
public Boolean assumeDatePartitioning = false;
@Parameter(names = {"--spark-master", "-ms"}, description = "Spark master", required = false)
public String sparkMaster = null;
@Parameter(names = {"--spark-memory", "-sm"}, description = "spark memory to use", required = false)
public String sparkMemory = "1g";
@Parameter(names = {"--props"}, description = "path to properties file on localfs or dfs, with configurations for "
+ "hoodie client")
public String propsFilePath = null;
@Parameter(names = {"--hoodie-conf"}, description = "Any configuration that can be set in the properties file "
+ "(using the CLI parameter \"--props\") can also be passed command line using this parameter. This can be repeated",
splitter = IdentitySplitter.class)
public List configs = new ArrayList<>();
@Parameter(names = {"--help", "-h"}, help = true)
public Boolean help = false;
@Override
public String toString() {
return "MetadataTableValidatorConfig {\n"
+ " --base-path " + basePath + ", \n"
+ " --continuous " + continuous + ", \n"
+ " --ignore-failed " + ignoreFailed + ", \n"
+ " --min-validate-interval-seconds " + minValidateIntervalSeconds + ", \n"
+ " --parallelism " + parallelism + ", \n"
+ " --spark-master " + sparkMaster + ", \n"
+ " --spark-memory " + sparkMemory + ", \n"
+ " --assumeDatePartitioning-memory " + assumeDatePartitioning + ", \n"
+ " --props " + propsFilePath + ", \n"
+ " --hoodie-conf " + configs
+ "\n}";
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
HoodieMetadataTableValidator.Config config = (HoodieMetadataTableValidator.Config) o;
return basePath.equals(config.basePath)
&& Objects.equals(continuous, config.continuous)
&& Objects.equals(minValidateIntervalSeconds, config.minValidateIntervalSeconds)
&& Objects.equals(parallelism, config.parallelism)
&& Objects.equals(ignoreFailed, config.ignoreFailed)
&& Objects.equals(sparkMaster, config.sparkMaster)
&& Objects.equals(sparkMemory, config.sparkMemory)
&& Objects.equals(assumeDatePartitioning, config.assumeDatePartitioning)
&& Objects.equals(propsFilePath, config.propsFilePath)
&& Objects.equals(configs, config.configs);
}
@Override
public int hashCode() {
return Objects.hash(basePath, continuous, minValidateIntervalSeconds, parallelism, ignoreFailed, sparkMaster, sparkMemory,
assumeDatePartitioning, propsFilePath, configs, help);
}
}
public static void main(String[] args) {
final Config cfg = new Config();
JCommander cmd = new JCommander(cfg, null, args);
if (cfg.help || args.length == 0) {
cmd.usage();
System.exit(1);
}
SparkConf sparkConf = UtilHelpers.buildSparkConf("Hoodie-Data-Table-Validator", cfg.sparkMaster);
sparkConf.set("spark.executor.memory", cfg.sparkMemory);
JavaSparkContext jsc = new JavaSparkContext(sparkConf);
HoodieDataTableValidator validator = new HoodieDataTableValidator(jsc, cfg);
try {
validator.run();
} catch (Throwable throwable) {
LOG.error("Fail to do hoodie Data table validation for " + validator.cfg, throwable);
} finally {
jsc.stop();
}
}
public void run() {
try {
LOG.info(cfg.toString());
if (cfg.continuous) {
LOG.info(" ****** do hoodie data table validation in CONTINUOUS mode ******");
doHoodieDataTableValidationContinuous();
} else {
LOG.info(" ****** do hoodie data table validation once ******");
doHoodieDataTableValidationOnce();
}
} catch (Exception e) {
throw new HoodieException("Unable to do hoodie data table validation in " + cfg.basePath, e);
} finally {
if (asyncDataTableValidateService.isPresent()) {
asyncDataTableValidateService.get().shutdown(true);
}
}
}
private void doHoodieDataTableValidationOnce() {
try {
doDataTableValidation();
} catch (HoodieValidationException e) {
LOG.error("Metadata table validation failed to HoodieValidationException", e);
if (!cfg.ignoreFailed) {
throw e;
}
}
}
private void doHoodieDataTableValidationContinuous() {
asyncDataTableValidateService.ifPresent(service -> {
service.start(null);
try {
service.waitForShutdown();
} catch (Exception e) {
throw new HoodieException(e.getMessage(), e);
}
});
}
public void doDataTableValidation() {
boolean finalResult = true;
metaClient.reloadActiveTimeline();
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
try {
HoodieTableMetadata tableMetadata = new FileSystemBackedTableMetadata(
engineContext, metaClient.getTableConfig(), metaClient.getStorage(), cfg.basePath);
List allDataFilePaths =
HoodieDataTableUtils.getBaseAndLogFilePathsFromFileSystem(tableMetadata, cfg.basePath);
// verify that no data files present with commit time < earliest commit in active timeline.
if (metaClient.getActiveTimeline().firstInstant().isPresent()) {
String earliestInstant = metaClient.getActiveTimeline().firstInstant().get().requestedTime();
List danglingFilePaths = allDataFilePaths.stream().filter(path -> {
String instantTime = FSUtils.getCommitTime(path.getName());
return compareTimestamps(instantTime, LESSER_THAN,
earliestInstant);
}).collect(Collectors.toList());
if (!danglingFilePaths.isEmpty() && danglingFilePaths.size() > 0) {
LOG.error("Data table validation failed due to dangling files count "
+ danglingFilePaths.size() + ", found before active timeline");
danglingFilePaths.forEach(entry -> LOG.error("Dangling file: " + entry.toString()));
finalResult = false;
if (!cfg.ignoreFailed) {
throw new HoodieValidationException(
"Data table validation failed due to dangling files " + danglingFilePaths.size());
}
}
// Verify that for every completed commit in active timeline, there are no extra files found apart from what is present in
// commit metadata.
Map> instantToFilesMap = RepairUtils.tagInstantsOfBaseAndLogFiles(
metaClient.getBasePath().toString(), allDataFilePaths);
HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();
List hoodieInstants = activeTimeline.filterCompletedInstants().getInstants();
List danglingFiles = engineContext.flatMap(hoodieInstants, instant -> {
Option> filesFromTimeline = RepairUtils.getBaseAndLogFilePathsFromTimeline(
activeTimeline, instant);
List baseAndLogFilesFromFs = instantToFilesMap.containsKey(instant.requestedTime()) ? instantToFilesMap.get(instant.requestedTime())
: Collections.emptyList();
if (!baseAndLogFilesFromFs.isEmpty()) {
Set danglingInstantFiles = new HashSet<>(baseAndLogFilesFromFs);
if (filesFromTimeline.isPresent()) {
danglingInstantFiles.removeAll(filesFromTimeline.get());
}
return new ArrayList<>(danglingInstantFiles).stream();
} else {
return Stream.empty();
}
}, hoodieInstants.size()).stream().collect(Collectors.toList());
if (!danglingFiles.isEmpty()) {
LOG.error("Data table validation failed due to extra files found for completed commits " + danglingFiles.size());
danglingFiles.forEach(entry -> LOG.error("Dangling file: " + entry.toString()));
finalResult = false;
if (!cfg.ignoreFailed) {
throw new HoodieValidationException("Data table validation failed due to dangling files " + danglingFiles.size());
}
}
}
} catch (Exception e) {
LOG.error("Data table validation failed due to " + e.getMessage(), e);
if (!cfg.ignoreFailed) {
throw new HoodieValidationException("Data table validation failed due to " + e.getMessage(), e);
}
}
if (finalResult) {
LOG.info("Data table validation succeeded.");
} else {
LOG.warn("Data table validation failed.");
}
}
public class AsyncDataTableValidateService extends HoodieAsyncService {
private final transient ExecutorService executor = Executors.newSingleThreadExecutor();
@Override
protected Pair startService() {
return Pair.of(CompletableFuture.supplyAsync(() -> {
while (true) {
try {
long start = System.currentTimeMillis();
doDataTableValidation();
long toSleepMs = cfg.minValidateIntervalSeconds * 1000 - (System.currentTimeMillis() - start);
if (toSleepMs > 0) {
LOG.info("Last validate ran less than min validate interval: " + cfg.minValidateIntervalSeconds + " s, sleep: "
+ toSleepMs + " ms.");
Thread.sleep(toSleepMs);
}
} catch (HoodieValidationException e) {
LOG.error("Shutting down AsyncDataTableValidateService due to HoodieValidationException", e);
if (!cfg.ignoreFailed) {
throw e;
}
} catch (InterruptedException e) {
// ignore InterruptedException here.
}
}
}, executor), executor);
}
}
}